Response will appear here.-
Response will appear here.+
diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..24ee5b1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/Dockerfile b/Dockerfile index b9556a5..5775c68 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,20 @@ # Use an official Python runtime as a parent image -FROM python:3.13-slim +FROM ghcr.io/astral-sh/uv:debian # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - # Set work directory WORKDIR /app -# Install Python dependencies -COPY requirements.txt . -RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt - # Copy application files -COPY /app /app +COPY ./app /app +# Copy the pyproject.toml and uv.lock because this project's structure is chopped +COPY ./pyproject.toml ./uv.lock /app # Make start.sh executable RUN chmod +x /app/start.sh -# Expose the port the app runs on -EXPOSE 1986 - # Specify the entrypoint script ENTRYPOINT ["/app/start.sh"] diff --git a/app/app.py b/app/app.py index b4ae13e..2a3206f 100644 --- a/app/app.py +++ b/app/app.py @@ -1,90 +1,63 @@ import logging import os import uuid -from flask import Flask, render_template, Response, request, session -from main import yoink, process, user_streams, stream_lock +from flask import Flask, render_template, request, session + +# Only used for the main.process function, but it feels right to have that over just "process" +import main app = Flask(__name__, static_folder="website/static", template_folder="website") app.secret_key = os.urandom(24) # Necessary for using sessions -# Configure logging -logging.basicConfig( - filename='./logs/app.log', - level=logging.DEBUG, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' -) -def create_session(): +def create_session() -> str: """ - Create a new session by generating a UUID and ensuring it does not collide + Create a new session by generating a UUID and ensuring it does not collide with an existing session in the user_streams global dictionary. - + Returns: str: A unique session ID. """ - session_id = str(uuid.uuid4()) - # Even though collisions are unlikely, we check for safety. - try: - if user_streams[session_id]: - session_id = create_session() - except KeyError: - pass - return session_id + return uuid.uuid4().hex -@app.route('/') + +@app.get("/") def home(): """ Render the home page and initialize a session. - + Returns: Response: The rendered home page with a unique session id. """ session_id = create_session() - session['id'] = session_id + session["id"] = session_id logging.info(f"Home page accessed. Assigned initial session ID: {session_id}") - return render_template('index.html', session_id=session_id) + return render_template("index.html", session_id=session_id) -@app.route('/process_url', methods=['POST']) + +@app.post("/process-url") def process_url(): """ - Accept a YouTube URL (from a form submission), initialize the session if necessary, + Accept a YouTube URL (from a form submission), initialize the session if necessary, and trigger the transcript retrieval and AI processing. - + Returns: - Response: Text response indicating start or error message. + Response: The results of processing the url. """ - session_id = session.get('id') + session_id = session.get("id") if not session_id: session_id = create_session() - session['id'] = session_id + session["id"] = session_id logging.info(f"No existing session. Created new session ID: {session_id}") - url = request.form['url'] + url = request.form["url"] logging.info(f"Received URL for processing from session {session_id}: {url}") - success, msg, status_code = process(url, session_id) - if success: - logging.info(f"Processing started successfully for session {session_id}.") - return Response("Processing started. Check /stream_output for updates.", content_type='text/plain', status=200) - else: - logging.error(f"Processing failed for session {session_id}: {msg}") - return Response(msg, content_type='text/plain', status=status_code) + # Before I had process return stuff in the form of (success: bool, msg: str, status: int) + # I don't know why I was doing all that back then, it's not like it was a library or anything + # I planned on using on other projects... + return main.process(url, session_id) -@app.route('/stream_output') -def stream_output(): - """ - Stream the AI processing output for the current session. - - Returns: - Response: A streaming response with text/plain content. - """ - session_id = session.get('id') - if not session_id or session_id not in user_streams: - logging.warning(f"Stream requested without a valid session ID: {session_id}") - return Response("No active stream for this session.", content_type='text/plain', status=400) - logging.info(f"Streaming output requested for session {session_id}.") - return Response(yoink(session_id), content_type='text/plain', status=200) -if __name__ == '__main__': +if __name__ == "__main__": logging.info("Starting Flask application.") # Running with threaded=True to handle multiple requests concurrently. - app.run(debug=True, threaded=True) \ No newline at end of file + app.run(debug=True, threaded=True) diff --git a/app/main.py b/app/main.py index 2f10315..9bead88 100644 --- a/app/main.py +++ b/app/main.py @@ -3,15 +3,16 @@ Main module that handles processing of YouTube transcripts and connecting to the Each user session has its own output stream and thread to handle the asynchronous AI response. """ +from http import HTTPStatus import re import threading import asyncio -from asyncio import sleep from datetime import datetime +from flask import Response +from collections.abc import Generator import pytz import os import logging -import uuid # Youtube Transcript imports import youtube_transcript_api._errors @@ -19,11 +20,34 @@ from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.formatters import TextFormatter # OpenAI API imports -from openai import AssistantEventHandler from openai import OpenAI from dotenv import load_dotenv -load_dotenv() + +# Configure logging +try: + logging.basicConfig( + filename="./logs/main.log", + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) +# FIX: Bruh what is this :joy: +except FileNotFoundError as e: + with open("./logs/main.log", "x"): + pass + logging.basicConfig( + filename="./logs/main.log", + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + logging.info(f"No main.log file was found ({e}), so one was created.") + +if not load_dotenv(): + logging.fatal( + "Error loading dotenv, your goose is very likely to be cooked (no OpenAI API Key 3)" + ) # Global dict for per-user session streams. user_streams = {} @@ -33,270 +57,135 @@ stream_lock = threading.Lock() # For running async code in non-async functions. awaiter = asyncio.run -# Configure logging -try: - logging.basicConfig( - filename='./logs/main.log', - level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - ) -except FileNotFoundError as e: - with open("./logs/main.log", "x"): - pass - logging.basicConfig( - filename='./logs/main.log', - level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - ) - logging.info(f"No main.log file was found ({e}), so one was created.") - -class StreamOutput: - """ - Class to encapsulate a session's streaming output. - - Attributes: - delta (str): Last delta update. - response (str): Cumulative response from the AI. - done (bool): Flag indicating if streaming is complete. - buffer (list): List of output delta strings pending streaming. - """ - def __init__(self): - self.delta: str = "" - self.response: str = "" - self.done: bool = False - self.buffer: list = [] - - def reset(self): - """ - Reset the stream output to its initial state. - """ - self.delta = "" - self.response = "" - self.done = False - self.buffer = [] - - def send_delta(self, delta): - """ - Process a new delta string. This method is a synchronous wrapper that calls the async - method process_delta. - - Args: - delta (str): The delta string to process. - """ - awaiter(self.process_delta(delta)) - - async def process_delta(self, delta): - """ - Process a new delta chunk asynchronously to update buffering. - - Args: - delta (str): The delta portion of the response. - """ - self.delta = delta - self.response += delta - - def get_index(lst): - return 0 if not lst else len(lst) - 1 - - if self.buffer: - try: - if self.delta != self.buffer[get_index(self.buffer)]: - self.buffer.append(delta) - except IndexError as index_error: - logging.error(f"Caught IndexError: {str(index_error)}") - self.buffer.append(delta) - else: - self.buffer.append(delta) - return # OpenAI Client configuration client = OpenAI( - organization='org-7ANUFsqOVIXLLNju8Rvmxu3h', - project="proj_NGz8Kux8CSka7DRJucAlDCz6", - api_key=os.getenv("OPENAI_API_KEY") + organization=os.getenv("OPENAI_ORG"), + project=os.getenv("OPENAI_PROJ"), + api_key=os.getenv("OPENAI_API_KEY"), ) +prompt_env = os.getenv("OPENAI_PROMPT") +# No .unwrap or .expect function 3 +if prompt_env is None or not prompt_env.strip(): + logging.fatal( + "No ENV var set for OPENAI_PROMPT, unable to submit instructions to AI." + ) + raise ValueError("The OPENAI_PROMPT environment variable is not set") -asst_screw_bardo_id = "asst_JGFaX6uOIotqy5mIJnu3Yyp7" # Assistant ID for processing +prompt_id = prompt_env -class EventHandler(AssistantEventHandler): - """ - Event handler for processing OpenAI assistant events. - - Attributes: - output_stream (StreamOutput): The output stream to write updates to. - """ - def __init__(self, output_stream: StreamOutput): - """ - Initialize the event handler with a specific output stream. - - Args: - output_stream (StreamOutput): The session specific stream output instance. - """ - super().__init__() - self.output_stream = output_stream - def on_text_created(self, text) -> None: - """ - Event triggered when text is first created. - - Args: - text (str): The initial response text. - """ - self.output_stream.send_delta("Response Received:\n\nScrew-Bardo:\n\n") - logging.info("Text created event handled.") +def create_and_stream(transcript: str) -> Generator[str, None, str]: + stream = client.responses.create( + model="gpt-4.1-mini", + prompt={ + "id": "pmpt_69097600a25c8190ba77a32457973dcd087a89928ce72d22", + "version": "1", + }, + input=[{"role": "user", "content": transcript}], + stream=True, + ) + for event in stream: + # behold, one of the most bloated match statements ever. + # because of my uneeded comments, of course + # wouldn't have it any other way + match event.type: + case "response.created": + # TODO: Should the user really see this? + # Logging is fine, but there has to be some kind of idenfitier for responses, right? + logging.info("Stream {stream_id} created for response.") + yield "Transcript submitted to AI.\n\n" + case "response.output_text.delta": + # This is where the cash money money cash is + # Could put a diobolical debug statement here + yield event.delta + case "response.output_text.done": + # TODO: Again, should the user really see this? + # Newsflash: they don't since it's not yeilded! + logging.info("Stream {stream_id} completed") + return "\nAI response end." + case "error": + # HACK: In order to abide by the type checking, since I don't know how it'll handle errors + # Since the loop is handled by Flask and not me idk what it'll do with that iterator + # No I'm not writing another Generator for it. + err_msg = event.message + logging.error(f"Error while streaming: {err_msg}") + return str(ValueError(err_msg)) + # NOTE: For debug, really. + # There are many events that I likely don't care about and would bloat a log + case _: + logging.warning( + f"Unhandled event type: {event.type}\nEvent contents: {event}" + ) + continue + # TODO: Decide the severity + logging.critical( + "Generator returned early, likely an error with the stream that wasn't reported" + ) + # HACK: Same deal as the "error" case. + return str( + ValueError( + "OpenAI never reported response done, so response may be incomplete." + ) + ) - def on_text_delta(self, delta, snapshot): - """ - Event triggered when a new text delta is available. - - Args: - delta (Any): Object that contains the new delta information. - snapshot (Any): A snapshot of the current output (if applicable). - """ - self.output_stream.send_delta(delta.value) - logging.debug(f"Text delta received: {delta.value}") - def on_tool_call_created(self, tool_call): - """ - Handle the case when the assistant attempts to call a tool. - Raises an exception as this behavior is unexpected. - - Args: - tool_call (Any): The tool call info. - - Raises: - Exception: Always, since tool calls are not allowed. - """ - error_msg = "Assistant shouldn't be calling tools." - logging.error(error_msg) - raise Exception(error_msg) - -def create_and_stream(transcript, session_id): - """ - Create a new thread that runs the OpenAI stream for a given session and transcript. - - Args: - transcript (str): The transcript from the YouTube video. - session_id (str): The unique session identifier. - """ - logging.info(f"Starting OpenAI stream thread for session {session_id}.") - event_handler = EventHandler(user_streams[session_id]['output_stream']) - try: - with client.beta.threads.create_and_run_stream( - assistant_id=asst_screw_bardo_id, - thread={ - "messages": [{"role": "user", "content": transcript}] - }, - event_handler=event_handler - ) as stream: - stream.until_done() - with stream_lock: - user_streams[session_id]['output_stream'].done = True - logging.info(f"OpenAI stream completed for session {session_id}.") - except Exception as e: - logging.exception(f"Exception occurred during create_and_stream for session {session_id}.") - -def yoink(session_id): - """ - Generator that yields streaming output for a session. - - This function starts the AI response thread, then continuously yields data from the session's output buffer - until the response is marked as done. - - Args: - session_id (str): The unique session identifier. - - Yields: - bytes: Chunks of the AI generated response. - """ - logging.info(f"Starting stream for session {session_id}...") - with stream_lock: - user_data = user_streams.get(session_id) - if not user_data: - logging.critical(f"User data not found for session id {session_id}?") - return - output_stream: StreamOutput = user_data.get('output_stream') - thread: threading.Thread = user_data.get('thread') - thread.start() - while True: - if not output_stream or not thread: - logging.error(f"No output stream/thread for session {session_id}.") - break - # Stop streaming when done and there is no pending buffered output. - if output_stream.done and not output_stream.buffer: - break - try: - if output_stream.buffer: - delta = output_stream.buffer.pop(0) - yield bytes(delta, encoding="utf-8") - else: - # A short sleep before looping again - asyncio.run(sleep(0.018)) - except Exception as e: - logging.exception(f"Exception occurred during streaming for session {session_id}: {e}") - break - logging.info(f"Stream completed successfully for session {session_id}.") - logging.info(f"Completed Assistant Response for session {session_id}:\n{output_stream.response}") - with stream_lock: - thread.join() - # Clean up the session data once done. - del user_streams[session_id] - logging.info(f"Stream thread joined and resources cleaned up for session {session_id}.") - -def process(url, session_id): +def process(url: str, session_id: str) -> Response: """ Process a YouTube URL: parse the video id, retrieve its transcript, and prepare the session for AI processing. - + Args: url (str): The YouTube URL provided by the user. session_id (str): The unique session identifier. - + Returns: - tuple: (success (bool), message (str or None), status_code (int or None)) + Response: The proper HTTP response based off what goes on in this here backend """ - current_time = datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S') + # Current time for logging I assume + current_time = datetime.now(pytz.timezone("America/New_York")).strftime( + "%Y-%m-%d %H:%M:%S" + ) + # hey wadda ya know logging.info(f"New Entry at {current_time} for session {session_id}") logging.info(f"URL: {url}") + # Parse video id out of user submitted url video_id = get_video_id(url) + # If there is no video id if not video_id: logging.warning(f"Could not parse video id from URL: {url}") - return (False, "Couldn't parse video ID from URL. (Are you sure you entered a valid YouTube.com or YouTu.be URL?)", 400) + return Response( + "Couldn't parse video ID from URL. (Are you sure you entered a valid YouTube.com or YouTu.be URL?)", + HTTPStatus.BAD_REQUEST, + ) logging.info(f"Parsed Video ID: {video_id}") transcript = get_auto_transcript(video_id) if not transcript: - logging.error(f"Error: could not retrieve transcript for session {session_id}. Assistant won't be called.") - return (False, "Successfully parsed video ID from URL, however the transcript was disabled by the video owner or invalid.", 200) - - # Initialize session data for streaming. - user_streams[session_id] = { - 'output_stream': None, - 'thread': None - } - with stream_lock: - user_streams[session_id]['output_stream'] = StreamOutput() - thread = threading.Thread( - name=f"create_stream_{session_id}", - target=create_and_stream, - args=(transcript, session_id) + logging.error( + f"Error: could not retrieve transcript for session {session_id}. Assistant won't be called." + ) + return Response( + "Successfully parsed video ID from URL, however the transcript was disabled by the video owner or invalid.", + HTTPStatus.INTERNAL_SERVER_ERROR, ) - user_streams[session_id]['thread'] = thread - logging.info(f"Stream preparation complete for session {session_id}, sending reply.") - return (True, None, None) -def get_video_id(url): + # Hello + return Response(create_and_stream(transcript), HTTPStatus.OK) + + +def get_video_id(url: str): """ Extract the YouTube video ID from a URL. - + Args: url (str): The YouTube URL. - + Returns: str or None: The video ID if found, otherwise None. """ - youtu_be = r'(?<=youtu.be/)([A-Za-z0-9_-]{11})' - youtube_com = r'(?<=youtube\.com\/watch\?v=)([A-Za-z0-9_-]{11})' + # I was going to add trimming but I think the JavaScript does it + # and I hate JavaScript too much to go look for it + youtu_be = r"(?<=youtu.be/)([A-Za-z0-9_-]{11})" + youtube_com = r"(?<=youtube\.com\/watch\?v=)([A-Za-z0-9_-]{11})" id_match = re.search(youtu_be, url) if not id_match: id_match = re.search(youtube_com, url) @@ -305,19 +194,21 @@ def get_video_id(url): return None return id_match.group(1) -def get_auto_transcript(video_id): + +def get_auto_transcript(video_id: str): """ Retrieve and format the transcript from a YouTube video. - + Args: video_id (str): The YouTube video identifier. - + Returns: str or None: The formatted transcript if successful; otherwise None. """ trans_api_errors = youtube_transcript_api._errors try: - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'], proxies=None, cookies=None, preserve_formatting=False) + ytt_api = YouTubeTranscriptApi() + transcript = ytt_api.fetch(video_id) except trans_api_errors.TranscriptsDisabled as e: logging.exception(f"Exception while fetching transcript: {e}") return None @@ -326,6 +217,7 @@ def get_auto_transcript(video_id): logging.info("Transcript successfully retrieved and formatted.") return txt_transcript -# Initialize a global output_stream just for main module logging (not used for per-session streaming). -output_stream = StreamOutput() -logging.info(f"Main initialized at {datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S')}. Application starting.") \ No newline at end of file + +logging.info( + f"Main initialized at {datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S')}. Application starting." +) diff --git a/app/start.sh b/app/start.sh old mode 100644 new mode 100755 index 44ebc81..5eee2eb --- a/app/start.sh +++ b/app/start.sh @@ -1,2 +1,2 @@ #!/bin/bash -exec gunicorn -b 0.0.0.0:1986 -w 4 --thread 2 --log-level debug app:app --timeout 120 --worker-class gthread --access-logfile - --error-logfile - --capture-output \ No newline at end of file +exec uv run gunicorn -b 0.0.0.0:1986 -w 4 --thread 2 --log-level debug app:app --timeout 120 --worker-class gthread --access-logfile - --error-logfile - --capture-output diff --git a/app/website/index.html b/app/website/index.html index 7f3185a..ebf322e 100644 --- a/app/website/index.html +++ b/app/website/index.html @@ -1,32 +1,30 @@ -
- - -Response will appear here.-
Response will appear here.+