From 1fd6711da023e44133189f146a6f3a0f5a2ed7ac Mon Sep 17 00:00:00 2001 From: foreverpyrite <51493121+ForeverPyrite@users.noreply.github.com> Date: Mon, 3 Nov 2025 22:43:15 -0600 Subject: [PATCH] Modernized and simplified the app. This wasn't a crazy rewrite or anything, I just updated it to the new YouTube Transcript and OpenAI API's, as well as super simplifying the code. On top of that, it now works single threaded, just using multiple gunicorn threads for concurrency. It's a lot simplier and cleaner, although not up to my current standards. --- .python-version | 1 + Dockerfile | 18 +- app/app.py | 81 ++--- app/main.py | 376 ++++++++--------------- app/start.sh | 2 +- app/website/index.html | 52 ++-- app/website/static/script.js | 115 ++++--- docker-compose.yml | 10 +- pyproject.toml | 29 ++ requirements.txt | Bin 1214 -> 0 bytes start.sh | 3 - uv.lock | 572 +++++++++++++++++++++++++++++++++++ 12 files changed, 850 insertions(+), 409 deletions(-) create mode 100644 .python-version mode change 100644 => 100755 app/start.sh create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 start.sh create mode 100644 uv.lock diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..24ee5b1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/Dockerfile b/Dockerfile index b9556a5..5775c68 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,20 @@ # Use an official Python runtime as a parent image -FROM python:3.13-slim +FROM ghcr.io/astral-sh/uv:debian # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - # Set work directory WORKDIR /app -# Install Python dependencies -COPY requirements.txt . -RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt - # Copy application files -COPY /app /app +COPY ./app /app +# Copy the pyproject.toml and uv.lock because this project's structure is chopped +COPY ./pyproject.toml ./uv.lock /app # Make start.sh executable RUN chmod +x /app/start.sh -# Expose the port the app runs on -EXPOSE 1986 - # Specify the entrypoint script ENTRYPOINT ["/app/start.sh"] diff --git a/app/app.py b/app/app.py index b4ae13e..2a3206f 100644 --- a/app/app.py +++ b/app/app.py @@ -1,90 +1,63 @@ import logging import os import uuid -from flask import Flask, render_template, Response, request, session -from main import yoink, process, user_streams, stream_lock +from flask import Flask, render_template, request, session + +# Only used for the main.process function, but it feels right to have that over just "process" +import main app = Flask(__name__, static_folder="website/static", template_folder="website") app.secret_key = os.urandom(24) # Necessary for using sessions -# Configure logging -logging.basicConfig( - filename='./logs/app.log', - level=logging.DEBUG, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' -) -def create_session(): +def create_session() -> str: """ - Create a new session by generating a UUID and ensuring it does not collide + Create a new session by generating a UUID and ensuring it does not collide with an existing session in the user_streams global dictionary. - + Returns: str: A unique session ID. """ - session_id = str(uuid.uuid4()) - # Even though collisions are unlikely, we check for safety. - try: - if user_streams[session_id]: - session_id = create_session() - except KeyError: - pass - return session_id + return uuid.uuid4().hex -@app.route('/') + +@app.get("/") def home(): """ Render the home page and initialize a session. - + Returns: Response: The rendered home page with a unique session id. """ session_id = create_session() - session['id'] = session_id + session["id"] = session_id logging.info(f"Home page accessed. Assigned initial session ID: {session_id}") - return render_template('index.html', session_id=session_id) + return render_template("index.html", session_id=session_id) -@app.route('/process_url', methods=['POST']) + +@app.post("/process-url") def process_url(): """ - Accept a YouTube URL (from a form submission), initialize the session if necessary, + Accept a YouTube URL (from a form submission), initialize the session if necessary, and trigger the transcript retrieval and AI processing. - + Returns: - Response: Text response indicating start or error message. + Response: The results of processing the url. """ - session_id = session.get('id') + session_id = session.get("id") if not session_id: session_id = create_session() - session['id'] = session_id + session["id"] = session_id logging.info(f"No existing session. Created new session ID: {session_id}") - url = request.form['url'] + url = request.form["url"] logging.info(f"Received URL for processing from session {session_id}: {url}") - success, msg, status_code = process(url, session_id) - if success: - logging.info(f"Processing started successfully for session {session_id}.") - return Response("Processing started. Check /stream_output for updates.", content_type='text/plain', status=200) - else: - logging.error(f"Processing failed for session {session_id}: {msg}") - return Response(msg, content_type='text/plain', status=status_code) + # Before I had process return stuff in the form of (success: bool, msg: str, status: int) + # I don't know why I was doing all that back then, it's not like it was a library or anything + # I planned on using on other projects... + return main.process(url, session_id) -@app.route('/stream_output') -def stream_output(): - """ - Stream the AI processing output for the current session. - - Returns: - Response: A streaming response with text/plain content. - """ - session_id = session.get('id') - if not session_id or session_id not in user_streams: - logging.warning(f"Stream requested without a valid session ID: {session_id}") - return Response("No active stream for this session.", content_type='text/plain', status=400) - logging.info(f"Streaming output requested for session {session_id}.") - return Response(yoink(session_id), content_type='text/plain', status=200) -if __name__ == '__main__': +if __name__ == "__main__": logging.info("Starting Flask application.") # Running with threaded=True to handle multiple requests concurrently. - app.run(debug=True, threaded=True) \ No newline at end of file + app.run(debug=True, threaded=True) diff --git a/app/main.py b/app/main.py index 2f10315..9bead88 100644 --- a/app/main.py +++ b/app/main.py @@ -3,15 +3,16 @@ Main module that handles processing of YouTube transcripts and connecting to the Each user session has its own output stream and thread to handle the asynchronous AI response. """ +from http import HTTPStatus import re import threading import asyncio -from asyncio import sleep from datetime import datetime +from flask import Response +from collections.abc import Generator import pytz import os import logging -import uuid # Youtube Transcript imports import youtube_transcript_api._errors @@ -19,11 +20,34 @@ from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.formatters import TextFormatter # OpenAI API imports -from openai import AssistantEventHandler from openai import OpenAI from dotenv import load_dotenv -load_dotenv() + +# Configure logging +try: + logging.basicConfig( + filename="./logs/main.log", + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) +# FIX: Bruh what is this :joy: +except FileNotFoundError as e: + with open("./logs/main.log", "x"): + pass + logging.basicConfig( + filename="./logs/main.log", + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + logging.info(f"No main.log file was found ({e}), so one was created.") + +if not load_dotenv(): + logging.fatal( + "Error loading dotenv, your goose is very likely to be cooked (no OpenAI API Key None: - """ - Event triggered when text is first created. - - Args: - text (str): The initial response text. - """ - self.output_stream.send_delta("Response Received:\n\nScrew-Bardo:\n\n") - logging.info("Text created event handled.") +def create_and_stream(transcript: str) -> Generator[str, None, str]: + stream = client.responses.create( + model="gpt-4.1-mini", + prompt={ + "id": "pmpt_69097600a25c8190ba77a32457973dcd087a89928ce72d22", + "version": "1", + }, + input=[{"role": "user", "content": transcript}], + stream=True, + ) + for event in stream: + # behold, one of the most bloated match statements ever. + # because of my uneeded comments, of course + # wouldn't have it any other way + match event.type: + case "response.created": + # TODO: Should the user really see this? + # Logging is fine, but there has to be some kind of idenfitier for responses, right? + logging.info("Stream {stream_id} created for response.") + yield "Transcript submitted to AI.\n\n" + case "response.output_text.delta": + # This is where the cash money money cash is + # Could put a diobolical debug statement here + yield event.delta + case "response.output_text.done": + # TODO: Again, should the user really see this? + # Newsflash: they don't since it's not yeilded! + logging.info("Stream {stream_id} completed") + return "\nAI response end." + case "error": + # HACK: In order to abide by the type checking, since I don't know how it'll handle errors + # Since the loop is handled by Flask and not me idk what it'll do with that iterator + # No I'm not writing another Generator for it. + err_msg = event.message + logging.error(f"Error while streaming: {err_msg}") + return str(ValueError(err_msg)) + # NOTE: For debug, really. + # There are many events that I likely don't care about and would bloat a log + case _: + logging.warning( + f"Unhandled event type: {event.type}\nEvent contents: {event}" + ) + continue + # TODO: Decide the severity + logging.critical( + "Generator returned early, likely an error with the stream that wasn't reported" + ) + # HACK: Same deal as the "error" case. + return str( + ValueError( + "OpenAI never reported response done, so response may be incomplete." + ) + ) - def on_text_delta(self, delta, snapshot): - """ - Event triggered when a new text delta is available. - - Args: - delta (Any): Object that contains the new delta information. - snapshot (Any): A snapshot of the current output (if applicable). - """ - self.output_stream.send_delta(delta.value) - logging.debug(f"Text delta received: {delta.value}") - def on_tool_call_created(self, tool_call): - """ - Handle the case when the assistant attempts to call a tool. - Raises an exception as this behavior is unexpected. - - Args: - tool_call (Any): The tool call info. - - Raises: - Exception: Always, since tool calls are not allowed. - """ - error_msg = "Assistant shouldn't be calling tools." - logging.error(error_msg) - raise Exception(error_msg) - -def create_and_stream(transcript, session_id): - """ - Create a new thread that runs the OpenAI stream for a given session and transcript. - - Args: - transcript (str): The transcript from the YouTube video. - session_id (str): The unique session identifier. - """ - logging.info(f"Starting OpenAI stream thread for session {session_id}.") - event_handler = EventHandler(user_streams[session_id]['output_stream']) - try: - with client.beta.threads.create_and_run_stream( - assistant_id=asst_screw_bardo_id, - thread={ - "messages": [{"role": "user", "content": transcript}] - }, - event_handler=event_handler - ) as stream: - stream.until_done() - with stream_lock: - user_streams[session_id]['output_stream'].done = True - logging.info(f"OpenAI stream completed for session {session_id}.") - except Exception as e: - logging.exception(f"Exception occurred during create_and_stream for session {session_id}.") - -def yoink(session_id): - """ - Generator that yields streaming output for a session. - - This function starts the AI response thread, then continuously yields data from the session's output buffer - until the response is marked as done. - - Args: - session_id (str): The unique session identifier. - - Yields: - bytes: Chunks of the AI generated response. - """ - logging.info(f"Starting stream for session {session_id}...") - with stream_lock: - user_data = user_streams.get(session_id) - if not user_data: - logging.critical(f"User data not found for session id {session_id}?") - return - output_stream: StreamOutput = user_data.get('output_stream') - thread: threading.Thread = user_data.get('thread') - thread.start() - while True: - if not output_stream or not thread: - logging.error(f"No output stream/thread for session {session_id}.") - break - # Stop streaming when done and there is no pending buffered output. - if output_stream.done and not output_stream.buffer: - break - try: - if output_stream.buffer: - delta = output_stream.buffer.pop(0) - yield bytes(delta, encoding="utf-8") - else: - # A short sleep before looping again - asyncio.run(sleep(0.018)) - except Exception as e: - logging.exception(f"Exception occurred during streaming for session {session_id}: {e}") - break - logging.info(f"Stream completed successfully for session {session_id}.") - logging.info(f"Completed Assistant Response for session {session_id}:\n{output_stream.response}") - with stream_lock: - thread.join() - # Clean up the session data once done. - del user_streams[session_id] - logging.info(f"Stream thread joined and resources cleaned up for session {session_id}.") - -def process(url, session_id): +def process(url: str, session_id: str) -> Response: """ Process a YouTube URL: parse the video id, retrieve its transcript, and prepare the session for AI processing. - + Args: url (str): The YouTube URL provided by the user. session_id (str): The unique session identifier. - + Returns: - tuple: (success (bool), message (str or None), status_code (int or None)) + Response: The proper HTTP response based off what goes on in this here backend """ - current_time = datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S') + # Current time for logging I assume + current_time = datetime.now(pytz.timezone("America/New_York")).strftime( + "%Y-%m-%d %H:%M:%S" + ) + # hey wadda ya know logging.info(f"New Entry at {current_time} for session {session_id}") logging.info(f"URL: {url}") + # Parse video id out of user submitted url video_id = get_video_id(url) + # If there is no video id if not video_id: logging.warning(f"Could not parse video id from URL: {url}") - return (False, "Couldn't parse video ID from URL. (Are you sure you entered a valid YouTube.com or YouTu.be URL?)", 400) + return Response( + "Couldn't parse video ID from URL. (Are you sure you entered a valid YouTube.com or YouTu.be URL?)", + HTTPStatus.BAD_REQUEST, + ) logging.info(f"Parsed Video ID: {video_id}") transcript = get_auto_transcript(video_id) if not transcript: - logging.error(f"Error: could not retrieve transcript for session {session_id}. Assistant won't be called.") - return (False, "Successfully parsed video ID from URL, however the transcript was disabled by the video owner or invalid.", 200) - - # Initialize session data for streaming. - user_streams[session_id] = { - 'output_stream': None, - 'thread': None - } - with stream_lock: - user_streams[session_id]['output_stream'] = StreamOutput() - thread = threading.Thread( - name=f"create_stream_{session_id}", - target=create_and_stream, - args=(transcript, session_id) + logging.error( + f"Error: could not retrieve transcript for session {session_id}. Assistant won't be called." + ) + return Response( + "Successfully parsed video ID from URL, however the transcript was disabled by the video owner or invalid.", + HTTPStatus.INTERNAL_SERVER_ERROR, ) - user_streams[session_id]['thread'] = thread - logging.info(f"Stream preparation complete for session {session_id}, sending reply.") - return (True, None, None) -def get_video_id(url): + # Hello + return Response(create_and_stream(transcript), HTTPStatus.OK) + + +def get_video_id(url: str): """ Extract the YouTube video ID from a URL. - + Args: url (str): The YouTube URL. - + Returns: str or None: The video ID if found, otherwise None. """ - youtu_be = r'(?<=youtu.be/)([A-Za-z0-9_-]{11})' - youtube_com = r'(?<=youtube\.com\/watch\?v=)([A-Za-z0-9_-]{11})' + # I was going to add trimming but I think the JavaScript does it + # and I hate JavaScript too much to go look for it + youtu_be = r"(?<=youtu.be/)([A-Za-z0-9_-]{11})" + youtube_com = r"(?<=youtube\.com\/watch\?v=)([A-Za-z0-9_-]{11})" id_match = re.search(youtu_be, url) if not id_match: id_match = re.search(youtube_com, url) @@ -305,19 +194,21 @@ def get_video_id(url): return None return id_match.group(1) -def get_auto_transcript(video_id): + +def get_auto_transcript(video_id: str): """ Retrieve and format the transcript from a YouTube video. - + Args: video_id (str): The YouTube video identifier. - + Returns: str or None: The formatted transcript if successful; otherwise None. """ trans_api_errors = youtube_transcript_api._errors try: - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'], proxies=None, cookies=None, preserve_formatting=False) + ytt_api = YouTubeTranscriptApi() + transcript = ytt_api.fetch(video_id) except trans_api_errors.TranscriptsDisabled as e: logging.exception(f"Exception while fetching transcript: {e}") return None @@ -326,6 +217,7 @@ def get_auto_transcript(video_id): logging.info("Transcript successfully retrieved and formatted.") return txt_transcript -# Initialize a global output_stream just for main module logging (not used for per-session streaming). -output_stream = StreamOutput() -logging.info(f"Main initialized at {datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S')}. Application starting.") \ No newline at end of file + +logging.info( + f"Main initialized at {datetime.now(pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S')}. Application starting." +) diff --git a/app/start.sh b/app/start.sh old mode 100644 new mode 100755 index 44ebc81..5eee2eb --- a/app/start.sh +++ b/app/start.sh @@ -1,2 +1,2 @@ #!/bin/bash -exec gunicorn -b 0.0.0.0:1986 -w 4 --thread 2 --log-level debug app:app --timeout 120 --worker-class gthread --access-logfile - --error-logfile - --capture-output \ No newline at end of file +exec uv run gunicorn -b 0.0.0.0:1986 -w 4 --thread 2 --log-level debug app:app --timeout 120 --worker-class gthread --access-logfile - --error-logfile - --capture-output diff --git a/app/website/index.html b/app/website/index.html index 7f3185a..ebf322e 100644 --- a/app/website/index.html +++ b/app/website/index.html @@ -1,32 +1,30 @@ - - - - Screw You Bardo - - - - - + + + + Screw You Bardo + + + + + - -
-
-
Response will appear here.
-
+ +
+
+
Response will appear here.
+
-
-
- - -
-
-
- - - \ No newline at end of file +
+
+ + +
+
+
+ + diff --git a/app/website/static/script.js b/app/website/static/script.js index e9c66cf..2b58509 100644 --- a/app/website/static/script.js +++ b/app/website/static/script.js @@ -1,71 +1,68 @@ + document.addEventListener("DOMContentLoaded", () => { const responseArea = document.getElementById('response-area'); const responseSection = document.getElementById('response-section'); const submitButton = document.getElementById('submit'); const urlBox = document.getElementById('url_box'); + const form = document.getElementById('url-form'); - // Before sending HTMX request, prepare UI and handle empty input - document.body.addEventListener('htmx:beforeRequest', function(evt) { - if (evt.detail.elt.id === 'url-form') { - const url = urlBox.value.trim(); - if (!url) { - evt.detail.shouldCancel = true; - responseArea.innerText = 'Please enter a URL.'; - return; - } - urlBox.value = ''; - submitButton.disabled = true; - responseArea.innerText = 'Processing...'; + // Simple URL validation regex (covers http/https and domains) + const urlPattern = /^(https?:\/\/)[\w.-]+(\.[\w\.-]+)+[/#?]?.*$/i; + + form.addEventListener('submit', async (evt) => { + evt.preventDefault(); + + const url = urlBox.value.trim(); + if (!url) { + responseArea.textContent = 'Please enter a URL.'; + return; } - }); - - document.body.addEventListener('htmx:afterRequest', function(evt) { - if (evt.detail.elt.id === 'url-form') { - const text = evt.detail.xhr.responseText.trim(); - if (text === "Processing started. Check /stream_output for updates.") { - streamOutput(responseArea, responseSection, submitButton); - } else { - responseArea.innerText = text; - submitButton.disabled = false; - } + if (!urlPattern.test(url)) { + responseArea.textContent = 'Please enter a valid URL (must start with http:// or https://).'; + return; } - }); - function streamOutput(responseArea, responseSection, submitButton) { - // Fetch the streaming output - fetch('/stream_output') - .then(response => { - if (!response.ok) { - throw new Error('Network response was not ok'); - } - const reader = response.body.getReader(); - const decoder = new TextDecoder("utf-8"); + // Prepare UI + urlBox.value = ''; + submitButton.disabled = true; + responseArea.textContent = 'Processing...\n'; - responseArea.innerHTML = ""; - - function readStream() { - reader.read().then(({ done, value }) => { - if (done) { - submitButton.disabled = false; - return; - } - const chunk = decoder.decode(value, { stream: true }); - responseArea.innerHTML += chunk; - responseSection.scrollTop = responseSection.scrollHeight; - readStream(); - }).catch(error => { - console.error('Error reading stream:', error); - responseArea.innerText = 'Error reading stream: ' + error.message; - submitButton.disabled = false; - }); - } - - readStream(); - }) - .catch(error => { - console.error('Error fetching stream:', error); - responseArea.innerText = 'Error fetching stream: ' + error.message; - submitButton.disabled = false; + try { + const response = await fetch('/process-url', { + method: 'POST', + headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, + body: new URLSearchParams({ url }), }); - } + + if (!response.ok) { + throw new Error(`Server returned ${response.status}`); + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder('utf-8'); + responseArea.textContent = ''; // clear before streaming + + async function readChunk() { + const { done, value } = await reader.read(); + if (done) { + submitButton.disabled = false; + return; + } + + const chunk = decoder.decode(value, { stream: true }); + responseArea.innerHTML += chunk; + responseSection.scrollTop = responseSection.scrollHeight; + + await readChunk(); + } + + await readChunk(); + + } catch (err) { + console.error(err); + responseArea.textContent = `Error: ${err.message}`; + submitButton.disabled = false; + } + }); }); + diff --git a/docker-compose.yml b/docker-compose.yml index 802b547..522addf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,17 +3,9 @@ services: build: . container_name: screw-bardo ports: -<<<<<<< HEAD - - "$PORT:1986" + - "1986:1986" env_file: - .env volumes: - ./app/logs:/app/app/logs/:rw restart: unless-stopped - -======= - - "1986:1986" - volumes: - - ./app/logs:/app/logs - restart: unless-stopped ->>>>>>> b5a2b4e6d1b958dbb3ad702026889172514c1fd6 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7ebee02 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[project] +name = "screw-bardo" +version = "0.7.0" +description = "A simple, single-threaded web app to automate an assignment." +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + # Import environment from .env + "dotenv>=0.9.9", + # Web server + "flask>=3.1.2", + # Fetch the transcripts of YouTube videos + "youtube-transcript-api>=1.2.3", + # OpenAI API to get AI response + "openai>=2.7.0", + # Easiest way to install it in the venv rip + "gunicorn>=23.0.0", + # just for logging I think? who knows what I was doing lol + "pytz>=2025.2", +] + +# I think these are just the default settings for the most part... +[tool.basedpyright] +typeCheckingMode = "recommended" +reportMissingImports = true +reportUnusedImport = true +reportUnusedVariable = true +reportUndefinedVariable = true +reportPrivateUsage = true diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 11688ceba03d120e01db8a95c7abda6c8caacaf6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1214 zcmZvcOK;mi41{?u(4Uebl9MJqAOZ#LuoJ;#?AIM5Xdny#W;`?V#%{lRHq_)a= zqix9>z}a~2ikw?c#2@THG}$BZ!Q&ONFP!Mp`uB;P4M)9dBDHrE%4*5m!D*oLn%)<# zWw2j-6aMppuI^%6&=qazF63Y(l~UNElf5KfsvXjMiB>c(d8+1Z(_!;+;*g~E;w zn#Yi5^;X|kY8v9fK!goO>2eM_29ZH|llwH0?6m72V`{ zxo-qlw?g+KZlUmF#)?lMba+y=;n|nl8#i0q+1fo_&Y9=vSSfc@RplLn)yz%~JiR^8 zr*L{