TextGeneratorio · lee101 · Jun 7, 2025 · Jun 7, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,78 @@
+name: CI
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+
+jobs:
+  unit:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11', '3.12']
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Cache uv
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-uv-${{ hashFiles('requirements.txt', 'requirements-test.txt') }}
+      - name: Install uv
+        run: pip install uv
+      - name: Install dependencies
+        run: uv pip install --system -r requirements.txt -r requirements-test.txt
+      - name: Ruff
+        run: ruff .
+      - name: Run unit tests
+        run: pytest --cov=questions --cov-report=xml -q
+      - name: Upload coverage
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-xml
+          path: coverage.xml
+  offline-integration:
+    runs-on: ubuntu-latest
+    needs: unit
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Cache uv
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-uv-${{ hashFiles('requirements.txt', 'requirements-test.txt') }}
+      - name: Install uv
+        run: pip install uv
+      - name: Install dependencies
+        run: uv pip install --system -r requirements.txt -r requirements-test.txt
+      - name: Download NLTK data
+        run: python -m nltk.downloader punkt
+      - name: Run offline integration tests
+        run: pytest -m "integration and not internet" -q
+  integration:
+    runs-on: ubuntu-latest
+    needs: offline-integration
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Cache uv
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/uv
+          key: ${{ runner.os }}-uv-${{ hashFiles('requirements.txt', 'requirements-test.txt') }}
+      - name: Install uv
+        run: pip install uv
+      - name: Install dependencies
+        run: uv pip install --system -r requirements.txt -r requirements-test.txt
+      - name: Download NLTK data
+        run: python -m nltk.downloader punkt
+      - name: Run integration tests
+        run: pytest -m "integration and internet" -q
diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,4 @@ __pycache__
 flagged
 mainserver.log
 
+nltk_data/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.5
+    hooks:
+      - id: ruff
+  - repo: local
+    hooks:
+      - id: pytest
+        name: pytest
+        entry: pytest -q
+        language: system
+        pass_filenames: false
diff --git a/Makefile b/Makefile
@@ -0,0 +1,13 @@
+.PHONY: lint test offline-integration integration
+
+lint:
+	ruff check .
+
+test:
+	pytest -q
+
+offline-integration:
+	pytest -m "integration and not internet" -q || true
+
+integration:
+	pytest -m "integration and internet" -q || true
diff --git a/README.md b/README.md
@@ -83,6 +83,16 @@ Using cuda is important to speed up inference.
 python -m nltk.downloader punkt
 ```
 
+### Running offline integration tests
+
+Offline integration tests exercise functionality that does not require internet
+access but may load heavy dependencies. After installing the `punkt` dataset
+you can run them with:
+
+```shell
+pytest -m "integration and not internet"
+```
+
 Set up some environment variables in this file (fake ones are okay for local dev)
 
 ```shell

diff --git a/pytest.ini b/pytest.ini
@@ -1,7 +1,19 @@
-[tool:pytest]
-norecursedirs = '.*', 'build', 'dist', 'CVS', '_darcs', '{arch}', '*.egg', 'static', "models", "templates", "kuber", gameon questions/tools
-
 [pytest]
+norecursedirs =
+    .* 
+    build 
+    dist 
+    CVS 
+    _darcs 
+    {arch} 
+    *.egg 
+    static 
+    models 
+    templates 
+    kuber 
+    gameon 
+    questions/tools
+
 asyncio_mode=auto
 pythonpath = .
 
@@ -10,6 +22,9 @@ testpaths = tests
 # cwd
 workdir = .
 
-addopts = -s -v
+addopts = -s -v --ignore=tests/integration --ignore=tests/performance
+markers =
+    integration: integration tests
+    internet: tests that require internet access
 env =
   GOOGLE_APPLICATION_CREDENTIALS = secrets/google-credentials.json
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 pytest==7.3.1
-httpx==0.24.0
-fastapi==0.95.2
 pytest-cov==4.1.0
-colorama==0.4.6
+ruff
+httpx
+colorama
diff --git a/requirements.in b/requirements.in
@@ -25,8 +25,9 @@ google-cloud-storage
 
 google-cloud-ndb
 #jinja2
+# we need websockets <11 for pyppeteer compatibility
 jinja2
-websockets
+websockets<11
 #nltk
 nltk
 stripe

diff --git a/requirements.txt b/requirements.txt
@@ -194,7 +194,7 @@ urllib3==2.2.3
     # via requests
 uvicorn==0.34.2
     # via -r requirements.in
-websockets==12.0
+websockets==10.4
     # via -r requirements.in
 wrapt==1.11.2
     # via

diff --git a/ruff.toml b/ruff.toml
@@ -0,0 +1,2 @@
+line-length = 120
+exclude = [".venv"]
diff --git a/tests/integ/cutoff_example.py → tests/integration/cutoff_example.py b/tests/integ/cutoff_example.py → tests/integration/cutoff_example.py
diff --git a/tests/integ/data/f2bjrop1.0.wav → tests/integration/data/f2bjrop1.0.wav b/tests/integ/data/f2bjrop1.0.wav → tests/integration/data/f2bjrop1.0.wav
diff --git a/...integ/questions/payments/test_payments.py → ...ation/questions/payments/test_payments.py b/...integ/questions/payments/test_payments.py → ...ation/questions/payments/test_payments.py
diff --git a/tests/test_ai_wrapper.py → tests/integration/test_ai_wrapper.py b/tests/test_ai_wrapper.py → tests/integration/test_ai_wrapper.py
@@ -1,12 +1,15 @@
 import pytest
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
+
 from questions.ai_wrapper import generate_with_claude
 
 @pytest.mark.asyncio
 async def test_generate_with_claude():
     prompt = "What is the capital of France?"
     response = await generate_with_claude(prompt)
-    
+
     assert response is not None
     assert isinstance(response, str)
     assert len(response) > 0
-    assert "Paris" in response
+    assert "Paris" in response
diff --git a/tests/integ/test_audio_length_limit.py → tests/integration/test_audio_length_limit.py b/tests/integ/test_audio_length_limit.py → tests/integration/test_audio_length_limit.py
@@ -1,6 +1,8 @@
 import os
 import pytest
 
+pytestmark = pytest.mark.integration
+
 from starlette.testclient import TestClient
 
 from questions.inference_server.inference_server import app, audio_process

diff --git a/tests/integ/test_bad_cutoff.py → tests/integration/test_bad_cutoff.py b/tests/integ/test_bad_cutoff.py → tests/integration/test_bad_cutoff.py
@@ -1,5 +1,10 @@
 import requests
 import os
+import pytest
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
+
+pytest.skip("manual script", allow_module_level=True)
 
 from sellerinfo import TEXT_GENERATOR_SECRET
 

diff --git a/tests/integ/test_inference_server.py → tests/integration/test_inference_server.py b/tests/integ/test_inference_server.py → tests/integration/test_inference_server.py
@@ -1,5 +1,8 @@
 import dataclasses
 import os
+import pytest
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
 
 from fastapi import UploadFile
 

diff --git a/tests/integ/test_inference_server_speech.py → ...tegration/test_inference_server_speech.py b/tests/integ/test_inference_server_speech.py → ...tegration/test_inference_server_speech.py
@@ -1,4 +1,7 @@
 import os
+import pytest
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
 
 from questions.utils import log_time
 

diff --git a/tests/test_main.py → tests/integration/test_main.py b/tests/test_main.py → tests/integration/test_main.py
@@ -1,4 +1,7 @@
 import dataclasses
+import pytest
+
+pytestmark = pytest.mark.integration
 
 from starlette.testclient import TestClient
 

diff --git a/tests/test_main_unit.py → tests/integration/test_main_unit.py b/tests/test_main_unit.py → tests/integration/test_main_unit.py
@@ -1,4 +1,7 @@
 import pytest
+
+pytestmark = pytest.mark.integration
+
 from starlette.testclient import TestClient
 
 from questions.inference_server.inference_server import app, generate_route, openai_route

diff --git a/tests/performance/test_e2e_perf.py b/tests/performance/test_e2e_perf.py
@@ -1,6 +1,8 @@
 import traceback
 
 import pytest
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
 import requests
 import logging
 from questions.logging_config import setup_logging

diff --git a/tests/test_doc_api.py b/tests/test_doc_api.py
@@ -1,6 +1,14 @@
 import json
 from unittest.mock import patch, MagicMock, AsyncMock
 import pytest
+import os
+
+if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
+    pytest.skip("integration test requires Google credentials", allow_module_level=True)
+
+pytest.importorskip("google.cloud.ndb", reason="google cloud ndb required for document api tests")
+
+pytestmark = [pytest.mark.integration, pytest.mark.internet]
 
 from main import list_documents, get_document, save_document, autosave_document
 

diff --git a/tests/unit/questions/test_link_enricher.py b/tests/unit/questions/test_link_enricher.py
@@ -1,7 +1,12 @@
-from questions.link_enricher import get_urls, enrich_links
 import logging
+import pytest
+
+bs4 = pytest.importorskip("bs4", reason="bs4 required for link enrichment tests")
+from questions.link_enricher import get_urls, enrich_links
 from questions.logging_config import setup_logging
 
+pytestmark = pytest.mark.integration
+
 setup_logging()
 logger = logging.getLogger(__name__)
 

diff --git a/tests/unit/test_perplexity.py b/tests/unit/test_perplexity.py
@@ -1,11 +1,24 @@
-from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast, BloomTokenizerFast, BloomForCausalLM
+import pytest
+
+transformers = pytest.importorskip(
+    "transformers", reason="transformers is required for perplexity tests"
+)
+from transformers import (
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    GPT2TokenizerFast,
+    BloomTokenizerFast,
+    BloomForCausalLM,
+)
 import logging
 from questions.logging_config import setup_logging
 
+pytestmark = pytest.mark.integration
+
 setup_logging()
 logger = logging.getLogger(__name__)
 
-import torch
+torch = pytest.importorskip("torch", reason="torch is required for perplexity tests")
 
 from questions.perplexity import get_perplexity
 

diff --git a/tests/unit/test_post_process_results.py b/tests/unit/test_post_process_results.py
@@ -1,5 +1,12 @@
+import pytest
+
+transformers = pytest.importorskip(
+    "transformers", reason="transformers required for post-process tests"
+)
 from transformers import AutoTokenizer
 
+pytestmark = pytest.mark.integration
+
 from questions.models import GenerateParams
 from questions.post_process_results import post_process_results
 

diff --git a/tests/unit/test_summarization.py b/tests/unit/test_summarization.py
@@ -1,6 +1,11 @@
+import pytest
+pytest.importorskip("torch", reason="torch required for summarization tests")
 from questions.inference_server.inference_server import MODEL_CACHE
 from questions.summarization import get_extractive_summary
 from questions.utils import log_time
+import pytest
+
+pytestmark = pytest.mark.integration
 
 text = """
     # classification = summarizer("James Joseph Norton  is an American comedian, radio personality, actor, author, and television and podcast host. Norton has been the co-host of the podcast UFC Unfiltered with Matt Serra and the morning radio show Jim Norton & Sam Roberts on SiriusXM Radio since 2016, and The Chip Chipperson Podacast since 2017. He gained initial prominence as third mic on the radio show Opie and Anthony, with Gregg \"Opie\" Hughes and Anthony Cumia, from 2001 to 2014. After becoming a stand-up comedian in 1990, Norton spent his early years developing his act. His appearances on The Louie Show caught the attention of comedian Andrew Dice Clay in 1997, who chose Norton to open for him for his shows. In 2000, Norton made his debut on Opie and Anthony and joined the show as a third mic in 2001 which increased his national exposure. He went on to have a recurring role on the sitcom Lucky Louie and featured as a regular panellist on Tough Crowd with Colin Quinn. Since he joined SiriusXM with Opie and Anthony in 2004, Norton hosted Opie with Jim Norton from 2014 to 2016, and The Jim Norton Advice Show. Since 2003, Norton has released four comedy albums and seven comedy specials, including three on Epix and one on Netflix. In 2014, Norton hosted The Jim Norton Show, a talk show on Vice.")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,3 +48,4 @@ __pycache__
		flagged
		mainserver.log

		nltk_data/