diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py index a6ceb233..ed4a7da2 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py @@ -44,6 +44,7 @@ RecordType.COURT_CASES: RecordTypeCoarse.JAILS_AND_COURTS, RecordType.INCARCERATION_RECORDS: RecordTypeCoarse.JAILS_AND_COURTS, # Other + RecordType.OTHER: RecordTypeCoarse.OTHER, None: RecordTypeCoarse.NOT_RELEVANT } diff --git a/src/external/huggingface/hub/format.py b/src/external/huggingface/hub/format.py index c870ec17..e1eb32b6 100644 --- a/src/external/huggingface/hub/format.py +++ b/src/external/huggingface/hub/format.py @@ -16,8 +16,8 @@ def format_as_huggingface_dataset(outputs: list[GetForLoadingToHuggingFaceOutput d['url_id'].append(output.url_id) d['url'].append(output.url) d['relevant'].append(output.relevant) - d['record_type_fine'].append(output.record_type_fine) - d['record_type_coarse'].append(output.record_type_coarse) + d['record_type_fine'].append(output.record_type_fine.value) + d['record_type_coarse'].append(output.record_type_coarse.value) d['html'].append(output.html) return Dataset.from_dict(d) diff --git a/tests/manual/migration_with_prod_data/__init__.py b/tests/manual/core/tasks/scheduled/__init__.py similarity index 100% rename from tests/manual/migration_with_prod_data/__init__.py rename to tests/manual/core/tasks/scheduled/__init__.py diff --git a/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py b/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py new file mode 100644 index 00000000..a091ff5c --- /dev/null +++ b/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py @@ -0,0 +1,26 @@ +import pytest + +from environs import Env + +from src.core.env_var_manager import EnvVarManager +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.huggingface.hub.client import HuggingFaceHubClient + +env = Env() +env.read_env() + +@pytest.mark.asyncio +@pytest.mark.manual +async def test_push_to_huggingface(): + operator = PushToHuggingFaceTaskOperator( + adb_client=AsyncDatabaseClient( + db_url=env.str("PROD_DATABASE_URL") + ), + hf_client=HuggingFaceHubClient( + env.str("HUGGINGFACE_HUB_TOKEN") + ) + ) + + await operator.inner_task_logic() + diff --git a/tests/manual/core/tasks/url/__init__.py b/tests/manual/core/tasks/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/manual/core/tasks/test_url_html_task_operator.py b/tests/manual/core/tasks/url/test_url_html_task_operator.py similarity index 100% rename from tests/manual/core/tasks/test_url_html_task_operator.py rename to tests/manual/core/tasks/url/test_url_html_task_operator.py diff --git a/tests/manual/migration_with_prod_data/README.md b/tests/manual/migration_with_prod_data/README.md deleted file mode 100644 index 89e88a47..00000000 --- a/tests/manual/migration_with_prod_data/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory is designed to test that the migration works on a copy of the production data. - -For these tests to work properly, the local database must have the most recent production data, including the alembic version table. \ No newline at end of file diff --git a/tests/manual/unsorted/test_root_url_cache_unit.py b/tests/manual/unsorted/test_root_url_cache_unit.py deleted file mode 100644 index c19261b9..00000000 --- a/tests/manual/unsorted/test_root_url_cache_unit.py +++ /dev/null @@ -1,83 +0,0 @@ -import json -import os -import tempfile -from unittest.mock import mock_open, patch - -import pytest - - -@pytest.fixture -def temp_file(): - # Setup: Create a temporary file and immediately close it to avoid locking issues - temp_file = tempfile.NamedTemporaryFile(delete=False) - temp_file.close() # Close the file so it's not locked by the current process - yield temp_file.name # This is used by the test - # Teardown: Delete the temporary file - os.remove(temp_file.name) - - -@pytest.fixture -def cache(temp_file): - # Setup: Create a cache instance with a temporary file - cache = RootURLCache(cache_file=temp_file) - return cache - - -def test_load_cache_no_file(mocker): - """Test loading the cache when the file does not exist.""" - mocker.patch('os.path.exists', return_value=False) - cache = RootURLCache().load_cache() - assert cache == {}, "Cache should be empty if file does not exist" - - -def test_load_cache_with_file(mocker): - """Test loading the cache from an existing file.""" - mock_data = '{"https://example.com": "Example Domain"}' - mocker.patch('os.path.exists', return_value=True) - mocker.patch('builtins.open', mock_open(read_data=mock_data)) - cache = RootURLCache().load_cache() - assert cache == json.loads(mock_data), "Cache should match the content of the file" - - -def test_save_cache(temp_file): - """Test saving the cache to a file.""" - with patch('os.path.exists', return_value=False): - cache = RootURLCache(cache_file=temp_file) - cache.cache = {'https://example.com': 'Example Domain'} - cache.save_cache() - - with open(temp_file, 'r') as f: - file_contents = f.read() - expected_contents = json.dumps(cache.cache, indent=4) - assert file_contents == expected_contents - - -def test_get_title_not_in_cache(mocker, cache): - """Test retrieving a title not in cache, simulating a web request.""" - mock_response = mocker.Mock() - mock_response.text = 'Example Domain' - mocker.patch('requests.get', return_value=mock_response) - title = cache.get_title('https://example.com') - assert title == 'Example Domain', "Title should be retrieved from the web" - - -def test_get_title_in_cache(cache): - """Test retrieving a title that is already in cache.""" - cache.cache = {'https://example.com': 'Example Domain'} - title = cache.get_title('https://example.com') - assert title == 'Example Domain', "Title should be retrieved from the cache" - - -@pytest.mark.parametrize("url,expected_title", [ - ('http://www.example.com', 'Example Domain'), - ('http://www.google.com', 'Google'), - ('https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html', - 'All products | Books to Scrape - Sandbox'), - ( - 'https://books.toscrape.com/catalogue/i-had-a-nice-time-and-other-lies-how-to-find-love-sht-like-that_814/index.html', - 'All products | Books to Scrape - Sandbox') -]) -def test_actual_urls(url, expected_title, cache): - """Test retrieving titles from actual URLs.""" - title = cache.get_title(url) - assert title.strip() == expected_title