diff --git a/ENV.md b/ENV.md index 22f84cb8..2a203d7d 100644 --- a/ENV.md +++ b/ENV.md @@ -2,26 +2,27 @@ This page provides a full list, with description, of all the environment variabl Please ensure these are properly defined in a `.env` file in the root directory. -| Name | Description | Example | -|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------| -| `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | -| `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | -| `POSTGRES_USER` | The username for the test database | `test_source_collector_user` | -| `POSTGRES_PASSWORD` | The password for the test database | `HanviliciousHamiltonHilltops` | -| `POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | -| `POSTGRES_HOST` | The host for the test database | `127.0.0.1` | -| `POSTGRES_PORT` | The port for the test database | `5432` | -| `DS_APP_SECRET_KEY` | The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | -| `DEV` | Set to any value to run the application in development mode. | `true` | -| `DEEPSEEK_API_KEY` | The API key required for accessing the DeepSeek API. | `abc123` | -| `OPENAI_API_KEY` | The API key required for accessing the OpenAI API. | `abc123` | -| `PDAP_EMAIL` | An email address for accessing the PDAP API.[^1] | `abc123@test.com` | -| `PDAP_PASSWORD` | A password for accessing the PDAP API.[^1] | `abc123` | -| `PDAP_API_KEY` | An API key for accessing the PDAP API. | `abc123` | -| `PDAP_API_URL` | The URL for the PDAP API | `https://data-sources-v2.pdap.dev/api` | -| `DISCORD_WEBHOOK_URL` | The URL for the Discord webhook used for notifications | `abc123` | +| Name | Description | Example | +|---------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------| +| `GOOGLE_API_KEY` | The API key required for accessing the Google Custom Search API | `abc123` | +| `GOOGLE_CSE_ID` | The CSE ID required for accessing the Google Custom Search API | `abc123` | +| `POSTGRES_USER` | The username for the test database | `test_source_collector_user` | +| `POSTGRES_PASSWORD` | The password for the test database | `HanviliciousHamiltonHilltops` | +| `POSTGRES_DB` | The database name for the test database | `source_collector_test_db` | +| `POSTGRES_HOST` | The host for the test database | `127.0.0.1` | +| `POSTGRES_PORT` | The port for the test database | `5432` | +| `DS_APP_SECRET_KEY` | The secret key used for decoding JWT tokens produced by the Data Sources App. Must match the secret token `JWT_SECRET_KEY` that is used in the Data Sources App for encoding. | `abc123` | +| `DEV` | Set to any value to run the application in development mode. | `true` | +| `DEEPSEEK_API_KEY` | The API key required for accessing the DeepSeek API. | `abc123` | +| `OPENAI_API_KEY` | The API key required for accessing the OpenAI API. | `abc123` | +| `PDAP_EMAIL` | An email address for accessing the PDAP API.[^1] | `abc123@test.com` | +| `PDAP_PASSWORD` | A password for accessing the PDAP API.[^1] | `abc123` | +| `PDAP_API_KEY` | An API key for accessing the PDAP API. | `abc123` | +| `PDAP_API_URL` | The URL for the PDAP API | `https://data-sources-v2.pdap.dev/api` | +| `DISCORD_WEBHOOK_URL` | The URL for the Discord webhook used for notifications | `abc123` | | `HUGGINGFACE_INFERENCE_API_KEY` | The API key required for accessing the Hugging Face Inference API. | `abc123` | -| `HUGGINGFACE_HUB_TOKEN` | `abc123` | The API key required for uploading to the PDAP HuggingFace account via Hugging Face Hub API. | +| `HUGGINGFACE_HUB_TOKEN` | The API key required for uploading to the PDAP HuggingFace account via Hugging Face Hub API. | `abc123` | +| `SCHEDULED_TASKS_FLAG` | Set to `1` to enable running scheduled tasks. | `1` | [^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions. diff --git a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py new file mode 100644 index 00000000..846329ca --- /dev/null +++ b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py @@ -0,0 +1,124 @@ +"""Remove functional duplicates and setup constraints on fragments and nbsp + +Revision ID: 8cd5aa7670ff +Revises: 571ada5b81b9 +Create Date: 2025-08-09 20:31:58.865231 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '8cd5aa7670ff' +down_revision: Union[str, None] = '571ada5b81b9' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +COMPRESSED_HTML_FOREIGN_KEY_NAME = 'fk_url_compressed_html_url_id' +COMPRESSED_HTML_TABLE_NAME = 'url_compressed_html' + +URL_HTML_CONTENT_FOREIGN_KEY_NAME = 'url_html_content_url_id_fkey' +URL_HTML_CONTENT_TABLE_NAME = 'url_html_content' + +URL_ERROR_INFO_TABLE_NAME = 'url_error_info' +URL_ERROR_INFO_FOREIGN_KEY_NAME = 'url_error_info_url_id_fkey' + +URLS_NBSP_CHECK_CONSTRAINT_NAME = 'urls_nbsp_check' +URLS_FRAGMENTS_CHECK_CONSTRAINT_NAME = 'urls_fragments_check' + +AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME = 'automated_url_agency_suggestions' +AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME = 'automated_url_agency_suggestions_url_id_fkey' + + +def upgrade() -> None: + _add_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME) + _add_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME) + _add_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME) + _add_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) + _remove_data_source_urls() + _reset_data_sources_sync_state() + _add_constraint_forbidding_nbsp() + _delete_duplicate_urls() + _remove_fragments_from_urls() + _add_constraint_forbidding_fragments() + + +def downgrade() -> None: + _remove_constraint_forbidding_fragments() + _remove_constraint_forbidding_nbsp() + _remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME) + _remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME) + _remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME) + _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) + +def _delete_duplicate_urls() -> None: + op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)') + +def _create_url_foreign_key_with_cascade(table_name: str, foreign_key_name: str) -> None: + op.create_foreign_key( + foreign_key_name, + table_name, + referent_table='urls', + local_cols=['url_id'], remote_cols=['id'], + ondelete='CASCADE' + ) + +def _create_url_foreign_key_without_cascade(table_name: str, foreign_key_name: str) -> None: + op.create_foreign_key( + foreign_key_name, + table_name, + referent_table='urls', + local_cols=['url_id'], remote_cols=['id'] + ) + +def _remove_cascade_foreign_key(table_name: str, foreign_key_name: str) -> None: + op.drop_constraint(foreign_key_name, table_name=table_name, type_='foreignkey') + _create_url_foreign_key_without_cascade(table_name, foreign_key_name=foreign_key_name) + +def _add_cascade_foreign_key(table_name: str, foreign_key_name: str) -> None: + op.drop_constraint(foreign_key_name, table_name=table_name, type_='foreignkey') + _create_url_foreign_key_with_cascade(table_name, foreign_key_name=foreign_key_name) + +def _remove_data_source_urls() -> None: + op.execute(""" + delete from urls + where source = 'data_sources_app' + """ + ) + +def _reset_data_sources_sync_state() -> None: + op.execute(""" + delete from data_sources_sync_state + """ + ) + +def _add_constraint_forbidding_nbsp() -> None: + op.create_check_constraint( + constraint_name=URLS_NBSP_CHECK_CONSTRAINT_NAME, + table_name='urls', + condition="url not like '% %'" + ) + +def _add_constraint_forbidding_fragments() -> None: + op.create_check_constraint( + constraint_name=URLS_FRAGMENTS_CHECK_CONSTRAINT_NAME, + table_name='urls', + condition="url not like '%#%'" + ) + +def _remove_constraint_forbidding_nbsp() -> None: + op.drop_constraint(URLS_NBSP_CHECK_CONSTRAINT_NAME, table_name='urls', type_='check') + +def _remove_constraint_forbidding_fragments() -> None: + op.drop_constraint(URLS_FRAGMENTS_CHECK_CONSTRAINT_NAME, table_name='urls', type_='check') + +def _remove_fragments_from_urls() -> None: + # Remove fragments and everything after them + op.execute(""" + update urls + set url = substring(url from 1 for position('#' in url) - 1) + where url like '%#%' + """) \ No newline at end of file diff --git a/local_database/DockerInfos.py b/local_database/DockerInfos.py index 654b59bc..4d1d2a8f 100644 --- a/local_database/DockerInfos.py +++ b/local_database/DockerInfos.py @@ -28,7 +28,7 @@ def get_database_docker_info() -> DockerInfo: def get_source_collector_data_dumper_info() -> DockerInfo: return DockerInfo( dockerfile_info=DockerfileInfo( - image_tag="datadumper", + image_tag="datadumper_sc", dockerfile_directory=str(project_path( "local_database", "DataDumper" @@ -42,7 +42,7 @@ def get_source_collector_data_dumper_info() -> DockerInfo: )), container_path="/dump" ), - name="datadumper", + name="datadumper_sc", environment={ "DUMP_HOST": get_from_env("DUMP_HOST"), "DUMP_USER": get_from_env("DUMP_USER"), diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index a5cb5bf1..e946b590 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -12,6 +12,8 @@ from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from environs import Env + class AsyncScheduledTaskManager: @@ -34,6 +36,14 @@ def __init__( async def setup(self): + env = Env() + env.read_env() + + scheduled_task_flag = env.bool("SCHEDULED_TASKS_FLAG", default=True) + if not scheduled_task_flag: + print("Scheduled tasks are disabled.") + return + self.scheduler.start() await self.add_scheduled_tasks()