diff --git a/alembic/versions/2025_08_03_1800-571ada5b81b9_add_link_urls_redirect_url_table.py b/alembic/versions/2025_08_03_1800-571ada5b81b9_add_link_urls_redirect_url_table.py new file mode 100644 index 00000000..33c2a8c6 --- /dev/null +++ b/alembic/versions/2025_08_03_1800-571ada5b81b9_add_link_urls_redirect_url_table.py @@ -0,0 +1,110 @@ +"""Add link_urls_redirect_url table + +Revision ID: 571ada5b81b9 +Revises: 99eceed6e614 +Create Date: 2025-08-03 18:00:06.345733 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, created_at_column, updated_at_column + +# revision identifiers, used by Alembic. +revision: str = '571ada5b81b9' +down_revision: Union[str, None] = '99eceed6e614' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +URLS_TABLE = 'urls' +LINK_URLS_REDIRECT_URL_TABLE = 'link_urls_redirect_url' + +SOURCE_ENUM = sa.Enum( + 'collector', + 'data_sources_app', + 'redirect', + 'root_url', + 'manual', + name='url_source' +) + +def upgrade() -> None: + _create_link_urls_redirect_url_table() + _add_source_column_to_urls_table() + + + +def downgrade() -> None: + _drop_link_urls_redirect_url_table() + _drop_source_column_from_urls_table() + + +def _create_link_urls_redirect_url_table(): + op.create_table( + LINK_URLS_REDIRECT_URL_TABLE, + id_column(), + sa.Column('source_url_id', sa.Integer(), nullable=False), + sa.Column('destination_url_id', sa.Integer(), nullable=False), + created_at_column(), + updated_at_column(), + sa.ForeignKeyConstraint(['source_url_id'], [URLS_TABLE + '.id'], ), + sa.ForeignKeyConstraint(['destination_url_id'], [URLS_TABLE + '.id'], ), + sa.UniqueConstraint( + 'source_url_id', + 'destination_url_id', + name='link_urls_redirect_url_uq_source_url_id_destination_url_id' + ), + ) + + +def _add_source_column_to_urls_table(): + # Create enum + SOURCE_ENUM.create(op.get_bind(), checkfirst=True) + op.add_column( + URLS_TABLE, + sa.Column( + 'source', + SOURCE_ENUM, + nullable=True, + comment='The source of the URL.' + ) + ) + # Add sources to existing URLs + op.execute( + f"""UPDATE {URLS_TABLE} + SET source = 'collector'::url_source + """ + ) + op.execute( + f"""UPDATE {URLS_TABLE} + SET source = 'data_sources_app'::url_source + FROM url_data_sources WHERE url_data_sources.url_id = {URLS_TABLE}.id + AND url_data_sources.data_source_id IS NOT NULL; + """ + ) + op.execute( + f"""UPDATE {URLS_TABLE} + SET source = 'collector'::url_source + FROM link_batch_urls WHERE link_batch_urls.url_id = {URLS_TABLE}.id + AND link_batch_urls.batch_id IS NOT NULL; + """ + ) + + # Make source required + op.alter_column( + URLS_TABLE, + 'source', + nullable=False + ) + + +def _drop_link_urls_redirect_url_table(): + op.drop_table(LINK_URLS_REDIRECT_URL_TABLE) + + +def _drop_source_column_from_urls_table(): + op.drop_column(URLS_TABLE, 'source') + # Drop enum + SOURCE_ENUM.drop(op.get_bind(), checkfirst=True) diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 03e2cc36..5dcd3977 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -7,6 +7,7 @@ from src.core.enums import BatchStatus from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.queries.base.builder import QueryBuilderBase @@ -48,6 +49,7 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: collector_metadata=entry.collector_metadata, outcome=URLStatus.PENDING.value, record_type=entry.record_type.value if entry.record_type is not None else None, + source=URLSource.MANUAL ) async with session.begin_nested(): diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index b41eba76..dd76218f 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,6 +1,7 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -18,6 +19,7 @@ def preprocess_entry(self, entry: dict) -> list[URLInfo]: "snippet": qr["snippet"], "title": qr["title"] }, + source=URLSource.COLLECTOR )) return url_infos diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index d2f0d988..18afd3e3 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,6 +1,7 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -12,6 +13,7 @@ def preprocess(self, data: dict) -> List[URLInfo]: for url in data["urls"]: url_info = URLInfo( url=url, + source=URLSource.COLLECTOR ) url_infos.append(url_info) diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index 580b739e..5228c241 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -2,6 +2,7 @@ from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -12,6 +13,7 @@ def preprocess(self, data: ExampleOutputDTO) -> List[URLInfo]: for url in data.urls: url_info = URLInfo( url=url, + source=URLSource.COLLECTOR ) url_infos.append(url_info) diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index b0f1d9bc..660dd028 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,6 +1,7 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -12,6 +13,7 @@ def preprocess(self, data: dict) -> List[URLInfo]: url_info = URLInfo( url=entry["url"], collector_metadata=entry["metadata"], + source=URLSource.COLLECTOR ) url_infos.append(url_info) diff --git a/src/core/tasks/scheduled/convert.py b/src/core/tasks/scheduled/convert.py new file mode 100644 index 00000000..866e536a --- /dev/null +++ b/src/core/tasks/scheduled/convert.py @@ -0,0 +1,11 @@ +from src.core.tasks.scheduled.enums import IntervalEnum + + +def convert_interval_enum_to_hours(interval: IntervalEnum) -> int: + match interval: + case IntervalEnum.DAILY: + return 24 + case IntervalEnum.HOURLY: + return 1 + case _: + raise ValueError(f"Invalid interval: {interval}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/enums.py b/src/core/tasks/scheduled/enums.py new file mode 100644 index 00000000..27d03be6 --- /dev/null +++ b/src/core/tasks/scheduled/enums.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class IntervalEnum(Enum): + DAILY = "DAILY" + HOURLY = "HOURLY" \ No newline at end of file diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index ac16eb31..a5cb5bf1 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,11 +1,15 @@ from datetime import datetime, timedelta +from apscheduler.job import Job from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.triggers.interval import IntervalTrigger from src.core.core import AsyncCore from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler +from src.core.tasks.scheduled.convert import convert_interval_enum_to_hours +from src.core.tasks.scheduled.enums import IntervalEnum from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader +from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase @@ -26,71 +30,72 @@ def __init__( self.scheduler = AsyncIOScheduler() # Jobs - self.run_cycles_job = None - self.delete_logs_job = None - self.populate_backlog_snapshot_job = None - self.sync_agencies_job = None - self.sync_data_sources_job = None - self.push_to_hugging_face_job = None + self._jobs: dict[str, Job] = {} + async def setup(self): self.scheduler.start() await self.add_scheduled_tasks() - async def add_scheduled_tasks(self): - self.run_cycles_job = self.scheduler.add_job( - self.async_core.run_tasks, - trigger=IntervalTrigger( - hours=1, - start_date=datetime.now() + timedelta(minutes=1) + async def _get_entries(self) -> list[ScheduledTaskEntry]: + return [ + ScheduledTaskEntry( + name="Run Task Cycles", + function=self.async_core.run_tasks, + interval=IntervalEnum.HOURLY ), - misfire_grace_time=60 - ) - self.delete_logs_job = self.scheduler.add_job( - self.async_core.adb_client.delete_old_logs, - trigger=IntervalTrigger( - days=1, - start_date=datetime.now() + timedelta(minutes=10) - ) - ) - self.populate_backlog_snapshot_job = self.scheduler.add_job( - self.async_core.adb_client.populate_backlog_snapshot, - trigger=IntervalTrigger( - days=1, - start_date=datetime.now() + timedelta(minutes=20) - ) - ) - self.sync_agencies_job = self.scheduler.add_job( - self.run_task, - trigger=IntervalTrigger( - days=1, - start_date=datetime.now() + timedelta(minutes=2) + ScheduledTaskEntry( + name="Delete Old Logs", + function=self.async_core.adb_client.delete_old_logs, + interval=IntervalEnum.DAILY + ), + ScheduledTaskEntry( + name="Populate Backlog Snapshot", + function=self.async_core.adb_client.populate_backlog_snapshot, + interval=IntervalEnum.DAILY ), - kwargs={ - "operator": await self.loader.get_sync_agencies_task_operator() - } - ) - self.sync_data_sources_job = self.scheduler.add_job( - self.run_task, - trigger=IntervalTrigger( - days=1, - start_date=datetime.now() + timedelta(minutes=3) + ScheduledTaskEntry( + name="Sync Agencies", + function=self.run_task, + interval=IntervalEnum.DAILY, + kwargs={ + "operator": await self.loader.get_sync_agencies_task_operator() + } ), - kwargs={ - "operator": await self.loader.get_sync_data_sources_task_operator() - } - ) - # TODO: enable once more URLs with HTML have been added to the database. - # self.push_to_hugging_face_job = self.scheduler.add_job( - # self.run_task, - # trigger=IntervalTrigger( - # days=1, - # start_date=datetime.now() + timedelta(minutes=4) - # ), - # kwargs={ - # "operator": await self.loader.get_push_to_hugging_face_task_operator() - # } - # ) + ScheduledTaskEntry( + name="Sync Data Sources", + function=self.run_task, + interval=IntervalEnum.DAILY, + kwargs={ + "operator": await self.loader.get_sync_data_sources_task_operator() + } + ), + # ScheduledTaskEntry( + # name="Push to Hugging Face", + # function=self.run_task, + # interval=IntervalEnum.DAILY, + # kwargs={ + # "operator": await self.loader.get_push_to_hugging_face_task_operator() + # } + # ) + ] + + async def add_scheduled_tasks(self): + """ + Modifies: + self._jobs + """ + entries: list[ScheduledTaskEntry] = await self._get_entries() + for idx, entry in enumerate(entries): + self._jobs[entry.name] = self.scheduler.add_job( + entry.function, + trigger=IntervalTrigger( + hours=convert_interval_enum_to_hours(entry.interval), + start_date=datetime.now() + timedelta(minutes=idx + 1) + ), + misfire_grace_time=60, + kwargs=entry.kwargs + ) def shutdown(self): if self.scheduler.running: diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py b/src/core/tasks/scheduled/models/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/probe/setup/mocks/__init__.py rename to src/core/tasks/scheduled/models/__init__.py diff --git a/src/core/tasks/scheduled/models/entry.py b/src/core/tasks/scheduled/models/entry.py new file mode 100644 index 00000000..8413baea --- /dev/null +++ b/src/core/tasks/scheduled/models/entry.py @@ -0,0 +1,16 @@ +from typing import Any + +from pydantic import BaseModel + +from src.core.tasks.scheduled.enums import IntervalEnum + + +class ScheduledTaskEntry(BaseModel): + + class Config: + arbitrary_types_allowed = True + + name: str + function: Any + interval: IntervalEnum + kwargs: dict[str, Any] = {} \ No newline at end of file diff --git a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py index 1cab6e0d..f0e4a570 100644 --- a/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py +++ b/src/core/tasks/scheduled/sync/data_sources/queries/upsert/url/insert/params.py @@ -1,5 +1,6 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -10,6 +11,7 @@ class InsertURLForDataSourcesSyncParams(BulkInsertableModel): description: str | None outcome: URLStatus record_type: RecordType + source: URLSource = URLSource.DATA_SOURCES @classmethod def sa_model(cls) -> type[URL]: diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 59896f94..bee76770 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -111,10 +111,10 @@ async def get_url_probe_task_operator(self): async def get_task_operators(self) -> list[URLTaskOperatorBase]: return [ - await self.get_url_probe_task_operator(), + # await self.get_url_probe_task_operator(), await self.get_url_html_task_operator(), await self.get_url_duplicate_task_operator(), - await self.get_url_404_probe_task_operator(), + # await self.get_url_404_probe_task_operator(), await self.get_url_record_type_task_operator(), await self.get_agency_identification_task_operator(), await self.get_url_miscellaneous_metadata_task_operator(), diff --git a/src/core/tasks/url/operators/probe/convert.py b/src/core/tasks/url/operators/probe/convert.py new file mode 100644 index 00000000..8de86587 --- /dev/null +++ b/src/core/tasks/url/operators/probe/convert.py @@ -0,0 +1,18 @@ +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic + + +def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]: + results: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response = tdo.response.response + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=response.status_code != 404, + status_code=response.status_code, + content_type=response.content_type, + error_message=response.error + ) + results.append(web_metadata_object) + return results + diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py index 98d4f8ab..ab518bcd 100644 --- a/src/core/tasks/url/operators/probe/core.py +++ b/src/core/tasks/url/operators/probe/core.py @@ -2,8 +2,10 @@ from typing_extensions import override from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list +from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos +from src.core.tasks.url.operators.probe.queries.insert_redirects.query import InsertRedirectsQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping import URLMapping @@ -57,21 +59,18 @@ async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: ) # Re-associate the responses with the URL mappings for response in responses: - tdo = url_to_tdo[response.url] + tdo = url_to_tdo[response.original_url] tdo.response = response async def update_database(self, tdos: list[URLProbeTDO]) -> None: - web_metadata_objects: list[URLWebMetadataPydantic] = [] - for tdo in tdos: - response = tdo.response - web_metadata_object = URLWebMetadataPydantic( - url_id=tdo.url_mapping.url_id, - accessed=response.status_code is not None, - status_code=response.status_code, - content_type=response.content_type, - error_message=response.error - ) - web_metadata_objects.append(web_metadata_object) + non_redirect_tdos = filter_non_redirect_tdos(tdos) + web_metadata_objects = convert_tdo_to_web_metadata_list(non_redirect_tdos) await self.adb_client.bulk_insert(web_metadata_objects) + redirect_tdos = filter_redirect_tdos(tdos) + + query_builder = InsertRedirectsQueryBuilder(tdos=redirect_tdos) + await self.adb_client.run_query_builder(query_builder) + + diff --git a/src/core/tasks/url/operators/probe/filter.py b/src/core/tasks/url/operators/probe/filter.py new file mode 100644 index 00000000..4a129676 --- /dev/null +++ b/src/core/tasks/url/operators/probe/filter.py @@ -0,0 +1,8 @@ +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO + + +def filter_non_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: + return [tdo for tdo in tdos if not tdo.response.is_redirect] + +def filter_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: + return [tdo for tdo in tdos if tdo.response.is_redirect] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert.py b/src/core/tasks/url/operators/probe/queries/insert.py deleted file mode 100644 index 2b312e36..00000000 --- a/src/core/tasks/url/operators/probe/queries/insert.py +++ /dev/null @@ -1,15 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession -from typing_extensions import override, final - -from src.db.queries.base.builder import QueryBuilderBase - -@final -class InsertURLMetadataInfoQueryBuilder(QueryBuilderBase): - - def __init__( - self, - - ): - - @override - async def run(self, session: AsyncSession) -> None: diff --git a/tests/automated/integration/tasks/url/probe/setup/models/__init__.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/probe/setup/models/__init__.py rename to src/core/tasks/url/operators/probe/queries/insert_redirects/__init__.py diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py new file mode 100644 index 00000000..62de2ae1 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py @@ -0,0 +1,56 @@ +from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping +from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.url.core.enums import URLSource +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic + + +def convert_url_response_mapping_to_web_metadata_list( + url_response_mappings: list[URLResponseMapping] +) -> list[URLWebMetadataPydantic]: + results: list[URLWebMetadataPydantic] = [] + for url_response_mapping in url_response_mappings: + response = url_response_mapping.response + web_metadata_object = URLWebMetadataPydantic( + url_id=url_response_mapping.url_mapping.url_id, + accessed=response.status_code is not None, + status_code=response.status_code, + content_type=response.content_type, + error_message=response.error + ) + results.append(web_metadata_object) + return results + + +def convert_to_url_mappings(url_exists_results: list[UrlExistsResult]) -> list[URLMapping]: + return [ + URLMapping( + url=url_exists_result.url, + url_id=url_exists_result.url_id + ) for url_exists_result in url_exists_results + ] + + +def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: + results = [] + for url in urls: + results.append( + URLInsertModel( + url=url, + source=URLSource.REDIRECT + ) + ) + return results + +def convert_tdo_to_url_response_mappings(tdos: list[URLProbeTDO]) -> list[URLResponseMapping]: + results = [] + for tdo in tdos: + results.append( + URLResponseMapping( + url_mapping=tdo.url_mapping, + response=tdo.response.response.source + ) + ) + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py new file mode 100644 index 00000000..c44e1a83 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py @@ -0,0 +1,16 @@ +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.dtos.url.mapping import URLMapping +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair + + +def extract_response_pairs(tdos: list[URLProbeTDO]) -> list[URLProbeRedirectResponsePair]: + results = [] + for tdo in tdos: + if not tdo.response.is_redirect: + raise ValueError(f"Expected {tdo.url_mapping.url} to be a redirect.") + + response: URLProbeRedirectResponsePair = tdo.response.response + if not isinstance(response, URLProbeRedirectResponsePair): + raise ValueError(f"Expected {tdo.url_mapping.url} to be {URLProbeRedirectResponsePair.__name__}.") + results.append(response) + return results diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py new file mode 100644 index 00000000..1f36893d --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py @@ -0,0 +1,14 @@ +from src.db.dtos.url.mapping import URLMapping + + +def filter_new_dest_urls( + url_mappings_in_db: list[URLMapping], + all_dest_urls: list[str] +) -> list[str]: + extant_destination_urls: set[str] = set([url_mapping.url for url_mapping in url_mappings_in_db]) + new_dest_urls: list[str] = [ + url + for url in all_dest_urls + if url not in extant_destination_urls + ] + return new_dest_urls \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py new file mode 100644 index 00000000..53f2b2e1 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py @@ -0,0 +1,19 @@ +from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping +from src.db.dtos.url.mapping import URLMapping +from src.external.url_request.probe.models.response import URLProbeResponse + + +def map_url_mappings_to_probe_responses( + url_mappings: list[URLMapping], + url_to_probe_responses: dict[str, URLProbeResponse] +) -> list[URLResponseMapping]: + results = [] + for url_mapping in url_mappings: + response = url_to_probe_responses[url_mapping.url] + results.append( + URLResponseMapping( + url_mapping=url_mapping, + response=response + ) + ) + return results \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/__init__.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/probe/setup/queries/__init__.py rename to src/core/tasks/url/operators/probe/queries/insert_redirects/models/__init__.py diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py new file mode 100644 index 00000000..efbd5db8 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.db.dtos.url.mapping import URLMapping +from src.external.url_request.probe.models.response import URLProbeResponse + + +class URLResponseMapping(BaseModel): + url_mapping: URLMapping + response: URLProbeResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py new file mode 100644 index 00000000..a79cca77 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py @@ -0,0 +1,76 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.probe.queries.insert_redirects.extract import extract_response_pairs +from src.core.tasks.url.operators.probe.queries.insert_redirects.filter import filter_new_dest_urls +from src.core.tasks.url.operators.probe.queries.insert_redirects.request_manager import InsertRedirectsRequestManager +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.dtos.url.mapping import URLMapping +from src.db.queries.base.builder import QueryBuilderBase +from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.url_mapper import URLMapper + + +class InsertRedirectsQueryBuilder(QueryBuilderBase): + def __init__( + self, + tdos: list[URLProbeTDO], + ): + super().__init__() + self.tdos = tdos + self.source_url_mappings = [tdo.url_mapping for tdo in self.tdos] + self._mapper = URLMapper(self.source_url_mappings) + + self._response_pairs = extract_response_pairs(self.tdos) + + self._destination_probe_responses: list[URLProbeResponse] = [ + pair.destination + for pair in self._response_pairs + ] + self._destination_urls: list[str] = [ + response.url + for response in self._destination_probe_responses + ] + + self._destination_url_to_probe_response_mapping: dict[str, URLProbeResponse] = { + response.url: response + for response in self._destination_probe_responses + } + + + + + async def run(self, session: AsyncSession) -> None: + """ + Modifies: + self._mapper + """ + + rm = InsertRedirectsRequestManager( + session=session + ) + + dest_url_mappings_in_db: list[URLMapping] = await rm.get_url_mappings_in_db( + urls=self._destination_urls + ) + + new_dest_urls: list[str] = filter_new_dest_urls( + url_mappings_in_db=dest_url_mappings_in_db, + all_dest_urls=self._destination_urls + ) + new_dest_url_mappings: list[URLMapping] = await rm.insert_new_urls( + urls=new_dest_urls + ) + all_dest_url_mappings: list[URLMapping] = dest_url_mappings_in_db + new_dest_url_mappings + + self._mapper.add_mappings(all_dest_url_mappings) + + await rm.add_web_metadata( + all_dest_url_mappings=all_dest_url_mappings, + dest_url_to_probe_response_mappings=self._destination_url_to_probe_response_mapping, + tdos=self.tdos + ) + + await rm.add_redirect_links( + response_pairs=self._response_pairs, + mapper=self._mapper + ) diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py new file mode 100644 index 00000000..924de9ef --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -0,0 +1,81 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_mappings, \ + convert_to_url_insert_models, convert_tdo_to_url_response_mappings, \ + convert_url_response_mapping_to_web_metadata_list +from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses +from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping +from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.queries.urls.exist.query import URLsExistInDBQueryBuilder +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.models.instantiations.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.url_mapper import URLMapper + + +class InsertRedirectsRequestManager: + + def __init__(self, session: AsyncSession): + self.session = session + + async def get_url_mappings_in_db( + self, + urls: list[str], + ): + results: list[UrlExistsResult] = await URLsExistInDBQueryBuilder( + urls=urls + ).run(self.session) + extant_urls = [result for result in results if result.exists] + return convert_to_url_mappings(extant_urls) + + async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: + if len(urls) == 0: + return [] + deduplicated_urls = list(set(urls)) + insert_models = convert_to_url_insert_models(deduplicated_urls) + url_ids = await sh.bulk_insert(self.session, models=insert_models, return_ids=True) + url_mappings = [ + URLMapping(url=url, url_id=url_id) + for url, url_id + in zip(deduplicated_urls, url_ids) + ] + return url_mappings + + async def add_web_metadata( + self, + all_dest_url_mappings: list[URLMapping], + dest_url_to_probe_response_mappings: dict[str, URLProbeResponse], + tdos: list[URLProbeTDO], + ) -> None: + dest_url_response_mappings = map_url_mappings_to_probe_responses( + url_mappings=all_dest_url_mappings, + url_to_probe_responses=dest_url_to_probe_response_mappings + ) + src_url_response_mappings: list[URLResponseMapping] = convert_tdo_to_url_response_mappings( + tdos=tdos + ) + all_url_response_mappings: list[URLResponseMapping] = src_url_response_mappings + dest_url_response_mappings + web_metadata_list: list[URLWebMetadataPydantic] = convert_url_response_mapping_to_web_metadata_list( + all_url_response_mappings + ) + await sh.bulk_upsert(self.session, models=web_metadata_list) + + async def add_redirect_links( + self, + response_pairs: list[URLProbeRedirectResponsePair], + mapper: URLMapper + ) -> None: + links: list[LinkURLRedirectURLPydantic] = [] + for pair in response_pairs: + source_url_id = mapper.get_id(pair.source.url) + destination_url_id = mapper.get_id(pair.destination.url) + link = LinkURLRedirectURLPydantic( + source_url_id=source_url_id, + destination_url_id=destination_url_id + ) + links.append(link) + await sh.bulk_insert(self.session, models=links) diff --git a/src/core/tasks/url/operators/probe/queries/urls/__init__.py b/src/core/tasks/url/operators/probe/queries/urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py b/src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py new file mode 100644 index 00000000..1245044c --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class UrlExistsResult(BaseModel): + url: str + url_id: int | None + + @property + def exists(self): + return self.url_id is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py new file mode 100644 index 00000000..207648cc --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py @@ -0,0 +1,29 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class URLsExistInDBQueryBuilder(QueryBuilderBase): + """Checks if URLs exist in the database.""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[UrlExistsResult]: + query = select(URL.id, URL.url).where(URL.url.in_(self.urls)) + db_mappings = await sh.mappings(session, query=query) + + url_to_id_map: dict[str, int] = { + row["url"]: row["id"] + for row in db_mappings + } + return [ + UrlExistsResult( + url=url, + url_id=url_to_id_map.get(url) + ) for url in self.urls + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/__init__.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/queries/has_urls.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py similarity index 100% rename from src/core/tasks/url/operators/probe/queries/has_urls.py rename to src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/__init__.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py new file mode 100644 index 00000000..3beae86a --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py @@ -0,0 +1,9 @@ + + +def clean_url(url: str) -> str: + # Remove Non-breaking spaces + url = url.replace("\u00A0", "") + url = url.replace(" ", "") + url = url.replace("%C2%A0", "") + return url + diff --git a/src/core/tasks/url/operators/probe/queries/get_urls.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py similarity index 78% rename from src/core/tasks/url/operators/probe/queries/get_urls.py rename to src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 971d1974..aa0f4d5b 100644 --- a/src/core/tasks/url/operators/probe/queries/get_urls.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -2,6 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final +from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.clean import clean_url from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata @@ -29,4 +30,9 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: .limit(500) ) db_mappings = await sh.mappings(session, query=query) - return [URLMapping(**mapping) for mapping in db_mappings] \ No newline at end of file + return [ + URLMapping( + url_id=mapping["url_id"], + url=clean_url(mapping["url"]) + ) for mapping in db_mappings + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py index 8af513c1..5208fd80 100644 --- a/src/core/tasks/url/operators/probe/tdo.py +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -1,9 +1,9 @@ from pydantic import BaseModel -from src.external.url_request.probe.model import URLProbeResponse from src.db.dtos.url.mapping import URLMapping +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper class URLProbeTDO(BaseModel): url_mapping: URLMapping - response: URLProbeResponse | None = None + response: URLProbeResponseOuterWrapper | None = None diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 25b40852..475d8404 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -77,8 +77,8 @@ HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder -from src.core.tasks.url.operators.probe.queries.get_urls import GetURLsWithoutProbeQueryBuilder -from src.core.tasks.url.operators.probe.queries.has_urls import HasURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder @@ -262,6 +262,10 @@ async def scalars(self, session: AsyncSession, statement): async def mapping(self, session: AsyncSession, statement): return await sh.mapping(session, statement) + @session_manager + async def one_or_none(self, session: AsyncSession, statement): + return await sh.one_or_none(session, statement) + @session_manager async def run_query_builder( self, @@ -901,7 +905,8 @@ async def insert_url( url_entry = URL( url=url_info.url, collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value + outcome=url_info.outcome.value, + source=url_info.source ) if url_info.created_at is not None: url_entry.created_at = url_info.created_at diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 613c335b..b893abc1 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -120,7 +120,8 @@ def insert_url(self, session, url_info: URLInfo) -> int: url=url_info.url, collector_metadata=url_info.collector_metadata, outcome=url_info.outcome, - name=url_info.name + name=url_info.name, + source=url_info.source ) if url_info.created_at is not None: url_entry.created_at = url_info.created_at @@ -143,7 +144,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo try: url_id = self.insert_url(url_info) url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) - except IntegrityError: + except IntegrityError as e: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( duplicate_batch_id=batch_id, diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 6295415d..f205f0b9 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -30,4 +30,11 @@ def enum_column( nullable=nullable ) +def url_id_column() -> Column[int]: + return Column( + Integer(), + ForeignKey('urls.id', ondelete='CASCADE'), + nullable=False + ) + CURRENT_TIME_SERVER_DEFAULT = func.now() diff --git a/src/db/models/instantiations/link/url_redirect_url/__init__.py b/src/db/models/instantiations/link/url_redirect_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/link/url_redirect_url/pydantic.py b/src/db/models/instantiations/link/url_redirect_url/pydantic.py new file mode 100644 index 00000000..30799391 --- /dev/null +++ b/src/db/models/instantiations/link/url_redirect_url/pydantic.py @@ -0,0 +1,12 @@ +from src.db.models.instantiations.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkURLRedirectURLPydantic(BulkInsertableModel): + source_url_id: int + destination_url_id: int + + @classmethod + def sa_model(cls) -> type[LinkURLRedirectURL]: + return LinkURLRedirectURL + diff --git a/src/db/models/instantiations/link/url_redirect_url/sqlalchemy.py b/src/db/models/instantiations/link/url_redirect_url/sqlalchemy.py new file mode 100644 index 00000000..312cbb57 --- /dev/null +++ b/src/db/models/instantiations/link/url_redirect_url/sqlalchemy.py @@ -0,0 +1,10 @@ +from src.db.models.helpers import url_id_column +from src.db.models.templates_.standard import StandardBase + + + +class LinkURLRedirectURL(StandardBase): + __tablename__ = "link_urls_redirect_url" + source_url_id = url_id_column() + destination_url_id = url_id_column() + diff --git a/src/db/models/instantiations/url/core/enums.py b/src/db/models/instantiations/url/core/enums.py new file mode 100644 index 00000000..88fe5bc4 --- /dev/null +++ b/src/db/models/instantiations/url/core/enums.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class URLSource(Enum): + COLLECTOR = "collector" + MANUAL = "manual" + DATA_SOURCES = "data_sources_app" + REDIRECT = "redirect" + ROOT_URL = "root_url" \ No newline at end of file diff --git a/src/db/models/instantiations/url/core/pydantic/info.py b/src/db/models/instantiations/url/core/pydantic/info.py index 6099db29..d0130c88 100644 --- a/src/db/models/instantiations/url/core/pydantic/info.py +++ b/src/db/models/instantiations/url/core/pydantic/info.py @@ -4,6 +4,7 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.core.enums import URLSource class URLInfo(BaseModel): @@ -15,3 +16,4 @@ class URLInfo(BaseModel): updated_at: datetime.datetime | None = None created_at: datetime.datetime | None = None name: str | None = None + source: URLSource | None = None diff --git a/src/db/models/instantiations/url/core/pydantic/insert.py b/src/db/models/instantiations/url/core/pydantic/insert.py index e384416e..438294f6 100644 --- a/src/db/models/instantiations/url/core/pydantic/insert.py +++ b/src/db/models/instantiations/url/core/pydantic/insert.py @@ -1,5 +1,6 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -14,6 +15,7 @@ def sa_model(cls) -> type[Base]: url: str collector_metadata: dict | None = None - name: str - outcome: URLStatus - record_type: RecordType \ No newline at end of file + name: str | None = None + outcome: URLStatus = URLStatus.PENDING + record_type: RecordType | None = None + source: URLSource \ No newline at end of file diff --git a/src/db/models/instantiations/url/core/sqlalchemy.py b/src/db/models/instantiations/url/core/sqlalchemy.py index 4b4c0159..d0af49b1 100644 --- a/src/db/models/instantiations/url/core/sqlalchemy.py +++ b/src/db/models/instantiations/url/core/sqlalchemy.py @@ -4,6 +4,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.helpers import enum_column +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin from src.db.models.templates_.with_id import WithIDBase @@ -18,7 +19,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - outcome: Column = enum_column( + outcome = enum_column( URLStatus, name='url_status', nullable=False @@ -28,6 +29,11 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): name='record_type', nullable=True ) + source = enum_column( + URLSource, + name='url_source', + nullable=False + ) # Relationships batch = relationship( diff --git a/src/db/models/instantiations/url/error_info/pydantic.py b/src/db/models/instantiations/url/error_info/pydantic.py index c8596a13..74baf5e3 100644 --- a/src/db/models/instantiations/url/error_info/pydantic.py +++ b/src/db/models/instantiations/url/error_info/pydantic.py @@ -1,7 +1,4 @@ import datetime -from typing import Optional - -from pydantic import BaseModel from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.templates_.base import Base diff --git a/src/db/models/instantiations/url/web_metadata/pydantic.py b/src/db/models/instantiations/url/web_metadata/insert.py similarity index 69% rename from src/db/models/instantiations/url/web_metadata/pydantic.py rename to src/db/models/instantiations/url/web_metadata/insert.py index 0dc25f2d..430ed798 100644 --- a/src/db/models/instantiations/url/web_metadata/pydantic.py +++ b/src/db/models/instantiations/url/web_metadata/insert.py @@ -3,15 +3,22 @@ from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel -class URLWebMetadataPydantic(BulkInsertableModel): +class URLWebMetadataPydantic( + BulkInsertableModel, + BulkUpsertableModel +): @classmethod def sa_model(cls) -> type[Base]: """Defines the SQLAlchemy model.""" return URLWebMetadata + @classmethod + def id_field(cls) -> str: + return "url_id" url_id: int accessed: bool diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index d17164d7..093fe1ab 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -2,7 +2,8 @@ from src.external.url_request.dtos.url_response import URLResponseInfo from src.external.url_request.probe.core import URLProbeManager -from src.external.url_request.probe.model import URLProbeResponse +from src.external.url_request.probe.models.response import URLProbeResponse +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper from src.external.url_request.request import fetch_urls @@ -15,7 +16,7 @@ async def make_requests_with_html( return await fetch_urls(urls) @staticmethod - async def probe_urls(urls: list[str]) -> list[URLProbeResponse]: + async def probe_urls(urls: list[str]) -> list[URLProbeResponseOuterWrapper]: async with ClientSession(timeout=ClientTimeout(total=30)) as session: manager = URLProbeManager(session=session) return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/probe/convert.py b/src/external/url_request/probe/convert.py new file mode 100644 index 00000000..eafb7158 --- /dev/null +++ b/src/external/url_request/probe/convert.py @@ -0,0 +1,112 @@ +from http import HTTPStatus +from typing import Sequence + +from aiohttp import ClientResponse, ClientResponseError + +from src.external.url_request.probe.models.response import URLProbeResponse +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper + + +def _process_client_response_history(history: Sequence[ClientResponse]) -> list[str]: + return [str(cr.url) for cr in history] + + +def _extract_content_type(cr: ClientResponse, error: str | None) -> str | None: + if error is None: + return cr.content_type + return None + + +def _extract_redirect_probe_response(cr: ClientResponse) -> URLProbeResponse | None: + """Returns the probe response for the first redirect. + + This is the original URL that was probed.""" + if len(cr.history) == 0: + return None + + all_urls = [str(cr.url) for cr in cr.history] + first_url = all_urls[0] + + return URLProbeResponse( + url=first_url, + status_code=HTTPStatus.FOUND.value, + content_type=None, + error=None, + ) + + +def _extract_error(cr: ClientResponse) -> str | None: + try: + cr.raise_for_status() + return None + except ClientResponseError as e: + return str(e) + +def _has_redirect(cr: ClientResponse) -> bool: + return len(cr.history) > 0 + +def _extract_source_url(cr: ClientResponse) -> str: + return str(cr.history[0].url) + +def _extract_destination_url(cr: ClientResponse) -> str: + return str(cr.url) + +def convert_client_response_to_probe_response( + cr: ClientResponse +) -> URLProbeResponse | URLProbeRedirectResponsePair: + error = _extract_error(cr) + content_type = _extract_content_type(cr, error=error) + if not _has_redirect(cr): + return URLProbeResponse( + url=str(cr.url), + status_code=cr.status, + content_type=content_type, + error=error, + ) + + # Extract into separate probe responses + source_cr = cr.history[0] # Source CR is the first in the history + destination_cr = cr + + source_url = str(source_cr.url) + destination_url = str(destination_cr.url) + + source_error = _extract_error(source_cr) + source_content_type = _extract_content_type(source_cr, error=source_error) + source_probe_response = URLProbeResponse( + url=source_url, + status_code=source_cr.status, + content_type=source_content_type, + error=source_error, + ) + + + destination_error = _extract_error(destination_cr) + destination_content_type = _extract_content_type(destination_cr, error=destination_error) + destination_probe_response = URLProbeResponse( + url=destination_url, + status_code=destination_cr.status, + content_type=destination_content_type, + error=destination_error, + ) + + return URLProbeRedirectResponsePair( + source=source_probe_response, + destination=destination_probe_response + ) + +def convert_to_error_response( + url: str, + error: str, + status_code: int | None = None +) -> URLProbeResponseOuterWrapper: + return URLProbeResponseOuterWrapper( + original_url=url, + response=URLProbeResponse( + url=url, + status_code=status_code, + content_type=None, + error=error + ) + ) diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index 0b5bb934..a6eb9b99 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -1,11 +1,14 @@ -import asyncio +from http import HTTPStatus -from aiohttp import ClientSession, ClientResponseError - -from src.external.url_request.probe.format import format_client_response, format_client_response_error, format_error -from src.external.url_request.probe.model import URLProbeResponse +from aiohttp import ClientSession, InvalidUrlClientError, ClientConnectorSSLError, ClientConnectorDNSError, \ + ClientConnectorCertificateError, ClientResponseError, ClientConnectorError from tqdm.asyncio import tqdm_asyncio +from src.external.url_request.probe.convert import convert_client_response_to_probe_response, convert_to_error_response +from src.external.url_request.probe.models.response import URLProbeResponse +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper + + class URLProbeManager: def __init__( @@ -14,30 +17,53 @@ def __init__( ): self.session = session - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: - return await tqdm_asyncio.gather(*[self.probe_url(url) for url in urls]) + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + return await tqdm_asyncio.gather(*[self._probe(url) for url in urls]) + + async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: + try: + response = await self._head(url) + if not response.is_redirect and response.response.status_code == HTTPStatus.OK: + return response + # Fallback to GET if HEAD fails + return await self._get(url) + except InvalidUrlClientError: + return convert_to_error_response(url, error="Invalid URL") + except ( + ClientConnectorError, + ClientConnectorSSLError, + ClientConnectorDNSError, + ClientConnectorCertificateError + ) as e: + return convert_to_error_response(url, error=str(e)) + - async def probe_url(self, url: str) -> URLProbeResponse: - result = await self.head(url) - if result.error is None: - return result - return await self.get(url) - async def head(self, url: str) -> URLProbeResponse: + async def _head(self, url: str) -> URLProbeResponseOuterWrapper: try: - async with self.session.head(url) as response: - return format_client_response(url, response=response) + async with self.session.head(url, allow_redirects=True) as response: + return URLProbeResponseOuterWrapper( + original_url=url, + response=convert_client_response_to_probe_response(response) + ) except ClientResponseError as e: - return format_client_response_error(url, error=e) - except Exception as e: - return format_error(url, error=e) + return convert_to_error_response( + url, + error=str(e), + status_code=e.status + ) - async def get(self, url: str) -> URLProbeResponse: + async def _get(self, url: str) -> URLProbeResponseOuterWrapper: try: - async with self.session.get(url) as response: - return format_client_response(url, response=response) + async with self.session.get(url, allow_redirects=True) as response: + return URLProbeResponseOuterWrapper( + original_url=url, + response=convert_client_response_to_probe_response(response) + ) except ClientResponseError as e: - return format_client_response_error(url, error=e) - except Exception as e: - return format_error(url, error=e) \ No newline at end of file + return convert_to_error_response( + url, + error=str(e), + status_code=e.status + ) diff --git a/src/external/url_request/probe/format.py b/src/external/url_request/probe/format.py index 65430c1e..b528de4d 100644 --- a/src/external/url_request/probe/format.py +++ b/src/external/url_request/probe/format.py @@ -1,32 +1,7 @@ from aiohttp import ClientResponse, ClientResponseError -from src.external.url_request.probe.model import URLProbeResponse +from src.external.url_request.probe.models.response import URLProbeResponse def format_content_type(content_type: str) -> str: return content_type.split(";")[0].strip() - -def format_client_response(url: str, response: ClientResponse) -> URLProbeResponse: - return URLProbeResponse( - url=url, - status_code=response.status, - content_type=format_content_type( - response.headers.get("content-type") - ) - ) - -def format_client_response_error(url: str, error: ClientResponseError) -> URLProbeResponse: - return URLProbeResponse( - url=url, - status_code=error.status, - content_type=None, - error=str(error) - ) - -def format_error(url: str, error: Exception) -> URLProbeResponse: - return URLProbeResponse( - url=url, - status_code=None, - content_type=None, - error=str(error) - ) \ No newline at end of file diff --git a/src/external/url_request/probe/models/__init__.py b/src/external/url_request/probe/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/url_request/probe/models/redirect.py b/src/external/url_request/probe/models/redirect.py new file mode 100644 index 00000000..56c9f227 --- /dev/null +++ b/src/external/url_request/probe/models/redirect.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.external.url_request.probe.models.response import URLProbeResponse + + +class URLProbeRedirectResponsePair(BaseModel): + source: URLProbeResponse + destination: URLProbeResponse \ No newline at end of file diff --git a/src/external/url_request/probe/model.py b/src/external/url_request/probe/models/response.py similarity index 76% rename from src/external/url_request/probe/model.py rename to src/external/url_request/probe/models/response.py index 0af80ea4..967f1c4f 100644 --- a/src/external/url_request/probe/model.py +++ b/src/external/url_request/probe/models/response.py @@ -1,4 +1,5 @@ -from pydantic import BaseModel, model_validator, Field +from pydantic import BaseModel, Field, model_validator + class URLProbeResponse(BaseModel): @@ -10,8 +11,6 @@ class URLProbeResponse(BaseModel): @model_validator(mode='after') def check_error_mutually_exclusive_with_content(self): if self.error is None: - if self.content_type is None: - raise ValueError('Content type required if no error') if self.status_code is None: raise ValueError('Status code required if no error') return self @@ -20,3 +19,4 @@ def check_error_mutually_exclusive_with_content(self): raise ValueError('Content type mutually exclusive with error') return self + diff --git a/src/external/url_request/probe/models/wrapper.py b/src/external/url_request/probe/models/wrapper.py new file mode 100644 index 00000000..04dbc9c4 --- /dev/null +++ b/src/external/url_request/probe/models/wrapper.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel + +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.external.url_request.probe.models.response import URLProbeResponse + + +class URLProbeResponseOuterWrapper(BaseModel): + original_url: str + response: URLProbeResponse | URLProbeRedirectResponsePair + + @property + def is_redirect(self) -> bool: + return isinstance(self.response, URLProbeRedirectResponsePair) diff --git a/src/util/url_mapper.py b/src/util/url_mapper.py new file mode 100644 index 00000000..15ac6918 --- /dev/null +++ b/src/util/url_mapper.py @@ -0,0 +1,28 @@ +from src.db.dtos.url.mapping import URLMapping + + +class URLMapper: + + def __init__(self, mappings: list[URLMapping]): + self._url_to_id = { + mapping.url: mapping.url_id + for mapping in mappings + } + self._id_to_url = { + mapping.url_id: mapping.url + for mapping in mappings + } + + def get_id(self, url: str) -> int: + return self._url_to_id[url] + + def get_url(self, url_id: int) -> str: + return self._id_to_url[url_id] + + def add_mapping(self, mapping: URLMapping) -> None: + self._url_to_id[mapping.url] = mapping.url_id + self._id_to_url[mapping.url_id] = mapping.url + + def add_mappings(self, mappings: list[URLMapping]) -> None: + for mapping in mappings: + self.add_mapping(mapping) \ No newline at end of file diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 28a2483d..644261b2 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -3,6 +3,7 @@ from src.core.enums import BatchStatus from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL @@ -25,14 +26,17 @@ async def test_insert_urls( URLInfo( url="https://example.com/1", collector_metadata={"name": "example_1"}, + source=URLSource.COLLECTOR ), URLInfo( url="https://example.com/2", + source=URLSource.COLLECTOR ), # Duplicate URLInfo( url="https://example.com/1", collector_metadata={"name": "example_duplicate"}, + source=URLSource.COLLECTOR ) ] insert_urls_info = await adb_client_test.insert_urls( diff --git a/tests/automated/integration/db/structure/test_url.py b/tests/automated/integration/db/structure/test_url.py deleted file mode 100644 index 1c14d519..00000000 --- a/tests/automated/integration/db/structure/test_url.py +++ /dev/null @@ -1,45 +0,0 @@ -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql - -from src.collectors.enums import URLStatus -from src.util.helper_functions import get_enum_values -from tests.automated.integration.db.structure.testers.models.column import ColumnTester -from tests.automated.integration.db.structure.testers.table import TableTester -from tests.helpers.data_creator.core import DBDataCreator - - -def test_url(db_data_creator: DBDataCreator): - batch_id = db_data_creator.batch() - table_tester = TableTester( - table_name="urls", - columns=[ - ColumnTester( - column_name="batch_id", - type_=sa.Integer, - allowed_values=[batch_id], - ), - ColumnTester( - column_name="url", - type_=sa.String, - allowed_values=["https://example.com"], - ), - ColumnTester( - column_name="collector_metadata", - type_=sa.JSON, - allowed_values=[{}] - ), - ColumnTester( - column_name="outcome", - type_=postgresql.ENUM, - allowed_values=get_enum_values(URLStatus) - ), - ColumnTester( - column_name="name", - type_=sa.String, - allowed_values=['test'], - ) - ], - engine=db_data_creator.db_client.engine - ) - - table_tester.run_column_tests() diff --git a/tests/automated/integration/db/test_change_log.py b/tests/automated/integration/db/test_change_log.py deleted file mode 100644 index dde2d702..00000000 --- a/tests/automated/integration/db/test_change_log.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -from sqlalchemy import update, delete - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import ChangeLogOperationType -from src.db.models.instantiations.change_log import ChangeLog -from src.db.models.instantiations.url.core.sqlalchemy import URL - - -class _TestChangeGetter: - - def __init__(self, adb: AsyncDatabaseClient): - self.adb = adb - - async def get_change_log_entries(self): - return await self.adb.get_all(ChangeLog) - -@pytest.mark.asyncio -async def test_change_log(wiped_database, adb_client_test: AsyncDatabaseClient): - getter = _TestChangeGetter(adb_client_test) - - # Confirm no entries in the change log table - entries = await getter.get_change_log_entries() - assert len(entries) == 0 - - # Add entry to URL table - url = URL( - url="test_url", - name="test_name", - description="test_description", - outcome='pending' - ) - url_id = await adb_client_test.add(url, return_id=True) - - # Choose a single logged table -- URL -- for testing - entries = await getter.get_change_log_entries() - assert len(entries) == 1 - entry: ChangeLog = entries[0] - assert entry.operation_type == ChangeLogOperationType.INSERT - assert entry.table_name == "urls" - assert entry.affected_id == url_id - assert entry.old_data is None - assert entry.new_data is not None - nd = entry.new_data - assert nd["id"] == url_id - assert nd["url"] == "test_url" - assert nd["name"] == "test_name" - assert nd["description"] == "test_description" - assert nd["outcome"] == "pending" - assert nd["created_at"] is not None - assert nd["updated_at"] is not None - assert nd['record_type'] is None - assert nd['collector_metadata'] is None - - # Update URL - - await adb_client_test.execute( - update(URL).where(URL.id == url_id).values( - name="new_name", - description="new_description" - ) - ) - - # Confirm change log entry - entries = await getter.get_change_log_entries() - assert len(entries) == 2 - entry: ChangeLog = entries[1] - assert entry.operation_type == ChangeLogOperationType.UPDATE - assert entry.table_name == "urls" - assert entry.affected_id == url_id - assert entry.old_data is not None - assert entry.new_data is not None - od = entry.old_data - nd = entry.new_data - assert nd['description'] == "new_description" - assert od['description'] == "test_description" - assert nd['name'] == "new_name" - assert od['name'] == "test_name" - assert nd['updated_at'] is not None - assert od['updated_at'] is not None - - # Delete URL - await adb_client_test.execute( - delete(URL).where(URL.id == url_id) - ) - - # Confirm change log entry - entries = await getter.get_change_log_entries() - assert len(entries) == 3 - entry: ChangeLog = entries[2] - assert entry.operation_type == ChangeLogOperationType.DELETE - assert entry.table_name == "urls" - assert entry.affected_id == url_id - assert entry.old_data is not None - assert entry.new_data is None - diff --git a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py index 8e345d51..d4fd84ad 100644 --- a/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/huggingface/setup/queries/setup.py @@ -1,5 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -35,6 +36,7 @@ async def run(self, session: AsyncSession) -> list[Record]: name=name, description=description, record_type=inp.record_type, + source=URLSource.COLLECTOR ) session.add(url) await session.flush() diff --git a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py index 2c563f09..a4bd93f8 100644 --- a/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py +++ b/tests/automated/integration/tasks/scheduled/sync/data_sources/setup/manager/url.py @@ -2,6 +2,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from tests.automated.integration.tasks.scheduled.sync.data_sources.setup.enums import AgencyAssigned @@ -67,6 +68,7 @@ async def setup_sc_entry( collector_metadata={}, outcome=entry.url_status.value, record_type=entry.record_type.value if entry.record_type is not None else None, + source=URLSource.COLLECTOR ) url_id = await self.adb_client.add(url, return_id=True) links = [] diff --git a/tests/automated/integration/tasks/url/html/setup/manager.py b/tests/automated/integration/tasks/url/html/setup/manager.py index 8e679a57..7cfac879 100644 --- a/tests/automated/integration/tasks/url/html/setup/manager.py +++ b/tests/automated/integration/tasks/url/html/setup/manager.py @@ -5,8 +5,9 @@ from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel -from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic from tests.automated.integration.tasks.url.html.mocks.methods import mock_get_from_cache, mock_parse from tests.automated.integration.tasks.url.html.mocks.url_request_interface.core import MockURLRequestInterface from tests.automated.integration.tasks.url.html.setup.data import TEST_ENTRIES @@ -32,7 +33,8 @@ async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: outcome=entry.url_info.status, url=entry.url_info.url, name=f"Test for {entry.url_info.url}", - record_type=RecordType.RESOURCES + record_type=RecordType.RESOURCES, + source=URLSource.COLLECTOR ) url_insert_models.append(url_insert_model) url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/url/probe/check/__init__.py b/tests/automated/integration/tasks/url/probe/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/check/manager.py b/tests/automated/integration/tasks/url/probe/check/manager.py new file mode 100644 index 00000000..e8486838 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/check/manager.py @@ -0,0 +1,56 @@ +from sqlalchemy import select + +from src.collectors.enums import URLStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata + + +class TestURLProbeCheckManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient + ): + self.adb_client = adb_client + + async def check_url( + self, + url_id: int, + expected_status: URLStatus + ): + url: URL = await self.adb_client.one_or_none(select(URL).where(URL.id == url_id)) + assert url is not None + assert url.outcome == expected_status + + async def check_web_metadata( + self, + url_id: int, + status_code: int | None, + content_type: str | None, + error: str | None, + accessed: bool + ): + web_metadata: URLWebMetadata = await self.adb_client.one_or_none( + select(URLWebMetadata).where(URLWebMetadata.url_id == url_id) + ) + assert web_metadata is not None + assert web_metadata.url_id == url_id + assert web_metadata.status_code == status_code + assert web_metadata.content_type == content_type + assert web_metadata.error_message == error + assert web_metadata.accessed == accessed + + async def check_redirect( + self, + source_url_id: int, + ) -> int: + """ + Check existence of redirect link using source_url_id and return destination_url_id + """ + redirect: LinkURLRedirectURL = await self.adb_client.one_or_none( + select(LinkURLRedirectURL).where(LinkURLRedirectURL.source_url_id == source_url_id) + ) + assert redirect is not None + return redirect.destination_url_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/conftest.py b/tests/automated/integration/tasks/url/probe/conftest.py index b8836a4b..45d3d820 100644 --- a/tests/automated/integration/tasks/url/probe/conftest.py +++ b/tests/automated/integration/tasks/url/probe/conftest.py @@ -1,15 +1,23 @@ -import pytest_asyncio +import pytest -from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator -from src.external.url_request.core import URLRequestInterface -from tests.automated.integration.tasks.url.probe.constants import PATCH_ROOT -from tests.automated.integration.tasks.url.probe.setup.mocks.probe_manager import MockURLProbeManager +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager -@pytest_asyncio.fixture -async def operator(adb_client_test, monkeypatch): - monkeypatch.setattr(PATCH_ROOT, MockURLProbeManager) - yield URLProbeTaskOperator( - adb_client=adb_client_test, - url_request_interface=URLRequestInterface() +@pytest.fixture +def setup_manager( + adb_client_test: AsyncDatabaseClient +) -> TestURLProbeSetupManager: + return TestURLProbeSetupManager( + adb_client=adb_client_test ) + + +@pytest.fixture +def check_manager( + adb_client_test: AsyncDatabaseClient +) -> TestURLProbeCheckManager: + return TestURLProbeCheckManager( + adb_client=adb_client_test + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/constants.py b/tests/automated/integration/tasks/url/probe/constants.py index 6bc307e5..1a6e0e7b 100644 --- a/tests/automated/integration/tasks/url/probe/constants.py +++ b/tests/automated/integration/tasks/url/probe/constants.py @@ -1,3 +1,6 @@ +from src.db.models.instantiations.url.core.enums import URLSource - -PATCH_ROOT = "src.external.url_request.core.URLProbeManager" \ No newline at end of file +PATCH_ROOT = "src.external.url_request.core.URLProbeManager" +TEST_URL = "https://www.example.com" +TEST_DEST_URL = "https://www.example.com/redirect" +TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/probe/mocks/__init__.py b/tests/automated/integration/tasks/url/probe/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/mocks/url_request_interface.py b/tests/automated/integration/tasks/url/probe/mocks/url_request_interface.py new file mode 100644 index 00000000..cc493274 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/mocks/url_request_interface.py @@ -0,0 +1,22 @@ +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper + + +class MockURLRequestInterface: + + def __init__( + self, + response_or_responses: URLProbeResponseOuterWrapper | list[URLProbeResponseOuterWrapper] + ): + if not isinstance(response_or_responses, list): + responses = [response_or_responses] + else: + responses = response_or_responses + + self._url_to_response = { + response.original_url: response for response in responses + } + + async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + return [ + self._url_to_response[url] for url in urls + ] diff --git a/tests/automated/integration/tasks/url/probe/models/__init__.py b/tests/automated/integration/tasks/url/probe/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/models/entry.py b/tests/automated/integration/tasks/url/probe/models/entry.py new file mode 100644 index 00000000..810f40ea --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/models/entry.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper + + +class TestURLProbeTaskEntry(BaseModel): + url: str + url_status: URLStatus + planned_response: URLProbeResponseOuterWrapper \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/no_redirect/__init__.py b/tests/automated/integration/tasks/url/probe/no_redirect/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/probe/no_redirect/test_error.py new file mode 100644 index 00000000..c62498c2 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/no_redirect/test_error.py @@ -0,0 +1,46 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_error( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL returns a 500 error response (or any other error), + the task should add web metadata response to the database + with + - the correct status + - content_type = None + - accessed = True + - the expected error message + """ + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_no_redirect_probe_response( + status_code=500, + content_type=None, + error="Something went wrong" + ) + ) + assert not await operator.meets_task_prerequisites() + url_id = await setup_manager.setup_url(URLStatus.SUBMITTED) + assert await operator.meets_task_prerequisites() + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + assert not await operator.meets_task_prerequisites() + await check_manager.check_url( + url_id=url_id, + expected_status=URLStatus.SUBMITTED + ) + await check_manager.check_web_metadata( + url_id=url_id, + status_code=500, + content_type=None, + error="Something went wrong", + accessed=True + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/probe/no_redirect/test_not_found.py new file mode 100644 index 00000000..44dab7f5 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/no_redirect/test_not_found.py @@ -0,0 +1,47 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_not_found( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL returns a 404 error response, + the task should add web metadata response to the database + with + - the correct status + - content_type = None + - accessed = False + - error_message = "Not found." + """ + + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_no_redirect_probe_response( + status_code=404, + content_type=None, + error="Not found." + ) + ) + assert not await operator.meets_task_prerequisites() + url_id = await setup_manager.setup_url(URLStatus.NOT_RELEVANT) + assert await operator.meets_task_prerequisites() + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + assert not await operator.meets_task_prerequisites() + await check_manager.check_url( + url_id=url_id, + expected_status=URLStatus.NOT_RELEVANT + ) + await check_manager.check_web_metadata( + url_id=url_id, + status_code=404, + content_type=None, + error="Not found.", + accessed=False + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/no_redirect/test_ok.py b/tests/automated/integration/tasks/url/probe/no_redirect/test_ok.py new file mode 100644 index 00000000..607e503d --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/no_redirect/test_ok.py @@ -0,0 +1,51 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_no_redirect_ok( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL returns a 200 OK response, + the task should add web metadata response to the database + with + - the correct status + - the correct content_type + - accessed = True + - error_message = None + """ + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_no_redirect_probe_response( + status_code=200, + content_type="text/html", + error=None + ) + ) + assert not await operator.meets_task_prerequisites() + url_id = await setup_manager.setup_url(URLStatus.PENDING) + assert await operator.meets_task_prerequisites() + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + assert not await operator.meets_task_prerequisites() + await check_manager.check_url( + url_id=url_id, + expected_status=URLStatus.PENDING + ) + await check_manager.check_web_metadata( + url_id=url_id, + status_code=200, + content_type="text/html", + accessed=True, + error=None + ) + + + + + diff --git a/tests/automated/integration/tasks/url/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/probe/no_redirect/test_two_urls.py new file mode 100644 index 00000000..a67d7713 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/no_redirect/test_two_urls.py @@ -0,0 +1,42 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.core.sqlalchemy import URL +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_two_urls( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + url_1 = "https://example.com/1" + url_2 = "https://example.com/2" + operator = setup_manager.setup_operator( + response_or_responses=[ + setup_manager.setup_no_redirect_probe_response( + status_code=200, + content_type="text/html", + error=None, + url=url_1 + ), + setup_manager.setup_no_redirect_probe_response( + status_code=200, + content_type="text/html", + error=None, + url=url_2 + ) + ] + ) + assert not await operator.meets_task_prerequisites() + url_id_1 = await setup_manager.setup_url(URLStatus.PENDING, url=url_1) + url_id_2 = await setup_manager.setup_url(URLStatus.NOT_RELEVANT, url=url_2) + assert await operator.meets_task_prerequisites() + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + assert not await operator.meets_task_prerequisites() + + urls = await check_manager.adb_client.get_all(URL) + assert len(urls) == 2 diff --git a/tests/automated/integration/tasks/url/probe/redirect/__init__.py b/tests/automated/integration/tasks/url/probe/redirect/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/redirect/dest_new/README.md b/tests/automated/integration/tasks/url/probe/redirect/dest_new/README.md new file mode 100644 index 00000000..bb03c102 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/redirect/dest_new/README.md @@ -0,0 +1 @@ +Tests for when the destination is a new URL not in the database. \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/redirect/dest_new/__init__.py b/tests/automated/integration/tasks/url/probe/redirect/dest_new/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/probe/redirect/dest_new/test_dest_ok.py b/tests/automated/integration/tasks/url/probe/redirect/dest_new/test_dest_ok.py new file mode 100644 index 00000000..acb7c1a8 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/redirect/dest_new/test_dest_ok.py @@ -0,0 +1,56 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_redirect_dest_new_ok( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL + - returns a redirect response to a new URL, + - and the new URL returns a 200 OK response and does not exist in the database, + the task + - should add the new URL to the database + - along with web metadata response to the database + - and the link between the original URL and the new URL. + """ + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=301, + dest_status_code=200, + dest_content_type="text/html", + dest_error=None + ) + ) + source_url_id = await setup_manager.setup_url(URLStatus.PENDING) + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + await check_manager.check_url( + url_id=source_url_id, + expected_status=URLStatus.PENDING + ) + await check_manager.check_web_metadata( + url_id=source_url_id, + status_code=301, + content_type=None, + error=None, + accessed=True + ) + dest_url_id = await check_manager.check_redirect(source_url_id) + await check_manager.check_url( + url_id=dest_url_id, + expected_status=URLStatus.PENDING + ) + await check_manager.check_web_metadata( + url_id=dest_url_id, + status_code=200, + content_type="text/html", + error=None, + accessed=True + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/probe/redirect/test_dest_exists_in_db.py new file mode 100644 index 00000000..9dbb03d6 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/redirect/test_dest_exists_in_db.py @@ -0,0 +1,70 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.constants import TEST_DEST_URL +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_redirect_dest_exists_in_db( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL: + - returns a redirect response to a new URL, + - and the new URL already exists in the database, + the task should add web metadata response to the database URL + and a link between the original URL and the new URL. + + """ + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=302, + dest_status_code=200, + dest_content_type="text/html", + dest_error=None + ) + ) + source_url_id = await setup_manager.setup_url(URLStatus.INDIVIDUAL_RECORD) + dest_url_id = await setup_manager.setup_url(URLStatus.PENDING, url=TEST_DEST_URL) + # Add web metadata for destination URL, to prevent it from being pulled + web_metadata = URLWebMetadataPydantic( + url_id=dest_url_id, + status_code=200, + content_type="text/html", + error_message=None, + accessed=True + ) + await setup_manager.adb_client.bulk_insert([web_metadata]) + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + await check_manager.check_url( + url_id=source_url_id, + expected_status=URLStatus.INDIVIDUAL_RECORD + ) + await check_manager.check_url( + url_id=dest_url_id, + expected_status=URLStatus.PENDING + ) + await check_manager.check_web_metadata( + url_id=source_url_id, + status_code=302, + content_type=None, + error=None, + accessed=True + ) + await check_manager.check_web_metadata( + url_id=dest_url_id, + status_code=200, + content_type="text/html", + error=None, + accessed=True + ) + redirect_url_id = await check_manager.check_redirect( + source_url_id=source_url_id + ) + assert redirect_url_id == dest_url_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/probe/redirect/test_redirect_infinite.py new file mode 100644 index 00000000..637c3a63 --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/redirect/test_redirect_infinite.py @@ -0,0 +1,46 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.constants import TEST_URL +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_redirect_infinite( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL: + - returns a redirect response to itself + The task should add a link that points to itself + as well as web metadata response to the database URL + """ + + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=303, + dest_status_code=303, + dest_content_type=None, + dest_error=None, + redirect_url=TEST_URL + ) + ) + url_id = await setup_manager.setup_url(URLStatus.PENDING) + run_info = await operator.run_task(1) + await check_manager.check_url( + url_id=url_id, + expected_status=URLStatus.PENDING + ) + await check_manager.check_web_metadata( + url_id=url_id, + status_code=303, + content_type=None, + error=None, + accessed=True + ) + redirect_url_id = await check_manager.check_redirect( + source_url_id=url_id, + ) + assert redirect_url_id == url_id diff --git a/tests/automated/integration/tasks/url/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/probe/redirect/test_two_urls_same_dest.py new file mode 100644 index 00000000..0104b5ee --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/redirect/test_two_urls_same_dest.py @@ -0,0 +1,56 @@ +import pytest + +from src.collectors.enums import URLStatus +from tests.automated.integration.tasks.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.probe.setup.manager import TestURLProbeSetupManager + + +@pytest.mark.asyncio +async def test_url_probe_task_redirect_two_urls_same_dest( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If two URLs: + - return a redirect response to the same URL + Two links to that URL should be added to the database, one for each URL + """ + + operator = setup_manager.setup_operator( + response_or_responses=[ + setup_manager.setup_redirect_probe_response( + redirect_status_code=307, + dest_status_code=200, + dest_content_type=None, + dest_error=None, + ), + setup_manager.setup_redirect_probe_response( + redirect_status_code=308, + dest_status_code=200, + dest_content_type=None, + dest_error=None, + source_url="https://example.com/2", + ), + ] + ) + source_url_id_1 = await setup_manager.setup_url(URLStatus.PENDING) + source_url_id_2 = await setup_manager.setup_url(URLStatus.PENDING, url="https://example.com/2") + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + await check_manager.check_url( + url_id=source_url_id_1, + expected_status=URLStatus.PENDING + ) + await check_manager.check_url( + url_id=source_url_id_2, + expected_status=URLStatus.PENDING + ) + redirect_url_id_1 = await check_manager.check_redirect( + source_url_id=source_url_id_1 + ) + redirect_url_id_2 = await check_manager.check_redirect( + source_url_id=source_url_id_2 + ) + assert redirect_url_id_1 == redirect_url_id_2 + diff --git a/tests/automated/integration/tasks/url/probe/setup/core.py b/tests/automated/integration/tasks/url/probe/setup/core.py deleted file mode 100644 index 1884798b..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/core.py +++ /dev/null @@ -1,22 +0,0 @@ -from src.core.enums import RecordType -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata -from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES - - -async def create_urls_in_db( - adb_client: AsyncDatabaseClient, -) -> None: - record_types = [rt for rt in RecordType] - urls = [] - for idx, entry in enumerate(SETUP_ENTRIES): - url = URLInsertModel( - url=entry.url, - outcome=entry.url_status, - name=f"test-url-probe-task-url-{idx}", - record_type=record_types[idx] - ) - urls.append(url) - await adb_client.bulk_insert(urls) - diff --git a/tests/automated/integration/tasks/url/probe/setup/data.py b/tests/automated/integration/tasks/url/probe/setup/data.py deleted file mode 100644 index 85ad2547..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/data.py +++ /dev/null @@ -1,36 +0,0 @@ -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry -from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse - -SETUP_ENTRIES: list[TestURLProbeTaskEntry] = [ - TestURLProbeTaskEntry( - url="https://pending.com", - url_status=URLStatus.PENDING, - url_probe_response=URLProbePlannedResponse( - status_code=200, - content_type="text/html", - error=None - ), - expected_accessed=True - ), - TestURLProbeTaskEntry( - url="https://submitted.com", - url_status=URLStatus.SUBMITTED, - url_probe_response=URLProbePlannedResponse( - status_code=500, - content_type=None, - error="test error" - ), - expected_accessed=True - ), - TestURLProbeTaskEntry( - url="https://failure.com", - url_status=URLStatus.ERROR, - url_probe_response=URLProbePlannedResponse( - status_code=None, - content_type=None, - error="URL not found" - ), - expected_accessed=False - ) -] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/format.py b/tests/automated/integration/tasks/url/probe/setup/format.py deleted file mode 100644 index 8cb2fdb0..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/format.py +++ /dev/null @@ -1,24 +0,0 @@ -from src.external.url_request.probe.model import URLProbeResponse -from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES -from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry - - -def build_url_to_probe_response_map( -) -> dict[str, URLProbeResponse]: - d = {} - for entry in SETUP_ENTRIES: - probe_response = URLProbeResponse( - url=entry.url, - status_code=entry.url_probe_response.status_code, - content_type=entry.url_probe_response.content_type, - error=entry.url_probe_response.error - ) - d[entry.url] = probe_response - return d - -def build_url_to_entry_map( -) -> dict[str, TestURLProbeTaskEntry]: - d = {} - for entry in SETUP_ENTRIES: - d[entry.url] = entry - return d \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/manager.py b/tests/automated/integration/tasks/url/probe/setup/manager.py new file mode 100644 index 00000000..3e0635ed --- /dev/null +++ b/tests/automated/integration/tasks/url/probe/setup/manager.py @@ -0,0 +1,100 @@ +from typing import cast, Literal + +from src.collectors.enums import URLStatus +from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.external.url_request.core import URLRequestInterface +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.external.url_request.probe.models.response import URLProbeResponse +from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from tests.automated.integration.tasks.url.probe.constants import TEST_URL, TEST_DEST_URL, TEST_SOURCE +from tests.automated.integration.tasks.url.probe.mocks.url_request_interface import MockURLRequestInterface + + +class TestURLProbeSetupManager: + + def __init__( + self, + adb_client: AsyncDatabaseClient + ): + self.adb_client = adb_client + + async def setup_url( + self, + url_status: URLStatus, + url: str = TEST_URL + ) -> int: + url_insert_model = URLInsertModel( + url=url, + outcome=url_status, + source=TEST_SOURCE + ) + return ( + await self.adb_client.bulk_insert( + models=[url_insert_model], + return_ids=True + ) + )[0] + + def setup_operator( + self, + response_or_responses: URLProbeResponseOuterWrapper | list[URLProbeResponseOuterWrapper] + ) -> URLProbeTaskOperator: + operator = URLProbeTaskOperator( + adb_client=self.adb_client, + url_request_interface=cast( + URLRequestInterface, + MockURLRequestInterface( + response_or_responses=response_or_responses + ) + ) + ) + return operator + + @staticmethod + def setup_no_redirect_probe_response( + status_code: int | None, + content_type: str | None, + error: str | None, + url: str = TEST_URL + ) -> URLProbeResponseOuterWrapper: + return URLProbeResponseOuterWrapper( + original_url=url, + response=URLProbeResponse( + url=url, + status_code=status_code, + content_type=content_type, + error=error + ) + ) + + @staticmethod + def setup_redirect_probe_response( + redirect_status_code: Literal[301, 302, 303, 307, 308], + dest_status_code: int, + dest_content_type: str | None, + dest_error: str | None, + source_url: str = TEST_URL, + redirect_url: str = TEST_DEST_URL + ) -> URLProbeResponseOuterWrapper: + if redirect_status_code not in (301, 302, 303, 307, 308): + raise ValueError('Redirect response must be one of 301, 302, 303, 307, 308') + return URLProbeResponseOuterWrapper( + original_url=source_url, + response=URLProbeRedirectResponsePair( + source=URLProbeResponse( + url=source_url, + status_code=redirect_status_code, + content_type=None, + error=None + ), + destination=URLProbeResponse( + url=redirect_url, + status_code=dest_status_code, + content_type=dest_content_type, + error=dest_error + ) + ) + ) + diff --git a/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py b/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py deleted file mode 100644 index ac65ea9b..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/mocks/probe_manager.py +++ /dev/null @@ -1,20 +0,0 @@ -from aiohttp import ClientSession - -from src.external.url_request.probe.model import URLProbeResponse -from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_probe_response_map - - -class MockURLProbeManager: - - def __init__( - self, - session: ClientSession - ): - self.session = session - self._url_to_probe_response: dict[str, URLProbeResponse] = build_url_to_probe_response_map() - - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponse]: - return [ - self._url_to_probe_response[url] - for url in urls - ] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/models/entry.py b/tests/automated/integration/tasks/url/probe/setup/models/entry.py deleted file mode 100644 index 6432de9c..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/models/entry.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.url.probe.setup.models.planned_response import URLProbePlannedResponse - - -class TestURLProbeTaskEntry(BaseModel): - url: str - url_status: URLStatus - url_probe_response: URLProbePlannedResponse - expected_accessed: bool diff --git a/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py b/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py deleted file mode 100644 index 41f17883..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/models/planned_response.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydantic import BaseModel - - -class URLProbePlannedResponse(BaseModel): - status_code: int | None - content_type: str | None - error: str | None \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/probe/setup/queries/check.py b/tests/automated/integration/tasks/url/probe/setup/queries/check.py deleted file mode 100644 index 988efffc..00000000 --- a/tests/automated/integration/tasks/url/probe/setup/queries/check.py +++ /dev/null @@ -1,43 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.queries.base.builder import QueryBuilderBase -from tests.automated.integration.tasks.url.probe.setup.data import SETUP_ENTRIES -from tests.automated.integration.tasks.url.probe.setup.format import build_url_to_entry_map -from tests.automated.integration.tasks.url.probe.setup.models.entry import TestURLProbeTaskEntry - - -class CheckURLsInDBForURLProbeTaskQueryBuilder(QueryBuilderBase): - - def __init__(self): - super().__init__() - self._entries = SETUP_ENTRIES - self._url_to_entry_map: dict[ - str, TestURLProbeTaskEntry - ] = build_url_to_entry_map() - - async def run(self, session: AsyncSession) -> None: - - query = ( - select( - URL.url, - URLWebMetadata.accessed, - URLWebMetadata.status_code, - URLWebMetadata.content_type, - URLWebMetadata.error_message - ) - .join(URLWebMetadata, URL.id == URLWebMetadata.url_id) - ) - mappings = await sh.mappings(session, query=query) - assert len(mappings) == len(self._entries) - for mapping in mappings: - url = mapping["url"] - entry = self._url_to_entry_map[url] - assert entry.expected_accessed == mapping["accessed"] - assert entry.url_probe_response.status_code == mapping["status_code"] - assert entry.url_probe_response.content_type == mapping["content_type"] - assert entry.url_probe_response.error == mapping["error_message"] - diff --git a/tests/automated/integration/tasks/url/probe/test_core.py b/tests/automated/integration/tasks/url/probe/test_core.py deleted file mode 100644 index ee3fe50c..00000000 --- a/tests/automated/integration/tasks/url/probe/test_core.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator -from tests.automated.integration.tasks.asserts import assert_task_ran_without_error -from tests.automated.integration.tasks.url.probe.setup.core import create_urls_in_db -from tests.automated.integration.tasks.url.probe.setup.queries.check import CheckURLsInDBForURLProbeTaskQueryBuilder - - -@pytest.mark.asyncio -async def test_url_probe_task( - operator: URLProbeTaskOperator -): - adb_client = operator.adb_client - # Check task does not yet meet pre-requisites - assert not await operator.meets_task_prerequisites() - - # Set up URLs - await create_urls_in_db(adb_client=adb_client) - - # Check task meets pre-requisites - assert await operator.meets_task_prerequisites() - - # Run task - run_info = await operator.run_task(1) - assert_task_ran_without_error(run_info) - - # Check task no longer meets pre-requisites - assert not await operator.meets_task_prerequisites() - - # Check results as expected - await adb_client.run_query_builder( - CheckURLsInDBForURLProbeTaskQueryBuilder() - ) diff --git a/tests/automated/integration/tasks/url/test_url_404_probe.py b/tests/automated/integration/tasks/url/test_url_404_probe.py index 2022a8f3..0f445486 100644 --- a/tests/automated/integration/tasks/url/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/test_url_404_probe.py @@ -18,7 +18,9 @@ @pytest.mark.asyncio -async def test_url_404_probe_task(db_data_creator: DBDataCreator): +async def test_url_404_probe_task( + db_data_creator: DBDataCreator +): mock_html_content = "" mock_content_type = "text/html" diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 20ddc362..096ea3eb 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -7,6 +7,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -37,6 +38,12 @@ async def test_auto_googler_collector(patch_get_query_results): mock.assert_called_once_with("keyword") collector.adb_client.insert_urls.assert_called_once_with( - url_infos=[URLInfo(url="https://include.com/1", collector_metadata={"query": "keyword", "title": "keyword", "snippet": "snippet 1"})], + url_infos=[ + URLInfo( + url="https://include.com/1", + collector_metadata={"query": "keyword", "title": "keyword", "snippet": "snippet 1"}, + source=URLSource.COLLECTOR + ) + ], batch_id=1 ) \ No newline at end of file diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 622da31b..4e69d1ad 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -6,6 +6,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @@ -39,8 +40,8 @@ async def test_common_crawl_collector(mock_get_common_crawl_search_results): collector.adb_client.insert_urls.assert_called_once_with( url_infos=[ - URLInfo(url="http://keyword.com"), - URLInfo(url="http://keyword.com/page3") + URLInfo(url="http://keyword.com", source=URLSource.COLLECTOR), + URLInfo(url="http://keyword.com/page3", source=URLSource.COLLECTOR), ], batch_id=1 ) diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index a8afe591..d0a10982 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -10,6 +10,7 @@ from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO from src.collectors.source_collectors.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo PATCH_ROOT = "src.collectors.source_collectors.muckrock" @@ -55,10 +56,12 @@ async def test_muckrock_simple_collector(patch_muckrock_fetcher): URLInfo( url='https://include.com/1', collector_metadata={'absolute_url': 'https://include.com/1', 'title': 'keyword'}, + source=URLSource.COLLECTOR ), URLInfo( url='https://include.com/2', collector_metadata={'absolute_url': 'https://include.com/2', 'title': 'keyword'}, + source=URLSource.COLLECTOR ) ], batch_id=1 @@ -111,14 +114,17 @@ async def test_muckrock_county_search_collector(patch_muckrock_county_level_sear URLInfo( url='https://include.com/1', collector_metadata={'absolute_url': 'https://include.com/1', 'title': 'keyword'}, + source=URLSource.COLLECTOR ), URLInfo( url='https://include.com/2', collector_metadata={'absolute_url': 'https://include.com/2', 'title': 'keyword'}, + source=URLSource.COLLECTOR ), URLInfo( url='https://include.com/3', collector_metadata={'absolute_url': 'https://include.com/3', 'title': 'lemon'}, + source=URLSource.COLLECTOR ), ], batch_id=1 diff --git a/tests/helpers/data_creator/commands/impl/url_metadata.py b/tests/helpers/data_creator/commands/impl/url_metadata.py index 9d3cf4ff..608bc403 100644 --- a/tests/helpers/data_creator/commands/impl/url_metadata.py +++ b/tests/helpers/data_creator/commands/impl/url_metadata.py @@ -1,6 +1,6 @@ from http import HTTPStatus -from src.db.models.instantiations.url.web_metadata.pydantic import URLWebMetadataPydantic +from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls.py index e4602dee..ab727bef 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls.py @@ -3,6 +3,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.simple_test_data_functions import generate_test_urls @@ -38,7 +39,8 @@ def run_sync(self) -> InsertURLsInfo: outcome=self.outcome, name="Test Name" if self.outcome == URLStatus.VALIDATED else None, collector_metadata=self.collector_metadata, - created_at=self.created_at + created_at=self.created_at, + source=URLSource.COLLECTOR ) ) diff --git a/tests/manual/external/url_request/test_url_probe.py b/tests/manual/external/url_request/test_url_probe.py index d13d0f80..b987aa45 100644 --- a/tests/manual/external/url_request/test_url_probe.py +++ b/tests/manual/external/url_request/test_url_probe.py @@ -3,20 +3,18 @@ from src.external.url_request.probe.core import URLProbeManager URLS = [ - "https://www.google.com", - "https://www.example.com", - "https://www.example.org", - "https://www.nonexistent.com", +'https://citydocs.longbeach.gov/LBPDPublicDocs/DocView.aspx?id=162830&dbid=0&repo=LBPD-PUBDOCS%C2%A0' + # "https://tableau.alleghenycounty.us/t/PublicSite/views/PublicBudgetDashboard_17283931835700/OperatingOverview?%3Aembed=y&%3AisGuestRedirectFromVizportal=y" + # "data.austintexas.gov/resource/sc6h-qr9f.json" + # "https://albanyoregon.gov/police/crime/statistics-crime-analysis", + # "https://www.example.com", + # "https://www.example.org", + # "https://www.nonexistent.com", ] -@pytest.mark.asyncio -async def test_url_probe_head(test_client_session): - manager = URLProbeManager(session=test_client_session) - result = await manager.head(url=URLS[0]) - print(result) @pytest.mark.asyncio async def test_url_probe(test_client_session): manager = URLProbeManager(session=test_client_session) results = await manager.probe_urls(urls=URLS) - print(results) \ No newline at end of file + print(results)