diff --git a/ENV.md b/ENV.md index b0811247..f7e0e533 100644 --- a/ENV.md +++ b/ENV.md @@ -39,7 +39,6 @@ The following flags are available: | `URL_RECORD_TYPE_TASK_FLAG` | Automatically assigns Record Types to URLs. | | `URL_AGENCY_IDENTIFICATION_TASK_FLAG` | Automatically assigns and suggests Agencies for URLs. | | `URL_SUBMIT_APPROVED_TASK_FLAG` | Submits approved URLs to the Data Sources App. | -| `URL_DUPLICATE_TASK_FLAG` | Identifies duplicate URLs. | | `URL_MISC_METADATA_TASK_FLAG` | Adds misc metadata to URLs. | | `URL_404_PROBE_TASK_FLAG` | Probes URLs for 404 errors. | | `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. | diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 6b55a157..b2bc1e14 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -9,7 +9,6 @@ from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator -from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator @@ -114,19 +113,6 @@ async def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry: ) ) - async def _get_url_duplicate_task_operator(self) -> URLTaskEntry: - operator = URLDuplicateTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ) - return URLTaskEntry( - operator=operator, - enabled=self.env.bool( - "URL_DUPLICATE_TASK_FLAG", - default=True - ) - ) - async def _get_url_404_probe_task_operator(self) -> URLTaskEntry: operator = URL404ProbeTaskOperator( adb_client=self.adb_client, @@ -170,7 +156,6 @@ async def load_entries(self) -> list[URLTaskEntry]: return [ await self._get_url_probe_task_operator(), await self._get_url_html_task_operator(), - await self._get_url_duplicate_task_operator(), await self._get_url_404_probe_task_operator(), await self._get_url_record_type_task_operator(), await self._get_agency_identification_task_operator(), diff --git a/src/core/tasks/url/operators/duplicate/__init__.py b/src/core/tasks/url/operators/duplicate/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/duplicate/core.py b/src/core/tasks/url/operators/duplicate/core.py deleted file mode 100644 index dba0147c..00000000 --- a/src/core/tasks/url/operators/duplicate/core.py +++ /dev/null @@ -1,47 +0,0 @@ -from http import HTTPStatus - -from aiohttp import ClientResponseError - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.external.pdap.client import PDAPClient - - -class URLDuplicateTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self): - return TaskType.DUPLICATE_DETECTION - - async def meets_task_prerequisites(self): - return await self.adb_client.has_pending_urls_not_checked_for_duplicates() - - async def inner_task_logic(self): - tdos: list[URLDuplicateTDO] = await self.adb_client.get_pending_urls_not_checked_for_duplicates() - url_ids = [tdo.url_id for tdo in tdos] - await self.link_urls_to_task(url_ids=url_ids) - checked_tdos = [] - for tdo in tdos: - try: - tdo.is_duplicate = await self.pdap_client.is_url_duplicate(tdo.url) - checked_tdos.append(tdo) - except ClientResponseError as e: - print("ClientResponseError:", e.status) - if e.status == HTTPStatus.TOO_MANY_REQUESTS: - break - raise e - - duplicate_url_ids = [tdo.url_id for tdo in checked_tdos if tdo.is_duplicate] - checked_url_ids = [tdo.url_id for tdo in checked_tdos] - await self.adb_client.mark_all_as_duplicates(duplicate_url_ids) - await self.adb_client.mark_as_checked_for_duplicates(checked_url_ids) diff --git a/src/core/tasks/url/operators/duplicate/tdo.py b/src/core/tasks/url/operators/duplicate/tdo.py deleted file mode 100644 index af00ce38..00000000 --- a/src/core/tasks/url/operators/duplicate/tdo.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class URLDuplicateTDO(BaseModel): - url_id: int - url: str - is_duplicate: Optional[bool] = None diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 136fea8a..ffb7738b 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -5,7 +5,6 @@ from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row from sqlalchemy.dialects.postgresql import insert as pg_insert -from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -60,11 +59,13 @@ from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query -from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import get_update_agencies_sync_progress_query +from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ + get_update_agencies_sync_progress_query from src.core.tasks.scheduled.impl.sync.agency.queries.upsert import \ convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import GetDataSourcesSyncParametersQueryBuilder +from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ + GetDataSourcesSyncParametersQueryBuilder from src.core.tasks.scheduled.impl.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query from src.core.tasks.scheduled.impl.sync.data_sources.queries.update_sync_progress import \ get_update_data_sources_sync_progress_query @@ -78,14 +79,6 @@ HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder -from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder -from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder -from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO -from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO from src.core.tasks.url.operators.html.queries.get import \ GetPendingURLsWithoutHTMLDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.queries.get_pending_urls_missing_miscellaneous_data import \ @@ -93,6 +86,13 @@ from src.core.tasks.url.operators.misc_metadata.queries.has_pending_urls_missing_miscellaneous_data import \ HasPendingURsMissingMiscellaneousDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO +from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder +from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO +from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder +from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder +from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager @@ -109,9 +109,6 @@ from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.models.instantiations.batch.sqlalchemy import Batch from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate -from src.db.models.instantiations.link.batch_url import LinkBatchURL from src.db.models.instantiations.link.task_url import LinkTaskURL from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.instantiations.log.pydantic.info import LogInfo @@ -121,12 +118,12 @@ from src.db.models.instantiations.task.core import Task from src.db.models.instantiations.task.error import TaskError from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.core.pydantic.info import URLInfo from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 @@ -1336,44 +1333,6 @@ async def populate_backlog_snapshot( session.add(snapshot) - @session_manager - async def has_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> bool: - query = (select( - URL.id - ).outerjoin( - URLCheckedForDuplicate, - URL.id == URLCheckedForDuplicate.url_id - ).where( - URL.status == URLStatus.PENDING.value, - URLCheckedForDuplicate.id == None - ).limit(1) - ) - - raw_result = await session.execute(query) - result = raw_result.one_or_none() - return result is not None - - @session_manager - async def get_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> List[URLDuplicateTDO]: - query = (select( - URL - ).outerjoin( - URLCheckedForDuplicate, - URL.id == URLCheckedForDuplicate.url_id - ).where( - URL.status == URLStatus.PENDING.value, - URLCheckedForDuplicate.id == None - ).limit(100) - ) - - raw_result = await session.execute(query) - urls = raw_result.scalars().all() - return [URLDuplicateTDO(url=url.url, url_id=url.id) for url in urls] - - async def mark_all_as_duplicates(self, url_ids: List[int]): - query = update(URL).where(URL.id.in_(url_ids)).values(status=URLStatus.DUPLICATE.value) - await self.execute(query) - async def mark_all_as_404(self, url_ids: List[int]): query = update(URL).where(URL.id.in_(url_ids)).values(status=URLStatus.NOT_FOUND.value) await self.execute(query) diff --git a/tests/automated/integration/tasks/url/impl/duplicate/__init__.py b/tests/automated/integration/tasks/url/impl/duplicate/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/duplicate/constants.py b/tests/automated/integration/tasks/url/impl/duplicate/constants.py deleted file mode 100644 index 01682c3a..00000000 --- a/tests/automated/integration/tasks/url/impl/duplicate/constants.py +++ /dev/null @@ -1,16 +0,0 @@ -from src.collectors.enums import URLStatus -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters - -BATCH_CREATION_PARAMETERS = TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=2, - status=URLStatus.PENDING - ), - ] -) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/duplicate/test_url_duplicate_task.py b/tests/automated/integration/tasks/url/impl/duplicate/test_url_duplicate_task.py deleted file mode 100644 index ceb4abc1..00000000 --- a/tests/automated/integration/tasks/url/impl/duplicate/test_url_duplicate_task.py +++ /dev/null @@ -1,84 +0,0 @@ -from http import HTTPStatus -from unittest.mock import MagicMock - -import pytest - -from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator -from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.collectors.enums import URLStatus -from src.core.tasks.url.enums import TaskOperatorOutcome -from tests.automated.integration.tasks.url.impl.duplicate.constants import BATCH_CREATION_PARAMETERS -from tests.helpers.data_creator.core import DBDataCreator -from pdap_access_manager import ResponseInfo -from src.external.pdap.client import PDAPClient - - -@pytest.mark.asyncio -async def test_url_duplicate_task( - db_data_creator: DBDataCreator, - mock_pdap_client: PDAPClient -): - operator = URLDuplicateTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - assert not await operator.meets_task_prerequisites() - make_request_mock: MagicMock = mock_pdap_client.access_manager.make_request - - make_request_mock.assert_not_called() - - # Add three URLs to the database, one of which is in error, the other two pending - creation_info = await db_data_creator.batch_v2(BATCH_CREATION_PARAMETERS) - pending_urls: list[URLMapping] = creation_info.urls_by_status[URLStatus.PENDING].url_mappings - duplicate_url = pending_urls[0] - non_duplicate_url = pending_urls[1] - assert await operator.meets_task_prerequisites() - make_request_mock.assert_not_called() - - make_request_mock.side_effect = [ - ResponseInfo( - data={ - "duplicates": [ - { - "original_url": duplicate_url.url, - "approval_status": "approved" - } - ], - }, - status_code=HTTPStatus.OK - ), - ResponseInfo( - data={ - "duplicates": [], - }, - status_code=HTTPStatus.OK - ), - ] - run_info = await operator.run_task(1) - assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - assert make_request_mock.call_count == 2 - - adb_client = db_data_creator.adb_client - urls: list[URL] = await adb_client.get_all(URL) - assert len(urls) == 3 - url_ids = [url.id for url in urls] - assert duplicate_url.url_id in url_ids - for url in urls: - if url.id == duplicate_url.url_id: - assert url.status == URLStatus.DUPLICATE - - checked_for_duplicates: list[URLCheckedForDuplicate] = await adb_client.get_all(URLCheckedForDuplicate) - assert len(checked_for_duplicates) == 2 - checked_for_duplicate_url_ids = [url.url_id for url in checked_for_duplicates] - assert duplicate_url.url_id in checked_for_duplicate_url_ids - assert non_duplicate_url.url_id in checked_for_duplicate_url_ids - - assert not await operator.meets_task_prerequisites() - - - - - diff --git a/tests/automated/integration/tasks/url/loader/test_flags.py b/tests/automated/integration/tasks/url/loader/test_flags.py index f184397d..68e8862a 100644 --- a/tests/automated/integration/tasks/url/loader/test_flags.py +++ b/tests/automated/integration/tasks/url/loader/test_flags.py @@ -6,7 +6,6 @@ from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator @@ -40,10 +39,6 @@ class Config: env_var="URL_SUBMIT_APPROVED_TASK_FLAG", operator=SubmitApprovedURLTaskOperator ), - FlagTestParams( - env_var="URL_DUPLICATE_TASK_FLAG", - operator=URLDuplicateTaskOperator - ), FlagTestParams( env_var="URL_MISC_METADATA_TASK_FLAG", operator=URLMiscellaneousMetadataTaskOperator diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index 769204d7..639eb0ae 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS = 9 +NUMBER_OF_TASK_OPERATORS = 8 @pytest.mark.asyncio async def test_happy_path(