Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ The following flags are available:
| `URL_RECORD_TYPE_TASK_FLAG` | Automatically assigns Record Types to URLs. |
| `URL_AGENCY_IDENTIFICATION_TASK_FLAG` | Automatically assigns and suggests Agencies for URLs. |
| `URL_SUBMIT_APPROVED_TASK_FLAG` | Submits approved URLs to the Data Sources App. |
| `URL_DUPLICATE_TASK_FLAG` | Identifies duplicate URLs. |
| `URL_MISC_METADATA_TASK_FLAG` | Adds misc metadata to URLs. |
| `URL_404_PROBE_TASK_FLAG` | Probes URLs for 404 errors. |
| `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. |
Expand Down
15 changes: 0 additions & 15 deletions src/core/tasks/url/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator
from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader
from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator
from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator
from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator
from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser
from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator
Expand Down Expand Up @@ -114,19 +113,6 @@ async def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry:
)
)

async def _get_url_duplicate_task_operator(self) -> URLTaskEntry:
operator = URLDuplicateTaskOperator(
adb_client=self.adb_client,
pdap_client=self.pdap_client
)
return URLTaskEntry(
operator=operator,
enabled=self.env.bool(
"URL_DUPLICATE_TASK_FLAG",
default=True
)
)

async def _get_url_404_probe_task_operator(self) -> URLTaskEntry:
operator = URL404ProbeTaskOperator(
adb_client=self.adb_client,
Expand Down Expand Up @@ -170,7 +156,6 @@ async def load_entries(self) -> list[URLTaskEntry]:
return [
await self._get_url_probe_task_operator(),
await self._get_url_html_task_operator(),
await self._get_url_duplicate_task_operator(),
await self._get_url_404_probe_task_operator(),
await self._get_url_record_type_task_operator(),
await self._get_agency_identification_task_operator(),
Expand Down
Empty file.
47 changes: 0 additions & 47 deletions src/core/tasks/url/operators/duplicate/core.py

This file was deleted.

9 changes: 0 additions & 9 deletions src/core/tasks/url/operators/duplicate/tdo.py

This file was deleted.

65 changes: 12 additions & 53 deletions src/db/client/async_.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
from sqlalchemy.orm import selectinload, QueryableAttribute

Expand Down Expand Up @@ -60,11 +59,13 @@
from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters
from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder
from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query
from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import get_update_agencies_sync_progress_query
from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \
get_update_agencies_sync_progress_query
from src.core.tasks.scheduled.impl.sync.agency.queries.upsert import \
convert_agencies_sync_response_to_agencies_upsert
from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters
from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import GetDataSourcesSyncParametersQueryBuilder
from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \
GetDataSourcesSyncParametersQueryBuilder
from src.core.tasks.scheduled.impl.sync.data_sources.queries.mark_full_sync import get_mark_full_data_sources_sync_query
from src.core.tasks.scheduled.impl.sync.data_sources.queries.update_sync_progress import \
get_update_data_sources_sync_progress_query
Expand All @@ -78,21 +79,20 @@
HasURLsWithoutAgencySuggestionsQueryBuilder
from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO
from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder
from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO
from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder
from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder
from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder
from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo
from src.core.tasks.url.operators.duplicate.tdo import URLDuplicateTDO
from src.core.tasks.url.operators.html.queries.get import \
GetPendingURLsWithoutHTMLDataQueryBuilder
from src.core.tasks.url.operators.misc_metadata.queries.get_pending_urls_missing_miscellaneous_data import \
GetPendingURLsMissingMiscellaneousDataQueryBuilder
from src.core.tasks.url.operators.misc_metadata.queries.has_pending_urls_missing_miscellaneous_data import \
HasPendingURsMissingMiscellaneousDataQueryBuilder
from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO
from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder
from src.core.tasks.url.operators.probe_404.tdo import URL404ProbeTDO
from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder
from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder
from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder
from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo
from src.db.client.helpers import add_standard_limit_and_offset
from src.db.client.types import UserSuggestionModel
from src.db.config_manager import ConfigManager
Expand All @@ -109,9 +109,6 @@
from src.db.models.instantiations.batch.pydantic import BatchInfo
from src.db.models.instantiations.batch.sqlalchemy import Batch
from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo
from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo
from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate
from src.db.models.instantiations.link.batch_url import LinkBatchURL
from src.db.models.instantiations.link.task_url import LinkTaskURL
from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.instantiations.log.pydantic.info import LogInfo
Expand All @@ -121,12 +118,12 @@
from src.db.models.instantiations.task.core import Task
from src.db.models.instantiations.task.error import TaskError
from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate
from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML
from src.db.models.instantiations.url.core.pydantic.info import URLInfo
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource
from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo
from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML
from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent
from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404
Expand Down Expand Up @@ -1336,44 +1333,6 @@ async def populate_backlog_snapshot(

session.add(snapshot)

@session_manager
async def has_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> bool:
query = (select(
URL.id
).outerjoin(
URLCheckedForDuplicate,
URL.id == URLCheckedForDuplicate.url_id
).where(
URL.status == URLStatus.PENDING.value,
URLCheckedForDuplicate.id == None
).limit(1)
)

raw_result = await session.execute(query)
result = raw_result.one_or_none()
return result is not None

@session_manager
async def get_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> List[URLDuplicateTDO]:
query = (select(
URL
).outerjoin(
URLCheckedForDuplicate,
URL.id == URLCheckedForDuplicate.url_id
).where(
URL.status == URLStatus.PENDING.value,
URLCheckedForDuplicate.id == None
).limit(100)
)

raw_result = await session.execute(query)
urls = raw_result.scalars().all()
return [URLDuplicateTDO(url=url.url, url_id=url.id) for url in urls]

async def mark_all_as_duplicates(self, url_ids: List[int]):
query = update(URL).where(URL.id.in_(url_ids)).values(status=URLStatus.DUPLICATE.value)
await self.execute(query)

async def mark_all_as_404(self, url_ids: List[int]):
query = update(URL).where(URL.id.in_(url_ids)).values(status=URLStatus.NOT_FOUND.value)
await self.execute(query)
Expand Down
Empty file.

This file was deleted.

This file was deleted.

5 changes: 0 additions & 5 deletions tests/automated/integration/tasks/url/loader/test_flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator
from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator
from src.core.tasks.url.operators.base import URLTaskOperatorBase
from src.core.tasks.url.operators.duplicate.core import URLDuplicateTaskOperator
from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator
from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator
from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator
Expand Down Expand Up @@ -40,10 +39,6 @@ class Config:
env_var="URL_SUBMIT_APPROVED_TASK_FLAG",
operator=SubmitApprovedURLTaskOperator
),
FlagTestParams(
env_var="URL_DUPLICATE_TASK_FLAG",
operator=URLDuplicateTaskOperator
),
FlagTestParams(
env_var="URL_MISC_METADATA_TASK_FLAG",
operator=URLMiscellaneousMetadataTaskOperator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from src.core.tasks.url.loader import URLTaskOperatorLoader

NUMBER_OF_TASK_OPERATORS = 9
NUMBER_OF_TASK_OPERATORS = 8

@pytest.mark.asyncio
async def test_happy_path(
Expand Down