diff --git a/ENV.md b/ENV.md index f7e0e533..b9d08ed1 100644 --- a/ENV.md +++ b/ENV.md @@ -43,9 +43,10 @@ The following flags are available: | `URL_404_PROBE_TASK_FLAG` | Probes URLs for 404 errors. | | `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. | | `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. | +| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. | | `SYNC_AGENCIES_TASK_FLAG` | Synchonize agencies from Data Sources App. | | `SYNC_DATA_SOURCES_TASK_FLAG` | Synchonize data sources from Data Sources App. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | | `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | | `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | | `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | diff --git a/alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py b/alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py new file mode 100644 index 00000000..28b1f049 --- /dev/null +++ b/alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py @@ -0,0 +1,147 @@ +"""Refine root table logic + +Revision ID: 49fd9f295b8d +Revises: 9a56916ea7d8 +Create Date: 2025-08-12 08:19:08.170835 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, updated_at_column, url_id_column, created_at_column, switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '49fd9f295b8d' +down_revision: Union[str, None] = '9a56916ea7d8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +ROOT_URLS_TABLE_NAME = "root_urls" +ROOT_URL_CACHE_TABLE_NAME = "root_url_cache" + +LINK_URLS_ROOT_URL_TABLE_NAME = "link_urls_root_url" +FLAG_ROOT_URL_TABLE_NAME = "flag_root_url" + + + + +def upgrade() -> None: + _drop_root_url_cache() + _drop_root_urls() + _create_flag_root_url() + _create_link_urls_root_url() + _add_root_url_task_enum() + + +def downgrade() -> None: + _create_root_url_cache() + _create_root_urls() + _drop_link_urls_root_url() + _drop_flag_root_url() + _remove_root_url_task_enum() + +def _add_root_url_task_enum(): + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe', + 'Populate Backlog Snapshot', + 'Delete Old Logs', + 'Run URL Task Cycles', + 'Root URL' + ] + ) + + +def _remove_root_url_task_enum(): + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe', + 'Populate Backlog Snapshot', + 'Delete Old Logs', + 'Run URL Task Cycles' + ] + ) + + +def _drop_root_url_cache(): + op.drop_table(ROOT_URL_CACHE_TABLE_NAME) + +def _drop_root_urls(): + op.drop_table(ROOT_URLS_TABLE_NAME) + +def _create_root_url_cache(): + op.create_table( + ROOT_URL_CACHE_TABLE_NAME, + id_column(), + sa.Column('url', sa.String(), nullable=False), + sa.Column('page_title', sa.String(), nullable=False), + sa.Column('page_description', sa.String(), nullable=True), + updated_at_column(), + sa.UniqueConstraint('url', name='root_url_cache_uq_url') + ) + +def _create_root_urls(): + op.create_table( + ROOT_URLS_TABLE_NAME, + id_column(), + sa.Column('url', sa.String(), nullable=False), + sa.Column('page_title', sa.String(), nullable=False), + sa.Column('page_description', sa.String(), nullable=True), + updated_at_column(), + sa.UniqueConstraint('url', name='uq_root_url_url') + ) + +def _create_link_urls_root_url(): + op.create_table( + LINK_URLS_ROOT_URL_TABLE_NAME, + id_column(), + url_id_column(), + url_id_column('root_url_id'), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', 'root_url_id') + ) + +def _drop_link_urls_root_url(): + op.drop_table(LINK_URLS_ROOT_URL_TABLE_NAME) + +def _create_flag_root_url(): + op.create_table( + FLAG_ROOT_URL_TABLE_NAME, + url_id_column(), + created_at_column(), + sa.PrimaryKeyConstraint('url_id') + ) + +def _drop_flag_root_url(): + op.drop_table(FLAG_ROOT_URL_TABLE_NAME) \ No newline at end of file diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 4e29e2f3..9b3ffdeb 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,8 +5,8 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.types import UserSuggestionType diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index 8e41373a..a6a5b69d 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,9 +5,9 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/annotate/agency/get/dto.py b/src/api/endpoints/annotate/agency/get/dto.py index f2dda0f5..35288969 100644 --- a/src/api/endpoints/annotate/agency/get/dto.py +++ b/src/api/endpoints/annotate/agency/get/dto.py @@ -7,11 +7,11 @@ class GetNextURLForAgencyAgencyInfo(BaseModel): suggestion_type: SuggestionType - pdap_agency_id: Optional[int] = None - agency_name: Optional[str] = None - state: Optional[str] = None - county: Optional[str] = None - locality: Optional[str] = None + pdap_agency_id: int | None = None + agency_name: str | None = None + state: str | None = None + county: str | None = None + locality: str | None = None class GetNextURLForAgencyAnnotationInnerResponse(AnnotationInnerResponseInfoBase): agency_suggestions: list[ @@ -19,5 +19,5 @@ class GetNextURLForAgencyAnnotationInnerResponse(AnnotationInnerResponseInfoBase ] class GetNextURLForAgencyAnnotationResponse(BaseModel): - next_annotation: Optional[GetNextURLForAgencyAnnotationInnerResponse] + next_annotation: GetNextURLForAgencyAnnotationInnerResponse | None diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py index 14a00260..1f202263 100644 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py @@ -3,8 +3,8 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.core.enums import SuggestionType -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index d529616b..70ae112a 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -9,12 +9,12 @@ from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder diff --git a/src/api/endpoints/annotate/agency/post/dto.py b/src/api/endpoints/annotate/agency/post/dto.py index 1d0ade02..dc41720a 100644 --- a/src/api/endpoints/annotate/agency/post/dto.py +++ b/src/api/endpoints/annotate/agency/post/dto.py @@ -5,4 +5,4 @@ class URLAgencyAnnotationPostInfo(BaseModel): is_new: bool = False - suggested_agency: Optional[int] = None + suggested_agency: int | None = None diff --git a/src/api/endpoints/annotate/all/get/dto.py b/src/api/endpoints/annotate/all/get/dto.py index 63d46ce6..26bb5e07 100644 --- a/src/api/endpoints/annotate/all/get/dto.py +++ b/src/api/endpoints/annotate/all/get/dto.py @@ -21,4 +21,4 @@ class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): class GetNextURLForAllAnnotationResponse(BaseModel): - next_annotation: Optional[GetNextURLForAllAnnotationInnerResponse] \ No newline at end of file + next_annotation: GetNextURLForAllAnnotationInnerResponse | None \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index a9e39753..a2afafd9 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -10,11 +10,11 @@ from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/annotate/all/post/dto.py b/src/api/endpoints/annotate/all/post/dto.py index 293dcd7a..73c21606 100644 --- a/src/api/endpoints/annotate/all/post/dto.py +++ b/src/api/endpoints/annotate/all/post/dto.py @@ -9,8 +9,8 @@ class AllAnnotationPostInfo(BaseModel): suggested_status: SuggestedStatus - record_type: Optional[RecordType] = None - agency: Optional[URLAgencyAnnotationPostInfo] = None + record_type: RecordType | None = None + agency: URLAgencyAnnotationPostInfo | None = None @model_validator(mode="after") def allow_record_type_and_agency_only_if_relevant(self): diff --git a/src/api/endpoints/annotate/dtos/record_type/response.py b/src/api/endpoints/annotate/dtos/record_type/response.py index d46c8e12..188d6500 100644 --- a/src/api/endpoints/annotate/dtos/record_type/response.py +++ b/src/api/endpoints/annotate/dtos/record_type/response.py @@ -9,11 +9,11 @@ class GetNextRecordTypeAnnotationResponseInfo( AnnotationInnerResponseInfoBase ): - suggested_record_type: Optional[RecordType] = Field( + suggested_record_type: RecordType | None = Field( title="What record type, if any, the auto-labeler identified the URL as" ) class GetNextRecordTypeAnnotationResponseOuterInfo( BaseModel ): - next_annotation: Optional[GetNextRecordTypeAnnotationResponseInfo] + next_annotation: GetNextRecordTypeAnnotationResponseInfo | None diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index 1e9fc5fa..edcc80e1 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -14,6 +14,6 @@ class AnnotationInnerResponseInfoBase(BaseModel): html_info: ResponseHTMLInfo = Field( title="HTML information about the URL" ) - batch_info: Optional[AnnotationBatchInfo] = Field( + batch_info: AnnotationBatchInfo | None = Field( title="Information about the annotation batch" ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/relevance/get/dto.py b/src/api/endpoints/annotate/relevance/get/dto.py index b4467365..649367f4 100644 --- a/src/api/endpoints/annotate/relevance/get/dto.py +++ b/src/api/endpoints/annotate/relevance/get/dto.py @@ -22,4 +22,4 @@ class GetNextRelevanceAnnotationResponseInfo(AnnotationInnerResponseInfoBase): ) class GetNextRelevanceAnnotationResponseOuterInfo(BaseModel): - next_annotation: Optional[GetNextRelevanceAnnotationResponseInfo] + next_annotation: GetNextRelevanceAnnotationResponseInfo | None diff --git a/src/api/endpoints/annotate/relevance/get/query.py b/src/api/endpoints/annotate/relevance/get/query.py index 11e509d0..2c616b7b 100644 --- a/src/api/endpoints/annotate/relevance/get/query.py +++ b/src/api/endpoints/annotate/relevance/get/query.py @@ -7,9 +7,9 @@ RelevanceAnnotationResponseInfo from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/annotate/routes.py b/src/api/endpoints/annotate/routes.py index fb5b117e..ddcc24ca 100644 --- a/src/api/endpoints/annotate/routes.py +++ b/src/api/endpoints/annotate/routes.py @@ -31,7 +31,7 @@ async def get_next_url_for_relevance_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = Query( + batch_id: int | None = Query( description="The batch id of the next URL to get. " "If not specified, defaults to first qualifying URL", default=None), @@ -48,7 +48,7 @@ async def annotate_url_for_relevance_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextRelevanceAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate @@ -67,7 +67,7 @@ async def annotate_url_for_relevance_and_get_next_url( async def get_next_url_for_record_type_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextRecordTypeAnnotationResponseOuterInfo: return await async_core.get_next_url_for_record_type_annotation( user_id=access_info.user_id, @@ -80,7 +80,7 @@ async def annotate_url_for_record_type_and_get_next_url( url_id: int = Path(description="The URL id to annotate"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextRecordTypeAnnotationResponseOuterInfo: """ Post URL annotation and get next URL to annotate @@ -99,7 +99,7 @@ async def annotate_url_for_record_type_and_get_next_url( async def get_next_url_for_agency_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextURLForAgencyAnnotationResponse: return await async_core.get_next_url_agency_for_annotation( user_id=access_info.user_id, @@ -112,7 +112,7 @@ async def annotate_url_for_agency_and_get_next_url( agency_annotation_post_info: URLAgencyAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextURLForAgencyAnnotationResponse: """ Post URL annotation and get next URL to annotate @@ -131,7 +131,7 @@ async def annotate_url_for_agency_and_get_next_url( async def get_next_url_for_all_annotations( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextURLForAllAnnotationResponse: return await async_core.get_next_url_for_all_annotations( batch_id=batch_id @@ -143,7 +143,7 @@ async def annotate_url_for_all_annotations_and_get_next_url( all_annotation_post_info: AllAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), - batch_id: Optional[int] = batch_query + batch_id: int | None = batch_query ) -> GetNextURLForAllAnnotationResponse: """ Post URL annotation and get next URL to annotate diff --git a/src/api/endpoints/batch/dtos/get/logs.py b/src/api/endpoints/batch/dtos/get/logs.py index 437e53cd..09ac7bba 100644 --- a/src/api/endpoints/batch/dtos/get/logs.py +++ b/src/api/endpoints/batch/dtos/get/logs.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.instantiations.log.pydantic.output import LogOutputInfo +from src.db.models.impl.log.pydantic.output import LogOutputInfo class GetBatchLogsResponse(BaseModel): diff --git a/src/api/endpoints/batch/dtos/get/summaries/summary.py b/src/api/endpoints/batch/dtos/get/summaries/summary.py index f00a42a5..4ca06768 100644 --- a/src/api/endpoints/batch/dtos/get/summaries/summary.py +++ b/src/api/endpoints/batch/dtos/get/summaries/summary.py @@ -13,6 +13,6 @@ class BatchSummary(BaseModel): status: BatchStatus parameters: dict user_id: int - compute_time: Optional[float] + compute_time: float | None date_generated: datetime.datetime url_counts: BatchSummaryURLCounts diff --git a/src/api/endpoints/batch/duplicates/dto.py b/src/api/endpoints/batch/duplicates/dto.py index b3fe5f17..dce8ae02 100644 --- a/src/api/endpoints/batch/duplicates/dto.py +++ b/src/api/endpoints/batch/duplicates/dto.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo class GetDuplicatesByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 2be9189f..2d8edff9 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -2,11 +2,11 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import aliased -from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.duplicate.sqlalchemy import Duplicate +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/routes.py b/src/api/endpoints/batch/routes.py index 879c643d..a681759b 100644 --- a/src/api/endpoints/batch/routes.py +++ b/src/api/endpoints/batch/routes.py @@ -25,15 +25,15 @@ @batch_router.get("") async def get_batch_status( - collector_type: Optional[CollectorType] = Query( + collector_type: CollectorType | None = Query( description="Filter by collector type", default=None ), - status: Optional[BatchStatus] = Query( + status: BatchStatus | None = Query( description="Filter by status", default=None ), - has_pending_urls: Optional[bool] = Query( + has_pending_urls: bool | None = Query( description="Filter by whether the batch has pending URLs", default=None ), diff --git a/src/api/endpoints/batch/urls/dto.py b/src/api/endpoints/batch/urls/dto.py index 90f9b209..5e671e4b 100644 --- a/src/api/endpoints/batch/urls/dto.py +++ b/src/api/endpoints/batch/urls/dto.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo class GetURLsByBatchResponse(BaseModel): diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 980b4c81..6a88448f 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,9 +1,9 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/dtos/manual_batch/post.py b/src/api/endpoints/collector/dtos/manual_batch/post.py index f7de1ecf..6ec62579 100644 --- a/src/api/endpoints/collector/dtos/manual_batch/post.py +++ b/src/api/endpoints/collector/dtos/manual_batch/post.py @@ -7,13 +7,13 @@ class ManualBatchInnerInputDTO(BaseModel): url: str - name: Optional[str] = None - description: Optional[str] = None - collector_metadata: Optional[dict] = None - record_type: Optional[RecordType] = None - record_formats: Optional[list[str]] = None - data_portal_type: Optional[str] = None - supplying_entity: Optional[str] = None + name: str | None = None + description: str | None = None + collector_metadata: dict | None = None + record_type: RecordType | None = None + record_formats: list[str] | None = None + data_portal_type: str | None = None + supplying_entity: str | None = None class ManualBatchInputDTO(BaseModel): diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 9280fdb9..12b17ad3 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -5,11 +5,11 @@ from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py index a6c6c3df..e7de65fb 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesAggregatedInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 2d4b50e7..6fe0eb71 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -6,9 +6,9 @@ GetMetricsBatchesBreakdownInnerResponseDTO from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/review/approve/dto.py b/src/api/endpoints/review/approve/dto.py index 0d9628f7..639868ca 100644 --- a/src/api/endpoints/review/approve/dto.py +++ b/src/api/endpoints/review/approve/dto.py @@ -7,37 +7,37 @@ class FinalReviewApprovalInfo(FinalReviewBaseInfo): - record_type: Optional[RecordType] = Field( + record_type: RecordType | None = Field( title="The final record type of the URL." "If none, defers to the existing value from the auto-labeler only if it exists.", default=None ) - agency_ids: Optional[list[int]] = Field( + agency_ids: list[int] | None = Field( title="The final confirmed agencies for the URL. " "If none, defers to an existing confirmed agency only if that exists.", default=None ) - name: Optional[str] = Field( + name: str | None = Field( title="The name of the source. " "If none, defers to an existing name only if that exists.", default=None ) - description: Optional[str] = Field( + description: str | None = Field( title="The description of the source. " "If none, defers to an existing description only if that exists.", default=None ) - record_formats: Optional[list[str]] = Field( + record_formats: list[str] | None = Field( title="The record formats of the source. " "If none, defers to an existing record formats only if that exists.", default=None ) - data_portal_type: Optional[str] = Field( + data_portal_type: str | None = Field( title="The data portal type of the source. " "If none, defers to an existing data portal type only if that exists.", default=None ) - supplying_entity: Optional[str] = Field( + supplying_entity: str | None = Field( title="The supplying entity of the source. " "If none, defers to an existing supplying entity only if that exists.", default=None diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index eeea3da1..af810a2b 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -8,11 +8,11 @@ from src.api.endpoints.review.approve.query_.util import update_if_not_none from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/review/next/dto.py b/src/api/endpoints/review/next/dto.py index a9c378b9..e1fa2f74 100644 --- a/src/api/endpoints/review/next/dto.py +++ b/src/api/endpoints/review/next/dto.py @@ -9,16 +9,16 @@ class FinalReviewAnnotationRelevantInfo(BaseModel): - auto: Optional[RelevanceAnnotationResponseInfo] = Field(title="Whether the auto-labeler has marked the URL as relevant") - user: Optional[SuggestedStatus] = Field( + auto: RelevanceAnnotationResponseInfo | None = Field(title="Whether the auto-labeler has marked the URL as relevant") + user: SuggestedStatus | None = Field( title="The status marked by a user, if any", ) class FinalReviewAnnotationRecordTypeInfo(BaseModel): - auto: Optional[RecordType] = Field( + auto: RecordType | None = Field( title="The record type suggested by the auto-labeler" ) - user: Optional[RecordType] = Field( + user: RecordType | None = Field( title="The record type suggested by a user", ) @@ -26,17 +26,17 @@ class FinalReviewAnnotationRecordTypeInfo(BaseModel): class FinalReviewAnnotationAgencyAutoInfo(BaseModel): unknown: bool = Field(title="Whether the auto-labeler suggested the URL as unknown") - suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( + suggestions: list[GetNextURLForAgencyAgencyInfo] | None = Field( title="A list of agencies, if any, suggested by the auto-labeler", ) class FinalReviewAnnotationAgencyInfo(BaseModel): - confirmed: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( + confirmed: list[GetNextURLForAgencyAgencyInfo] | None = Field( title="The confirmed agency for the URL", ) - auto: Optional[FinalReviewAnnotationAgencyAutoInfo] = Field( + auto: FinalReviewAnnotationAgencyAutoInfo | None = Field( title="A single agency or a list of agencies suggested by the auto-labeler",) - user: Optional[GetNextURLForAgencyAgencyInfo] = Field( + user: GetNextURLForAgencyAgencyInfo | None = Field( title="A single agency suggested by a user", ) # endregion @@ -53,15 +53,15 @@ class FinalReviewAnnotationInfo(BaseModel): ) class FinalReviewOptionalMetadata(BaseModel): - record_formats: Optional[list[str]] = Field( + record_formats: list[str] | None = Field( title="The record formats of the source", default=None ) - data_portal_type: Optional[str] = Field( + data_portal_type: str | None = Field( title="The data portal type of the source", default=None ) - supplying_entity: Optional[str] = Field( + supplying_entity: str | None = Field( title="The supplying entity of the source", default=None ) @@ -77,8 +77,8 @@ class FinalReviewBatchInfo(BaseModel): class GetNextURLForFinalReviewResponse(BaseModel): id: int = Field(title="The id of the URL") url: str = Field(title="The URL") - name: Optional[str] = Field(title="The name of the source") - description: Optional[str] = Field(title="The description of the source") + name: str | None = Field(title="The name of the source") + description: str | None = Field(title="The description of the source") html_info: ResponseHTMLInfo = Field(title="The HTML content of the URL") annotations: FinalReviewAnnotationInfo = Field( title="The annotations for the URL, from both users and the auto-labeler", @@ -86,12 +86,12 @@ class GetNextURLForFinalReviewResponse(BaseModel): optional_metadata: FinalReviewOptionalMetadata = Field( title="Optional metadata for the source", ) - batch_info: Optional[FinalReviewBatchInfo] = Field( + batch_info: FinalReviewBatchInfo | None = Field( title="Information about the batch", ) class GetNextURLForFinalReviewOuterResponse(BaseModel): - next_source: Optional[GetNextURLForFinalReviewResponse] = Field( + next_source: GetNextURLForFinalReviewResponse | None = Field( title="The next source to be reviewed", ) remaining: int = Field( diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/query.py index e2de4f07..7cb4670b 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/query.py @@ -12,12 +12,12 @@ from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder @@ -27,7 +27,7 @@ class GetNextURLForFinalReviewQueryBuilder(QueryBuilderBase): - def __init__(self, batch_id: Optional[int] = None): + def __init__(self, batch_id: int | None = None): super().__init__() self.batch_id = batch_id self.anno_exists_builder = AnnotationExistsCTEQueryBuilder() @@ -107,7 +107,7 @@ def _sum_exists_query(self, query, models: list[Type[URLDependentMixin]]): ).label(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL) - async def _apply_batch_id_filter(self, url_query: Select, batch_id: Optional[int]): + async def _apply_batch_id_filter(self, url_query: Select, batch_id: int | None): if batch_id is None: return url_query return url_query.where(URL.batch_id == batch_id) @@ -150,7 +150,7 @@ async def _extract_optional_metadata(self, url: URL) -> FinalReviewOptionalMetad supplying_entity=url.optional_data_source_metadata.supplying_entity ) - async def get_batch_info(self, session: AsyncSession) -> Optional[FinalReviewBatchInfo]: + async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | None: if self.batch_id is None: return None diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 00bf26d3..7d603fe1 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,8 +5,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/search/dtos/response.py b/src/api/endpoints/search/dtos/response.py index 1a46c0be..c2283ea4 100644 --- a/src/api/endpoints/search/dtos/response.py +++ b/src/api/endpoints/search/dtos/response.py @@ -5,4 +5,4 @@ class SearchURLResponse(BaseModel): found: bool - url_id: Optional[int] = None \ No newline at end of file + url_id: int | None = None \ No newline at end of file diff --git a/src/api/endpoints/task/by_id/dto.py b/src/api/endpoints/task/by_id/dto.py index eba6cece..1cac74d1 100644 --- a/src/api/endpoints/task/by_id/dto.py +++ b/src/api/endpoints/task/by_id/dto.py @@ -3,8 +3,8 @@ from pydantic import BaseModel -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.enums import BatchStatus @@ -13,6 +13,6 @@ class TaskInfo(BaseModel): task_type: TaskType task_status: BatchStatus updated_at: datetime.datetime - error_info: Optional[str] = None + error_info: str | None = None urls: list[URLInfo] url_errors: list[URLErrorPydanticInfo] \ No newline at end of file diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index e66001f5..45917d3a 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -5,11 +5,11 @@ from src.api.endpoints.task.by_id.dto import TaskInfo from src.collectors.enums import URLStatus from src.core.enums import BatchStatus -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType -from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.task.core import Task +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/task/routes.py b/src/api/endpoints/task/routes.py index a719d6b9..23f52999 100644 --- a/src/api/endpoints/task/routes.py +++ b/src/api/endpoints/task/routes.py @@ -25,11 +25,11 @@ async def get_tasks( description="The page number", default=1 ), - task_status: Optional[BatchStatus] = Query( + task_status: BatchStatus | None = Query( description="Filter by task status", default=None ), - task_type: Optional[TaskType] = Query( + task_type: TaskType | None = Query( description="Filter by task type", default=None ), diff --git a/src/api/endpoints/url/get/dto.py b/src/api/endpoints/url/get/dto.py index 3b3e980e..eef8da2d 100644 --- a/src/api/endpoints/url/get/dto.py +++ b/src/api/endpoints/url/get/dto.py @@ -25,7 +25,7 @@ class GetURLsResponseInnerInfo(BaseModel): batch_id: int | None url: str status: URLStatus - collector_metadata: Optional[dict] + collector_metadata: dict | None updated_at: datetime.datetime created_at: datetime.datetime errors: list[GetURLsResponseErrorInfo] diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index b7ef6119..be4801bf 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -5,8 +5,8 @@ from src.api.endpoints.url.get.dto import GetURLsResponseInfo, GetURLsResponseErrorInfo, GetURLsResponseInnerInfo from src.collectors.enums import URLStatus from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/main.py b/src/api/main.py index 735c5f6f..384cb680 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -16,11 +16,11 @@ from src.api.endpoints.search.routes import search_router from src.api.endpoints.task.routes import task_router from src.api.endpoints.url.routes import url_router -from src.collectors.manager import AsyncCollectorManager from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore -from src.core.logger import AsyncCoreLogger from src.core.env_var_manager import EnvVarManager +from src.core.logger import AsyncCoreLogger from src.core.tasks.handler import TaskHandler from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader from src.core.tasks.scheduled.manager import AsyncScheduledTaskManager @@ -28,13 +28,12 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser -from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.external.huggingface.hub.client import HuggingFaceHubClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient +from src.external.url_request.core import URLRequestInterface @asynccontextmanager @@ -74,9 +73,7 @@ async def lifespan(app: FastAPI): loader=URLTaskOperatorLoader( adb_client=adb_client, url_request_interface=URLRequestInterface(), - html_parser=HTMLResponseParser( - root_url_cache=RootURLCache() - ), + html_parser=HTMLResponseParser(), pdap_client=pdap_client, muckrock_api_interface=MuckrockAPIInterface( session=session diff --git a/src/collectors/impl/base.py b/src/collectors/impl/base.py index d4910b8a..6dcaac7c 100644 --- a/src/collectors/impl/base.py +++ b/src/collectors/impl/base.py @@ -8,13 +8,13 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger from src.core.function_trigger import FunctionTrigger from src.core.enums import BatchStatus from src.core.preprocessors.base import PreprocessorBase -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo class AsyncCollectorBase(ABC): diff --git a/src/collectors/queries/get_url_info.py b/src/collectors/queries/get_url_info.py index d72fc6af..9dc9fc24 100644 --- a/src/collectors/queries/get_url_info.py +++ b/src/collectors/queries/get_url_info.py @@ -1,8 +1,8 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index f8c2bc75..96365107 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -1,8 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index ddab0582..75176158 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -5,8 +5,8 @@ from src.util.clean import clean_url from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/collectors/queries/insert/urls/request_manager.py b/src/collectors/queries/insert/urls/request_manager.py index cd8a3399..22f6ff66 100644 --- a/src/collectors/queries/insert/urls/request_manager.py +++ b/src/collectors/queries/insert/urls/request_manager.py @@ -2,8 +2,8 @@ from src.collectors.queries.get_url_info import GetURLInfoByURLQueryBuilder from src.collectors.queries.insert.url import InsertURLQueryBuilder -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.helpers.session import session_helper as sh diff --git a/src/core/core.py b/src/core/core.py index f2c084c5..c597a591 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager @@ -93,9 +93,9 @@ async def get_duplicate_urls_by_batch(self, batch_id: int, page: int = 1) -> Get async def get_batch_statuses( self, - collector_type: Optional[CollectorType], - status: Optional[BatchStatus], - has_pending_urls: Optional[bool], + collector_type: CollectorType | None, + status: BatchStatus | None, + has_pending_urls: bool | None, page: int ) -> GetBatchSummariesResponse: results = await self.adb_client.get_batch_summaries( @@ -117,7 +117,7 @@ async def initiate_collector( self, collector_type: CollectorType, user_id: int, - dto: Optional[BaseModel] = None, + dto: BaseModel | None = None, ) -> CollectorStartInfo: """ Reserves a batch ID from the database diff --git a/src/core/logger.py b/src/core/logger.py index 804edffd..22f35492 100644 --- a/src/core/logger.py +++ b/src/core/logger.py @@ -1,7 +1,7 @@ import asyncio from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo class AsyncCoreLogger: diff --git a/src/core/preprocessors/autogoogler.py b/src/core/preprocessors/autogoogler.py index dd76218f..e3771f2c 100644 --- a/src/core/preprocessors/autogoogler.py +++ b/src/core/preprocessors/autogoogler.py @@ -1,8 +1,8 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo class AutoGooglerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/base.py b/src/core/preprocessors/base.py index 2f777d5f..16d9432b 100644 --- a/src/core/preprocessors/base.py +++ b/src/core/preprocessors/base.py @@ -2,7 +2,7 @@ from abc import ABC from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo class PreprocessorBase(ABC): diff --git a/src/core/preprocessors/ckan.py b/src/core/preprocessors/ckan.py index 0b1cef2e..671134c2 100644 --- a/src/core/preprocessors/ckan.py +++ b/src/core/preprocessors/ckan.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo class CKANPreprocessor: diff --git a/src/core/preprocessors/common_crawler.py b/src/core/preprocessors/common_crawler.py index 18afd3e3..d831c520 100644 --- a/src/core/preprocessors/common_crawler.py +++ b/src/core/preprocessors/common_crawler.py @@ -1,8 +1,8 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo class CommonCrawlerPreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index 34c1e3a4..31e68e44 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -2,8 +2,8 @@ from src.collectors.impl.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo class ExamplePreprocessor(PreprocessorBase): diff --git a/src/core/preprocessors/muckrock.py b/src/core/preprocessors/muckrock.py index 660dd028..1e05395a 100644 --- a/src/core/preprocessors/muckrock.py +++ b/src/core/preprocessors/muckrock.py @@ -1,8 +1,8 @@ from typing import List from src.core.preprocessors.base import PreprocessorBase -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo class MuckrockPreprocessor(PreprocessorBase): diff --git a/src/core/tasks/base/run_info.py b/src/core/tasks/base/run_info.py index b822c59f..78e6b357 100644 --- a/src/core/tasks/base/run_info.py +++ b/src/core/tasks/base/run_info.py @@ -7,7 +7,7 @@ class TaskOperatorRunInfo(BaseModel): - task_id: Optional[int] + task_id: int | None task_type: TaskType outcome: TaskOperatorOutcome message: str = "" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py index a349233c..23e0b0b6 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py @@ -6,9 +6,9 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.state.huggingface import HuggingFaceUploadState -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.state.huggingface import HuggingFaceUploadState +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.core.sqlalchemy import URL class CheckValidURLsUpdatedRequester: diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 27f206b7..30cfa234 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -5,8 +5,8 @@ from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_url_status_to_relevant, \ convert_fine_to_coarse_record_type from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html from src.db.helpers.session import session_helper as sh diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/state.py b/src/core/tasks/scheduled/impl/huggingface/queries/state.py index 5e04c809..3abebc71 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/state.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/state.py @@ -3,7 +3,7 @@ from sqlalchemy import delete, insert from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.state.huggingface import HuggingFaceUploadState +from src.db.models.impl.state.huggingface import HuggingFaceUploadState from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py index 106211df..0e81e97d 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/get_sync_params.py @@ -3,7 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.state.sync.agencies import AgenciesSyncState from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py b/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py index f92a8798..c578c4ea 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/mark_full_sync.py @@ -1,6 +1,6 @@ from sqlalchemy import update, func, text, Update -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.state.sync.agencies import AgenciesSyncState def get_mark_full_agencies_sync_query() -> Update: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py b/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py index 6cc88398..2cebb046 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/update_sync_progress.py @@ -1,6 +1,6 @@ from sqlalchemy import Update, update -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.state.sync.agencies import AgenciesSyncState def get_update_agencies_sync_progress_query(page: int) -> Update: diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py index 64988cba..61a0b104 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.agency.pydantic.upsert import AgencyUpsertModel +from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py index 26e76921..114eb758 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/get_sync_params.py @@ -3,7 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py index f2966c69..8d6e0bdb 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/mark_full_sync.py @@ -1,6 +1,6 @@ from sqlalchemy import Update, update, func, text -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState def get_mark_full_data_sources_sync_query() -> Update: diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py index 51962fff..d6aaebe0 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/update_sync_progress.py @@ -1,6 +1,6 @@ from sqlalchemy import update, Update -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState def get_update_data_sources_sync_progress_query(page: int) -> Update: diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py index 05b6ec75..a265def5 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/convert.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic def convert_to_link_url_agency_models( diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py index fa807acc..a81be905 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py @@ -5,9 +5,9 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index ffbe61f9..7ca8ebad 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -10,8 +10,8 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic class UpsertURLsFromDataSourcesParamManager: diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py index c0d6eaa1..08b5df22 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py @@ -14,8 +14,8 @@ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.instantiations.url.data_source.pydantic import URLDataSourcePydantic +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic class UpsertURLsFromDataSourcesDBRequester: diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py index 2be5d539..50b8e586 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/insert/params.py @@ -1,7 +1,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py index cf232a4a..d77be0ab 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/lookup/query.py @@ -5,9 +5,9 @@ from src.db.helpers.session import session_helper as sh from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ LookupURLForDataSourcesSyncResponse, URLDataSyncInfo -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py index 0bbf0be2..c8d20afb 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/url/update/params.py @@ -1,6 +1,6 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.templates.markers.bulk.update import BulkUpdatableModel diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index b2bc1e14..2203674d 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -16,6 +16,7 @@ from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient @@ -152,8 +153,21 @@ async def _get_url_probe_task_operator(self) -> URLTaskEntry: ) ) + async def _get_url_root_url_task_operator(self) -> URLTaskEntry: + operator = URLRootURLTaskOperator( + adb_client=self.adb_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.env.bool( + "URL_ROOT_URL_TASK_FLAG", + default=True + ) + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ + await self._get_url_root_url_task_operator(), await self._get_url_probe_task_operator(), await self._get_url_html_task_operator(), await self._get_url_404_probe_task_operator(), diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 759cfe81..8ac1f632 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo class AgencyIdentificationTaskOperator(URLTaskOperatorBase): diff --git a/src/core/tasks/url/operators/agency_identification/dtos/output.py b/src/core/tasks/url/operators/agency_identification/dtos/output.py index 46f3aa97..d7381129 100644 --- a/src/core/tasks/url/operators/agency_identification/dtos/output.py +++ b/src/core/tasks/url/operators/agency_identification/dtos/output.py @@ -1,7 +1,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo class GetAgencySuggestionsOutput(BaseModel): diff --git a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py b/src/core/tasks/url/operators/agency_identification/dtos/tdo.py index 35f22844..72f24d97 100644 --- a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py +++ b/src/core/tasks/url/operators/agency_identification/dtos/tdo.py @@ -7,5 +7,5 @@ class AgencyIdentificationTDO(BaseModel): url_id: int - collector_metadata: Optional[dict] = None + collector_metadata: dict | None = None collector_type: CollectorType | None diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py index 521fa8c0..5eeb4355 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py @@ -3,9 +3,9 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py index ab5429fb..e8a0e8ce 100644 --- a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py +++ b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/auto_relevant/core.py b/src/core/tasks/url/operators/auto_relevant/core.py index d696cc31..53ff101f 100644 --- a/src/core/tasks/url/operators/auto_relevant/core.py +++ b/src/core/tasks/url/operators/auto_relevant/core.py @@ -3,8 +3,8 @@ from src.core.tasks.url.operators.auto_relevant.sort import separate_success_and_error_subsets from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.huggingface.inference.models.input import BasicInput diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index 1a5fafc1..570f087c 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -6,9 +6,9 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py index fb7bdd59..bee7183c 100644 --- a/src/core/tasks/url/operators/html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,6 +1,6 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.enums import HTMLContentType class HTMLContentInfoGetter: diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py index 00c1d1c3..26f70cdb 100644 --- a/src/core/tasks/url/operators/html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.external.url_request.core import URLRequestInterface diff --git a/src/core/tasks/url/operators/html/queries/get.py b/src/core/tasks/url/operators/html/queries/get.py index 8ea70bed..832d9917 100644 --- a/src/core/tasks/url/operators/html/queries/get.py +++ b/src/core/tasks/url/operators/html/queries/get.py @@ -1,7 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py index 9c9906d8..b07118bb 100644 --- a/src/core/tasks/url/operators/html/queries/insert/convert.py +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -3,10 +3,10 @@ from src.core.tasks.url.operators.html.content_info_getter import HTMLContentInfoGetter from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.html.compressed.pydantic import URLCompressedHTMLPydantic -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus -from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.utils.compression import compress_html from src.external.url_request.dtos.url_response import URLResponseInfo diff --git a/src/core/tasks/url/operators/html/scraper/parser/core.py b/src/core/tasks/url/operators/html/scraper/parser/core.py index c209ba27..d79ab1f6 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/core.py +++ b/src/core/tasks/url/operators/html/scraper/parser/core.py @@ -1,25 +1,20 @@ import json -from typing import Optional from bs4 import BeautifulSoup +from src.core.tasks.url.operators.html.scraper.parser.constants import HEADER_TAGS from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.enums import ParserTypeEnum -from src.core.tasks.url.operators.html.scraper.parser.constants import HEADER_TAGS -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.html.scraper.parser.util import remove_excess_whitespace, add_https, remove_trailing_backslash, \ +from src.core.tasks.url.operators.html.scraper.parser.util import remove_excess_whitespace, add_https, \ + remove_trailing_backslash, \ drop_hostname class HTMLResponseParser: - def __init__(self, root_url_cache: RootURLCache): - self.root_url_cache = root_url_cache - async def parse(self, url: str, html_content: str, content_type: str) -> ResponseHTMLInfo: html_info = ResponseHTMLInfo() self.add_url_and_path(html_info, html_content=html_content, url=url) - await self.add_root_page_titles(html_info) parser_type = self.get_parser_type(content_type) if parser_type is None: return html_info @@ -116,16 +111,6 @@ def add_url_and_path( url_path = remove_trailing_backslash(url_path) html_info.url_path = url_path - async def add_root_page_titles(self, html_info: ResponseHTMLInfo) -> None: - """ - Modifies: - html_info.root_page_title - """ - root_page_title = await self.root_url_cache.get_title(html_info.url) - html_info.root_page_title = remove_excess_whitespace( - root_page_title - ) - def get_parser_type(self, content_type: str) -> ParserTypeEnum | None: try: # If content type does not contain "html" or "xml" then we can assume that the content is unreadable diff --git a/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py b/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py index dfa34510..0df614ce 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py +++ b/src/core/tasks/url/operators/html/scraper/parser/dtos/response_html.py @@ -7,7 +7,6 @@ class ResponseHTMLInfo(BaseModel): url_path: str = "" title: str = "" description: str = "" - root_page_title: str = "" http_response: int = -1 h1: str = "" h2: str = "" diff --git a/src/core/tasks/url/operators/html/scraper/parser/mapping.py b/src/core/tasks/url/operators/html/scraper/parser/mapping.py index 641af779..b4bb4f4a 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/mapping.py +++ b/src/core/tasks/url/operators/html/scraper/parser/mapping.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.enums import HTMLContentType ENUM_TO_ATTRIBUTE_MAPPING = { HTMLContentType.TITLE: "title", diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py deleted file mode 100644 index 52d392e0..00000000 --- a/src/core/tasks/url/operators/html/scraper/root_url_cache/constants.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -Some websites refuse the connection of automated requests, -setting the User-Agent will circumvent that. -""" -USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36" -REQUEST_HEADERS = { - "User-Agent": USER_AGENT, - # Make sure there's no pre-mature closing of responses before a redirect completes - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - } diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py deleted file mode 100644 index 1bf15638..00000000 --- a/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py +++ /dev/null @@ -1,81 +0,0 @@ -from typing import Optional -from urllib.parse import urlparse - -from aiohttp import ClientSession -from bs4 import BeautifulSoup - -from src.db.client.async_ import AsyncDatabaseClient -from src.core.tasks.url.operators.html.scraper.root_url_cache.constants import REQUEST_HEADERS -from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo - -DEBUG = False - - -class RootURLCache: - def __init__(self, adb_client: AsyncDatabaseClient | None = None): - if adb_client is None: - adb_client = AsyncDatabaseClient() - self.adb_client = adb_client - self.cache = None - - async def save_to_cache(self, url: str, title: str) -> None: - if url in self.cache: - return - self.cache[url] = title - await self.adb_client.add_to_root_url_cache(url=url, page_title=title) - - async def get_from_cache(self, url: str) -> str | None: - if self.cache is None: - self.cache = await self.adb_client.load_root_url_cache() - - if url in self.cache: - return self.cache[url] - return None - - async def get_request(self, url: str) -> RootURLCacheResponseInfo: - async with ClientSession() as session: - try: - async with session.get(url, headers=REQUEST_HEADERS, timeout=120) as response: - response.raise_for_status() - text = await response.text() - return RootURLCacheResponseInfo(text=text) - except Exception as e: - return RootURLCacheResponseInfo(exception=e) - - async def get_title(self, url) -> str: - if not url.startswith('http'): - url = "https://" + url - - parsed_url = urlparse(url) - root_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - - title = await self.get_from_cache(root_url) - if title is not None: - return title - - response_info = await self.get_request(root_url) - if response_info.exception is not None: - return self.handle_exception(response_info.exception) - - title = await self.get_title_from_soup(response_info.text) - - await self.save_to_cache(url=root_url, title=title) - - return title - - async def get_title_from_soup(self, text: str) -> str: - soup = BeautifulSoup(text, 'html.parser') - try: - title = soup.find('title').text - except AttributeError: - title = "" - # Prevents most bs4 memory leaks - if soup.html: - soup.html.decompose() - return title - - def handle_exception(self, e): - if DEBUG: - return f"Error retrieving title: {e}" - else: - return "" diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py deleted file mode 100644 index 6ea1d21c..00000000 --- a/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class RootURLCacheResponseInfo(BaseModel): - class Config: - arbitrary_types_allowed = True - - text: Optional[str] = None - exception: Optional[Exception] = None diff --git a/src/core/tasks/url/operators/html/tdo.py b/src/core/tasks/url/operators/html/tdo.py index 6395e363..00d5b9af 100644 --- a/src/core/tasks/url/operators/html/tdo.py +++ b/src/core/tasks/url/operators/html/tdo.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.external.url_request.dtos.url_response import URLResponseInfo diff --git a/src/core/tasks/url/operators/misc_metadata/core.py b/src/core/tasks/url/operators/misc_metadata/core.py index 8e423c0e..20e2fcd2 100644 --- a/src/core/tasks/url/operators/misc_metadata/core.py +++ b/src/core/tasks/url/operators/misc_metadata/core.py @@ -1,7 +1,7 @@ from typing import Optional from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.collectors.enums import CollectorType from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO diff --git a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py index ed411bd6..0efbfceb 100644 --- a/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py +++ b/src/core/tasks/url/operators/misc_metadata/queries/get_pending_urls_missing_miscellaneous_data.py @@ -3,8 +3,8 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo -from src.db.models.instantiations.url.html.content.enums import HTMLContentType -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/core/tasks/url/operators/probe/convert.py b/src/core/tasks/url/operators/probe/convert.py index 8de86587..dcb211f0 100644 --- a/src/core/tasks/url/operators/probe/convert.py +++ b/src/core/tasks/url/operators/probe/convert.py @@ -1,5 +1,5 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]: diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py index 62de2ae1..eb0597ba 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py @@ -2,9 +2,9 @@ from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic def convert_url_response_mapping_to_web_metadata_list( diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 924de9ef..d866106a 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -10,8 +10,8 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse from src.util.url_mapper import URLMapper diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py index 207648cc..5176add9 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py index 1ae7835b..99c4cc67 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py @@ -3,8 +3,8 @@ from typing_extensions import override, final from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.base.builder import QueryBuilderBase @final diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index b39d8947..8e29adc6 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,8 +4,8 @@ from src.util.clean import clean_url from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/record_type/core.py b/src/core/tasks/url/operators/record_type/core.py index 56abc6fc..2efbe28f 100644 --- a/src/core/tasks/url/operators/record_type/core.py +++ b/src/core/tasks/url/operators/record_type/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.record_type.tdo import URLRecordTypeTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/record_type/tdo.py b/src/core/tasks/url/operators/record_type/tdo.py index 43a32bab..3effcf53 100644 --- a/src/core/tasks/url/operators/record_type/tdo.py +++ b/src/core/tasks/url/operators/record_type/tdo.py @@ -8,8 +8,8 @@ class URLRecordTypeTDO(BaseModel): url_with_html: URLWithHTML - record_type: Optional[RecordType] = None - error: Optional[str] = None + record_type: RecordType | None = None + error: str | None = None def is_errored(self): return self.error is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py b/src/core/tasks/url/operators/root_url/__init__.py similarity index 100% rename from src/core/tasks/url/operators/html/scraper/root_url_cache/__init__.py rename to src/core/tasks/url/operators/root_url/__init__.py diff --git a/src/core/tasks/url/operators/root_url/convert.py b/src/core/tasks/url/operators/root_url/convert.py new file mode 100644 index 00000000..405cbc49 --- /dev/null +++ b/src/core/tasks/url/operators/root_url/convert.py @@ -0,0 +1,49 @@ +from src.core.tasks.url.operators.root_url.extract import extract_root_url +from src.core.tasks.url.operators.root_url.models.root_mapping import URLRootURLMapping +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic +from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.util.url_mapper import URLMapper + + +def convert_to_flag_root_url_pydantic(url_ids: list[int]) -> list[FlagRootURLPydantic]: + return [FlagRootURLPydantic(url_id=url_id) for url_id in url_ids] + +def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLRootURLMapping]: + return [ + URLRootURLMapping( + url=mapping.url, + root_url=extract_root_url(mapping.url) + ) for mapping in url_mappings + ] + +def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: + return [ + URLInsertModel( + url=url, + source=URLSource.ROOT_URL + ) for url in urls + ] + +def convert_to_root_url_links( + root_db_mappings: list[URLMapping], + branch_db_mappings: list[URLMapping], + url_root_url_mappings: list[URLRootURLMapping] +) -> list[LinkURLRootURLPydantic]: + root_mapper = URLMapper(root_db_mappings) + branch_mapper = URLMapper(branch_db_mappings) + results: list[LinkURLRootURLPydantic] = [] + + for url_root_url_mapping in url_root_url_mappings: + root_url_id = root_mapper.get_id(url_root_url_mapping.root_url) + branch_url_id = branch_mapper.get_id(url_root_url_mapping.url) + + results.append( + LinkURLRootURLPydantic( + root_url_id=root_url_id, + url_id=branch_url_id) + ) + + return results diff --git a/src/core/tasks/url/operators/root_url/core.py b/src/core/tasks/url/operators/root_url/core.py new file mode 100644 index 00000000..e32654da --- /dev/null +++ b/src/core/tasks/url/operators/root_url/core.py @@ -0,0 +1,162 @@ +from typing import final + +from typing_extensions import override + +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.root_url.convert import convert_to_flag_root_url_pydantic, \ + convert_to_url_root_url_mapping, convert_to_url_insert_models, convert_to_root_url_links +from src.core.tasks.url.operators.root_url.models.root_mapping import URLRootURLMapping +from src.core.tasks.url.operators.root_url.queries.get import GetURLsForRootURLTaskQueryBuilder +from src.core.tasks.url.operators.root_url.queries.lookup.query import LookupRootURLsQueryBuilder +from src.core.tasks.url.operators.root_url.queries.lookup.response import LookupRootsURLResponse +from src.core.tasks.url.operators.root_url.queries.prereq import CheckPrereqsForRootURLTaskQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.enums import TaskType +from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic +from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.util.url_mapper import URLMapper + + +@final +class URLRootURLTaskOperator(URLTaskOperatorBase): + + def __init__(self, adb_client: AsyncDatabaseClient): + super().__init__(adb_client) + + @override + async def meets_task_prerequisites(self) -> bool: + builder = CheckPrereqsForRootURLTaskQueryBuilder() + return await self.adb_client.run_query_builder(builder) + + @property + @override + def task_type(self) -> TaskType: + return TaskType.ROOT_URL + + @override + async def inner_task_logic(self) -> None: + all_task_mappings: list[URLMapping] = await self._get_urls_for_root_url_task() + + await self.link_urls_to_task( + url_ids=[mapping.url_id for mapping in all_task_mappings] + ) + + # Get the Root URLs for all URLs + mapper = URLMapper(all_task_mappings) + + # -- Identify and Derive Root URLs -- + + root_url_mappings: list[URLRootURLMapping] = convert_to_url_root_url_mapping(all_task_mappings) + + # For those where the URL is also the Root URL, separate them + original_root_urls: list[str] = [mapping.url for mapping in root_url_mappings if mapping.is_root_url] + derived_root_urls: list[str] = [mapping.root_url for mapping in root_url_mappings if not mapping.is_root_url] + + # -- Add new Derived Root URLs -- + + # For derived Root URLs, we need to check if they are already in the database + derived_root_url_lookup_responses: list[LookupRootsURLResponse] = await self._lookup_root_urls(derived_root_urls) + + # For those not already in the database, we need to add them and get their mappings + derived_root_urls_not_in_db: list[str] = [ + response.url + for response in derived_root_url_lookup_responses + if response.url_id is None + ] + new_derived_root_url_mappings: list[URLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) + + # Add these to the mapper + mapper.add_mappings(new_derived_root_url_mappings) + + # -- Flag Root URLs -- + + # Of those we obtain, we need to get those that are not yet flagged as Root URLs + extant_derived_root_url_ids_not_flagged: list[int] = [ + response.url_id + for response in derived_root_url_lookup_responses + if response.url_id is not None and not response.flagged_as_root + ] + original_root_url_ids_not_flagged: list[int] = [ + mapper.get_id(url) + for url in original_root_urls + ] + new_derived_root_url_ids_not_flagged: list[int] = [ + mapping.url_id + for mapping in new_derived_root_url_mappings + ] + + all_root_url_ids_not_flagged: list[int] = list(set( + extant_derived_root_url_ids_not_flagged + + new_derived_root_url_ids_not_flagged + + original_root_url_ids_not_flagged + )) + + await self._flag_root_urls(all_root_url_ids_not_flagged) + + # -- Add Root URL Links -- + + branch_url_mappings: list[URLRootURLMapping] = [mapping for mapping in root_url_mappings if not mapping.is_root_url] + await self._add_root_url_links( + mapper, + root_url_mappings=branch_url_mappings, + ) + + async def _add_root_url_links( + self, + mapper: URLMapper, + root_url_mappings: list[URLRootURLMapping], + ): + # For all task URLs that are not root URLs (i.e. 'branch' URLs): + # - Connect them to the Root URL + # - Add the link + + branch_urls: list[str] = [mapping.url for mapping in root_url_mappings] + root_urls: list[str] = [mapping.root_url for mapping in root_url_mappings] + + root_url_db_mappings: list[URLMapping] = await self._lookup_root_urls(root_urls) + task_url_db_mappings: list[URLMapping] = mapper.get_mappings_by_url(branch_urls) + + links: list[LinkURLRootURLPydantic] = convert_to_root_url_links( + root_db_mappings=root_url_db_mappings, + branch_db_mappings=task_url_db_mappings, + url_root_url_mappings=root_url_mappings + ) + await self._add_link_url_root_urls(links) + + async def _flag_root_urls( + self, + url_ids: list[int] + ): + await self._flag_as_root_urls(url_ids) + + async def _get_urls_for_root_url_task(self) -> list[URLMapping]: + builder = GetURLsForRootURLTaskQueryBuilder() + return await self.adb_client.run_query_builder(builder) + + async def _lookup_root_urls(self, urls: list[str]) -> list[LookupRootsURLResponse]: + builder = LookupRootURLsQueryBuilder(urls=list(set(urls))) + return await self.adb_client.run_query_builder(builder) + + async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: + if len(urls) == 0: + return [] + insert_models: list[URLInsertModel] = convert_to_url_insert_models(urls) + url_ids: list[int] = await self.adb_client.bulk_insert(insert_models, return_ids=True) + mappings: list[URLMapping] = [] + for url, url_id in zip(urls, url_ids): + mappings.append( + URLMapping( + url=url, + url_id=url_id + ) + ) + return mappings + + async def _flag_as_root_urls(self, url_ids: list[int]) -> None: + flag_root_urls: list[FlagRootURLPydantic] = convert_to_flag_root_url_pydantic(url_ids) + await self.adb_client.bulk_insert(flag_root_urls) + + async def _add_link_url_root_urls(self, links: list[LinkURLRootURLPydantic]) -> None: + await self.adb_client.bulk_insert(links) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py new file mode 100644 index 00000000..e384fd15 --- /dev/null +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -0,0 +1,7 @@ +from urllib.parse import urlparse, ParseResult + + +def extract_root_url(url: str) -> str: + parsed_url: ParseResult = urlparse(url) + root_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + return root_url \ No newline at end of file diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py b/src/core/tasks/url/operators/root_url/models/__init__.py similarity index 100% rename from src/core/tasks/url/operators/html/scraper/root_url_cache/dtos/__init__.py rename to src/core/tasks/url/operators/root_url/models/__init__.py diff --git a/src/core/tasks/url/operators/root_url/models/root_mapping.py b/src/core/tasks/url/operators/root_url/models/root_mapping.py new file mode 100644 index 00000000..7b115f36 --- /dev/null +++ b/src/core/tasks/url/operators/root_url/models/root_mapping.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class URLRootURLMapping(BaseModel): + url: str + root_url: str + + @property + def is_root_url(self) -> bool: + return self.url == self.root_url \ No newline at end of file diff --git a/src/db/models/instantiations/__init__.py b/src/core/tasks/url/operators/root_url/queries/__init__.py similarity index 100% rename from src/db/models/instantiations/__init__.py rename to src/core/tasks/url/operators/root_url/queries/__init__.py diff --git a/src/db/models/instantiations/agency/__init__.py b/src/core/tasks/url/operators/root_url/queries/_shared/__init__.py similarity index 100% rename from src/db/models/instantiations/agency/__init__.py rename to src/core/tasks/url/operators/root_url/queries/_shared/__init__.py diff --git a/src/core/tasks/url/operators/root_url/queries/_shared/urls_without_root_id.py b/src/core/tasks/url/operators/root_url/queries/_shared/urls_without_root_id.py new file mode 100644 index 00000000..f573133f --- /dev/null +++ b/src/core/tasks/url/operators/root_url/queries/_shared/urls_without_root_id.py @@ -0,0 +1,28 @@ +""" +A query to retrieve URLS that either +- are not a root URL +- are not already linked to a root URL + +""" + +from sqlalchemy import select + +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.sqlalchemy import URL + +URLS_WITHOUT_ROOT_ID_QUERY = ( + select( + URL.id, + URL.url + ).outerjoin( + FlagRootURL, + URL.id == FlagRootURL.url_id + ).outerjoin( + LinkURLRootURL, + URL.id == LinkURLRootURL.url_id + ).where( + FlagRootURL.url_id.is_(None), + LinkURLRootURL.url_id.is_(None) + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/queries/get.py b/src/core/tasks/url/operators/root_url/queries/get.py new file mode 100644 index 00000000..3643f343 --- /dev/null +++ b/src/core/tasks/url/operators/root_url/queries/get.py @@ -0,0 +1,23 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from typing_extensions import override + +from src.core.tasks.url.operators.root_url.queries._shared.urls_without_root_id import URLS_WITHOUT_ROOT_ID_QUERY +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsForRootURLTaskQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + URLS_WITHOUT_ROOT_ID_QUERY + ) + mappings = await sh.mappings(session, query=query) + return [ + URLMapping( + url_id=mapping["id"], + url=mapping["url"] + ) for mapping in mappings + ] \ No newline at end of file diff --git a/src/db/models/instantiations/agency/pydantic/__init__.py b/src/core/tasks/url/operators/root_url/queries/lookup/__init__.py similarity index 100% rename from src/db/models/instantiations/agency/pydantic/__init__.py rename to src/core/tasks/url/operators/root_url/queries/lookup/__init__.py diff --git a/src/core/tasks/url/operators/root_url/queries/lookup/query.py b/src/core/tasks/url/operators/root_url/queries/lookup/query.py new file mode 100644 index 00000000..88e1112e --- /dev/null +++ b/src/core/tasks/url/operators/root_url/queries/lookup/query.py @@ -0,0 +1,58 @@ +from sqlalchemy import select, case +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.root_url.queries.lookup.response import LookupRootsURLResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupRootURLsQueryBuilder(QueryBuilderBase): + """ + Looks up URLs to see if they exist in the database as root URLs + """ + + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[LookupRootsURLResponse]: + + # Run query + query = select( + URL.id, + URL.url, + case( + (FlagRootURL.url_id.is_(None), False), + else_=True + ).label("flagged_as_root") + ).outerjoin(FlagRootURL).where( + URL.url.in_(self.urls), + ) + mappings = await sh.mappings(session, query=query) + + # Store results in intermediate map + url_to_response_map: dict[str, LookupRootsURLResponse] = {} + for mapping in mappings: + url = mapping["url"] + response = LookupRootsURLResponse( + url=url, + url_id=mapping["id"], + flagged_as_root=mapping["flagged_as_root"] + ) + url_to_response_map[url] = response + + # Iterate through original URLs and add missing responses + results: list[LookupRootsURLResponse] = [] + for url in self.urls: + response = url_to_response_map.get(url) + if response is None: + response = LookupRootsURLResponse( + url=url, + url_id=None, + flagged_as_root=False + ) + results.append(response) + + return results diff --git a/src/core/tasks/url/operators/root_url/queries/lookup/response.py b/src/core/tasks/url/operators/root_url/queries/lookup/response.py new file mode 100644 index 00000000..ea21b38d --- /dev/null +++ b/src/core/tasks/url/operators/root_url/queries/lookup/response.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, model_validator + + +class LookupRootsURLResponse(BaseModel): + url: str + url_id: int | None + flagged_as_root: bool + + @property + def exists_in_db(self) -> bool: + return self.url_id is not None + + @model_validator(mode='after') + def validate_flagged_as_root(self): + if self.flagged_as_root and self.url_id is None: + raise ValueError('URL ID should be provided if flagged as root') + return self \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/queries/prereq.py b/src/core/tasks/url/operators/root_url/queries/prereq.py new file mode 100644 index 00000000..e447f9d9 --- /dev/null +++ b/src/core/tasks/url/operators/root_url/queries/prereq.py @@ -0,0 +1,19 @@ +from typing_extensions import override + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.root_url.queries._shared.urls_without_root_id import URLS_WITHOUT_ROOT_ID_QUERY +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class CheckPrereqsForRootURLTaskQueryBuilder(QueryBuilderBase): + + @override + async def run(self, session: AsyncSession) -> bool: + query = ( + URLS_WITHOUT_ROOT_ID_QUERY + .limit(1) + ) + result = await sh.one_or_none(session, query=query) + return result is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/core.py b/src/core/tasks/url/operators/submit_approved/core.py index e6b1be9f..107130eb 100644 --- a/src/core/tasks/url/operators/submit_approved/core.py +++ b/src/core/tasks/url/operators/submit_approved/core.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.enums import TaskType from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO from src.core.tasks.url.operators.base import URLTaskOperatorBase diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 484a9aec..6c22c731 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py index 7c2d0509..abd94d20 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index e1f9e382..d2563335 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -3,8 +3,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/client/async_.py b/src/db/client/async_.py index ffb7738b..ebe1b772 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -104,37 +104,36 @@ from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.db.helpers.session import session_helper as sh -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.backlog_snapshot import BacklogSnapshot -from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.duplicate.pydantic.info import DuplicateInfo -from src.db.models.instantiations.link.task_url import LinkTaskURL -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.log.pydantic.info import LogInfo -from src.db.models.instantiations.log.pydantic.output import LogOutputInfo -from src.db.models.instantiations.log.sqlalchemy import Log -from src.db.models.instantiations.root_url_cache import RootURL -from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.task.error import TaskError -from src.db.models.instantiations.url.checked_for_duplicate import URLCheckedForDuplicate -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo -from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.backlog_snapshot import BacklogSnapshot +from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.output import LogOutputInfo +from src.db.models.impl.log.sqlalchemy import Log +from src.db.models.impl.task.core import Task +from src.db.models.impl.task.error import TaskError +from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.probed_for_404 import URLProbedFor404 +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder @@ -151,7 +150,7 @@ class AsyncDatabaseClient: - def __init__(self, db_url: Optional[str] = None): + def __init__(self, db_url: str | None = None): if db_url is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) self.db_url = db_url @@ -300,7 +299,7 @@ async def get_user_suggestion( model: UserSuggestionModel, user_id: int, url_id: int - ) -> Optional[UserSuggestionModel]: + ) -> UserSuggestionModel | None: statement = Select(model).where( and_( model.url_id == url_id, @@ -314,7 +313,7 @@ async def get_next_url_for_user_annotation( self, user_suggestion_model_to_exclude: UserSuggestionModel, auto_suggestion_relationship: QueryableAttribute, - batch_id: Optional[int], + batch_id: int | None, check_if_annotated_not_relevant: bool = False ) -> URL: return await self.run_query_builder( @@ -370,8 +369,8 @@ async def get_next_url_for_record_type_annotation( self, session: AsyncSession, user_id: int, - batch_id: Optional[int] - ) -> Optional[GetNextRecordTypeAnnotationResponseInfo]: + batch_id: int | None + ) -> GetNextRecordTypeAnnotationResponseInfo | None: url = await GetNextURLForUserAnnotationQueryBuilder( user_suggestion_model_to_exclude=UserRecordTypeSuggestion, @@ -610,20 +609,6 @@ async def get_all( """Get all records of a model. Used primarily in testing.""" return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) - @session_manager - async def load_root_url_cache(self, session: AsyncSession) -> dict[str, str]: - statement = select(RootURL) - scalar_result = await session.scalars(statement) - model_result = scalar_result.all() - d = {} - for result in model_result: - d[result.url] = result.page_title - return d - - async def add_to_root_url_cache(self, url: str, page_title: str) -> None: - cache = RootURL(url=url, page_title=page_title) - await self.add(cache) - async def get_urls( self, page: int, diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 17483542..03a45d3b 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -7,19 +7,19 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base -from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate -from src.db.models.instantiations.log.sqlalchemy import Log -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.impl.duplicate.sqlalchemy import Duplicate +from src.db.models.impl.log.sqlalchemy import Log +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus @@ -27,7 +27,7 @@ # Database Client class DatabaseClient: - def __init__(self, db_url: Optional[str] = None): + def __init__(self, db_url: str | None = None): """Initialize the DatabaseClient.""" if db_url is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) diff --git a/src/db/client/types.py b/src/db/client/types.py index 8b004e19..efdfdc72 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,9 +1,9 @@ -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion AutoSuggestionModel = AutoRelevantSuggestion or AutoRecordTypeSuggestion or AutomatedUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 0b2379ef..505a6e58 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,9 +1,9 @@ -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 869b8978..979a3b51 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -8,17 +8,17 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion class DTOConverter: @@ -109,7 +109,7 @@ def final_review_annotation_agency_auto_info( @staticmethod def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( user_url_agency_suggestion: UserUrlAgencySuggestion - ) -> Optional[GetNextURLForAgencyAgencyInfo]: + ) -> GetNextURLForAgencyAgencyInfo | None: suggestion = user_url_agency_suggestion if suggestion is None: return None diff --git a/src/db/dtos/url/html_content.py b/src/db/dtos/url/html_content.py index 1d3d67bf..d7fb560e 100644 --- a/src/db/dtos/url/html_content.py +++ b/src/db/dtos/url/html_content.py @@ -1,5 +1,5 @@ -from src.db.models.instantiations.url.html.content.enums import HTMLContentType -from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/enums.py b/src/db/enums.py index 27d64402..dee42c2e 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -44,6 +44,7 @@ class TaskType(PyEnum): IDLE = "Idle" PROBE_404 = "404 Probe" PROBE_URL = "URL Probe" + ROOT_URL = "Root URL" # Scheduled Tasks PUSH_TO_HUGGINGFACE = "Push to Hugging Face" diff --git a/src/db/models/instantiations/batch/__init__.py b/src/db/models/impl/__init__.py similarity index 100% rename from src/db/models/instantiations/batch/__init__.py rename to src/db/models/impl/__init__.py diff --git a/src/db/models/instantiations/duplicate/__init__.py b/src/db/models/impl/agency/__init__.py similarity index 100% rename from src/db/models/instantiations/duplicate/__init__.py rename to src/db/models/impl/agency/__init__.py diff --git a/src/db/models/instantiations/duplicate/pydantic/__init__.py b/src/db/models/impl/agency/pydantic/__init__.py similarity index 100% rename from src/db/models/instantiations/duplicate/pydantic/__init__.py rename to src/db/models/impl/agency/pydantic/__init__.py diff --git a/src/db/models/instantiations/agency/pydantic/upsert.py b/src/db/models/impl/agency/pydantic/upsert.py similarity index 87% rename from src/db/models/instantiations/agency/pydantic/upsert.py rename to src/db/models/impl/agency/pydantic/upsert.py index c9d81336..099e8451 100644 --- a/src/db/models/instantiations/agency/pydantic/upsert.py +++ b/src/db/models/impl/agency/pydantic/upsert.py @@ -1,6 +1,6 @@ from datetime import datetime -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.upsert import BulkUpsertableModel diff --git a/src/db/models/instantiations/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/agency/sqlalchemy.py rename to src/db/models/impl/agency/sqlalchemy.py diff --git a/src/db/models/instantiations/backlog_snapshot.py b/src/db/models/impl/backlog_snapshot.py similarity index 100% rename from src/db/models/instantiations/backlog_snapshot.py rename to src/db/models/impl/backlog_snapshot.py diff --git a/src/db/models/instantiations/link/__init__.py b/src/db/models/impl/batch/__init__.py similarity index 100% rename from src/db/models/instantiations/link/__init__.py rename to src/db/models/impl/batch/__init__.py diff --git a/src/db/models/instantiations/batch/pydantic.py b/src/db/models/impl/batch/pydantic.py similarity index 60% rename from src/db/models/instantiations/batch/pydantic.py rename to src/db/models/impl/batch/pydantic.py index 3e1d265b..3272ceef 100644 --- a/src/db/models/instantiations/batch/pydantic.py +++ b/src/db/models/impl/batch/pydantic.py @@ -7,11 +7,11 @@ class BatchInfo(BaseModel): - id: Optional[int] = None + id: int | None = None strategy: str status: BatchStatus parameters: dict user_id: int - total_url_count: Optional[int] = None - compute_time: Optional[float] = None - date_generated: Optional[datetime] = None + total_url_count: int | None = None + compute_time: float | None = None + date_generated: datetime | None = None diff --git a/src/db/models/instantiations/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/batch/sqlalchemy.py rename to src/db/models/impl/batch/sqlalchemy.py diff --git a/src/db/models/instantiations/change_log.py b/src/db/models/impl/change_log.py similarity index 100% rename from src/db/models/instantiations/change_log.py rename to src/db/models/impl/change_log.py diff --git a/src/db/models/instantiations/link/url_agency/__init__.py b/src/db/models/impl/duplicate/__init__.py similarity index 100% rename from src/db/models/instantiations/link/url_agency/__init__.py rename to src/db/models/impl/duplicate/__init__.py diff --git a/src/db/models/instantiations/link/url_redirect_url/__init__.py b/src/db/models/impl/duplicate/pydantic/__init__.py similarity index 100% rename from src/db/models/instantiations/link/url_redirect_url/__init__.py rename to src/db/models/impl/duplicate/pydantic/__init__.py diff --git a/src/db/models/instantiations/duplicate/pydantic/info.py b/src/db/models/impl/duplicate/pydantic/info.py similarity index 62% rename from src/db/models/instantiations/duplicate/pydantic/info.py rename to src/db/models/impl/duplicate/pydantic/info.py index 3a020e04..627f5d54 100644 --- a/src/db/models/instantiations/duplicate/pydantic/info.py +++ b/src/db/models/impl/duplicate/pydantic/info.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo class DuplicateInfo(DuplicateInsertInfo): diff --git a/src/db/models/instantiations/duplicate/pydantic/insert.py b/src/db/models/impl/duplicate/pydantic/insert.py similarity index 77% rename from src/db/models/instantiations/duplicate/pydantic/insert.py rename to src/db/models/impl/duplicate/pydantic/insert.py index a8854cf3..7de4974a 100644 --- a/src/db/models/instantiations/duplicate/pydantic/insert.py +++ b/src/db/models/impl/duplicate/pydantic/insert.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.models.impl.duplicate.sqlalchemy import Duplicate from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/duplicate/sqlalchemy.py b/src/db/models/impl/duplicate/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/duplicate/sqlalchemy.py rename to src/db/models/impl/duplicate/sqlalchemy.py diff --git a/src/db/models/instantiations/log/__init__.py b/src/db/models/impl/flag/__init__.py similarity index 100% rename from src/db/models/instantiations/log/__init__.py rename to src/db/models/impl/flag/__init__.py diff --git a/src/db/models/instantiations/log/pydantic/__init__.py b/src/db/models/impl/flag/root_url/__init__.py similarity index 100% rename from src/db/models/instantiations/log/pydantic/__init__.py rename to src/db/models/impl/flag/root_url/__init__.py diff --git a/src/db/models/impl/flag/root_url/pydantic.py b/src/db/models/impl/flag/root_url/pydantic.py new file mode 100644 index 00000000..a840192a --- /dev/null +++ b/src/db/models/impl/flag/root_url/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class FlagRootURLPydantic(BulkInsertableModel): + + url_id: int + + @classmethod + def sa_model(cls) -> type[FlagRootURL]: + return FlagRootURL \ No newline at end of file diff --git a/src/db/models/impl/flag/root_url/sqlalchemy.py b/src/db/models/impl/flag/root_url/sqlalchemy.py new file mode 100644 index 00000000..8c8afbed --- /dev/null +++ b/src/db/models/impl/flag/root_url/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagRootURL( + CreatedAtMixin, + URLDependentMixin, + Base +): + __tablename__ = 'flag_root_url' + __table_args__ = ( + PrimaryKeyConstraint( + 'url_id', + ), + ) diff --git a/src/db/models/instantiations/state/__init__.py b/src/db/models/impl/link/__init__.py similarity index 100% rename from src/db/models/instantiations/state/__init__.py rename to src/db/models/impl/link/__init__.py diff --git a/src/db/models/instantiations/link/batch_url.py b/src/db/models/impl/link/batch_url.py similarity index 100% rename from src/db/models/instantiations/link/batch_url.py rename to src/db/models/impl/link/batch_url.py diff --git a/src/db/models/instantiations/link/task_url.py b/src/db/models/impl/link/task_url.py similarity index 100% rename from src/db/models/instantiations/link/task_url.py rename to src/db/models/impl/link/task_url.py diff --git a/src/db/models/instantiations/state/sync/__init__.py b/src/db/models/impl/link/url_agency/__init__.py similarity index 100% rename from src/db/models/instantiations/state/sync/__init__.py rename to src/db/models/impl/link/url_agency/__init__.py diff --git a/src/db/models/instantiations/link/url_agency/pydantic.py b/src/db/models/impl/link/url_agency/pydantic.py similarity index 80% rename from src/db/models/instantiations/link/url_agency/pydantic.py rename to src/db/models/impl/link/url_agency/pydantic.py index 75c02119..77522a64 100644 --- a/src/db/models/instantiations/link/url_agency/pydantic.py +++ b/src/db/models/impl/link/url_agency/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/link/url_agency/sqlalchemy.py rename to src/db/models/impl/link/url_agency/sqlalchemy.py diff --git a/src/db/models/instantiations/task/__init__.py b/src/db/models/impl/link/url_redirect_url/__init__.py similarity index 100% rename from src/db/models/instantiations/task/__init__.py rename to src/db/models/impl/link/url_redirect_url/__init__.py diff --git a/src/db/models/instantiations/link/url_redirect_url/pydantic.py b/src/db/models/impl/link/url_redirect_url/pydantic.py similarity index 75% rename from src/db/models/instantiations/link/url_redirect_url/pydantic.py rename to src/db/models/impl/link/url_redirect_url/pydantic.py index 30799391..b7b5dff3 100644 --- a/src/db/models/instantiations/link/url_redirect_url/pydantic.py +++ b/src/db/models/impl/link/url_redirect_url/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/link/url_redirect_url/sqlalchemy.py b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/link/url_redirect_url/sqlalchemy.py rename to src/db/models/impl/link/url_redirect_url/sqlalchemy.py diff --git a/src/db/models/instantiations/url/__init__.py b/src/db/models/impl/link/urls_root_url/__init__.py similarity index 100% rename from src/db/models/instantiations/url/__init__.py rename to src/db/models/impl/link/urls_root_url/__init__.py diff --git a/src/db/models/impl/link/urls_root_url/pydantic.py b/src/db/models/impl/link/urls_root_url/pydantic.py new file mode 100644 index 00000000..c3037567 --- /dev/null +++ b/src/db/models/impl/link/urls_root_url/pydantic.py @@ -0,0 +1,12 @@ +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkURLRootURLPydantic(BulkInsertableModel): + + url_id: int + root_url_id: int + + @classmethod + def sa_model(cls) -> type[LinkURLRootURL]: + return LinkURLRootURL \ No newline at end of file diff --git a/src/db/models/impl/link/urls_root_url/sqlalchemy.py b/src/db/models/impl/link/urls_root_url/sqlalchemy.py new file mode 100644 index 00000000..a856dd31 --- /dev/null +++ b/src/db/models/impl/link/urls_root_url/sqlalchemy.py @@ -0,0 +1,14 @@ +from src.db.models.helpers import url_id_column +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.with_id import WithIDBase + + +class LinkURLRootURL( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + WithIDBase +): + __tablename__ = "link_urls_root_url" + + root_url_id = url_id_column() \ No newline at end of file diff --git a/src/db/models/instantiations/url/core/__init__.py b/src/db/models/impl/log/__init__.py similarity index 100% rename from src/db/models/instantiations/url/core/__init__.py rename to src/db/models/impl/log/__init__.py diff --git a/src/db/models/instantiations/url/core/pydantic/__init__.py b/src/db/models/impl/log/pydantic/__init__.py similarity index 100% rename from src/db/models/instantiations/url/core/pydantic/__init__.py rename to src/db/models/impl/log/pydantic/__init__.py diff --git a/src/db/models/instantiations/log/pydantic/info.py b/src/db/models/impl/log/pydantic/info.py similarity index 67% rename from src/db/models/instantiations/log/pydantic/info.py rename to src/db/models/impl/log/pydantic/info.py index aa9b06ee..76af0dd7 100644 --- a/src/db/models/instantiations/log/pydantic/info.py +++ b/src/db/models/impl/log/pydantic/info.py @@ -5,7 +5,7 @@ class LogInfo(BaseModel): - id: Optional[int] = None + id: int | None = None log: str batch_id: int - created_at: Optional[datetime] = None + created_at: datetime | None = None diff --git a/src/db/models/instantiations/log/pydantic/output.py b/src/db/models/impl/log/pydantic/output.py similarity index 65% rename from src/db/models/instantiations/log/pydantic/output.py rename to src/db/models/impl/log/pydantic/output.py index c58eab0f..36ea843b 100644 --- a/src/db/models/instantiations/log/pydantic/output.py +++ b/src/db/models/impl/log/pydantic/output.py @@ -5,6 +5,6 @@ class LogOutputInfo(BaseModel): - id: Optional[int] = None + id: int | None = None log: str - created_at: Optional[datetime] = None + created_at: datetime | None = None diff --git a/src/db/models/instantiations/log/sqlalchemy.py b/src/db/models/impl/log/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/log/sqlalchemy.py rename to src/db/models/impl/log/sqlalchemy.py diff --git a/src/db/models/instantiations/missing.py b/src/db/models/impl/missing.py similarity index 100% rename from src/db/models/instantiations/missing.py rename to src/db/models/impl/missing.py diff --git a/src/db/models/instantiations/url/data_source/__init__.py b/src/db/models/impl/state/__init__.py similarity index 100% rename from src/db/models/instantiations/url/data_source/__init__.py rename to src/db/models/impl/state/__init__.py diff --git a/src/db/models/instantiations/state/huggingface.py b/src/db/models/impl/state/huggingface.py similarity index 100% rename from src/db/models/instantiations/state/huggingface.py rename to src/db/models/impl/state/huggingface.py diff --git a/src/db/models/instantiations/url/error_info/__init__.py b/src/db/models/impl/state/sync/__init__.py similarity index 100% rename from src/db/models/instantiations/url/error_info/__init__.py rename to src/db/models/impl/state/sync/__init__.py diff --git a/src/db/models/instantiations/state/sync/agencies.py b/src/db/models/impl/state/sync/agencies.py similarity index 100% rename from src/db/models/instantiations/state/sync/agencies.py rename to src/db/models/impl/state/sync/agencies.py diff --git a/src/db/models/instantiations/state/sync/data_sources.py b/src/db/models/impl/state/sync/data_sources.py similarity index 100% rename from src/db/models/instantiations/state/sync/data_sources.py rename to src/db/models/impl/state/sync/data_sources.py diff --git a/src/db/models/instantiations/url/html/__init__.py b/src/db/models/impl/task/__init__.py similarity index 100% rename from src/db/models/instantiations/url/html/__init__.py rename to src/db/models/impl/task/__init__.py diff --git a/src/db/models/instantiations/task/core.py b/src/db/models/impl/task/core.py similarity index 100% rename from src/db/models/instantiations/task/core.py rename to src/db/models/impl/task/core.py diff --git a/src/db/models/instantiations/task/error.py b/src/db/models/impl/task/error.py similarity index 100% rename from src/db/models/instantiations/task/error.py rename to src/db/models/impl/task/error.py diff --git a/src/db/models/instantiations/url/html/compressed/__init__.py b/src/db/models/impl/url/__init__.py similarity index 100% rename from src/db/models/instantiations/url/html/compressed/__init__.py rename to src/db/models/impl/url/__init__.py diff --git a/src/db/models/instantiations/url/checked_for_duplicate.py b/src/db/models/impl/url/checked_for_duplicate.py similarity index 100% rename from src/db/models/instantiations/url/checked_for_duplicate.py rename to src/db/models/impl/url/checked_for_duplicate.py diff --git a/src/db/models/instantiations/url/html/content/__init__.py b/src/db/models/impl/url/core/__init__.py similarity index 100% rename from src/db/models/instantiations/url/html/content/__init__.py rename to src/db/models/impl/url/core/__init__.py diff --git a/src/db/models/instantiations/url/core/enums.py b/src/db/models/impl/url/core/enums.py similarity index 100% rename from src/db/models/instantiations/url/core/enums.py rename to src/db/models/impl/url/core/enums.py diff --git a/src/db/models/instantiations/url/scrape_info/__init__.py b/src/db/models/impl/url/core/pydantic/__init__.py similarity index 100% rename from src/db/models/instantiations/url/scrape_info/__init__.py rename to src/db/models/impl/url/core/pydantic/__init__.py diff --git a/src/db/models/instantiations/url/core/pydantic/info.py b/src/db/models/impl/url/core/pydantic/info.py similarity index 87% rename from src/db/models/instantiations/url/core/pydantic/info.py rename to src/db/models/impl/url/core/pydantic/info.py index f53297c1..07df21fe 100644 --- a/src/db/models/instantiations/url/core/pydantic/info.py +++ b/src/db/models/impl/url/core/pydantic/info.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.enums import URLSource +from src.db.models.impl.url.core.enums import URLSource class URLInfo(BaseModel): diff --git a/src/db/models/instantiations/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py similarity index 80% rename from src/db/models/instantiations/url/core/pydantic/insert.py rename to src/db/models/impl/url/core/pydantic/insert.py index caac3128..b893e9fa 100644 --- a/src/db/models/instantiations/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -1,7 +1,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py similarity index 98% rename from src/db/models/instantiations/url/core/sqlalchemy.py rename to src/db/models/impl/url/core/sqlalchemy.py index 992187dc..b9c38732 100644 --- a/src/db/models/instantiations/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.helpers import enum_column -from src.db.models.instantiations.url.core.enums import URLSource +from src.db.models.impl.url.core.enums import URLSource from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin from src.db.models.templates_.with_id import WithIDBase diff --git a/src/db/models/instantiations/url/suggestion/__init__.py b/src/db/models/impl/url/data_source/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/__init__.py rename to src/db/models/impl/url/data_source/__init__.py diff --git a/src/db/models/instantiations/url/data_source/pydantic.py b/src/db/models/impl/url/data_source/pydantic.py similarity index 75% rename from src/db/models/instantiations/url/data_source/pydantic.py rename to src/db/models/impl/url/data_source/pydantic.py index 00da8c5e..7d02c5df 100644 --- a/src/db/models/instantiations/url/data_source/pydantic.py +++ b/src/db/models/impl/url/data_source/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/data_source/sqlalchemy.py b/src/db/models/impl/url/data_source/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/data_source/sqlalchemy.py rename to src/db/models/impl/url/data_source/sqlalchemy.py diff --git a/src/db/models/instantiations/url/suggestion/agency/__init__.py b/src/db/models/impl/url/error_info/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/agency/__init__.py rename to src/db/models/impl/url/error_info/__init__.py diff --git a/src/db/models/instantiations/url/error_info/pydantic.py b/src/db/models/impl/url/error_info/pydantic.py similarity index 81% rename from src/db/models/instantiations/url/error_info/pydantic.py rename to src/db/models/impl/url/error_info/pydantic.py index 74baf5e3..2de814c8 100644 --- a/src/db/models/instantiations/url/error_info/pydantic.py +++ b/src/db/models/impl/url/error_info/pydantic.py @@ -1,6 +1,6 @@ import datetime -from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/error_info/sqlalchemy.py b/src/db/models/impl/url/error_info/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/error_info/sqlalchemy.py rename to src/db/models/impl/url/error_info/sqlalchemy.py diff --git a/src/db/models/instantiations/url/suggestion/record_type/__init__.py b/src/db/models/impl/url/html/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/record_type/__init__.py rename to src/db/models/impl/url/html/__init__.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/__init__.py b/src/db/models/impl/url/html/compressed/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/__init__.py rename to src/db/models/impl/url/html/compressed/__init__.py diff --git a/src/db/models/instantiations/url/html/compressed/pydantic.py b/src/db/models/impl/url/html/compressed/pydantic.py similarity index 79% rename from src/db/models/instantiations/url/html/compressed/pydantic.py rename to src/db/models/impl/url/html/compressed/pydantic.py index b626b5c2..1409d924 100644 --- a/src/db/models/instantiations/url/html/compressed/pydantic.py +++ b/src/db/models/impl/url/html/compressed/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/html/compressed/sqlalchemy.py b/src/db/models/impl/url/html/compressed/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/html/compressed/sqlalchemy.py rename to src/db/models/impl/url/html/compressed/sqlalchemy.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py b/src/db/models/impl/url/html/content/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/auto/__init__.py rename to src/db/models/impl/url/html/content/__init__.py diff --git a/src/db/models/instantiations/url/html/content/enums.py b/src/db/models/impl/url/html/content/enums.py similarity index 100% rename from src/db/models/instantiations/url/html/content/enums.py rename to src/db/models/impl/url/html/content/enums.py diff --git a/src/db/models/instantiations/url/html/content/pydantic.py b/src/db/models/impl/url/html/content/pydantic.py similarity index 100% rename from src/db/models/instantiations/url/html/content/pydantic.py rename to src/db/models/impl/url/html/content/pydantic.py diff --git a/src/db/models/instantiations/url/html/content/sqlalchemy.py b/src/db/models/impl/url/html/content/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/html/content/sqlalchemy.py rename to src/db/models/impl/url/html/content/sqlalchemy.py diff --git a/src/db/models/instantiations/url/optional_data_source_metadata.py b/src/db/models/impl/url/optional_data_source_metadata.py similarity index 100% rename from src/db/models/instantiations/url/optional_data_source_metadata.py rename to src/db/models/impl/url/optional_data_source_metadata.py diff --git a/src/db/models/instantiations/url/probed_for_404.py b/src/db/models/impl/url/probed_for_404.py similarity index 100% rename from src/db/models/instantiations/url/probed_for_404.py rename to src/db/models/impl/url/probed_for_404.py diff --git a/src/db/models/instantiations/url/reviewing_user.py b/src/db/models/impl/url/reviewing_user.py similarity index 100% rename from src/db/models/instantiations/url/reviewing_user.py rename to src/db/models/impl/url/reviewing_user.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py b/src/db/models/impl/url/scrape_info/__init__.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/__init__.py rename to src/db/models/impl/url/scrape_info/__init__.py diff --git a/src/db/models/instantiations/url/scrape_info/enums.py b/src/db/models/impl/url/scrape_info/enums.py similarity index 100% rename from src/db/models/instantiations/url/scrape_info/enums.py rename to src/db/models/impl/url/scrape_info/enums.py diff --git a/src/db/models/instantiations/url/scrape_info/pydantic.py b/src/db/models/impl/url/scrape_info/pydantic.py similarity index 65% rename from src/db/models/instantiations/url/scrape_info/pydantic.py rename to src/db/models/impl/url/scrape_info/pydantic.py index f41b1642..1aaf2205 100644 --- a/src/db/models/instantiations/url/scrape_info/pydantic.py +++ b/src/db/models/impl/url/scrape_info/pydantic.py @@ -1,5 +1,5 @@ -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus -from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel diff --git a/src/db/models/instantiations/url/scrape_info/sqlalchemy.py b/src/db/models/impl/url/scrape_info/sqlalchemy.py similarity index 82% rename from src/db/models/instantiations/url/scrape_info/sqlalchemy.py rename to src/db/models/impl/url/scrape_info/sqlalchemy.py index d97e0b93..b50f2903 100644 --- a/src/db/models/instantiations/url/scrape_info/sqlalchemy.py +++ b/src/db/models/impl/url/scrape_info/sqlalchemy.py @@ -1,5 +1,5 @@ from src.db.models.helpers import enum_column -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.mixins import URLDependentMixin from src.db.models.templates_.standard import StandardBase diff --git a/src/db/models/instantiations/url/suggestion/README.md b/src/db/models/impl/url/suggestion/README.md similarity index 100% rename from src/db/models/instantiations/url/suggestion/README.md rename to src/db/models/impl/url/suggestion/README.md diff --git a/src/db/models/instantiations/url/web_metadata/__init__.py b/src/db/models/impl/url/suggestion/__init__.py similarity index 100% rename from src/db/models/instantiations/url/web_metadata/__init__.py rename to src/db/models/impl/url/suggestion/__init__.py diff --git a/tests/automated/integration/html_tag_collector/__init__.py b/src/db/models/impl/url/suggestion/agency/__init__.py similarity index 100% rename from tests/automated/integration/html_tag_collector/__init__.py rename to src/db/models/impl/url/suggestion/agency/__init__.py diff --git a/src/db/models/instantiations/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/agency/auto.py rename to src/db/models/impl/url/suggestion/agency/auto.py diff --git a/src/db/models/instantiations/url/suggestion/agency/user.py b/src/db/models/impl/url/suggestion/agency/user.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/agency/user.py rename to src/db/models/impl/url/suggestion/agency/user.py diff --git a/src/db/models/impl/url/suggestion/record_type/__init__.py b/src/db/models/impl/url/suggestion/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/suggestion/record_type/auto.py b/src/db/models/impl/url/suggestion/record_type/auto.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/record_type/auto.py rename to src/db/models/impl/url/suggestion/record_type/auto.py diff --git a/src/db/models/instantiations/url/suggestion/record_type/user.py b/src/db/models/impl/url/suggestion/record_type/user.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/record_type/user.py rename to src/db/models/impl/url/suggestion/record_type/user.py diff --git a/src/db/models/impl/url/suggestion/relevant/__init__.py b/src/db/models/impl/url/suggestion/relevant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/relevant/auto/__init__.py b/src/db/models/impl/url/suggestion/relevant/auto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py b/src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py b/src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/auto/pydantic/input.py rename to src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/auto/sqlalchemy.py rename to src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py diff --git a/src/db/models/instantiations/url/suggestion/relevant/user.py b/src/db/models/impl/url/suggestion/relevant/user.py similarity index 100% rename from src/db/models/instantiations/url/suggestion/relevant/user.py rename to src/db/models/impl/url/suggestion/relevant/user.py diff --git a/src/db/models/impl/url/web_metadata/__init__.py b/src/db/models/impl/url/web_metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/instantiations/url/web_metadata/insert.py b/src/db/models/impl/url/web_metadata/insert.py similarity index 88% rename from src/db/models/instantiations/url/web_metadata/insert.py rename to src/db/models/impl/url/web_metadata/insert.py index 430ed798..4467b9da 100644 --- a/src/db/models/instantiations/url/web_metadata/insert.py +++ b/src/db/models/impl/url/web_metadata/insert.py @@ -1,6 +1,6 @@ from pydantic import Field -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel diff --git a/src/db/models/instantiations/url/web_metadata/sqlalchemy.py b/src/db/models/impl/url/web_metadata/sqlalchemy.py similarity index 100% rename from src/db/models/instantiations/url/web_metadata/sqlalchemy.py rename to src/db/models/impl/url/web_metadata/sqlalchemy.py diff --git a/src/db/models/instantiations/root_url_cache.py b/src/db/models/instantiations/root_url_cache.py deleted file mode 100644 index f79e4b5c..00000000 --- a/src/db/models/instantiations/root_url_cache.py +++ /dev/null @@ -1,17 +0,0 @@ -from sqlalchemy import UniqueConstraint, Column, String - -from src.db.models.mixins import UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase - - -class RootURL(UpdatedAtMixin, WithIDBase): - __tablename__ = 'root_url_cache' - __table_args__ = ( - UniqueConstraint( - "url", - name="uq_root_url_url"), - ) - - url = Column(String, nullable=False) - page_title = Column(String, nullable=False) - page_description = Column(String, nullable=True) diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index 4b5fd118..f0ef345c 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -9,8 +9,8 @@ class QueryBuilderBase(Generic[LabelsType]): - def __init__(self, labels: Optional[LabelsType] = None): - self.query: Optional[FromClause] = None + def __init__(self, labels: LabelsType | None = None): + self.query: FromClause | None = None self.labels = labels def get(self, key: str) -> ColumnClause: diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index bb6bf57a..f8dfa654 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -18,7 +18,7 @@ from src.collectors.enums import URLStatus from src.db.constants import ALL_ANNOTATION_MODELS -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/html_content_info.py b/src/db/queries/implementations/core/get/html_content_info.py index d647acc1..3d2ad559 100644 --- a/src/db/queries/implementations/core/get/html_content_info.py +++ b/src/db/queries/implementations/core/get/html_content_info.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index bd16f149..f9bb2ef8 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -7,7 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.impl.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -18,10 +18,10 @@ class GetRecentBatchSummariesQueryBuilder(QueryBuilderBase): def __init__( self, page: int = 1, - has_pending_urls: Optional[bool] = None, - collector_type: Optional[CollectorType] = None, - status: Optional[BatchStatus] = None, - batch_id: Optional[int] = None, + has_pending_urls: bool | None = None, + collector_type: CollectorType | None = None, + status: BatchStatus | None = None, + batch_id: int | None = None, ): super().__init__() self.url_counts_cte = URLCountsCTEQueryBuilder( diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index f2192307..72a33336 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -5,9 +5,9 @@ from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -18,10 +18,10 @@ class URLCountsCTEQueryBuilder(QueryBuilderBase): def __init__( self, page: int = 1, - has_pending_urls: Optional[bool] = None, - collector_type: Optional[CollectorType] = None, - status: Optional[BatchStatus] = None, - batch_id: Optional[int] = None + has_pending_urls: bool | None = None, + collector_type: CollectorType | None = None, + status: BatchStatus | None = None, + batch_id: int | None = None ): super().__init__(URLCountsLabels()) self.page = page diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 5e6751ca..269dfced 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -5,10 +5,10 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder diff --git a/src/db/queries/protocols.py b/src/db/queries/protocols.py index 0098e953..b1a2ce20 100644 --- a/src/db/queries/protocols.py +++ b/src/db/queries/protocols.py @@ -6,4 +6,4 @@ class HasQuery(Protocol): def __init__(self): - self.query: Optional[Select] = None + self.query: Select | None = None diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 6f00f7ff..45a281de 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -8,17 +8,17 @@ from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.link.task_url import LinkTaskURL -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.task.core import Task -from src.db.models.instantiations.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.batch.sqlalchemy import Batch -from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.task.core import Task +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType diff --git a/src/db/types.py b/src/db/types.py index dadef2f1..3c24919b 100644 --- a/src/db/types.py +++ b/src/db/types.py @@ -1,8 +1,8 @@ from typing import TypeVar -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.labels import LabelsBase UserSuggestionType = UserUrlAgencySuggestion | UserRelevantSuggestion | UserRecordTypeSuggestion diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 0b2b9ed8..ee357ad4 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -24,9 +24,9 @@ def __init__( async def match_agency( self, name: str, - state: Optional[str] = None, - county: Optional[str] = None, - locality: Optional[str] = None + state: str | None = None, + county: str | None = None, + locality: str | None = None ) -> MatchAgencyResponse: """ Returns agencies, if any, that match or partially match the search criteria diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 13327bfd..47a24cac 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -86,9 +86,9 @@ def updated_at_column() -> sa.Column: comment='The last time the row was updated.' ) -def url_id_column() -> sa.Column: +def url_id_column(name: str = 'url_id') -> sa.Column: return sa.Column( - 'url_id', + name, sa.Integer(), sa.ForeignKey( 'urls.id', diff --git a/src/util/url_mapper.py b/src/util/url_mapper.py index 15ac6918..17ddb3e6 100644 --- a/src/util/url_mapper.py +++ b/src/util/url_mapper.py @@ -16,9 +16,23 @@ def __init__(self, mappings: list[URLMapping]): def get_id(self, url: str) -> int: return self._url_to_id[url] + def get_ids(self, urls: list[str]) -> list[int]: + return [ + self._url_to_id[url] + for url in urls + ] + def get_url(self, url_id: int) -> str: return self._id_to_url[url_id] + def get_mappings_by_url(self, urls: list[str]) -> list[URLMapping]: + return [ + URLMapping( + url_id=self._url_to_id[url], + url=url + ) for url in urls + ] + def add_mapping(self, mapping: URLMapping) -> None: self._url_to_id[mapping.url] = mapping.url_id self._id_to_url[mapping.url_id] = mapping.url diff --git a/tests/alembic/helpers.py b/tests/alembic/helpers.py index b835c7a9..a284e0fc 100644 --- a/tests/alembic/helpers.py +++ b/tests/alembic/helpers.py @@ -13,7 +13,7 @@ def table_creation_check( alembic_runner: AlembicRunner, tables: list[str], end_revision: str, - start_revision: Optional[str] = None, + start_revision: str | None = None, ) -> None: if start_revision is not None: alembic_runner.upgrade(start_revision) diff --git a/tests/automated/integration/api/example_collector/test_happy_path.py b/tests/automated/integration/api/example_collector/test_happy_path.py index 78d20dce..bbb52789 100644 --- a/tests/automated/integration/api/example_collector/test_happy_path.py +++ b/tests/automated/integration/api/example_collector/test_happy_path.py @@ -6,7 +6,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/integration/api/review/rejection/helpers.py b/tests/automated/integration/api/review/rejection/helpers.py index cd6c8c74..f9619747 100644 --- a/tests/automated/integration/api/review/rejection/helpers.py +++ b/tests/automated/integration/api/review/rejection/helpers.py @@ -2,7 +2,7 @@ from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.review.reject.dto import FinalReviewRejectionInfo from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index 61ed4add..bfa126b1 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -5,10 +5,10 @@ from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py index 78dd0f55..51688765 100644 --- a/tests/automated/integration/api/test_annotate.py +++ b/tests/automated/integration/api/test_annotate.py @@ -12,12 +12,12 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.core.error_manager.enums import ErrorTypes from src.core.enums import RecordType, SuggestionType, SuggestedStatus from src.core.exceptions import FailedValidationException -from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py index fc140453..4dd21a49 100644 --- a/tests/automated/integration/api/test_batch.py +++ b/tests/automated/integration/api/test_batch.py @@ -1,6 +1,6 @@ import pytest -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType, URLStatus diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index bdf858f7..9b3fb326 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,10 +2,10 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType from src.core.enums import RecordType diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index fa8ed93b..03e3e74c 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -9,7 +9,7 @@ from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.enums import TaskType -from src.db.models.instantiations.task.core import Task +from src.db.models.impl.task.core import Task from tests.automated.integration.core.async_.helpers import setup_async_core from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py index 0c261097..c419fb70 100644 --- a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py +++ b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py @@ -1,7 +1,7 @@ import pytest from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index fb7abae9..2a7f9569 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,10 +3,10 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.reviewing_user import ReviewingUserURL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_add_url_error_info.py b/tests/automated/integration/db/client/test_add_url_error_info.py index 55e84836..32564f6b 100644 --- a/tests/automated/integration/db/client/test_add_url_error_info.py +++ b/tests/automated/integration/db/client/test_add_url_error_info.py @@ -1,7 +1,7 @@ import pytest from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_old_logs.py b/tests/automated/integration/db/client/test_delete_old_logs.py index 61f94af0..44c96075 100644 --- a/tests/automated/integration/db/client/test_delete_old_logs.py +++ b/tests/automated/integration/db/client/test_delete_old_logs.py @@ -2,7 +2,7 @@ import pytest -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_delete_url_updated_at.py b/tests/automated/integration/db/client/test_delete_url_updated_at.py index f0bebaaf..3c50c505 100644 --- a/tests/automated/integration/db/client/test_delete_url_updated_at.py +++ b/tests/automated/integration/db/client/test_delete_url_updated_at.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_logs.py b/tests/automated/integration/db/client/test_insert_logs.py index dff43790..5ac9b9be 100644 --- a/tests/automated/integration/db/client/test_insert_logs.py +++ b/tests/automated/integration/db/client/test_insert_logs.py @@ -1,6 +1,6 @@ import pytest -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 644261b2..78578c6b 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,11 +1,11 @@ import pytest from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.db.models.instantiations.link.batch_url import LinkBatchURL -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL @pytest.mark.asyncio diff --git a/tests/automated/integration/db/structure/test_upsert_new_agencies.py b/tests/automated/integration/db/structure/test_upsert_new_agencies.py index 0993c7a7..6b377974 100644 --- a/tests/automated/integration/db/structure/test_upsert_new_agencies.py +++ b/tests/automated/integration/db/structure/test_upsert_new_agencies.py @@ -2,7 +2,7 @@ from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/html_tag_collector/test_root_url_cache.py b/tests/automated/integration/html_tag_collector/test_root_url_cache.py deleted file mode 100644 index 0add726e..00000000 --- a/tests/automated/integration/html_tag_collector/test_root_url_cache.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache -from src.core.tasks.url.operators.html.scraper.root_url_cache.dtos.response import RootURLCacheResponseInfo - - -async def mock_get_request(url: str) -> RootURLCacheResponseInfo: - return RootURLCacheResponseInfo(text="Test Title") - -@pytest.mark.asyncio -async def test_root_url_cache_happy_path(wiped_database): - cache = RootURLCache() - cache.get_request = mock_get_request - title = await cache.get_title("https://example.com") - assert title == "Test Title" - - # Check that entry is in database - d = await cache.adb_client.load_root_url_cache() - assert d["https://example.com"] == "Test Title" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index e782bd42..8e01c86b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -1,8 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry import \ diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py index 44da9b6f..a38cbaa6 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/existence_checker.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import FIRST_CALL_RESPONSE, SECOND_CALL_RESPONSE diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py index 0fbe64bc..6b1a8544 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/helpers.py @@ -5,8 +5,8 @@ from sqlalchemy import select, func, TIMESTAMP, cast, update from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.state.sync.agencies import AgenciesSyncState from src.external.pdap.client import PDAPClient from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import PREEXISTING_AGENCIES diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py index 8b3d8294..9fadf6ca 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py @@ -5,7 +5,7 @@ from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import AGENCIES_SYNC_RESPONSES from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded, patch_sync_agencies diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py index d1af6417..db7f74b5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py @@ -3,8 +3,8 @@ from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.state.sync.agencies import AgenciesSyncState from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import FIRST_CALL_RESPONSE, \ THIRD_CALL_RESPONSE, SECOND_CALL_RESPONSE from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py index 9fdd88bb..68225a51 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_new_results.py @@ -6,8 +6,8 @@ from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.state.sync.agencies import AgenciesSyncState +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.state.sync.agencies import AgenciesSyncState from tests.automated.integration.tasks.scheduled.impl.sync.agency.data import THIRD_CALL_RESPONSE from tests.automated.integration.tasks.scheduled.impl.sync.agency.existence_checker import AgencyChecker from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, check_sync_concluded diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py index e5a3c4ba..12428d7d 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py @@ -3,8 +3,8 @@ from sqlalchemy import select, cast, func, TIMESTAMP from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.url.core.sqlalchemy import URL async def check_sync_concluded( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py index d034def8..4007c38d 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py @@ -1,8 +1,8 @@ from collections import defaultdict -from src.db.models.instantiations.link.url_agency_.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.link.url_agency_.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py index c7a0ad41..0321aec9 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py @@ -1,7 +1,7 @@ from sqlalchemy import select from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.agency.sqlalchemy import Agency +from src.db.models.impl.agency.sqlalchemy import Agency from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py index 8ed045e8..ad1bc4c0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py @@ -2,8 +2,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord from src.db.helpers.session import session_helper as sh diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py index 0a5d15b9..81eaa50f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py @@ -1,9 +1,9 @@ from pendulum import today from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.agency import AgencyAssignmentManager diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py index 4b98094f..997859b5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py @@ -3,7 +3,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py index d3181f90..fe69cc57 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py @@ -5,7 +5,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.models.instantiations.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py index c96aa4db..b3a24dc3 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py @@ -1,6 +1,6 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.agency.sqlalchemy import Agency -from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion async def assert_expected_confirmed_and_auto_suggestions(adb_client: AsyncDatabaseClient): diff --git a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py index 9b7d2274..cfa60cf8 100644 --- a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py @@ -4,9 +4,9 @@ from src.collectors.enums import URLStatus from src.db.enums import TaskType -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_url_task_has_expected_run_info, \ assert_prereqs_met from tests.automated.integration.tasks.url.impl.auto_relevant.setup import setup_operator, setup_urls diff --git a/tests/automated/integration/tasks/url/impl/html/check/manager.py b/tests/automated/integration/tasks/url/impl/html/check/manager.py index 489d7cd8..deb0fa11 100644 --- a/tests/automated/integration/tasks/url/impl/html/check/manager.py +++ b/tests/automated/integration/tasks/url/impl/html/check/manager.py @@ -1,8 +1,8 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.instantiations.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from tests.automated.integration.tasks.url.impl.html.setup.models.record import TestURLHTMLTaskSetupRecord diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 7d3f0028..e9495ad4 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -1,7 +1,7 @@ from http import HTTPStatus from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestURLInfo, \ TestWebMetadataInfo, ExpectedResult, TestErrorType diff --git a/tests/automated/integration/tasks/url/impl/html/setup/manager.py b/tests/automated/integration/tasks/url/impl/html/setup/manager.py index 718149b9..986a9f7e 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/manager.py @@ -3,12 +3,11 @@ from src.core.enums import RecordType from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic -from tests.automated.integration.tasks.url.impl.html.mocks.methods import mock_get_from_cache, mock_parse +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from tests.automated.integration.tasks.url.impl.html.mocks.methods import mock_parse from tests.automated.integration.tasks.url.impl.html.mocks.url_request_interface.core import MockURLRequestInterface from tests.automated.integration.tasks.url.impl.html.setup.data import TEST_ENTRIES from tests.automated.integration.tasks.url.impl.html.setup.models.record import TestURLHTMLTaskSetupRecord @@ -68,18 +67,8 @@ async def setup_web_metadata( models.append(model) await self.adb_client.bulk_insert(models) - - -async def setup_mocked_root_url_cache() -> RootURLCache: - mock_root_url_cache = RootURLCache() - mock_root_url_cache.get_from_cache = types.MethodType(mock_get_from_cache, mock_root_url_cache) - return mock_root_url_cache - - async def setup_operator() -> URLHTMLTaskOperator: - html_parser = HTMLResponseParser( - root_url_cache=await setup_mocked_root_url_cache() - ) + html_parser = HTMLResponseParser() html_parser.parse = types.MethodType(mock_parse, html_parser) operator = URLHTMLTaskOperator( adb_client=AsyncDatabaseClient(), diff --git a/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py b/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py index 8cc2a8ad..287bb52c 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus class TestErrorType(Enum): diff --git a/tests/automated/integration/tasks/url/impl/probe/check/manager.py b/tests/automated/integration/tasks/url/impl/probe/check/manager.py index 01c835c9..a8d89ba5 100644 --- a/tests/automated/integration/tasks/url/impl/probe/check/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/check/manager.py @@ -2,9 +2,9 @@ from src.collectors.enums import URLStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL -from src.db.models.instantiations.url.core.sqlalchemy import URL -from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata class TestURLProbeCheckManager: diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 1a6e0e7b..6c218e25 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.core.enums import URLSource +from src.db.models.impl.url.core.enums import URLSource PATCH_ROOT = "src.external.url_request.core.URLProbeManager" TEST_URL = "https://www.example.com" diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index aa531de0..75595ed4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -1,7 +1,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 398b6828..75847c4a 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -1,7 +1,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.constants import TEST_DEST_URL diff --git a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py index 746e3ca1..50405970 100644 --- a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py @@ -3,7 +3,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.external.url_request.core import URLRequestInterface from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse diff --git a/tests/automated/integration/tasks/url/impl/root_url/__init__.py b/tests/automated/integration/tasks/url/impl/root_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/root_url/conftest.py b/tests/automated/integration/tasks/url/impl/root_url/conftest.py new file mode 100644 index 00000000..16b7012e --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/conftest.py @@ -0,0 +1,9 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator(adb_client_test: AsyncDatabaseClient) -> URLRootURLTaskOperator: + return URLRootURLTaskOperator(adb_client=adb_client_test) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/constants.py b/tests/automated/integration/tasks/url/impl/root_url/constants.py new file mode 100644 index 00000000..dc688797 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/constants.py @@ -0,0 +1,5 @@ + + +ROOT_URL = "https://root.com" +BRANCH_URL = "https://root.com/branch" +SECOND_BRANCH_URL = "https://root.com/second-branch" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py new file mode 100644 index 00000000..aa26154d --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py @@ -0,0 +1,60 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import ROOT_URL, BRANCH_URL + + +@pytest.mark.asyncio +async def test_branch_root_url_in_db( + operator: URLRootURLTaskOperator +): + """ + If a URL is a branch URL, + with the root URL in the database, + it should be marked as such and not pulled again + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add URL that is a root URL, and mark as such + url_insert_model_root = URLInsertModel( + url=ROOT_URL, + source=URLSource.DATA_SOURCES + ) + root_url_id = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] + root_model_flag_insert = FlagRootURLPydantic( + url_id=root_url_id + ) + await operator.adb_client.bulk_insert([root_model_flag_insert]) + + # Add URL that is a branch of the root URL + url_insert_model = URLInsertModel( + url=BRANCH_URL, + source=URLSource.COLLECTOR + ) + branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + + links: list[LinkURLRootURL] = await operator.adb_client.get_all(LinkURLRootURL) + assert len(links) == 1 + assert links[0].url_id == branch_url_id + + # Check for only one flag, for the root URL + flags: list[FlagRootURL] = await operator.adb_client.get_all(FlagRootURL) + assert len(flags) == 1 + assert flags[0].url_id == root_url_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py new file mode 100644 index 00000000..845190ad --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py @@ -0,0 +1,58 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import BRANCH_URL, ROOT_URL + + +@pytest.mark.asyncio +async def test_branch_root_url_not_in_db( + operator: URLRootURLTaskOperator +): + """ + If a URL is a branch URL, + with the root URL not in the database, + Add the root URL and mark it as such + and add the link to the root URL for the branch + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add URL that is a branch of a root URL + url_insert_model = URLInsertModel( + url=BRANCH_URL, + source=URLSource.COLLECTOR + ) + branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + + # Check for presence of root URL with proper source and flag + urls: list[URL] = await operator.adb_client.get_all(URL) + root_url = next(url for url in urls if url.url == ROOT_URL) + assert root_url.source == URLSource.ROOT_URL + + # Check for presence of link for branch URL + links: list[LinkURLRootURL] = await operator.adb_client.get_all(LinkURLRootURL) + assert len(links) == 1 + link = next(link for link in links if link.url_id == branch_url_id) + assert link.root_url_id == root_url.id + + # Check for absence of flag for branch URL + flags: list[FlagRootURL] = await operator.adb_client.get_all(FlagRootURL) + assert len(flags) == 1 + flag = next(flag for flag in flags if flag.url_id == root_url.id) + assert flag \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py new file mode 100644 index 00000000..e815f564 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py @@ -0,0 +1,47 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import ROOT_URL + + +@pytest.mark.asyncio +async def test_is_root_url( + operator: URLRootURLTaskOperator +): + """ + If a URL is a root URL, + it should be marked as such and not pulled again + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add URL that is a root URL + url_insert_model = URLInsertModel( + url=ROOT_URL, + source=URLSource.DATA_SOURCES + ) + url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + + # Check for absence of Link + links: list[LinkURLRootURL] = await operator.adb_client.get_all(LinkURLRootURL) + assert len(links) == 0 + + # Check for presence of Flag + flags: list[FlagRootURL] = await operator.adb_client.get_all(FlagRootURL) + assert len(flags) == 1 + assert flags[0].url_id == url_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py new file mode 100644 index 00000000..141ae93b --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py @@ -0,0 +1,61 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import ROOT_URL, BRANCH_URL, SECOND_BRANCH_URL + + +@pytest.mark.asyncio +async def test_two_branches_one_root_in_db( + operator: URLRootURLTaskOperator +): + """ + If two URLs are branches of a ROOT URL that is already in the database, + Both URLs should be linked to the ROOT URL + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add root URL and mark as such + url_insert_model_root = URLInsertModel( + url=ROOT_URL, + source=URLSource.DATA_SOURCES + ) + url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] + root_model_flag_insert = FlagRootURLPydantic( + url_id=url_id_root + ) + await operator.adb_client.bulk_insert([root_model_flag_insert]) + + # Add two URLs that are branches of that root URL + url_insert_model_branch_1 = URLInsertModel( + url=BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] + + url_insert_model_branch_2 = URLInsertModel( + url=SECOND_BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + + # Check for presence of separate links for both branch URLs + links: list[LinkURLRootURL] = await operator.adb_client.get_all(LinkURLRootURL) + assert len(links) == 2 + link_url_ids = {link.url_id for link in links} + assert link_url_ids == {url_id_branch_1, url_id_branch_2} diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py new file mode 100644 index 00000000..88f65596 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py @@ -0,0 +1,68 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import ROOT_URL, BRANCH_URL, SECOND_BRANCH_URL + + +@pytest.mark.asyncio +async def test_two_branches_one_root_in_db_not_flagged( + operator: URLRootURLTaskOperator +): + """ + If two URLs are branches of a ROOT URL that is already in the database + but not flagged as such, + Both URLs should be linked to the ROOT URL + and the Root URL should be flagged + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add root URL but do not mark as such + url_insert_model_root = URLInsertModel( + url=ROOT_URL, + source=URLSource.DATA_SOURCES + ) + url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] + + # Add two URLs that are branches of that root URL + url_insert_model_branch_1 = URLInsertModel( + url=BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] + + url_insert_model_branch_2 = URLInsertModel( + url=SECOND_BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + + # Check for presence of separate links for both branch URLs + links: list[LinkURLRootURL] = await operator.adb_client.get_all(LinkURLRootURL) + assert len(links) == 2 + url_ids = [link.url_id for link in links] + # Check both URLs are present + assert set(url_ids) == {url_id_branch_1, url_id_branch_2} + # Check both URLs are linked to the root URL + assert url_id_root in [link.root_url_id for link in links] + + flags: list[FlagRootURL] = await operator.adb_client.get_all(FlagRootURL) + assert len(flags) == 1 + assert flags[0].url_id == url_id_root + diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py new file mode 100644 index 00000000..8bfb8534 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.automated.integration.tasks.url.impl.root_url.constants import BRANCH_URL, SECOND_BRANCH_URL + + +@pytest.mark.asyncio +@pytest.mark.asyncio +async def test_two_branches_one_root_in_db_not_flagged( + operator: URLRootURLTaskOperator +): + """ + If two URLs are branches of a ROOT URL that is not already in the database, + Both URLs, along with the Root URL, should be added to the database + and the Root URL should flagged as such + """ + # Check prerequisites not yet met + assert not await operator.meets_task_prerequisites() + + # Add two URLs that are branches of a root URL + url_insert_model_branch_1 = URLInsertModel( + url=BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] + + url_insert_model_branch_2 = URLInsertModel( + url=SECOND_BRANCH_URL, + source=URLSource.COLLECTOR + ) + url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] + + # Check prerequisites are now met + assert await operator.meets_task_prerequisites() + + # Run task + run_info = await operator.run_task(1) + assert_task_ran_without_error(run_info) + + # Check task prerequisites no longer met + assert not await operator.meets_task_prerequisites() + diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index acada2ad..8df14a8f 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -3,9 +3,9 @@ from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.enums import TaskType -from src.db.models.instantiations.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.instantiations.url.data_source.sqlalchemy import URLDataSource -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from tests.automated.integration.tasks.url.impl.submit_approved.mock import mock_make_request diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 5c2d4d7e..698c9c59 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -7,8 +7,8 @@ from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.external.url_request.core import URLRequestInterface -from src.db.models.instantiations.url.probed_for_404 import URLProbedFor404 -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.probed_for_404 import URLProbedFor404 +from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.external.url_request.dtos.url_response import URLResponseInfo diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index 6e95fccb..5c6e32ac 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -3,8 +3,8 @@ import pytest from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator -from src.db.models.instantiations.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py index 3ea95811..1259441e 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py @@ -3,7 +3,7 @@ import pytest from src.db.enums import TaskType -from src.db.models.instantiations.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.enums import RecordType diff --git a/tests/automated/integration/tasks/url/loader/test_flags.py b/tests/automated/integration/tasks/url/loader/test_flags.py index 68e8862a..43164d9e 100644 --- a/tests/automated/integration/tasks/url/loader/test_flags.py +++ b/tests/automated/integration/tasks/url/loader/test_flags.py @@ -11,6 +11,7 @@ from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator +from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator @@ -55,6 +56,10 @@ class Config: env_var="URL_PROBE_TASK_FLAG", operator=URLProbeTaskOperator ), + FlagTestParams( + env_var="URL_ROOT_URL_TASK_FLAG", + operator=URLRootURLTaskOperator + ) ] @pytest.mark.asyncio diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index 639eb0ae..769204d7 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS = 8 +NUMBER_OF_TASK_OPERATORS = 9 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/automated/unit/core/test_core_logger.py b/tests/automated/unit/core/test_core_logger.py index 580f18bd..6c4f0375 100644 --- a/tests/automated/unit/core/test_core_logger.py +++ b/tests/automated/unit/core/test_core_logger.py @@ -3,7 +3,7 @@ import pytest -from src.db.models.instantiations.log.pydantic.info import LogInfo +from src.db.models.impl.log.pydantic.info import LogInfo from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 99395476..cc191dc3 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -7,8 +7,8 @@ from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger from src.collectors.impl.auto_googler.collector import AutoGooglerCollector -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 2757227b..0a10680f 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -6,8 +6,8 @@ from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger from src.collectors.impl.common_crawler.collector import CommonCrawlerCollector -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo @pytest.fixture diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index bb194d22..6c845b8e 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -10,8 +10,8 @@ from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO from src.collectors.impl.muckrock.fetch_requests.foia import FOIAFetchRequest -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo PATCH_ROOT = "src.collectors.impl.muckrock" diff --git a/tests/helpers/data_creator/commands/impl/batch.py b/tests/helpers/data_creator/commands/impl/batch.py index 09cdbe61..69583a45 100644 --- a/tests/helpers/data_creator/commands/impl/batch.py +++ b/tests/helpers/data_creator/commands/impl/batch.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index dd947d65..c548eb5a 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,8 +1,8 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.instantiations.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo -from src.db.models.instantiations.url.scrape_info.enums import ScrapeStatus -from src.db.models.instantiations.url.scrape_info.pydantic import URLScrapeInfoInsertModel +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py index 58dfc8fb..2e31491d 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/url_metadata.py b/tests/helpers/data_creator/commands/impl/url_metadata.py index 608bc403..161d5631 100644 --- a/tests/helpers/data_creator/commands/impl/url_metadata.py +++ b/tests/helpers/data_creator/commands/impl/url_metadata.py @@ -1,6 +1,6 @@ from http import HTTPStatus -from src.db.models.instantiations.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls.py index 3e886e34..ee9ef954 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls.py @@ -3,8 +3,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.url.core.enums import URLSource -from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.info import URLInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.simple_test_data_functions import generate_test_urls diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index d22fc1f9..096bad32 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -5,9 +5,9 @@ from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient from src.db.enums import TaskType from src.collectors.enums import CollectorType, URLStatus diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 6b214bf2..02c364d6 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -1,5 +1,5 @@ from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.models.impl.url.core.sqlalchemy import URL async def populate_database(adb_client: AsyncDatabaseClient) -> None: diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index 0536a1d9..584facdd 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,7 +2,7 @@ import dotenv -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 84c4c430..9a896392 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from src.collectors.impl.ckan import group_search, package_search, organization_search diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 2e4e0227..417e7240 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,4 +1,4 @@ -from src.db.models.instantiations.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/tasks/test_url_html_task_operator.py b/tests/manual/core/tasks/test_url_html_task_operator.py index b6031d77..e0a409e3 100644 --- a/tests/manual/core/tasks/test_url_html_task_operator.py +++ b/tests/manual/core/tasks/test_url_html_task_operator.py @@ -4,7 +4,6 @@ from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.external.url_request.core import URLRequestInterface -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache @pytest.mark.asyncio @@ -24,11 +23,7 @@ async def test_url_html_task_operator( "https://www.albanyca.org/departments/police-department/policies-procedures-training-sb978", "https://www.yelp.com/biz/albany-police-department-albany-3", ] - parser = HTMLResponseParser( - root_url_cache=RootURLCache( - adb_client=adb_client_test - ) - ) + parser = HTMLResponseParser() manual_batch_dto = ManualBatchInputDTO( name="Test Batch", entries=[ diff --git a/tests/manual/html_collector/test_html_tag_collector_integration.py b/tests/manual/html_collector/test_html_tag_collector_integration.py index d7942b4a..6cdaf118 100644 --- a/tests/manual/html_collector/test_html_tag_collector_integration.py +++ b/tests/manual/html_collector/test_html_tag_collector_integration.py @@ -1,11 +1,10 @@ import pytest +from src.db.models.impl.url.core.pydantic_.info import URLInfo from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser -from src.external.url_request.core import URLRequestInterface -from src.core.tasks.url.operators.html.scraper.root_url_cache.core import RootURLCache from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.instantiations.url.core.pydantic_.info import URLInfo +from src.external.url_request.core import URLRequestInterface from tests.helpers.data_creator.core import DBDataCreator URLS = [ @@ -57,9 +56,7 @@ async def test_url_html_cycle_live_data( operator = URLHTMLTaskOperator( adb_client=AsyncDatabaseClient(), url_request_interface=URLRequestInterface(), - html_parser=HTMLResponseParser( - root_url_cache=RootURLCache() - ) + html_parser=HTMLResponseParser() ) await operator.run_task() @@ -77,8 +74,6 @@ async def test_url_html_cycle( operator = URLHTMLTaskOperator( adb_client=adb_client, url_request_interface=URLRequestInterface(), - html_parser=HTMLResponseParser( - root_url_cache=RootURLCache() - ) + html_parser=HTMLResponseParser() ) await operator.run_task() \ No newline at end of file diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index f3050d7b..f26f2a6f 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_deepseek_record_classifier(): - from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct + from src.db.models.impl.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA", diff --git a/tests/manual/llm_api_logic/test_openai_record_classifier.py b/tests/manual/llm_api_logic/test_openai_record_classifier.py index b0105437..3b3ec08b 100644 --- a/tests/manual/llm_api_logic/test_openai_record_classifier.py +++ b/tests/manual/llm_api_logic/test_openai_record_classifier.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_openai_record_classifier(): - from src.db.models.instantiations.url.html.content.enums import HTMLContentType as hct + from src.db.models.impl.url.html.content.enums import HTMLContentType as hct d = { hct.TITLE: "Oath of Office for Newly Promoted Corporal Lumpkin with Acworth Police – City of Acworth, GA",