diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 1f958a62..2be9189f 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -50,7 +50,7 @@ async def run(self, session: AsyncSession) -> list[DuplicateInfo]: final_results.append( DuplicateInfo( source_url=result.source_url, - duplicate_batch_id=result.duplicate_batch_id, + batch_id=result.duplicate_batch_id, duplicate_metadata=result.duplicate_batch_parameters, original_batch_id=result.original_batch_id, original_metadata=result.original_batch_parameters, diff --git a/src/api/endpoints/collector/routes.py b/src/api/endpoints/collector/routes.py index 6f39d27f..4818dc63 100644 --- a/src/api/endpoints/collector/routes.py +++ b/src/api/endpoints/collector/routes.py @@ -5,17 +5,17 @@ from src.api.endpoints.collector.dtos.collector_start import CollectorStartInfo from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO -from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO -from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO +from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO +from src.collectors.impl.common_crawler.input import CommonCrawlerInputDTO +from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.core import AsyncCore from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo -from src.collectors.source_collectors.ckan.dtos.input import CKANInputDTO -from src.collectors.source_collectors.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO +from src.collectors.impl.ckan.dtos.input import CKANInputDTO +from src.collectors.impl.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO +from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO +from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO collector_router = APIRouter( prefix="/collector", diff --git a/src/api/main.py b/src/api/main.py index 4e587a2a..735c5f6f 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -17,7 +17,7 @@ from src.api.endpoints.task.routes import task_router from src.api.endpoints.url.routes import url_router from src.collectors.manager import AsyncCollectorManager -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.core import AsyncCore from src.core.logger import AsyncCoreLogger from src.core.env_var_manager import EnvVarManager diff --git a/src/collectors/source_collectors/README.md b/src/collectors/impl/README.md similarity index 100% rename from src/collectors/source_collectors/README.md rename to src/collectors/impl/README.md diff --git a/src/collectors/source_collectors/__init__.py b/src/collectors/impl/__init__.py similarity index 100% rename from src/collectors/source_collectors/__init__.py rename to src/collectors/impl/__init__.py diff --git a/src/collectors/source_collectors/auto_googler/README.md b/src/collectors/impl/auto_googler/README.md similarity index 100% rename from src/collectors/source_collectors/auto_googler/README.md rename to src/collectors/impl/auto_googler/README.md diff --git a/src/collectors/source_collectors/auto_googler/__init__.py b/src/collectors/impl/auto_googler/__init__.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/__init__.py rename to src/collectors/impl/auto_googler/__init__.py diff --git a/src/collectors/source_collectors/auto_googler/auto_googler.py b/src/collectors/impl/auto_googler/auto_googler.py similarity index 77% rename from src/collectors/source_collectors/auto_googler/auto_googler.py rename to src/collectors/impl/auto_googler/auto_googler.py index 49cdc2de..c8cddb08 100644 --- a/src/collectors/source_collectors/auto_googler/auto_googler.py +++ b/src/collectors/impl/auto_googler/auto_googler.py @@ -1,6 +1,6 @@ -from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO -from src.collectors.source_collectors.auto_googler.searcher import GoogleSearcher -from src.collectors.source_collectors.auto_googler.dtos.config import SearchConfig +from src.collectors.impl.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO +from src.collectors.impl.auto_googler.searcher import GoogleSearcher +from src.collectors.impl.auto_googler.dtos.config import SearchConfig class AutoGoogler: diff --git a/src/collectors/source_collectors/auto_googler/collector.py b/src/collectors/impl/auto_googler/collector.py similarity index 74% rename from src/collectors/source_collectors/auto_googler/collector.py rename to src/collectors/impl/auto_googler/collector.py index 718bdfb7..bec62c3d 100644 --- a/src/collectors/source_collectors/auto_googler/collector.py +++ b/src/collectors/impl/auto_googler/collector.py @@ -1,13 +1,13 @@ -from src.collectors.source_collectors.base import AsyncCollectorBase +from src.collectors.impl.base import AsyncCollectorBase from src.collectors.enums import CollectorType from src.core.env_var_manager import EnvVarManager from src.core.preprocessors.autogoogler import AutoGooglerPreprocessor -from src.collectors.source_collectors.auto_googler.auto_googler import AutoGoogler -from src.collectors.source_collectors.auto_googler.dtos.output import AutoGooglerInnerOutputDTO -from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO -from src.collectors.source_collectors.auto_googler.searcher import GoogleSearcher -from src.collectors.source_collectors.auto_googler.dtos.config import SearchConfig +from src.collectors.impl.auto_googler.auto_googler import AutoGoogler +from src.collectors.impl.auto_googler.dtos.output import AutoGooglerInnerOutputDTO +from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO +from src.collectors.impl.auto_googler.searcher import GoogleSearcher +from src.collectors.impl.auto_googler.dtos.config import SearchConfig from src.util.helper_functions import base_model_list_dump diff --git a/src/collectors/source_collectors/auto_googler/dtos/__init__.py b/src/collectors/impl/auto_googler/dtos/__init__.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/dtos/__init__.py rename to src/collectors/impl/auto_googler/dtos/__init__.py diff --git a/src/collectors/source_collectors/auto_googler/dtos/config.py b/src/collectors/impl/auto_googler/dtos/config.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/dtos/config.py rename to src/collectors/impl/auto_googler/dtos/config.py diff --git a/src/collectors/source_collectors/auto_googler/dtos/input.py b/src/collectors/impl/auto_googler/dtos/input.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/dtos/input.py rename to src/collectors/impl/auto_googler/dtos/input.py diff --git a/src/collectors/source_collectors/auto_googler/dtos/output.py b/src/collectors/impl/auto_googler/dtos/output.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/dtos/output.py rename to src/collectors/impl/auto_googler/dtos/output.py diff --git a/src/collectors/source_collectors/auto_googler/dtos/query_results.py b/src/collectors/impl/auto_googler/dtos/query_results.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/dtos/query_results.py rename to src/collectors/impl/auto_googler/dtos/query_results.py diff --git a/src/collectors/source_collectors/auto_googler/exceptions.py b/src/collectors/impl/auto_googler/exceptions.py similarity index 100% rename from src/collectors/source_collectors/auto_googler/exceptions.py rename to src/collectors/impl/auto_googler/exceptions.py diff --git a/src/collectors/source_collectors/auto_googler/searcher.py b/src/collectors/impl/auto_googler/searcher.py similarity index 93% rename from src/collectors/source_collectors/auto_googler/searcher.py rename to src/collectors/impl/auto_googler/searcher.py index aa8a0bb6..cb877e25 100644 --- a/src/collectors/source_collectors/auto_googler/searcher.py +++ b/src/collectors/impl/auto_googler/searcher.py @@ -3,8 +3,8 @@ import aiohttp from googleapiclient.errors import HttpError -from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO -from src.collectors.source_collectors.auto_googler.exceptions import QuotaExceededError +from src.collectors.impl.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO +from src.collectors.impl.auto_googler.exceptions import QuotaExceededError class GoogleSearcher: diff --git a/src/collectors/source_collectors/base.py b/src/collectors/impl/base.py similarity index 95% rename from src/collectors/source_collectors/base.py rename to src/collectors/impl/base.py index 32cd3a48..d4910b8a 100644 --- a/src/collectors/source_collectors/base.py +++ b/src/collectors/impl/base.py @@ -14,6 +14,7 @@ from src.core.function_trigger import FunctionTrigger from src.core.enums import BatchStatus from src.core.preprocessors.base import PreprocessorBase +from src.db.models.instantiations.url.core.pydantic.info import URLInfo class AsyncCollectorBase(ABC): @@ -73,8 +74,8 @@ async def handle_error(self, e: Exception) -> None: async def process(self) -> None: await self.log("Processing collector...") - preprocessor = self.preprocessor() - url_infos = preprocessor.preprocess(self.data) + preprocessor: PreprocessorBase = self.preprocessor() + url_infos: list[URLInfo] = preprocessor.preprocess(self.data) await self.log(f"URLs processed: {len(url_infos)}") await self.log("Inserting URLs...") diff --git a/src/collectors/source_collectors/ckan/README.md b/src/collectors/impl/ckan/README.md similarity index 100% rename from src/collectors/source_collectors/ckan/README.md rename to src/collectors/impl/ckan/README.md diff --git a/src/collectors/source_collectors/ckan/__init__.py b/src/collectors/impl/ckan/__init__.py similarity index 100% rename from src/collectors/source_collectors/ckan/__init__.py rename to src/collectors/impl/ckan/__init__.py diff --git a/src/collectors/source_collectors/ckan/collector.py b/src/collectors/impl/ckan/collector.py similarity index 79% rename from src/collectors/source_collectors/ckan/collector.py rename to src/collectors/impl/ckan/collector.py index 3239e83b..42390306 100644 --- a/src/collectors/source_collectors/ckan/collector.py +++ b/src/collectors/impl/ckan/collector.py @@ -1,13 +1,13 @@ from pydantic import BaseModel -from src.collectors.source_collectors.base import AsyncCollectorBase +from src.collectors.impl.base import AsyncCollectorBase from src.collectors.enums import CollectorType from src.core.preprocessors.ckan import CKANPreprocessor -from src.collectors.source_collectors.ckan.dtos.input import CKANInputDTO -from src.collectors.source_collectors.ckan.scraper_toolkit.search_funcs.group import ckan_group_package_search -from src.collectors.source_collectors.ckan.scraper_toolkit.search_funcs.organization import ckan_package_search_from_organization -from src.collectors.source_collectors.ckan.scraper_toolkit.search_funcs.package import ckan_package_search -from src.collectors.source_collectors.ckan.scraper_toolkit.search import perform_search, get_flat_list, deduplicate_entries, \ +from src.collectors.impl.ckan.dtos.input import CKANInputDTO +from src.collectors.impl.ckan.scraper_toolkit.search_funcs.group import ckan_group_package_search +from src.collectors.impl.ckan.scraper_toolkit.search_funcs.organization import ckan_package_search_from_organization +from src.collectors.impl.ckan.scraper_toolkit.search_funcs.package import ckan_package_search +from src.collectors.impl.ckan.scraper_toolkit.search import perform_search, get_flat_list, deduplicate_entries, \ get_collections, filter_result, parse_result from src.util.helper_functions import base_model_list_dump diff --git a/src/collectors/source_collectors/ckan/constants.py b/src/collectors/impl/ckan/constants.py similarity index 100% rename from src/collectors/source_collectors/ckan/constants.py rename to src/collectors/impl/ckan/constants.py diff --git a/src/collectors/source_collectors/ckan/dtos/__init__.py b/src/collectors/impl/ckan/dtos/__init__.py similarity index 100% rename from src/collectors/source_collectors/ckan/dtos/__init__.py rename to src/collectors/impl/ckan/dtos/__init__.py diff --git a/src/collectors/source_collectors/ckan/dtos/input.py b/src/collectors/impl/ckan/dtos/input.py similarity index 73% rename from src/collectors/source_collectors/ckan/dtos/input.py rename to src/collectors/impl/ckan/dtos/input.py index b835999e..315bcafd 100644 --- a/src/collectors/source_collectors/ckan/dtos/input.py +++ b/src/collectors/impl/ckan/dtos/input.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field -from src.collectors.source_collectors.ckan.dtos.search.group_and_organization import GroupAndOrganizationSearchDTO -from src.collectors.source_collectors.ckan.dtos.search.package import CKANPackageSearchDTO +from src.collectors.impl.ckan.dtos.search.group_and_organization import GroupAndOrganizationSearchDTO +from src.collectors.impl.ckan.dtos.search.package import CKANPackageSearchDTO class CKANInputDTO(BaseModel): diff --git a/src/collectors/source_collectors/ckan/dtos/package.py b/src/collectors/impl/ckan/dtos/package.py similarity index 100% rename from src/collectors/source_collectors/ckan/dtos/package.py rename to src/collectors/impl/ckan/dtos/package.py diff --git a/src/collectors/source_collectors/ckan/dtos/search/__init__.py b/src/collectors/impl/ckan/dtos/search/__init__.py similarity index 100% rename from src/collectors/source_collectors/ckan/dtos/search/__init__.py rename to src/collectors/impl/ckan/dtos/search/__init__.py diff --git a/src/collectors/source_collectors/ckan/dtos/search/_helpers.py b/src/collectors/impl/ckan/dtos/search/_helpers.py similarity index 100% rename from src/collectors/source_collectors/ckan/dtos/search/_helpers.py rename to src/collectors/impl/ckan/dtos/search/_helpers.py diff --git a/src/collectors/source_collectors/ckan/dtos/search/group_and_organization.py b/src/collectors/impl/ckan/dtos/search/group_and_organization.py similarity index 76% rename from src/collectors/source_collectors/ckan/dtos/search/group_and_organization.py rename to src/collectors/impl/ckan/dtos/search/group_and_organization.py index da413ce1..4a352321 100644 --- a/src/collectors/source_collectors/ckan/dtos/search/group_and_organization.py +++ b/src/collectors/impl/ckan/dtos/search/group_and_organization.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field -from src.collectors.source_collectors.ckan.dtos.search._helpers import url_field +from src.collectors.impl.ckan.dtos.search._helpers import url_field class GroupAndOrganizationSearchDTO(BaseModel): diff --git a/src/collectors/source_collectors/ckan/dtos/search/package.py b/src/collectors/impl/ckan/dtos/search/package.py similarity index 80% rename from src/collectors/source_collectors/ckan/dtos/search/package.py rename to src/collectors/impl/ckan/dtos/search/package.py index 43fcbda5..3ef73d1a 100644 --- a/src/collectors/source_collectors/ckan/dtos/search/package.py +++ b/src/collectors/impl/ckan/dtos/search/package.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field -from src.collectors.source_collectors.ckan.dtos.search._helpers import url_field +from src.collectors.impl.ckan.dtos.search._helpers import url_field class CKANPackageSearchDTO(BaseModel): diff --git a/src/collectors/source_collectors/ckan/exceptions.py b/src/collectors/impl/ckan/exceptions.py similarity index 100% rename from src/collectors/source_collectors/ckan/exceptions.py rename to src/collectors/impl/ckan/exceptions.py diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/README.md b/src/collectors/impl/ckan/scraper_toolkit/README.md similarity index 100% rename from src/collectors/source_collectors/ckan/scraper_toolkit/README.md rename to src/collectors/impl/ckan/scraper_toolkit/README.md diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/__init__.py b/src/collectors/impl/ckan/scraper_toolkit/__init__.py similarity index 100% rename from src/collectors/source_collectors/ckan/scraper_toolkit/__init__.py rename to src/collectors/impl/ckan/scraper_toolkit/__init__.py diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/_api_interface.py b/src/collectors/impl/ckan/scraper_toolkit/_api_interface.py similarity index 96% rename from src/collectors/source_collectors/ckan/scraper_toolkit/_api_interface.py rename to src/collectors/impl/ckan/scraper_toolkit/_api_interface.py index d94c1516..8f557f3f 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/_api_interface.py +++ b/src/collectors/impl/ckan/scraper_toolkit/_api_interface.py @@ -3,7 +3,7 @@ import aiohttp from aiohttp import ContentTypeError -from src.collectors.source_collectors.ckan.exceptions import CKANAPIError +from src.collectors.impl.ckan.exceptions import CKANAPIError class CKANAPIInterface: diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search.py b/src/collectors/impl/ckan/scraper_toolkit/search.py similarity index 96% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search.py rename to src/collectors/impl/ckan/scraper_toolkit/search.py index 5bf686d1..7cd24b27 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/search.py +++ b/src/collectors/impl/ckan/scraper_toolkit/search.py @@ -7,9 +7,9 @@ from from_root import from_root from tqdm import tqdm -from src.collectors.source_collectors.ckan.scraper_toolkit.search_funcs.collection import ckan_collection_search -from src.collectors.source_collectors.ckan.dtos.package import Package -from src.collectors.source_collectors.ckan.constants import CKAN_DATA_TYPES, CKAN_TYPE_CONVERSION_MAPPING +from src.collectors.impl.ckan.scraper_toolkit.search_funcs.collection import ckan_collection_search +from src.collectors.impl.ckan.dtos.package import Package +from src.collectors.impl.ckan.constants import CKAN_DATA_TYPES, CKAN_TYPE_CONVERSION_MAPPING p = from_root(".pydocstyle").parent sys.path.insert(1, str(p)) diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/__init__.py b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/__init__.py similarity index 100% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/__init__.py rename to src/collectors/impl/ckan/scraper_toolkit/search_funcs/__init__.py diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/collection.py b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/collection.py similarity index 98% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/collection.py rename to src/collectors/impl/ckan/scraper_toolkit/search_funcs/collection.py index 07fcd0f9..cd275fc0 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/collection.py +++ b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/collection.py @@ -7,7 +7,7 @@ import aiohttp from bs4 import ResultSet, Tag, BeautifulSoup -from src.collectors.source_collectors.ckan.dtos.package import Package +from src.collectors.impl.ckan.dtos.package import Package async def ckan_collection_search(base_url: str, collection_id: str) -> list[Package]: diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/group.py b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/group.py similarity index 88% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/group.py rename to src/collectors/impl/ckan/scraper_toolkit/search_funcs/group.py index 1c0a296d..b74d32f2 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/group.py +++ b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/group.py @@ -1,7 +1,7 @@ import sys from typing import Optional, Any -from src.collectors.source_collectors.ckan.scraper_toolkit._api_interface import CKANAPIInterface +from src.collectors.impl.ckan.scraper_toolkit._api_interface import CKANAPIInterface async def ckan_group_package_search( diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/organization.py b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/organization.py similarity index 82% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/organization.py rename to src/collectors/impl/ckan/scraper_toolkit/search_funcs/organization.py index 45ff6767..6f53ce52 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/organization.py +++ b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/organization.py @@ -1,7 +1,7 @@ from typing import Any -from src.collectors.source_collectors.ckan.scraper_toolkit._api_interface import CKANAPIInterface -from src.collectors.source_collectors.ckan.scraper_toolkit.search_funcs.package import ckan_package_search +from src.collectors.impl.ckan.scraper_toolkit._api_interface import CKANAPIInterface +from src.collectors.impl.ckan.scraper_toolkit.search_funcs.package import ckan_package_search async def ckan_package_search_from_organization( diff --git a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/package.py b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/package.py similarity index 95% rename from src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/package.py rename to src/collectors/impl/ckan/scraper_toolkit/search_funcs/package.py index f5737b35..e6bb2495 100644 --- a/src/collectors/source_collectors/ckan/scraper_toolkit/search_funcs/package.py +++ b/src/collectors/impl/ckan/scraper_toolkit/search_funcs/package.py @@ -1,7 +1,7 @@ import sys from typing import Optional, Any -from src.collectors.source_collectors.ckan.scraper_toolkit._api_interface import CKANAPIInterface +from src.collectors.impl.ckan.scraper_toolkit._api_interface import CKANAPIInterface async def ckan_package_search( diff --git a/src/collectors/source_collectors/common_crawler/__init__.py b/src/collectors/impl/common_crawler/__init__.py similarity index 100% rename from src/collectors/source_collectors/common_crawler/__init__.py rename to src/collectors/impl/common_crawler/__init__.py diff --git a/src/collectors/source_collectors/common_crawler/collector.py b/src/collectors/impl/common_crawler/collector.py similarity index 76% rename from src/collectors/source_collectors/common_crawler/collector.py rename to src/collectors/impl/common_crawler/collector.py index e5e65dfe..f390ef71 100644 --- a/src/collectors/source_collectors/common_crawler/collector.py +++ b/src/collectors/impl/common_crawler/collector.py @@ -1,8 +1,8 @@ -from src.collectors.source_collectors.base import AsyncCollectorBase +from src.collectors.impl.base import AsyncCollectorBase from src.collectors.enums import CollectorType from src.core.preprocessors.common_crawler import CommonCrawlerPreprocessor -from src.collectors.source_collectors.common_crawler.crawler import CommonCrawler -from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO +from src.collectors.impl.common_crawler.crawler import CommonCrawler +from src.collectors.impl.common_crawler.input import CommonCrawlerInputDTO class CommonCrawlerCollector(AsyncCollectorBase): diff --git a/src/collectors/source_collectors/common_crawler/crawler.py b/src/collectors/impl/common_crawler/crawler.py similarity index 98% rename from src/collectors/source_collectors/common_crawler/crawler.py rename to src/collectors/impl/common_crawler/crawler.py index ca4f7ca9..f963aa4a 100644 --- a/src/collectors/source_collectors/common_crawler/crawler.py +++ b/src/collectors/impl/common_crawler/crawler.py @@ -6,7 +6,7 @@ import aiohttp -from src.collectors.source_collectors.common_crawler.utils import URLWithParameters +from src.collectors.impl.common_crawler.utils import URLWithParameters async def async_make_request( search_url: 'URLWithParameters' diff --git a/src/collectors/source_collectors/common_crawler/input.py b/src/collectors/impl/common_crawler/input.py similarity index 100% rename from src/collectors/source_collectors/common_crawler/input.py rename to src/collectors/impl/common_crawler/input.py diff --git a/src/collectors/source_collectors/common_crawler/utils.py b/src/collectors/impl/common_crawler/utils.py similarity index 100% rename from src/collectors/source_collectors/common_crawler/utils.py rename to src/collectors/impl/common_crawler/utils.py diff --git a/src/collectors/source_collectors/example/__init__.py b/src/collectors/impl/example/__init__.py similarity index 100% rename from src/collectors/source_collectors/example/__init__.py rename to src/collectors/impl/example/__init__.py diff --git a/src/collectors/source_collectors/example/core.py b/src/collectors/impl/example/core.py similarity index 79% rename from src/collectors/source_collectors/example/core.py rename to src/collectors/impl/example/core.py index 988caa09..4bccf242 100644 --- a/src/collectors/source_collectors/example/core.py +++ b/src/collectors/impl/example/core.py @@ -5,9 +5,9 @@ """ import asyncio -from src.collectors.source_collectors.base import AsyncCollectorBase -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO -from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO +from src.collectors.impl.base import AsyncCollectorBase +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.collectors.impl.example.dtos.output import ExampleOutputDTO from src.collectors.enums import CollectorType from src.core.preprocessors.example import ExamplePreprocessor diff --git a/src/collectors/source_collectors/example/dtos/__init__.py b/src/collectors/impl/example/dtos/__init__.py similarity index 100% rename from src/collectors/source_collectors/example/dtos/__init__.py rename to src/collectors/impl/example/dtos/__init__.py diff --git a/src/collectors/source_collectors/example/dtos/input.py b/src/collectors/impl/example/dtos/input.py similarity index 100% rename from src/collectors/source_collectors/example/dtos/input.py rename to src/collectors/impl/example/dtos/input.py diff --git a/src/collectors/source_collectors/example/dtos/output.py b/src/collectors/impl/example/dtos/output.py similarity index 100% rename from src/collectors/source_collectors/example/dtos/output.py rename to src/collectors/impl/example/dtos/output.py diff --git a/src/collectors/source_collectors/muckrock/README.md b/src/collectors/impl/muckrock/README.md similarity index 100% rename from src/collectors/source_collectors/muckrock/README.md rename to src/collectors/impl/muckrock/README.md diff --git a/src/collectors/source_collectors/muckrock/__init__.py b/src/collectors/impl/muckrock/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/__init__.py rename to src/collectors/impl/muckrock/__init__.py diff --git a/src/collectors/source_collectors/muckrock/api_interface/__init__.py b/src/collectors/impl/muckrock/api_interface/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/api_interface/__init__.py rename to src/collectors/impl/muckrock/api_interface/__init__.py diff --git a/src/collectors/source_collectors/muckrock/api_interface/core.py b/src/collectors/impl/muckrock/api_interface/core.py similarity index 86% rename from src/collectors/source_collectors/muckrock/api_interface/core.py rename to src/collectors/impl/muckrock/api_interface/core.py index 3b174cf5..4dd97572 100644 --- a/src/collectors/source_collectors/muckrock/api_interface/core.py +++ b/src/collectors/impl/muckrock/api_interface/core.py @@ -3,8 +3,8 @@ import requests from aiohttp import ClientSession -from src.collectors.source_collectors.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.source_collectors.muckrock.enums import AgencyLookupResponseType +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType class MuckrockAPIInterface: diff --git a/src/collectors/source_collectors/muckrock/api_interface/lookup_response.py b/src/collectors/impl/muckrock/api_interface/lookup_response.py similarity index 69% rename from src/collectors/source_collectors/muckrock/api_interface/lookup_response.py rename to src/collectors/impl/muckrock/api_interface/lookup_response.py index a714eeb5..47ea855b 100644 --- a/src/collectors/source_collectors/muckrock/api_interface/lookup_response.py +++ b/src/collectors/impl/muckrock/api_interface/lookup_response.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from src.collectors.source_collectors.muckrock.enums import AgencyLookupResponseType +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType class AgencyLookupResponse(BaseModel): diff --git a/src/collectors/source_collectors/muckrock/collectors/__init__.py b/src/collectors/impl/muckrock/collectors/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/__init__.py rename to src/collectors/impl/muckrock/collectors/__init__.py diff --git a/src/collectors/source_collectors/muckrock/collectors/all_foia/__init__.py b/src/collectors/impl/muckrock/collectors/all_foia/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/all_foia/__init__.py rename to src/collectors/impl/muckrock/collectors/all_foia/__init__.py diff --git a/src/collectors/source_collectors/muckrock/collectors/all_foia/core.py b/src/collectors/impl/muckrock/collectors/all_foia/core.py similarity index 82% rename from src/collectors/source_collectors/muckrock/collectors/all_foia/core.py rename to src/collectors/impl/muckrock/collectors/all_foia/core.py index 0033d242..f4249b2a 100644 --- a/src/collectors/source_collectors/muckrock/collectors/all_foia/core.py +++ b/src/collectors/impl/muckrock/collectors/all_foia/core.py @@ -1,8 +1,8 @@ from src.collectors.enums import CollectorType -from src.collectors.source_collectors.base import AsyncCollectorBase -from src.collectors.source_collectors.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO -from src.collectors.source_collectors.muckrock.fetchers.foia.core import FOIAFetcher -from src.collectors.source_collectors.muckrock.exceptions import MuckrockNoMoreDataError +from src.collectors.impl.base import AsyncCollectorBase +from src.collectors.impl.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO +from src.collectors.impl.muckrock.fetchers.foia.core import FOIAFetcher +from src.collectors.impl.muckrock.exceptions import MuckrockNoMoreDataError from src.core.preprocessors.muckrock import MuckrockPreprocessor diff --git a/src/collectors/source_collectors/muckrock/collectors/all_foia/dto.py b/src/collectors/impl/muckrock/collectors/all_foia/dto.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/all_foia/dto.py rename to src/collectors/impl/muckrock/collectors/all_foia/dto.py diff --git a/src/collectors/source_collectors/muckrock/collectors/county/__init__.py b/src/collectors/impl/muckrock/collectors/county/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/county/__init__.py rename to src/collectors/impl/muckrock/collectors/county/__init__.py diff --git a/src/collectors/source_collectors/muckrock/collectors/county/core.py b/src/collectors/impl/muckrock/collectors/county/core.py similarity index 78% rename from src/collectors/source_collectors/muckrock/collectors/county/core.py rename to src/collectors/impl/muckrock/collectors/county/core.py index 9a429d5d..50c79470 100644 --- a/src/collectors/source_collectors/muckrock/collectors/county/core.py +++ b/src/collectors/impl/muckrock/collectors/county/core.py @@ -1,11 +1,11 @@ from src.collectors.enums import CollectorType -from src.collectors.source_collectors.base import AsyncCollectorBase -from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest -from src.collectors.source_collectors.muckrock.fetch_requests.jurisdiction_loop import \ +from src.collectors.impl.base import AsyncCollectorBase +from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO +from src.collectors.impl.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest +from src.collectors.impl.muckrock.fetch_requests.jurisdiction_loop import \ JurisdictionLoopFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.foia.loop import FOIALoopFetcher -from src.collectors.source_collectors.muckrock.fetchers.jurisdiction.generator import \ +from src.collectors.impl.muckrock.fetchers.foia.loop import FOIALoopFetcher +from src.collectors.impl.muckrock.fetchers.jurisdiction.generator import \ JurisdictionGeneratorFetcher from src.core.preprocessors.muckrock import MuckrockPreprocessor diff --git a/src/collectors/source_collectors/muckrock/collectors/county/dto.py b/src/collectors/impl/muckrock/collectors/county/dto.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/county/dto.py rename to src/collectors/impl/muckrock/collectors/county/dto.py diff --git a/src/collectors/source_collectors/muckrock/collectors/simple/__init__.py b/src/collectors/impl/muckrock/collectors/simple/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/simple/__init__.py rename to src/collectors/impl/muckrock/collectors/simple/__init__.py diff --git a/src/collectors/source_collectors/muckrock/collectors/simple/core.py b/src/collectors/impl/muckrock/collectors/simple/core.py similarity index 80% rename from src/collectors/source_collectors/muckrock/collectors/simple/core.py rename to src/collectors/impl/muckrock/collectors/simple/core.py index 2776a69e..1470b7c1 100644 --- a/src/collectors/source_collectors/muckrock/collectors/simple/core.py +++ b/src/collectors/impl/muckrock/collectors/simple/core.py @@ -1,11 +1,11 @@ import itertools from src.collectors.enums import CollectorType -from src.collectors.source_collectors.base import AsyncCollectorBase -from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.simple.searcher import FOIASearcher -from src.collectors.source_collectors.muckrock.fetchers.foia.core import FOIAFetcher -from src.collectors.source_collectors.muckrock.exceptions import SearchCompleteException +from src.collectors.impl.base import AsyncCollectorBase +from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO +from src.collectors.impl.muckrock.collectors.simple.searcher import FOIASearcher +from src.collectors.impl.muckrock.fetchers.foia.core import FOIAFetcher +from src.collectors.impl.muckrock.exceptions import SearchCompleteException from src.core.preprocessors.muckrock import MuckrockPreprocessor diff --git a/src/collectors/source_collectors/muckrock/collectors/simple/dto.py b/src/collectors/impl/muckrock/collectors/simple/dto.py similarity index 100% rename from src/collectors/source_collectors/muckrock/collectors/simple/dto.py rename to src/collectors/impl/muckrock/collectors/simple/dto.py diff --git a/src/collectors/source_collectors/muckrock/collectors/simple/searcher.py b/src/collectors/impl/muckrock/collectors/simple/searcher.py similarity index 87% rename from src/collectors/source_collectors/muckrock/collectors/simple/searcher.py rename to src/collectors/impl/muckrock/collectors/simple/searcher.py index 3bb13617..2f326a5d 100644 --- a/src/collectors/source_collectors/muckrock/collectors/simple/searcher.py +++ b/src/collectors/impl/muckrock/collectors/simple/searcher.py @@ -1,7 +1,7 @@ from typing import Optional -from src.collectors.source_collectors.muckrock.fetchers.foia.core import FOIAFetcher -from src.collectors.source_collectors.muckrock.exceptions import SearchCompleteException +from src.collectors.impl.muckrock.fetchers.foia.core import FOIAFetcher +from src.collectors.impl.muckrock.exceptions import SearchCompleteException class FOIASearcher: diff --git a/src/collectors/source_collectors/muckrock/constants.py b/src/collectors/impl/muckrock/constants.py similarity index 100% rename from src/collectors/source_collectors/muckrock/constants.py rename to src/collectors/impl/muckrock/constants.py diff --git a/src/collectors/source_collectors/muckrock/enums.py b/src/collectors/impl/muckrock/enums.py similarity index 100% rename from src/collectors/source_collectors/muckrock/enums.py rename to src/collectors/impl/muckrock/enums.py diff --git a/src/collectors/source_collectors/muckrock/exceptions.py b/src/collectors/impl/muckrock/exceptions.py similarity index 100% rename from src/collectors/source_collectors/muckrock/exceptions.py rename to src/collectors/impl/muckrock/exceptions.py diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/__init__.py b/src/collectors/impl/muckrock/fetch_requests/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetch_requests/__init__.py rename to src/collectors/impl/muckrock/fetch_requests/__init__.py diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/base.py b/src/collectors/impl/muckrock/fetch_requests/base.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetch_requests/base.py rename to src/collectors/impl/muckrock/fetch_requests/base.py diff --git a/src/collectors/impl/muckrock/fetch_requests/foia.py b/src/collectors/impl/muckrock/fetch_requests/foia.py new file mode 100644 index 00000000..87a66811 --- /dev/null +++ b/src/collectors/impl/muckrock/fetch_requests/foia.py @@ -0,0 +1,6 @@ +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest + + +class FOIAFetchRequest(FetchRequest): + page: int + page_size: int diff --git a/src/collectors/impl/muckrock/fetch_requests/foia_loop.py b/src/collectors/impl/muckrock/fetch_requests/foia_loop.py new file mode 100644 index 00000000..0371eeae --- /dev/null +++ b/src/collectors/impl/muckrock/fetch_requests/foia_loop.py @@ -0,0 +1,5 @@ +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest + + +class FOIALoopFetchRequest(FetchRequest): + jurisdiction: int diff --git a/src/collectors/impl/muckrock/fetch_requests/jurisdiction_by_id.py b/src/collectors/impl/muckrock/fetch_requests/jurisdiction_by_id.py new file mode 100644 index 00000000..22d23f74 --- /dev/null +++ b/src/collectors/impl/muckrock/fetch_requests/jurisdiction_by_id.py @@ -0,0 +1,5 @@ +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest + + +class JurisdictionByIDFetchRequest(FetchRequest): + jurisdiction_id: int diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_loop.py b/src/collectors/impl/muckrock/fetch_requests/jurisdiction_loop.py similarity index 54% rename from src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_loop.py rename to src/collectors/impl/muckrock/fetch_requests/jurisdiction_loop.py index a39da62d..369fbeed 100644 --- a/src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_loop.py +++ b/src/collectors/impl/muckrock/fetch_requests/jurisdiction_loop.py @@ -1,4 +1,4 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest class JurisdictionLoopFetchRequest(FetchRequest): diff --git a/src/collectors/source_collectors/muckrock/fetchers/__init__.py b/src/collectors/impl/muckrock/fetchers/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetchers/__init__.py rename to src/collectors/impl/muckrock/fetchers/__init__.py diff --git a/src/collectors/source_collectors/muckrock/fetchers/foia/__init__.py b/src/collectors/impl/muckrock/fetchers/foia/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetchers/foia/__init__.py rename to src/collectors/impl/muckrock/fetchers/foia/__init__.py diff --git a/src/collectors/source_collectors/muckrock/fetchers/foia/core.py b/src/collectors/impl/muckrock/fetchers/foia/core.py similarity index 79% rename from src/collectors/source_collectors/muckrock/fetchers/foia/core.py rename to src/collectors/impl/muckrock/fetchers/foia/core.py index 5717f112..c6c51d94 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/foia/core.py +++ b/src/collectors/impl/muckrock/fetchers/foia/core.py @@ -1,6 +1,6 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.foia import FOIAFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.templates.fetcher import MuckrockFetcherBase -from src.collectors.source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from src.collectors.impl.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.collectors.impl.muckrock.fetchers.templates.fetcher import MuckrockFetcherBase +from src.collectors.impl.muckrock.constants import BASE_MUCKROCK_URL FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia" diff --git a/src/collectors/source_collectors/muckrock/fetchers/foia/generator.py b/src/collectors/impl/muckrock/fetchers/foia/generator.py similarity index 62% rename from src/collectors/source_collectors/muckrock/fetchers/foia/generator.py rename to src/collectors/impl/muckrock/fetchers/foia/generator.py index 8e4fa7ac..9260f43b 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/foia/generator.py +++ b/src/collectors/impl/muckrock/fetchers/foia/generator.py @@ -1,6 +1,6 @@ -from src.collectors.source_collectors.muckrock.fetch_requests import FOIALoopFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.foia.manager import FOIAFetchManager -from src.collectors.source_collectors.muckrock.fetchers.templates.generator import MuckrockGeneratorFetcher +from src.collectors.impl.muckrock.fetch_requests import FOIALoopFetchRequest +from src.collectors.impl.muckrock.fetchers.foia.manager import FOIAFetchManager +from src.collectors.impl.muckrock.fetchers.templates.generator import MuckrockGeneratorFetcher class FOIAGeneratorFetcher(MuckrockGeneratorFetcher): diff --git a/src/collectors/source_collectors/muckrock/fetchers/foia/loop.py b/src/collectors/impl/muckrock/fetchers/foia/loop.py similarity index 68% rename from src/collectors/source_collectors/muckrock/fetchers/foia/loop.py rename to src/collectors/impl/muckrock/fetchers/foia/loop.py index ec21810e..44b4b845 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/foia/loop.py +++ b/src/collectors/impl/muckrock/fetchers/foia/loop.py @@ -1,8 +1,8 @@ from datasets import tqdm -from src.collectors.source_collectors.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.foia.manager import FOIAFetchManager -from src.collectors.source_collectors.muckrock.fetchers.templates.loop import MuckrockLoopFetcher +from src.collectors.impl.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest +from src.collectors.impl.muckrock.fetchers.foia.manager import FOIAFetchManager +from src.collectors.impl.muckrock.fetchers.templates.loop import MuckrockLoopFetcher class FOIALoopFetcher(MuckrockLoopFetcher): diff --git a/src/collectors/source_collectors/muckrock/fetchers/foia/manager.py b/src/collectors/impl/muckrock/fetchers/foia/manager.py similarity index 74% rename from src/collectors/source_collectors/muckrock/fetchers/foia/manager.py rename to src/collectors/impl/muckrock/fetchers/foia/manager.py index 7a38caaa..09f71a59 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/foia/manager.py +++ b/src/collectors/impl/muckrock/fetchers/foia/manager.py @@ -1,5 +1,5 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest -from src.collectors.source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from src.collectors.impl.muckrock.fetch_requests.foia_loop import FOIALoopFetchRequest +from src.collectors.impl.muckrock.constants import BASE_MUCKROCK_URL class FOIAFetchManager: diff --git a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/__init__.py b/src/collectors/impl/muckrock/fetchers/jurisdiction/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetchers/jurisdiction/__init__.py rename to src/collectors/impl/muckrock/fetchers/jurisdiction/__init__.py diff --git a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/core.py b/src/collectors/impl/muckrock/fetchers/jurisdiction/core.py similarity index 59% rename from src/collectors/source_collectors/muckrock/fetchers/jurisdiction/core.py rename to src/collectors/impl/muckrock/fetchers/jurisdiction/core.py index befbc3e9..8f21bca3 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/core.py +++ b/src/collectors/impl/muckrock/fetchers/jurisdiction/core.py @@ -1,7 +1,7 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.jurisdiction_by_id import \ +from src.collectors.impl.muckrock.fetch_requests.jurisdiction_by_id import \ JurisdictionByIDFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.templates.fetcher import MuckrockFetcherBase -from src.collectors.source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from src.collectors.impl.muckrock.fetchers.templates.fetcher import MuckrockFetcherBase +from src.collectors.impl.muckrock.constants import BASE_MUCKROCK_URL class JurisdictionByIDFetcher(MuckrockFetcherBase): diff --git a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/generator.py b/src/collectors/impl/muckrock/fetchers/jurisdiction/generator.py similarity index 58% rename from src/collectors/source_collectors/muckrock/fetchers/jurisdiction/generator.py rename to src/collectors/impl/muckrock/fetchers/jurisdiction/generator.py index b285e852..394a6801 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/generator.py +++ b/src/collectors/impl/muckrock/fetchers/jurisdiction/generator.py @@ -1,6 +1,6 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.jurisdiction.manager import JurisdictionFetchManager -from src.collectors.source_collectors.muckrock.fetchers.templates.generator import MuckrockGeneratorFetcher +from src.collectors.impl.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest +from src.collectors.impl.muckrock.fetchers.jurisdiction.manager import JurisdictionFetchManager +from src.collectors.impl.muckrock.fetchers.templates.generator import MuckrockGeneratorFetcher class JurisdictionGeneratorFetcher(MuckrockGeneratorFetcher): diff --git a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/loop.py b/src/collectors/impl/muckrock/fetchers/jurisdiction/loop.py similarity index 78% rename from src/collectors/source_collectors/muckrock/fetchers/jurisdiction/loop.py rename to src/collectors/impl/muckrock/fetchers/jurisdiction/loop.py index 5ca4b900..16ecdaa3 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/loop.py +++ b/src/collectors/impl/muckrock/fetchers/jurisdiction/loop.py @@ -1,8 +1,8 @@ from tqdm import tqdm -from src.collectors.source_collectors.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest -from src.collectors.source_collectors.muckrock.fetchers.jurisdiction.manager import JurisdictionFetchManager -from src.collectors.source_collectors.muckrock.fetchers.templates.loop import MuckrockLoopFetcher +from src.collectors.impl.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest +from src.collectors.impl.muckrock.fetchers.jurisdiction.manager import JurisdictionFetchManager +from src.collectors.impl.muckrock.fetchers.templates.loop import MuckrockLoopFetcher class JurisdictionLoopFetcher(MuckrockLoopFetcher): diff --git a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/manager.py b/src/collectors/impl/muckrock/fetchers/jurisdiction/manager.py similarity index 80% rename from src/collectors/source_collectors/muckrock/fetchers/jurisdiction/manager.py rename to src/collectors/impl/muckrock/fetchers/jurisdiction/manager.py index dfd27569..9cd24df2 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/jurisdiction/manager.py +++ b/src/collectors/impl/muckrock/fetchers/jurisdiction/manager.py @@ -1,5 +1,5 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest -from src.collectors.source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from src.collectors.impl.muckrock.fetch_requests.jurisdiction_loop import JurisdictionLoopFetchRequest +from src.collectors.impl.muckrock.constants import BASE_MUCKROCK_URL class JurisdictionFetchManager: diff --git a/src/collectors/source_collectors/muckrock/fetchers/templates/__init__.py b/src/collectors/impl/muckrock/fetchers/templates/__init__.py similarity index 100% rename from src/collectors/source_collectors/muckrock/fetchers/templates/__init__.py rename to src/collectors/impl/muckrock/fetchers/templates/__init__.py diff --git a/src/collectors/source_collectors/muckrock/fetchers/templates/fetcher.py b/src/collectors/impl/muckrock/fetchers/templates/fetcher.py similarity index 83% rename from src/collectors/source_collectors/muckrock/fetchers/templates/fetcher.py rename to src/collectors/impl/muckrock/fetchers/templates/fetcher.py index 6661c04a..1c41f6fd 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/templates/fetcher.py +++ b/src/collectors/impl/muckrock/fetchers/templates/fetcher.py @@ -4,8 +4,8 @@ import requests import aiohttp -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest -from src.collectors.source_collectors.muckrock.exceptions import MuckrockNoMoreDataError, MuckrockServerError +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest +from src.collectors.impl.muckrock.exceptions import MuckrockNoMoreDataError, MuckrockServerError class MuckrockFetcherBase(ABC): diff --git a/src/collectors/source_collectors/muckrock/fetchers/templates/generator.py b/src/collectors/impl/muckrock/fetchers/templates/generator.py similarity index 79% rename from src/collectors/source_collectors/muckrock/fetchers/templates/generator.py rename to src/collectors/impl/muckrock/fetchers/templates/generator.py index 3a6a0e01..55fa62ec 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/templates/generator.py +++ b/src/collectors/impl/muckrock/fetchers/templates/generator.py @@ -1,5 +1,5 @@ -from src.collectors.source_collectors.muckrock.fetchers.templates.iter_fetcher import MuckrockIterFetcherBase -from src.collectors.source_collectors.muckrock.exceptions import RequestFailureException +from src.collectors.impl.muckrock.fetchers.templates.iter_fetcher import MuckrockIterFetcherBase +from src.collectors.impl.muckrock.exceptions import RequestFailureException class MuckrockGeneratorFetcher(MuckrockIterFetcherBase): diff --git a/src/collectors/source_collectors/muckrock/fetchers/templates/iter_fetcher.py b/src/collectors/impl/muckrock/fetchers/templates/iter_fetcher.py similarity index 83% rename from src/collectors/source_collectors/muckrock/fetchers/templates/iter_fetcher.py rename to src/collectors/impl/muckrock/fetchers/templates/iter_fetcher.py index cc397242..66ee4cd3 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/templates/iter_fetcher.py +++ b/src/collectors/impl/muckrock/fetchers/templates/iter_fetcher.py @@ -3,8 +3,8 @@ import aiohttp import requests -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest -from src.collectors.source_collectors.muckrock.exceptions import RequestFailureException +from src.collectors.impl.muckrock.fetch_requests.base import FetchRequest +from src.collectors.impl.muckrock.exceptions import RequestFailureException class MuckrockIterFetcherBase(ABC): diff --git a/src/collectors/source_collectors/muckrock/fetchers/templates/loop.py b/src/collectors/impl/muckrock/fetchers/templates/loop.py similarity index 78% rename from src/collectors/source_collectors/muckrock/fetchers/templates/loop.py rename to src/collectors/impl/muckrock/fetchers/templates/loop.py index c3b5dc0f..427564c2 100644 --- a/src/collectors/source_collectors/muckrock/fetchers/templates/loop.py +++ b/src/collectors/impl/muckrock/fetchers/templates/loop.py @@ -1,8 +1,8 @@ from abc import abstractmethod from time import sleep -from src.collectors.source_collectors.muckrock.fetchers.templates.iter_fetcher import MuckrockIterFetcherBase -from src.collectors.source_collectors.muckrock.exceptions import RequestFailureException +from src.collectors.impl.muckrock.fetchers.templates.iter_fetcher import MuckrockIterFetcherBase +from src.collectors.impl.muckrock.exceptions import RequestFailureException class MuckrockLoopFetcher(MuckrockIterFetcherBase): diff --git a/src/collectors/manager.py b/src/collectors/manager.py index b90e03a6..a493b92c 100644 --- a/src/collectors/manager.py +++ b/src/collectors/manager.py @@ -6,7 +6,7 @@ from pydantic import BaseModel from src.db.client.async_ import AsyncDatabaseClient -from src.collectors.source_collectors.base import AsyncCollectorBase +from src.collectors.impl.base import AsyncCollectorBase from src.collectors.exceptions import InvalidCollectorError from src.collectors.mapping import COLLECTOR_MAPPING from src.collectors.enums import CollectorType diff --git a/src/collectors/mapping.py b/src/collectors/mapping.py index e07cac09..32aeda5a 100644 --- a/src/collectors/mapping.py +++ b/src/collectors/mapping.py @@ -1,11 +1,11 @@ from src.collectors.enums import CollectorType -from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector -from src.collectors.source_collectors.ckan.collector import CKANCollector -from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector -from src.collectors.source_collectors.example.core import ExampleCollector -from src.collectors.source_collectors.muckrock.collectors.all_foia.core import MuckrockAllFOIARequestsCollector -from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector -from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector +from src.collectors.impl.auto_googler.collector import AutoGooglerCollector +from src.collectors.impl.ckan.collector import CKANCollector +from src.collectors.impl.common_crawler.collector import CommonCrawlerCollector +from src.collectors.impl.example.core import ExampleCollector +from src.collectors.impl.muckrock.collectors.all_foia.core import MuckrockAllFOIARequestsCollector +from src.collectors.impl.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector +from src.collectors.impl.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector COLLECTOR_MAPPING = { CollectorType.EXAMPLE: ExampleCollector, diff --git a/src/collectors/queries/__init__.py b/src/collectors/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/collectors/queries/get_url_info.py b/src/collectors/queries/get_url_info.py new file mode 100644 index 00000000..d72fc6af --- /dev/null +++ b/src/collectors/queries/get_url_info.py @@ -0,0 +1,19 @@ +from sqlalchemy import Select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLInfoByURLQueryBuilder(QueryBuilderBase): + + def __init__(self, url: str): + super().__init__() + self.url = url + + async def run(self, session: AsyncSession) -> URLInfo | None: + query = Select(URL).where(URL.url == self.url) + raw_result = await session.execute(query) + url = raw_result.scalars().first() + return URLInfo(**url.__dict__) \ No newline at end of file diff --git a/src/collectors/queries/insert/__init__.py b/src/collectors/queries/insert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py new file mode 100644 index 00000000..44e7c612 --- /dev/null +++ b/src/collectors/queries/insert/url.py @@ -0,0 +1,33 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.instantiations.link.batch_url import LinkBatchURL +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.models.instantiations.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class InsertURLQueryBuilder(QueryBuilderBase): + + + def __init__(self, url_info: URLInfo): + super().__init__() + self.url_info = url_info + + async def run(self, session: AsyncSession) -> int: + """Insert a new URL into the database.""" + url_entry = URL( + url=self.url_info.url, + collector_metadata=self.url_info.collector_metadata, + outcome=self.url_info.outcome.value, + source=self.url_info.source + ) + if self.url_info.created_at is not None: + url_entry.created_at = self.url_info.created_at + session.add(url_entry) + await session.flush() + link = LinkBatchURL( + batch_id=self.url_info.batch_id, + url_id=url_entry.id + ) + session.add(link) + return url_entry.id \ No newline at end of file diff --git a/src/collectors/queries/insert/urls/__init__.py b/src/collectors/queries/insert/urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py new file mode 100644 index 00000000..ddab0582 --- /dev/null +++ b/src/collectors/queries/insert/urls/query.py @@ -0,0 +1,56 @@ +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager +from src.util.clean import clean_url +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.dtos.url.mapping import URLMapping +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo +from src.db.queries.base.builder import QueryBuilderBase + + +class InsertURLsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_infos: list[URLInfo], + batch_id: int + ): + super().__init__() + self.url_infos = url_infos + self.batch_id = batch_id + + async def run(self, session: AsyncSession) -> InsertURLsInfo: + url_mappings = [] + duplicates = [] + rm = InsertURLsRequestManager(session=session) + for url_info in self.url_infos: + url_info.url = clean_url(url_info.url) + url_info.batch_id = self.batch_id + try: + async with session.begin_nested() as sp: + url_id = await rm.insert_url(url_info) + url_mappings.append( + URLMapping( + url_id=url_id, + url=url_info.url + ) + ) + except IntegrityError: + sp.rollback() + orig_url_info = await rm.get_url_info_by_url(url_info.url) + duplicate_info = DuplicateInsertInfo( + batch_id=self.batch_id, + original_url_id=orig_url_info.id + ) + duplicates.append(duplicate_info) + await rm.insert_duplicates(duplicates) + + return InsertURLsInfo( + url_mappings=url_mappings, + total_count=len(self.url_infos), + original_count=len(url_mappings), + duplicate_count=len(duplicates), + url_ids=[url_mapping.url_id for url_mapping in url_mappings] + ) diff --git a/src/collectors/queries/insert/urls/request_manager.py b/src/collectors/queries/insert/urls/request_manager.py new file mode 100644 index 00000000..cd8a3399 --- /dev/null +++ b/src/collectors/queries/insert/urls/request_manager.py @@ -0,0 +1,33 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.queries.get_url_info import GetURLInfoByURLQueryBuilder +from src.collectors.queries.insert.url import InsertURLQueryBuilder +from src.db.models.instantiations.duplicate.pydantic.insert import DuplicateInsertInfo +from src.db.models.instantiations.url.core.pydantic.info import URLInfo + +from src.db.helpers.session import session_helper as sh + + +class InsertURLsRequestManager: + + def __init__( + self, + session: AsyncSession + ): + self.session = session + + async def insert_url(self, url_info: URLInfo) -> int: + return await InsertURLQueryBuilder( + url_info=url_info + ).run(self.session) + + async def get_url_info_by_url(self, url: str) -> URLInfo | None: + return await GetURLInfoByURLQueryBuilder( + url=url + ).run(self.session) + + async def insert_duplicates( + self, + duplicates: list[DuplicateInsertInfo] + ) -> None: + await sh.bulk_insert(self.session, models=duplicates) \ No newline at end of file diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/foia.py b/src/collectors/source_collectors/muckrock/fetch_requests/foia.py deleted file mode 100644 index 1f0bffec..00000000 --- a/src/collectors/source_collectors/muckrock/fetch_requests/foia.py +++ /dev/null @@ -1,6 +0,0 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest - - -class FOIAFetchRequest(FetchRequest): - page: int - page_size: int diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/foia_loop.py b/src/collectors/source_collectors/muckrock/fetch_requests/foia_loop.py deleted file mode 100644 index 54c063b6..00000000 --- a/src/collectors/source_collectors/muckrock/fetch_requests/foia_loop.py +++ /dev/null @@ -1,5 +0,0 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest - - -class FOIALoopFetchRequest(FetchRequest): - jurisdiction: int diff --git a/src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_by_id.py b/src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_by_id.py deleted file mode 100644 index 7825ade6..00000000 --- a/src/collectors/source_collectors/muckrock/fetch_requests/jurisdiction_by_id.py +++ /dev/null @@ -1,5 +0,0 @@ -from src.collectors.source_collectors.muckrock.fetch_requests.base import FetchRequest - - -class JurisdictionByIDFetchRequest(FetchRequest): - jurisdiction_id: int diff --git a/src/core/preprocessors/example.py b/src/core/preprocessors/example.py index 5228c241..34c1e3a4 100644 --- a/src/core/preprocessors/example.py +++ b/src/core/preprocessors/example.py @@ -1,6 +1,6 @@ from typing import List -from src.collectors.source_collectors.example.dtos.output import ExampleOutputDTO +from src.collectors.impl.example.dtos.output import ExampleOutputDTO from src.core.preprocessors.base import PreprocessorBase from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index e381c486..6b55a157 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -4,7 +4,7 @@ from environs import Env -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py index fd3b9ec2..633d84ac 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py @@ -2,9 +2,9 @@ from typing_extensions import override -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.source_collectors.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.source_collectors.muckrock.enums import AgencyLookupResponseType +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.exceptions import MuckrockAPIError from src.core.helpers import process_match_agency_response_to_suggestions from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 71f53568..6ef84149 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,5 +1,5 @@ from src.collectors.enums import CollectorType -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index aa0f4d5b..b39d8947 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.clean import clean_url +from src.util.clean import clean_url from src.db.dtos.url.mapping import URLMapping from src.db.models.instantiations.url.core.sqlalchemy import URL from src.db.models.instantiations.url.web_metadata.sqlalchemy import URLWebMetadata diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 1fa4376e..40a0a4e1 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -50,6 +50,7 @@ from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType +from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder @@ -895,52 +896,6 @@ async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo] page=page )) - @session_manager - async def insert_url( - self, - session: AsyncSession, - url_info: URLInfo - ) -> int: - """Insert a new URL into the database.""" - url_entry = URL( - url=url_info.url, - collector_metadata=url_info.collector_metadata, - outcome=url_info.outcome.value, - source=url_info.source - ) - if url_info.created_at is not None: - url_entry.created_at = url_info.created_at - session.add(url_entry) - await session.flush() - link = LinkBatchURL( - batch_id=url_info.batch_id, - url_id=url_entry.id - ) - session.add(link) - return url_entry.id - - @session_manager - async def get_url_info_by_url( - self, - session: AsyncSession, - url: str - ) -> URLInfo | None: - query = Select(URL).where(URL.url == url) - raw_result = await session.execute(query) - url = raw_result.scalars().first() - return URLInfo(**url.__dict__) - - @session_manager - async def get_url_info_by_id( - self, - session: AsyncSession, - url_id: int - ) -> URLInfo | None: - query = Select(URL).where(URL.id == url_id) - raw_result = await session.execute(query) - url = raw_result.scalars().first() - return URLInfo(**url.__dict__) - @session_manager async def insert_logs( self, @@ -953,19 +908,6 @@ async def insert_logs( log.created_at = log_info.created_at session.add(log) - @session_manager - async def insert_duplicates( - self, - session: AsyncSession, - duplicate_infos: list[DuplicateInsertInfo] - ) -> None: - for duplicate_info in duplicate_infos: - duplicate = Duplicate( - batch_id=duplicate_info.duplicate_batch_id, - original_url_id=duplicate_info.original_url_id, - ) - session.add(duplicate) - @session_manager async def insert_batch( self, @@ -996,29 +938,13 @@ async def insert_urls( url_infos: list[URLInfo], batch_id: int ) -> InsertURLsInfo: - url_mappings = [] - duplicates = [] - for url_info in url_infos: - url_info.batch_id = batch_id - try: - url_id = await self.insert_url(url_info) - url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) - except IntegrityError: - orig_url_info = await self.get_url_info_by_url(url_info.url) - duplicate_info = DuplicateInsertInfo( - duplicate_batch_id=batch_id, - original_url_id=orig_url_info.id - ) - duplicates.append(duplicate_info) - await self.insert_duplicates(duplicates) - - return InsertURLsInfo( - url_mappings=url_mappings, - total_count=len(url_infos), - original_count=len(url_mappings), - duplicate_count=len(duplicates), - url_ids=[url_mapping.url_id for url_mapping in url_mappings] + builder = InsertURLsQueryBuilder( + url_infos=url_infos, + batch_id=batch_id ) + return await self.run_query_builder(builder) + + @session_manager async def update_batch_post_collection( diff --git a/src/db/client/sync.py b/src/db/client/sync.py index b893abc1..62e45f08 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -99,7 +99,7 @@ def insert_duplicates( ): for duplicate_info in duplicate_infos: duplicate = Duplicate( - batch_id=duplicate_info.duplicate_batch_id, + batch_id=duplicate_info.batch_id, original_url_id=duplicate_info.original_url_id, ) session.add(duplicate) @@ -147,7 +147,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo except IntegrityError as e: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( - duplicate_batch_id=batch_id, + batch_id=batch_id, original_url_id=orig_url_info.id ) duplicates.append(duplicate_info) diff --git a/src/db/models/instantiations/duplicate/pydantic/insert.py b/src/db/models/instantiations/duplicate/pydantic/insert.py index f753e217..a8854cf3 100644 --- a/src/db/models/instantiations/duplicate/pydantic/insert.py +++ b/src/db/models/instantiations/duplicate/pydantic/insert.py @@ -1,7 +1,11 @@ -from pydantic import BaseModel +from src.db.models.instantiations.duplicate.sqlalchemy import Duplicate +from src.db.templates.markers.bulk.insert import BulkInsertableModel -class DuplicateInsertInfo(BaseModel): +class DuplicateInsertInfo(BulkInsertableModel): original_url_id: int - duplicate_batch_id: int + batch_id: int + @classmethod + def sa_model(self) -> type[Duplicate]: + return Duplicate \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py b/src/util/clean.py similarity index 69% rename from src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py rename to src/util/clean.py index 3beae86a..874aa665 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/clean.py +++ b/src/util/clean.py @@ -5,5 +5,8 @@ def clean_url(url: str) -> str: url = url.replace("\u00A0", "") url = url.replace(" ", "") url = url.replace("%C2%A0", "") + + # Remove any fragments and everything after them + url = url.split("#")[0] return url diff --git a/tests/automated/integration/api/_helpers/RequestValidator.py b/tests/automated/integration/api/_helpers/RequestValidator.py index 33c3120d..afa19afe 100644 --- a/tests/automated/integration/api/_helpers/RequestValidator.py +++ b/tests/automated/integration/api/_helpers/RequestValidator.py @@ -37,7 +37,7 @@ from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.enums import TaskType -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO +from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.util.helper_functions import update_if_not_none diff --git a/tests/automated/integration/api/example_collector/__init__.py b/tests/automated/integration/api/example_collector/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/example_collector/test_error.py b/tests/automated/integration/api/example_collector/test_error.py new file mode 100644 index 00000000..39f0ede7 --- /dev/null +++ b/tests/automated/integration/api/example_collector/test_error.py @@ -0,0 +1,54 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse +from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary +from src.collectors.impl.example.core import ExampleCollector +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.core.enums import BatchStatus +from src.core.logger import AsyncCoreLogger +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.mark.asyncio +async def test_example_collector_error(api_test_helper, monkeypatch): + """ + Test that when an error occurs in a collector, the batch is properly update + """ + ath = api_test_helper + + logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient(), flush_interval=1) + await logger.__aenter__() + ath.async_core.collector_manager.logger = logger + + # Patch the collector to raise an exception during run_implementation + mock = AsyncMock() + mock.side_effect = Exception("Collector failed!") + monkeypatch.setattr(ExampleCollector, 'run_implementation', mock) + + dto = ExampleInputDTO( + sleep_time=1 + ) + + data = ath.request_validator.example_collector( + dto=dto + ) + batch_id = data["batch_id"] + assert batch_id is not None + assert data["message"] == "Started example collector." + + await ath.wait_for_all_batches_to_complete() + + bi: BatchSummary = ath.request_validator.get_batch_info(batch_id=batch_id) + + assert bi.status == BatchStatus.ERROR + + # Check there are logs + assert not logger.log_queue.empty() + await logger.flush_all() + assert logger.log_queue.empty() + + gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) + assert gbl.logs[-1].log == "Error: Collector failed!" + await logger.__aexit__(None, None, None) diff --git a/tests/automated/integration/api/test_example_collector.py b/tests/automated/integration/api/example_collector/test_happy_path.py similarity index 65% rename from tests/automated/integration/api/test_example_collector.py rename to tests/automated/integration/api/example_collector/test_happy_path.py index 2903c528..78d20dce 100644 --- a/tests/automated/integration/api/test_example_collector.py +++ b/tests/automated/integration/api/example_collector/test_happy_path.py @@ -1,5 +1,4 @@ import asyncio -from unittest.mock import AsyncMock import pytest @@ -8,8 +7,7 @@ from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient from src.db.models.instantiations.batch.pydantic import BatchInfo -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO -from src.collectors.source_collectors.example.core import ExampleCollector +from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger from src.core.enums import BatchStatus @@ -95,48 +93,6 @@ async def test_example_collector(api_test_helper, monkeypatch): await logger.__aexit__(None, None, None) -@pytest.mark.asyncio -async def test_example_collector_error(api_test_helper, monkeypatch): - """ - Test that when an error occurs in a collector, the batch is properly update - """ - ath = api_test_helper - - logger = AsyncCoreLogger(adb_client=AsyncDatabaseClient(), flush_interval=1) - await logger.__aenter__() - ath.async_core.collector_manager.logger = logger - - # Patch the collector to raise an exception during run_implementation - mock = AsyncMock() - mock.side_effect = Exception("Collector failed!") - monkeypatch.setattr(ExampleCollector, 'run_implementation', mock) - - dto = ExampleInputDTO( - sleep_time=1 - ) - - data = ath.request_validator.example_collector( - dto=dto - ) - batch_id = data["batch_id"] - assert batch_id is not None - assert data["message"] == "Started example collector." - - await ath.wait_for_all_batches_to_complete() - - bi: BatchSummary = ath.request_validator.get_batch_info(batch_id=batch_id) - - assert bi.status == BatchStatus.ERROR - - # Check there are logs - assert not logger.log_queue.empty() - await logger.flush_all() - assert logger.log_queue.empty() - - gbl: GetBatchLogsResponse = ath.request_validator.get_batch_logs(batch_id=batch_id) - assert gbl.logs[-1].log == "Error: Collector failed!" - await logger.__aexit__(None, None, None) - diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py index 07408ff0..fc140453 100644 --- a/tests/automated/integration/api/test_batch.py +++ b/tests/automated/integration/api/test_batch.py @@ -2,7 +2,7 @@ from src.db.models.instantiations.batch.pydantic import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO +from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py index 68e33158..b6787899 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py @@ -2,7 +2,7 @@ import pytest -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py index 87bc6614..80f92ec4 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py @@ -2,9 +2,9 @@ import pytest -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.source_collectors.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.source_collectors.muckrock.enums import AgencyLookupResponseType +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.enums import SuggestionType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import MuckrockAgencyIdentificationSubtask diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 1e5c69ae..814dd48a 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -2,7 +2,7 @@ import pytest -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index 096ea3eb..99395476 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -2,18 +2,18 @@ import pytest -from src.collectors.source_collectors.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO -from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO +from src.collectors.impl.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO +from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector +from src.collectors.impl.auto_googler.collector import AutoGooglerCollector from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture def patch_get_query_results(monkeypatch): - patch_path = "src.collectors.source_collectors.auto_googler.searcher.GoogleSearcher.get_query_results" + patch_path = "src.collectors.impl.auto_googler.searcher.GoogleSearcher.get_query_results" mock = AsyncMock() mock.side_effect = [ [GoogleSearchQueryResultsInnerDTO(url="https://include.com/1", title="keyword", snippet="snippet 1"),], diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 4e69d1ad..2757227b 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -2,17 +2,17 @@ import pytest -from src.collectors.source_collectors.common_crawler.input import CommonCrawlerInputDTO +from src.collectors.impl.common_crawler.input import CommonCrawlerInputDTO from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.common_crawler.collector import CommonCrawlerCollector +from src.collectors.impl.common_crawler.collector import CommonCrawlerCollector from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo @pytest.fixture def mock_get_common_crawl_search_results(): - mock_path = "src.collectors.source_collectors.common_crawler.crawler.get_common_crawl_search_results" + mock_path = "src.collectors.impl.common_crawler.crawler.get_common_crawl_search_results" # Results contain other keys, but those are not relevant and thus # can be ignored mock_results = [ diff --git a/tests/automated/unit/source_collectors/test_example_collector.py b/tests/automated/unit/source_collectors/test_example_collector.py index d9d5b17a..632a6293 100644 --- a/tests/automated/unit/source_collectors/test_example_collector.py +++ b/tests/automated/unit/source_collectors/test_example_collector.py @@ -1,8 +1,8 @@ from unittest.mock import AsyncMock from src.db.client.sync import DatabaseClient -from src.collectors.source_collectors.example.dtos.input import ExampleInputDTO -from src.collectors.source_collectors.example.core import ExampleCollector +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.collectors.impl.example.core import ExampleCollector from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index d0a10982..bb194d22 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -3,17 +3,17 @@ import pytest -from src.collectors.source_collectors.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector -from src.collectors.source_collectors.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector +from src.collectors.impl.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector +from src.collectors.impl.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO +from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO +from src.collectors.impl.muckrock.fetch_requests.foia import FOIAFetchRequest from src.db.models.instantiations.url.core.enums import URLSource from src.db.models.instantiations.url.core.pydantic.info import URLInfo -PATCH_ROOT = "src.collectors.source_collectors.muckrock" +PATCH_ROOT = "src.collectors.impl.muckrock" @pytest.fixture def patch_muckrock_fetcher(monkeypatch): diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index fed9c970..2baac69c 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -285,7 +285,7 @@ def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): duplicate_infos = [] for url_id in url_ids: dup_info = DuplicateInsertInfo( - duplicate_batch_id=duplicate_batch_id, + batch_id=duplicate_batch_id, original_url_id=url_id ) duplicate_infos.append(dup_info) diff --git a/tests/helpers/patch_functions.py b/tests/helpers/patch_functions.py index 8a42c9dc..170a2062 100644 --- a/tests/helpers/patch_functions.py +++ b/tests/helpers/patch_functions.py @@ -4,7 +4,7 @@ async def block_sleep(monkeypatch) -> AwaitableBarrier: barrier = AwaitableBarrier() monkeypatch.setattr( - "src.collectors.source_collectors.example.core.ExampleCollector.sleep", + "src.collectors.impl.example.core.ExampleCollector.sleep", barrier ) return barrier diff --git a/tests/manual/agency_identifier/test_muckrock_api_interface.py b/tests/manual/agency_identifier/test_muckrock_api_interface.py index 1b809718..31fafa23 100644 --- a/tests/manual/agency_identifier/test_muckrock_api_interface.py +++ b/tests/manual/agency_identifier/test_muckrock_api_interface.py @@ -1,7 +1,7 @@ import pytest from aiohttp import ClientSession -from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface @pytest.mark.asyncio diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 37e71666..84c4c430 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,7 +1,7 @@ from src.db.models.instantiations.batch.pydantic import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus -from src.collectors.source_collectors.ckan import group_search, package_search, organization_search +from src.collectors.impl.ckan import group_search, package_search, organization_search from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/source_collectors/test_autogoogler_collector.py b/tests/manual/source_collectors/test_autogoogler_collector.py index 320434e1..39d1f8e7 100644 --- a/tests/manual/source_collectors/test_autogoogler_collector.py +++ b/tests/manual/source_collectors/test_autogoogler_collector.py @@ -2,10 +2,10 @@ import pytest -from src.collectors.source_collectors.auto_googler.dtos.input import AutoGooglerInputDTO +from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO from src.core.env_var_manager import EnvVarManager from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.auto_googler.collector import AutoGooglerCollector +from src.collectors.impl.auto_googler.collector import AutoGooglerCollector from src.db.client.async_ import AsyncDatabaseClient from environs import Env diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index bfe065dc..9b5edc9f 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -3,10 +3,10 @@ import pytest from marshmallow import Schema, fields -from src.collectors.source_collectors.ckan.collector import CKANCollector +from src.collectors.impl.ckan.collector import CKANCollector from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.ckan import collector -from src.collectors.source_collectors.ckan.dtos.input import CKANInputDTO +from src.collectors.impl.ckan import collector +from src.collectors.impl.ckan.dtos.input import CKANInputDTO class CKANSchema(Schema): diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index 144bfc6e..e508c2ac 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -4,8 +4,8 @@ from marshmallow import Schema, fields from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.common_crawler import collector -from src.collectors.source_collectors.common_crawler import CommonCrawlerInputDTO +from src.collectors.impl.common_crawler import collector +from src.collectors.impl.common_crawler import CommonCrawlerInputDTO class CommonCrawlerSchema(Schema): diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index caf2274c..d8153c6b 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -4,10 +4,10 @@ from marshmallow import Schema, fields from src.core.logger import AsyncCoreLogger -from src.collectors.source_collectors.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO -from src.collectors.source_collectors.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO -from src.collectors.source_collectors import MuckrockSimpleSearchCollector, \ +from src.collectors.impl.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO +from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO +from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO +from src.collectors.impl import MuckrockSimpleSearchCollector, \ MuckrockCountyLevelSearchCollector, MuckrockAllFOIARequestsCollector from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, \