From 06dec6e474aab7fc68d3877a62c1f0185a66e629 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 11 Aug 2025 16:46:25 -0400 Subject: [PATCH] Finesse new URL Probe logic --- src/core/env_var_manager.py | 3 +- src/core/tasks/scheduled/loader.py | 16 +++---- .../url/operators/html/scraper/parser/core.py | 31 ++++++++++--- .../url/operators/html/scraper/parser/util.py | 8 ++-- .../html/scraper/root_url_cache/core.py | 6 +-- .../tasks/url/operators/misc_metadata/core.py | 13 ++++-- .../tasks/url/operators/probe_404/core.py | 24 +++++++--- src/db/helpers/connect.py | 2 +- src/db/models/helpers.py | 2 +- .../core/common/annotation_exists.py | 2 +- src/external/huggingface/hub/client.py | 15 +++++- src/external/pdap/client.py | 9 ++-- src/external/pdap/dtos/match_agency/post.py | 6 +-- src/external/pdap/dtos/sync/agencies.py | 6 +-- .../pdap/dtos/unique_url_duplicate.py | 2 +- src/external/url_request/core.py | 1 - src/external/url_request/probe/convert.py | 4 +- src/external/url_request/probe/core.py | 39 ++++++++++++---- src/security/manager.py | 4 +- src/util/clean.py | 4 +- src/util/db_manager.py | 46 ------------------- src/util/helper_functions.py | 8 +++- src/util/miscellaneous_functions.py | 4 +- tests/alembic/conftest.py | 12 +++-- tests/alembic/helpers.py | 3 +- .../external/url_request/test_url_probe.py | 2 +- 26 files changed, 150 insertions(+), 122 deletions(-) delete mode 100644 src/util/db_manager.py diff --git a/src/core/env_var_manager.py b/src/core/env_var_manager.py index 98a78b69..cbf424ec 100644 --- a/src/core/env_var_manager.py +++ b/src/core/env_var_manager.py @@ -16,7 +16,8 @@ def __init__(self, env: dict = os.environ): self.env = env self._load() - def _load(self): + def _load(self) -> None: + """Load environment variables from environment""" self.google_api_key = self.require_env("GOOGLE_API_KEY") self.google_cse_id = self.require_env("GOOGLE_CSE_ID") diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index b738a0c9..193a368f 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -41,6 +41,14 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: return [ + ScheduledTaskEntry( + operator=SyncDataSourcesTaskOperator( + adb_client=self.async_core.adb_client, + pdap_client=self.pdap_client + ), + interval=IntervalEnum.DAILY, + enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) + ), ScheduledTaskEntry( operator=RunURLTasksTaskOperator(async_core=self.async_core), interval=IntervalEnum.HOURLY, @@ -57,14 +65,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True) ), - ScheduledTaskEntry( - operator=SyncDataSourcesTaskOperator( - adb_client=self.async_core.adb_client, - pdap_client=self.pdap_client - ), - interval=IntervalEnum.DAILY, - enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) - ), ScheduledTaskEntry( operator=SyncAgenciesTaskOperator( adb_client=self.async_core.adb_client, diff --git a/src/core/tasks/url/operators/html/scraper/parser/core.py b/src/core/tasks/url/operators/html/scraper/parser/core.py index a212b951..c209ba27 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/core.py +++ b/src/core/tasks/url/operators/html/scraper/parser/core.py @@ -35,7 +35,12 @@ def add_html_from_beautiful_soup( html_info: ResponseHTMLInfo, parser_type: ParserTypeEnum, html_content: str - ): + ) -> None: + """ + Modifies: + html_info + """ + soup = BeautifulSoup( markup=html_content, features=parser_type.value, @@ -48,7 +53,7 @@ def add_html_from_beautiful_soup( if soup.html is not None: soup.html.decompose() - def get_div_text(self, soup): + def get_div_text(self, soup: BeautifulSoup) -> str: div_text = "" MAX_WORDS = 500 for div in soup.find_all("div"): @@ -85,7 +90,7 @@ def add_header_tags(self, html_info: ResponseHTMLInfo, soup: BeautifulSoup): continue setattr(html_info, header_tag, tag_content) - def get_html_title(self, soup: BeautifulSoup) -> Optional[str]: + def get_html_title(self, soup: BeautifulSoup) -> str | None: if soup.title is None: return None if soup.title.string is None: @@ -93,7 +98,17 @@ def get_html_title(self, soup: BeautifulSoup) -> Optional[str]: return remove_excess_whitespace(soup.title.string) - def add_url_and_path(self, html_info: ResponseHTMLInfo, html_content: str, url: str): + def add_url_and_path( + self, + html_info: ResponseHTMLInfo, + html_content: str, + url: str + ) -> None: + """ + Modifies: + html_info.url + html_info.url_path + """ url = add_https(url) html_info.url = url @@ -101,13 +116,17 @@ def add_url_and_path(self, html_info: ResponseHTMLInfo, html_content: str, url: url_path = remove_trailing_backslash(url_path) html_info.url_path = url_path - async def add_root_page_titles(self, html_info: ResponseHTMLInfo): + async def add_root_page_titles(self, html_info: ResponseHTMLInfo) -> None: + """ + Modifies: + html_info.root_page_title + """ root_page_title = await self.root_url_cache.get_title(html_info.url) html_info.root_page_title = remove_excess_whitespace( root_page_title ) - def get_parser_type(self, content_type: str) -> ParserTypeEnum or None: + def get_parser_type(self, content_type: str) -> ParserTypeEnum | None: try: # If content type does not contain "html" or "xml" then we can assume that the content is unreadable if "html" in content_type: diff --git a/src/core/tasks/url/operators/html/scraper/parser/util.py b/src/core/tasks/url/operators/html/scraper/parser/util.py index a4ea2d1b..924506a1 100644 --- a/src/core/tasks/url/operators/html/scraper/parser/util.py +++ b/src/core/tasks/url/operators/html/scraper/parser/util.py @@ -5,7 +5,9 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -def convert_to_response_html_info(html_content_infos: list[URLHTMLContentInfo]): +def convert_to_response_html_info( + html_content_infos: list[URLHTMLContentInfo] +) -> ResponseHTMLInfo: response_html_info = ResponseHTMLInfo() for html_content_info in html_content_infos: @@ -32,12 +34,12 @@ def add_https(url: str) -> str: return url -def remove_trailing_backslash(url_path): +def remove_trailing_backslash(url_path: str) -> str: if url_path and url_path[-1] == "/": url_path = url_path[:-1] return url_path -def drop_hostname(new_url): +def drop_hostname(new_url: str) -> str: url_path = urlparse(new_url).path[1:] return url_path diff --git a/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py index 284ad678..1bf15638 100644 --- a/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py +++ b/src/core/tasks/url/operators/html/scraper/root_url_cache/core.py @@ -12,19 +12,19 @@ class RootURLCache: - def __init__(self, adb_client: Optional[AsyncDatabaseClient] = None): + def __init__(self, adb_client: AsyncDatabaseClient | None = None): if adb_client is None: adb_client = AsyncDatabaseClient() self.adb_client = adb_client self.cache = None - async def save_to_cache(self, url: str, title: str): + async def save_to_cache(self, url: str, title: str) -> None: if url in self.cache: return self.cache[url] = title await self.adb_client.add_to_root_url_cache(url=url, page_title=title) - async def get_from_cache(self, url: str) -> Optional[str]: + async def get_from_cache(self, url: str) -> str | None: if self.cache is None: self.cache = await self.adb_client.load_root_url_cache() diff --git a/src/core/tasks/url/operators/misc_metadata/core.py b/src/core/tasks/url/operators/misc_metadata/core.py index 9921846b..8e423c0e 100644 --- a/src/core/tasks/url/operators/misc_metadata/core.py +++ b/src/core/tasks/url/operators/misc_metadata/core.py @@ -22,16 +22,16 @@ def __init__( super().__init__(adb_client) @property - def task_type(self): + def task_type(self) -> TaskType: return TaskType.MISC_METADATA - async def meets_task_prerequisites(self): + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.has_pending_urls_missing_miscellaneous_metadata() async def get_subtask( self, collector_type: CollectorType - ) -> Optional[MiscellaneousMetadataSubtaskBase]: + ) -> MiscellaneousMetadataSubtaskBase | None: match collector_type: case CollectorType.MUCKROCK_SIMPLE_SEARCH: return MuckrockMiscMetadataSubtask() @@ -47,12 +47,17 @@ async def get_subtask( return None async def html_default_logic(self, tdo: URLMiscellaneousMetadataTDO): + """ + Modifies: + tdo.name + tdo.description + """ if tdo.name is None: tdo.name = tdo.html_metadata_info.title if tdo.description is None: tdo.description = tdo.html_metadata_info.description - async def inner_task_logic(self): + async def inner_task_logic(self) -> None: tdos: list[URLMiscellaneousMetadataTDO] = await self.adb_client.get_pending_urls_missing_miscellaneous_metadata() await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) diff --git a/src/core/tasks/url/operators/probe_404/core.py b/src/core/tasks/url/operators/probe_404/core.py index 6600d17d..ecfed6f5 100644 --- a/src/core/tasks/url/operators/probe_404/core.py +++ b/src/core/tasks/url/operators/probe_404/core.py @@ -26,13 +26,17 @@ def __init__( self.url_request_interface = url_request_interface @property - def task_type(self): + def task_type(self) -> TaskType: return TaskType.PROBE_404 - async def meets_task_prerequisites(self): + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.has_pending_urls_not_recently_probed_for_404() - async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]): + async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]) -> None: + """ + Modifies: + URL404ProbeTDO.is_404 + """ responses = await self.url_request_interface.make_simple_requests( urls=[tdo.url for tdo in tdos] ) @@ -42,7 +46,7 @@ async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]): tdo.is_404 = response.status == HTTPStatus.NOT_FOUND - async def inner_task_logic(self): + async def inner_task_logic(self) -> None: tdos = await self.get_pending_urls_not_recently_probed_for_404() url_ids = [task_info.url_id for task_info in tdos] await self.link_urls_to_task(url_ids=url_ids) @@ -55,9 +59,17 @@ async def inner_task_logic(self): async def get_pending_urls_not_recently_probed_for_404(self) -> list[URL404ProbeTDO]: return await self.adb_client.get_pending_urls_not_recently_probed_for_404() - async def update_404s_in_database(self, url_ids_404: list[int]): + async def update_404s_in_database(self, url_ids_404: list[int]) -> None: + """ + Modifies: + URL data in DB + """ await self.adb_client.mark_all_as_404(url_ids_404) - async def mark_as_recently_probed_for_404(self, url_ids: list[int]): + async def mark_as_recently_probed_for_404(self, url_ids: list[int]) -> None: + """ + Modifies: + URL data in DB + """ await self.adb_client.mark_all_as_recently_probed_for_404(url_ids) diff --git a/src/db/helpers/connect.py b/src/db/helpers/connect.py index 618b2e6d..2a15cba5 100644 --- a/src/db/helpers/connect.py +++ b/src/db/helpers/connect.py @@ -1,5 +1,5 @@ from src.core.env_var_manager import EnvVarManager -def get_postgres_connection_string(is_async = False): +def get_postgres_connection_string(is_async = False) -> str: return EnvVarManager.get().get_postgres_connection_string(is_async) diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index f205f0b9..50f3d43e 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -1,7 +1,7 @@ from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey, Enum as SAEnum from enum import Enum as PyEnum -def get_created_at_column(): +def get_created_at_column() -> Column: return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists.py index 253d0b57..bb6bf57a 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists.py @@ -41,7 +41,7 @@ def get_all(self) -> list[Any]: async def _annotation_exists_case( self, - ): + ) -> list[Any]: cases = [] for model in ALL_ANNOTATION_MODELS: cases.append( diff --git a/src/external/huggingface/hub/client.py b/src/external/huggingface/hub/client.py index 9cb2ba34..ef9d1cc7 100644 --- a/src/external/huggingface/hub/client.py +++ b/src/external/huggingface/hub/client.py @@ -11,10 +11,21 @@ class HuggingFaceHubClient: def __init__(self, token: str): self.token = token - def _push_dataset_to_hub(self, repo_id: str, dataset: Dataset): + def _push_dataset_to_hub(self, repo_id: str, dataset: Dataset) -> None: + """ + Modifies: + - repository on Hugging Face, identified by `repo_id` + """ dataset.push_to_hub(repo_id=repo_id, token=self.token) - def push_data_sources_raw_to_hub(self, outputs: list[GetForLoadingToHuggingFaceOutput]): + def push_data_sources_raw_to_hub( + self, + outputs: list[GetForLoadingToHuggingFaceOutput] + ) -> None: + """ + Modifies: + - repository on Hugging Face, identified by `DATA_SOURCES_RAW_REPO_ID` + """ dataset = format_as_huggingface_dataset(outputs) print(dataset) self._push_dataset_to_hub(repo_id=DATA_SOURCES_RAW_REPO_ID, dataset=dataset) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 29f99154..0b2b9ed8 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -192,14 +192,15 @@ async def sync_data_sources( ) headers = await self.access_manager.jwt_header() headers['Content-Type'] = "application/json" + params_dict = {"page": params.page} + if params.cutoff_date is not None: + params_dict["updated_at"] = params.cutoff_date + request_info = RequestInfo( type_=RequestType.GET, url=url, headers=headers, - params={ - "page": params.page, - "updated_at": params.cutoff_date - } + params=params_dict ) response_info = await self.access_manager.make_request(request_info) return DataSourcesSyncResponseInfo( diff --git a/src/external/pdap/dtos/match_agency/post.py b/src/external/pdap/dtos/match_agency/post.py index 14870796..2be0b90e 100644 --- a/src/external/pdap/dtos/match_agency/post.py +++ b/src/external/pdap/dtos/match_agency/post.py @@ -6,6 +6,6 @@ class MatchAgencyInfo(BaseModel): id: int submitted_name: str - state: Optional[str] = None - county: Optional[str] = None - locality: Optional[str] = None + state: str | None = None + county: str | None = None + locality: str | None = None diff --git a/src/external/pdap/dtos/sync/agencies.py b/src/external/pdap/dtos/sync/agencies.py index 7f2b5ad0..99483107 100644 --- a/src/external/pdap/dtos/sync/agencies.py +++ b/src/external/pdap/dtos/sync/agencies.py @@ -6,9 +6,9 @@ class AgenciesSyncResponseInnerInfo(BaseModel): display_name: str agency_id: int - state_name: Optional[str] - county_name: Optional[str] - locality_name: Optional[str] + state_name: str | None + county_name: str | None + locality_name: str | None updated_at: datetime.datetime class AgenciesSyncResponseInfo(BaseModel): diff --git a/src/external/pdap/dtos/unique_url_duplicate.py b/src/external/pdap/dtos/unique_url_duplicate.py index 096622fe..51e327f1 100644 --- a/src/external/pdap/dtos/unique_url_duplicate.py +++ b/src/external/pdap/dtos/unique_url_duplicate.py @@ -8,4 +8,4 @@ class UniqueURLDuplicateInfo(BaseModel): original_url: str approval_status: ApprovalStatus - rejection_note: Optional[str] = None + rejection_note: str | None = None diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index 093fe1ab..2f37f90d 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -2,7 +2,6 @@ from src.external.url_request.dtos.url_response import URLResponseInfo from src.external.url_request.probe.core import URLProbeManager -from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper from src.external.url_request.request import fetch_urls diff --git a/src/external/url_request/probe/convert.py b/src/external/url_request/probe/convert.py index eafb7158..3b15268a 100644 --- a/src/external/url_request/probe/convert.py +++ b/src/external/url_request/probe/convert.py @@ -53,6 +53,7 @@ def _extract_destination_url(cr: ClientResponse) -> str: return str(cr.url) def convert_client_response_to_probe_response( + url: str, cr: ClientResponse ) -> URLProbeResponse | URLProbeRedirectResponsePair: error = _extract_error(cr) @@ -69,13 +70,12 @@ def convert_client_response_to_probe_response( source_cr = cr.history[0] # Source CR is the first in the history destination_cr = cr - source_url = str(source_cr.url) destination_url = str(destination_cr.url) source_error = _extract_error(source_cr) source_content_type = _extract_content_type(source_cr, error=source_error) source_probe_response = URLProbeResponse( - url=source_url, + url=url, status_code=source_cr.status, content_type=source_content_type, error=source_error, diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index a6eb9b99..f196e6fb 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -1,11 +1,12 @@ +import asyncio.exceptions from http import HTTPStatus from aiohttp import ClientSession, InvalidUrlClientError, ClientConnectorSSLError, ClientConnectorDNSError, \ - ClientConnectorCertificateError, ClientResponseError, ClientConnectorError + ClientConnectorCertificateError, ClientResponseError, ClientConnectorError, TooManyRedirects, ClientOSError +from pydantic import ValidationError from tqdm.asyncio import tqdm_asyncio from src.external.url_request.probe.convert import convert_client_response_to_probe_response, convert_to_error_response -from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper @@ -18,7 +19,10 @@ def __init__( self.session = session async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: - return await tqdm_asyncio.gather(*[self._probe(url) for url in urls]) + return await tqdm_asyncio.gather( + *[self._probe(url) for url in urls], + timeout=60 * 10 # 10 minutes + ) async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: try: @@ -36,17 +40,28 @@ async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: ClientConnectorCertificateError ) as e: return convert_to_error_response(url, error=str(e)) - - - + except asyncio.exceptions.TimeoutError: + return convert_to_error_response(url, error="Timeout Error") + except ValidationError as e: + raise ValueError(f"Validation Error for {url}.") from e + except ClientOSError as e: + return convert_to_error_response(url, error=f"Client OS Error: {e.errno}. {str(e)}") async def _head(self, url: str) -> URLProbeResponseOuterWrapper: try: async with self.session.head(url, allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, - response=convert_client_response_to_probe_response(response) + response=convert_client_response_to_probe_response( + url, + response + ) ) + except TooManyRedirects: + return convert_to_error_response( + url, + error="Too many redirects (> 10)", + ) except ClientResponseError as e: return convert_to_error_response( url, @@ -59,8 +74,16 @@ async def _get(self, url: str) -> URLProbeResponseOuterWrapper: async with self.session.get(url, allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, - response=convert_client_response_to_probe_response(response) + response=convert_client_response_to_probe_response( + url, + response + ) ) + except TooManyRedirects: + return convert_to_error_response( + url, + error="Too many redirects (> 10)", + ) except ClientResponseError as e: return convert_to_error_response( url, diff --git a/src/security/manager.py b/src/security/manager.py index 97bc0da8..16f0519e 100644 --- a/src/security/manager.py +++ b/src/security/manager.py @@ -16,9 +16,7 @@ class SecurityManager: - def __init__( - self - ): + def __init__(self): dotenv.load_dotenv() self.secret_key = os.getenv("DS_APP_SECRET_KEY") diff --git a/src/util/clean.py b/src/util/clean.py index 874aa665..3c0a0f92 100644 --- a/src/util/clean.py +++ b/src/util/clean.py @@ -2,9 +2,7 @@ def clean_url(url: str) -> str: # Remove Non-breaking spaces - url = url.replace("\u00A0", "") - url = url.replace(" ", "") - url = url.replace("%C2%A0", "") + url = url.strip(" ") # Remove any fragments and everything after them url = url.split("#")[0] diff --git a/src/util/db_manager.py b/src/util/db_manager.py deleted file mode 100644 index b03708a0..00000000 --- a/src/util/db_manager.py +++ /dev/null @@ -1,46 +0,0 @@ -import os - -import psycopg2 -from dotenv import load_dotenv - - -class DBManager: - - def __init__(self, db_name, user, password, host, port): - self.conn = psycopg2.connect( - dbname=db_name, - user=user, - password=password, - host=host, - port=port - ) - self.cursor = self.conn.cursor() - - def __del__(self): - self.conn.close() - - def execute(self, query, params=None): - self.cursor.execute(query, params) - self.conn.commit() - return self.cursor.fetchall() - - def fetchall(self): - return self.cursor.fetchall() - - def fetchone(self): - return self.cursor.fetchone() - - def fetchmany(self, size): - return self.cursor.fetchmany(size) - - def close(self): - self.conn.close() - - -if __name__ == "__main__": - # Note: This is test code to evaluate whether the connection url works. Will be removed in final version. - load_dotenv() - conn_url = os.getenv("DIGITAL_OCEAN_DB_CONNECTION_URL") - conn = psycopg2.connect(conn_url) - - pass \ No newline at end of file diff --git a/src/util/helper_functions.py b/src/util/helper_functions.py index deb6830b..4e33985f 100644 --- a/src/util/helper_functions.py +++ b/src/util/helper_functions.py @@ -16,7 +16,7 @@ def get_project_root(marker_files=(".project-root",)) -> Path: def project_path(*parts: str) -> Path: return get_project_root().joinpath(*parts) -def get_enum_values(enum: Type[Enum]): +def get_enum_values(enum: Type[Enum]) -> list[str]: return [item.value for item in enum] def get_from_env(key: str, allow_none: bool = False): @@ -42,7 +42,11 @@ def load_from_environment(keys: list[str]) -> dict[str, str]: def base_model_list_dump(model_list: list[BaseModel]) -> list[dict]: return [model.model_dump() for model in model_list] -def update_if_not_none(target: dict, source: dict): +def update_if_not_none(target: dict, source: dict) -> None: + """ + Modifies: + target + """ for key, value in source.items(): if value is not None: target[key] = value \ No newline at end of file diff --git a/src/util/miscellaneous_functions.py b/src/util/miscellaneous_functions.py index 4b0bc88b..88e7a6a7 100644 --- a/src/util/miscellaneous_functions.py +++ b/src/util/miscellaneous_functions.py @@ -16,8 +16,8 @@ def create_directories_if_not_exist(file_path: str): Create directories if they don't exist Args: file_path: - - Returns: + Modifies: + file_path """ directory = os.path.dirname(file_path) diff --git a/tests/alembic/conftest.py b/tests/alembic/conftest.py index f50dee14..e8c5dc9f 100644 --- a/tests/alembic/conftest.py +++ b/tests/alembic/conftest.py @@ -1,6 +1,8 @@ +from typing import Any, Generator + import pytest from alembic.config import Config -from sqlalchemy import create_engine, inspect, MetaData +from sqlalchemy import create_engine, inspect, MetaData, Engine, Connection from sqlalchemy.orm import scoped_session, sessionmaker from src.db.helpers.connect import get_postgres_connection_string @@ -8,27 +10,27 @@ @pytest.fixture() -def alembic_config(): +def alembic_config() -> Generator[Config, Any, None]: alembic_cfg = Config("alembic.ini") yield alembic_cfg @pytest.fixture() -def db_engine(): +def db_engine() -> Generator[Engine, Any, None]: engine = create_engine(get_postgres_connection_string()) yield engine engine.dispose() @pytest.fixture() -def connection(db_engine): +def connection(db_engine) -> Generator[Connection, Any, None]: connection = db_engine.connect() yield connection connection.close() @pytest.fixture() -def alembic_runner(connection, alembic_config) -> AlembicRunner: +def alembic_runner(connection, alembic_config) -> Generator[AlembicRunner, Any, None]: alembic_config.attributes["connection"] = connection alembic_config.set_main_option( "sqlalchemy.url", diff --git a/tests/alembic/helpers.py b/tests/alembic/helpers.py index 96e7f62a..b835c7a9 100644 --- a/tests/alembic/helpers.py +++ b/tests/alembic/helpers.py @@ -14,8 +14,7 @@ def table_creation_check( tables: list[str], end_revision: str, start_revision: Optional[str] = None, - -): +) -> None: if start_revision is not None: alembic_runner.upgrade(start_revision) for table_name in tables: diff --git a/tests/manual/external/url_request/test_url_probe.py b/tests/manual/external/url_request/test_url_probe.py index b987aa45..b2ec71f2 100644 --- a/tests/manual/external/url_request/test_url_probe.py +++ b/tests/manual/external/url_request/test_url_probe.py @@ -3,7 +3,7 @@ from src.external.url_request.probe.core import URLProbeManager URLS = [ -'https://citydocs.longbeach.gov/LBPDPublicDocs/DocView.aspx?id=162830&dbid=0&repo=LBPD-PUBDOCS%C2%A0' +'https://www.opendataphilly.org/dataset?q=crime+map&sort=score+desc%2C+metadata_modified+desc' # "https://tableau.alleghenycounty.us/t/PublicSite/views/PublicBudgetDashboard_17283931835700/OperatingOverview?%3Aembed=y&%3AisGuestRedirectFromVizportal=y" # "data.austintexas.gov/resource/sc6h-qr9f.json" # "https://albanyoregon.gov/police/crime/statistics-crime-analysis",