Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/core/env_var_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def __init__(self, env: dict = os.environ):
self.env = env
self._load()

def _load(self):
def _load(self) -> None:
"""Load environment variables from environment"""

self.google_api_key = self.require_env("GOOGLE_API_KEY")
self.google_cse_id = self.require_env("GOOGLE_CSE_ID")
Expand Down
16 changes: 8 additions & 8 deletions src/core/tasks/scheduled/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ async def load_entries(self) -> list[ScheduledTaskEntry]:


return [
ScheduledTaskEntry(
operator=SyncDataSourcesTaskOperator(
adb_client=self.async_core.adb_client,
pdap_client=self.pdap_client
),
interval=IntervalEnum.DAILY,
enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=RunURLTasksTaskOperator(async_core=self.async_core),
interval=IntervalEnum.HOURLY,
Expand All @@ -57,14 +65,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]:
interval=IntervalEnum.DAILY,
enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=SyncDataSourcesTaskOperator(
adb_client=self.async_core.adb_client,
pdap_client=self.pdap_client
),
interval=IntervalEnum.DAILY,
enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=SyncAgenciesTaskOperator(
adb_client=self.async_core.adb_client,
Expand Down
31 changes: 25 additions & 6 deletions src/core/tasks/url/operators/html/scraper/parser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@
html_info: ResponseHTMLInfo,
parser_type: ParserTypeEnum,
html_content: str
):
) -> None:
"""
Modifies:
html_info
"""

soup = BeautifulSoup(
markup=html_content,
features=parser_type.value,
Expand All @@ -48,7 +53,7 @@
if soup.html is not None:
soup.html.decompose()

def get_div_text(self, soup):
def get_div_text(self, soup: BeautifulSoup) -> str:

Check warning on line 56 in src/core/tasks/url/operators/html/scraper/parser/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/core.py#L56 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/scraper/parser/core.py:56:1: D102 Missing docstring in public method
div_text = ""
MAX_WORDS = 500
for div in soup.find_all("div"):
Expand Down Expand Up @@ -85,29 +90,43 @@
continue
setattr(html_info, header_tag, tag_content)

def get_html_title(self, soup: BeautifulSoup) -> Optional[str]:
def get_html_title(self, soup: BeautifulSoup) -> str | None:

Check warning on line 93 in src/core/tasks/url/operators/html/scraper/parser/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/core.py#L93 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/scraper/parser/core.py:93:1: D102 Missing docstring in public method
if soup.title is None:
return None
if soup.title.string is None:
return None
return remove_excess_whitespace(soup.title.string)


def add_url_and_path(self, html_info: ResponseHTMLInfo, html_content: str, url: str):
def add_url_and_path(

Check failure on line 101 in src/core/tasks/url/operators/html/scraper/parser/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/core.py#L101 <303>

too many blank lines (2)
Raw output
./src/core/tasks/url/operators/html/scraper/parser/core.py:101:5: E303 too many blank lines (2)
self,
html_info: ResponseHTMLInfo,
html_content: str,

Check warning on line 104 in src/core/tasks/url/operators/html/scraper/parser/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/core.py#L104 <100>

Unused argument 'html_content'
Raw output
./src/core/tasks/url/operators/html/scraper/parser/core.py:104:9: U100 Unused argument 'html_content'
url: str
) -> None:
"""
Modifies:
html_info.url
html_info.url_path
"""
url = add_https(url)
html_info.url = url

url_path = drop_hostname(url)
url_path = remove_trailing_backslash(url_path)
html_info.url_path = url_path

async def add_root_page_titles(self, html_info: ResponseHTMLInfo):
async def add_root_page_titles(self, html_info: ResponseHTMLInfo) -> None:
"""
Modifies:
html_info.root_page_title
"""
root_page_title = await self.root_url_cache.get_title(html_info.url)
html_info.root_page_title = remove_excess_whitespace(
root_page_title
)

def get_parser_type(self, content_type: str) -> ParserTypeEnum or None:
def get_parser_type(self, content_type: str) -> ParserTypeEnum | None:

Check warning on line 129 in src/core/tasks/url/operators/html/scraper/parser/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/core.py#L129 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/scraper/parser/core.py:129:1: D102 Missing docstring in public method
try:
# If content type does not contain "html" or "xml" then we can assume that the content is unreadable
if "html" in content_type:
Expand Down
8 changes: 5 additions & 3 deletions src/core/tasks/url/operators/html/scraper/parser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo


def convert_to_response_html_info(html_content_infos: list[URLHTMLContentInfo]):
def convert_to_response_html_info(

Check warning on line 8 in src/core/tasks/url/operators/html/scraper/parser/util.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/util.py#L8 <103>

Missing docstring in public function
Raw output
./src/core/tasks/url/operators/html/scraper/parser/util.py:8:1: D103 Missing docstring in public function
html_content_infos: list[URLHTMLContentInfo]
) -> ResponseHTMLInfo:
response_html_info = ResponseHTMLInfo()

for html_content_info in html_content_infos:
Expand All @@ -32,12 +34,12 @@
return url


def remove_trailing_backslash(url_path):
def remove_trailing_backslash(url_path: str) -> str:

Check warning on line 37 in src/core/tasks/url/operators/html/scraper/parser/util.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/util.py#L37 <103>

Missing docstring in public function
Raw output
./src/core/tasks/url/operators/html/scraper/parser/util.py:37:1: D103 Missing docstring in public function
if url_path and url_path[-1] == "/":
url_path = url_path[:-1]
return url_path


def drop_hostname(new_url):
def drop_hostname(new_url: str) -> str:

Check warning on line 43 in src/core/tasks/url/operators/html/scraper/parser/util.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/parser/util.py#L43 <103>

Missing docstring in public function
Raw output
./src/core/tasks/url/operators/html/scraper/parser/util.py:43:1: D103 Missing docstring in public function
url_path = urlparse(new_url).path[1:]
return url_path
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@


class RootURLCache:
def __init__(self, adb_client: Optional[AsyncDatabaseClient] = None):
def __init__(self, adb_client: AsyncDatabaseClient | None = None):

Check warning on line 15 in src/core/tasks/url/operators/html/scraper/root_url_cache/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/root_url_cache/core.py#L15 <107>

Missing docstring in __init__
Raw output
./src/core/tasks/url/operators/html/scraper/root_url_cache/core.py:15:1: D107 Missing docstring in __init__
if adb_client is None:
adb_client = AsyncDatabaseClient()
self.adb_client = adb_client
self.cache = None

async def save_to_cache(self, url: str, title: str):
async def save_to_cache(self, url: str, title: str) -> None:

Check warning on line 21 in src/core/tasks/url/operators/html/scraper/root_url_cache/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/root_url_cache/core.py#L21 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/scraper/root_url_cache/core.py:21:1: D102 Missing docstring in public method
if url in self.cache:
return
self.cache[url] = title
await self.adb_client.add_to_root_url_cache(url=url, page_title=title)

async def get_from_cache(self, url: str) -> Optional[str]:
async def get_from_cache(self, url: str) -> str | None:

Check warning on line 27 in src/core/tasks/url/operators/html/scraper/root_url_cache/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/scraper/root_url_cache/core.py#L27 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/scraper/root_url_cache/core.py:27:1: D102 Missing docstring in public method
if self.cache is None:
self.cache = await self.adb_client.load_root_url_cache()

Expand Down
13 changes: 9 additions & 4 deletions src/core/tasks/url/operators/misc_metadata/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@
super().__init__(adb_client)

@property
def task_type(self):
def task_type(self) -> TaskType:

Check warning on line 25 in src/core/tasks/url/operators/misc_metadata/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/misc_metadata/core.py#L25 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/misc_metadata/core.py:25:1: D102 Missing docstring in public method
return TaskType.MISC_METADATA

async def meets_task_prerequisites(self):
async def meets_task_prerequisites(self) -> bool:

Check warning on line 28 in src/core/tasks/url/operators/misc_metadata/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/misc_metadata/core.py#L28 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/misc_metadata/core.py:28:1: D102 Missing docstring in public method
return await self.adb_client.has_pending_urls_missing_miscellaneous_metadata()

async def get_subtask(
self,
collector_type: CollectorType
) -> Optional[MiscellaneousMetadataSubtaskBase]:
) -> MiscellaneousMetadataSubtaskBase | None:
match collector_type:
case CollectorType.MUCKROCK_SIMPLE_SEARCH:
return MuckrockMiscMetadataSubtask()
Expand All @@ -47,12 +47,17 @@
return None

async def html_default_logic(self, tdo: URLMiscellaneousMetadataTDO):
"""
Modifies:
tdo.name
tdo.description
"""
if tdo.name is None:
tdo.name = tdo.html_metadata_info.title
if tdo.description is None:
tdo.description = tdo.html_metadata_info.description

async def inner_task_logic(self):
async def inner_task_logic(self) -> None:

Check warning on line 60 in src/core/tasks/url/operators/misc_metadata/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/misc_metadata/core.py#L60 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/misc_metadata/core.py:60:1: D102 Missing docstring in public method
tdos: list[URLMiscellaneousMetadataTDO] = await self.adb_client.get_pending_urls_missing_miscellaneous_metadata()
await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos])

Expand Down
24 changes: 18 additions & 6 deletions src/core/tasks/url/operators/probe_404/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,17 @@
self.url_request_interface = url_request_interface

@property
def task_type(self):
def task_type(self) -> TaskType:

Check warning on line 29 in src/core/tasks/url/operators/probe_404/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe_404/core.py#L29 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/probe_404/core.py:29:1: D102 Missing docstring in public method
return TaskType.PROBE_404

async def meets_task_prerequisites(self):
async def meets_task_prerequisites(self) -> bool:

Check warning on line 32 in src/core/tasks/url/operators/probe_404/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe_404/core.py#L32 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/probe_404/core.py:32:1: D102 Missing docstring in public method
return await self.adb_client.has_pending_urls_not_recently_probed_for_404()

async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]):
async def probe_urls_for_404(self, tdos: list[URL404ProbeTDO]) -> None:
"""
Modifies:
URL404ProbeTDO.is_404
"""
responses = await self.url_request_interface.make_simple_requests(
urls=[tdo.url for tdo in tdos]
)
Expand All @@ -42,7 +46,7 @@
tdo.is_404 = response.status == HTTPStatus.NOT_FOUND


async def inner_task_logic(self):
async def inner_task_logic(self) -> None:

Check warning on line 49 in src/core/tasks/url/operators/probe_404/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe_404/core.py#L49 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/probe_404/core.py:49:1: D102 Missing docstring in public method

Check failure on line 49 in src/core/tasks/url/operators/probe_404/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe_404/core.py#L49 <303>

too many blank lines (2)
Raw output
./src/core/tasks/url/operators/probe_404/core.py:49:5: E303 too many blank lines (2)
tdos = await self.get_pending_urls_not_recently_probed_for_404()
url_ids = [task_info.url_id for task_info in tdos]
await self.link_urls_to_task(url_ids=url_ids)
Expand All @@ -55,9 +59,17 @@
async def get_pending_urls_not_recently_probed_for_404(self) -> list[URL404ProbeTDO]:
return await self.adb_client.get_pending_urls_not_recently_probed_for_404()

async def update_404s_in_database(self, url_ids_404: list[int]):
async def update_404s_in_database(self, url_ids_404: list[int]) -> None:
"""
Modifies:
URL data in DB
"""
await self.adb_client.mark_all_as_404(url_ids_404)

async def mark_as_recently_probed_for_404(self, url_ids: list[int]):
async def mark_as_recently_probed_for_404(self, url_ids: list[int]) -> None:
"""
Modifies:
URL data in DB
"""
await self.adb_client.mark_all_as_recently_probed_for_404(url_ids)

2 changes: 1 addition & 1 deletion src/db/helpers/connect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.core.env_var_manager import EnvVarManager


def get_postgres_connection_string(is_async = False):
def get_postgres_connection_string(is_async = False) -> str:

Check warning on line 4 in src/db/helpers/connect.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/helpers/connect.py#L4 <103>

Missing docstring in public function
Raw output
./src/db/helpers/connect.py:4:1: D103 Missing docstring in public function

Check failure on line 4 in src/db/helpers/connect.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/helpers/connect.py#L4 <251>

unexpected spaces around keyword / parameter equals
Raw output
./src/db/helpers/connect.py:4:44: E251 unexpected spaces around keyword / parameter equals

Check failure on line 4 in src/db/helpers/connect.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/helpers/connect.py#L4 <251>

unexpected spaces around keyword / parameter equals
Raw output
./src/db/helpers/connect.py:4:46: E251 unexpected spaces around keyword / parameter equals
return EnvVarManager.get().get_postgres_connection_string(is_async)
2 changes: 1 addition & 1 deletion src/db/models/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sqlalchemy import Column, TIMESTAMP, func, Integer, ForeignKey, Enum as SAEnum
from enum import Enum as PyEnum

def get_created_at_column():
def get_created_at_column() -> Column:

Check warning on line 4 in src/db/models/helpers.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/models/helpers.py#L4 <103>

Missing docstring in public function
Raw output
./src/db/models/helpers.py:4:1: D103 Missing docstring in public function
return Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_all(self) -> list[Any]:

async def _annotation_exists_case(
self,
):
) -> list[Any]:
cases = []
for model in ALL_ANNOTATION_MODELS:
cases.append(
Expand Down
15 changes: 13 additions & 2 deletions src/external/huggingface/hub/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,21 @@ class HuggingFaceHubClient:
def __init__(self, token: str):
self.token = token

def _push_dataset_to_hub(self, repo_id: str, dataset: Dataset):
def _push_dataset_to_hub(self, repo_id: str, dataset: Dataset) -> None:
"""
Modifies:
- repository on Hugging Face, identified by `repo_id`
"""
dataset.push_to_hub(repo_id=repo_id, token=self.token)

def push_data_sources_raw_to_hub(self, outputs: list[GetForLoadingToHuggingFaceOutput]):
def push_data_sources_raw_to_hub(
self,
outputs: list[GetForLoadingToHuggingFaceOutput]
) -> None:
"""
Modifies:
- repository on Hugging Face, identified by `DATA_SOURCES_RAW_REPO_ID`
"""
dataset = format_as_huggingface_dataset(outputs)
print(dataset)
self._push_dataset_to_hub(repo_id=DATA_SOURCES_RAW_REPO_ID, dataset=dataset)
9 changes: 5 additions & 4 deletions src/external/pdap/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,14 +192,15 @@ async def sync_data_sources(
)
headers = await self.access_manager.jwt_header()
headers['Content-Type'] = "application/json"
params_dict = {"page": params.page}
if params.cutoff_date is not None:
params_dict["updated_at"] = params.cutoff_date

request_info = RequestInfo(
type_=RequestType.GET,
url=url,
headers=headers,
params={
"page": params.page,
"updated_at": params.cutoff_date
}
params=params_dict
)
response_info = await self.access_manager.make_request(request_info)
return DataSourcesSyncResponseInfo(
Expand Down
6 changes: 3 additions & 3 deletions src/external/pdap/dtos/match_agency/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
class MatchAgencyInfo(BaseModel):
id: int
submitted_name: str
state: Optional[str] = None
county: Optional[str] = None
locality: Optional[str] = None
state: str | None = None
county: str | None = None
locality: str | None = None
6 changes: 3 additions & 3 deletions src/external/pdap/dtos/sync/agencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
class AgenciesSyncResponseInnerInfo(BaseModel):
display_name: str
agency_id: int
state_name: Optional[str]
county_name: Optional[str]
locality_name: Optional[str]
state_name: str | None
county_name: str | None
locality_name: str | None
updated_at: datetime.datetime

class AgenciesSyncResponseInfo(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/external/pdap/dtos/unique_url_duplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
class UniqueURLDuplicateInfo(BaseModel):
original_url: str
approval_status: ApprovalStatus
rejection_note: Optional[str] = None
rejection_note: str | None = None
1 change: 0 additions & 1 deletion src/external/url_request/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from src.external.url_request.dtos.url_response import URLResponseInfo
from src.external.url_request.probe.core import URLProbeManager
from src.external.url_request.probe.models.response import URLProbeResponse
from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper
from src.external.url_request.request import fetch_urls

Expand Down
4 changes: 2 additions & 2 deletions src/external/url_request/probe/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def _extract_destination_url(cr: ClientResponse) -> str:
return str(cr.url)

def convert_client_response_to_probe_response(
url: str,
cr: ClientResponse
) -> URLProbeResponse | URLProbeRedirectResponsePair:
error = _extract_error(cr)
Expand All @@ -69,13 +70,12 @@ def convert_client_response_to_probe_response(
source_cr = cr.history[0] # Source CR is the first in the history
destination_cr = cr

source_url = str(source_cr.url)
destination_url = str(destination_cr.url)

source_error = _extract_error(source_cr)
source_content_type = _extract_content_type(source_cr, error=source_error)
source_probe_response = URLProbeResponse(
url=source_url,
url=url,
status_code=source_cr.status,
content_type=source_content_type,
error=source_error,
Expand Down
Loading