Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 2 additions & 1 deletion ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ The following flags are available:
| `URL_404_PROBE_TASK_FLAG` | Probes URLs for 404 errors. |
| `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. |
| `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. |
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
| `SYNC_AGENCIES_TASK_FLAG` | Synchonize agencies from Data Sources App. |
| `SYNC_DATA_SOURCES_TASK_FLAG` | Synchonize data sources from Data Sources App. |
| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. |
| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. |
| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. |
| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. |
| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""Refine root table logic

Revision ID: 49fd9f295b8d
Revises: 9a56916ea7d8
Create Date: 2025-08-12 08:19:08.170835

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from src.util.alembic_helpers import id_column, updated_at_column, url_id_column, created_at_column, switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '49fd9f295b8d'
down_revision: Union[str, None] = '9a56916ea7d8'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

ROOT_URLS_TABLE_NAME = "root_urls"
ROOT_URL_CACHE_TABLE_NAME = "root_url_cache"

LINK_URLS_ROOT_URL_TABLE_NAME = "link_urls_root_url"
FLAG_ROOT_URL_TABLE_NAME = "flag_root_url"




def upgrade() -> None:

Check warning on line 30 in alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py#L30 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py:30:1: D103 Missing docstring in public function

Check failure on line 30 in alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py#L30 <303>

too many blank lines (4)
Raw output
./alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py:30:1: E303 too many blank lines (4)
_drop_root_url_cache()
_drop_root_urls()
_create_flag_root_url()
_create_link_urls_root_url()
_add_root_url_task_enum()


def downgrade() -> None:

Check warning on line 38 in alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py#L38 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py:38:1: D103 Missing docstring in public function
_create_root_url_cache()
_create_root_urls()
_drop_link_urls_root_url()
_drop_flag_root_url()
_remove_root_url_task_enum()

def _add_root_url_task_enum():
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face',
'URL Probe',
'Populate Backlog Snapshot',
'Delete Old Logs',
'Run URL Task Cycles',
'Root URL'
]
)


def _remove_root_url_task_enum():
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face',
'URL Probe',
'Populate Backlog Snapshot',
'Delete Old Logs',
'Run URL Task Cycles'
]
)


def _drop_root_url_cache():
op.drop_table(ROOT_URL_CACHE_TABLE_NAME)

def _drop_root_urls():
op.drop_table(ROOT_URLS_TABLE_NAME)

def _create_root_url_cache():
op.create_table(
ROOT_URL_CACHE_TABLE_NAME,
id_column(),
sa.Column('url', sa.String(), nullable=False),
sa.Column('page_title', sa.String(), nullable=False),
sa.Column('page_description', sa.String(), nullable=True),
updated_at_column(),
sa.UniqueConstraint('url', name='root_url_cache_uq_url')
)

def _create_root_urls():
op.create_table(
ROOT_URLS_TABLE_NAME,
id_column(),
sa.Column('url', sa.String(), nullable=False),
sa.Column('page_title', sa.String(), nullable=False),
sa.Column('page_description', sa.String(), nullable=True),
updated_at_column(),
sa.UniqueConstraint('url', name='uq_root_url_url')
)

def _create_link_urls_root_url():
op.create_table(
LINK_URLS_ROOT_URL_TABLE_NAME,
id_column(),
url_id_column(),
url_id_column('root_url_id'),
created_at_column(),
updated_at_column(),
sa.UniqueConstraint('url_id', 'root_url_id')
)

def _drop_link_urls_root_url():
op.drop_table(LINK_URLS_ROOT_URL_TABLE_NAME)

def _create_flag_root_url():
op.create_table(
FLAG_ROOT_URL_TABLE_NAME,
url_id_column(),
created_at_column(),
sa.PrimaryKeyConstraint('url_id')
)

def _drop_flag_root_url():
op.drop_table(FLAG_ROOT_URL_TABLE_NAME)

Check warning on line 147 in alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py#L147 <292>

no newline at end of file
Raw output
./alembic/versions/2025_08_12_0819-49fd9f295b8d_refine_root_table_logic.py:147:44: W292 no newline at end of file
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo
from src.collectors.enums import URLStatus
from src.db.models.instantiations.link.batch_url import LinkBatchURL
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.impl.link.batch_url import LinkBatchURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.queries.base.builder import QueryBuilderBase
from src.db.statement_composer import StatementComposer
from src.db.types import UserSuggestionType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from src.collectors.enums import URLStatus
from src.core.enums import SuggestedStatus
from src.db.client.types import UserSuggestionModel
from src.db.models.instantiations.link.batch_url import LinkBatchURL
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.models.impl.link.batch_url import LinkBatchURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.queries.base.builder import QueryBuilderBase
from src.db.statement_composer import StatementComposer

Expand Down
12 changes: 6 additions & 6 deletions src/api/endpoints/annotate/agency/get/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@

class GetNextURLForAgencyAgencyInfo(BaseModel):
suggestion_type: SuggestionType
pdap_agency_id: Optional[int] = None
agency_name: Optional[str] = None
state: Optional[str] = None
county: Optional[str] = None
locality: Optional[str] = None
pdap_agency_id: int | None = None
agency_name: str | None = None
state: str | None = None
county: str | None = None
locality: str | None = None

class GetNextURLForAgencyAnnotationInnerResponse(AnnotationInnerResponseInfoBase):
agency_suggestions: list[
GetNextURLForAgencyAgencyInfo
]

class GetNextURLForAgencyAnnotationResponse(BaseModel):
next_annotation: Optional[GetNextURLForAgencyAnnotationInnerResponse]
next_annotation: GetNextURLForAgencyAnnotationInnerResponse | None

Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo
from src.core.enums import SuggestionType
from src.db.models.instantiations.agency.sqlalchemy import Agency
from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion
from src.db.models.impl.agency.sqlalchemy import Agency
from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion
from src.db.queries.base.builder import QueryBuilderBase


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
from src.core.enums import SuggestedStatus
from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info
from src.db.dtos.url.mapping import URLMapping
from src.db.models.instantiations.link.batch_url import LinkBatchURL
from src.db.models.instantiations.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.instantiations.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion
from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.models.impl.link.batch_url import LinkBatchURL
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion
from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.queries.base.builder import QueryBuilderBase
from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder

Expand Down
2 changes: 1 addition & 1 deletion src/api/endpoints/annotate/agency/post/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

class URLAgencyAnnotationPostInfo(BaseModel):
is_new: bool = False
suggested_agency: Optional[int] = None
suggested_agency: int | None = None
2 changes: 1 addition & 1 deletion src/api/endpoints/annotate/all/get/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@


class GetNextURLForAllAnnotationResponse(BaseModel):
next_annotation: Optional[GetNextURLForAllAnnotationInnerResponse]
next_annotation: GetNextURLForAllAnnotationInnerResponse | None

Check warning on line 24 in src/api/endpoints/annotate/all/get/dto.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/annotate/all/get/dto.py#L24 <292>

no newline at end of file
Raw output
./src/api/endpoints/annotate/all/get/dto.py:24:68: W292 no newline at end of file
10 changes: 5 additions & 5 deletions src/api/endpoints/annotate/all/get/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
from src.collectors.enums import URLStatus
from src.db.dto_converter import DTOConverter
from src.db.dtos.url.mapping import URLMapping
from src.db.models.instantiations.link.batch_url import LinkBatchURL
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.instantiations.url.suggestion.record_type.user import UserRecordTypeSuggestion
from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.models.impl.link.batch_url import LinkBatchURL
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion
from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.queries.base.builder import QueryBuilderBase
from src.db.statement_composer import StatementComposer

Expand Down
4 changes: 2 additions & 2 deletions src/api/endpoints/annotate/all/post/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

class AllAnnotationPostInfo(BaseModel):
suggested_status: SuggestedStatus
record_type: Optional[RecordType] = None
agency: Optional[URLAgencyAnnotationPostInfo] = None
record_type: RecordType | None = None
agency: URLAgencyAnnotationPostInfo | None = None

@model_validator(mode="after")
def allow_record_type_and_agency_only_if_relevant(self):
Expand Down
4 changes: 2 additions & 2 deletions src/api/endpoints/annotate/dtos/record_type/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
class GetNextRecordTypeAnnotationResponseInfo(
AnnotationInnerResponseInfoBase
):
suggested_record_type: Optional[RecordType] = Field(
suggested_record_type: RecordType | None = Field(
title="What record type, if any, the auto-labeler identified the URL as"
)

class GetNextRecordTypeAnnotationResponseOuterInfo(
BaseModel
):
next_annotation: Optional[GetNextRecordTypeAnnotationResponseInfo]
next_annotation: GetNextRecordTypeAnnotationResponseInfo | None
2 changes: 1 addition & 1 deletion src/api/endpoints/annotate/dtos/shared/base/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ class AnnotationInnerResponseInfoBase(BaseModel):
html_info: ResponseHTMLInfo = Field(
title="HTML information about the URL"
)
batch_info: Optional[AnnotationBatchInfo] = Field(
batch_info: AnnotationBatchInfo | None = Field(
title="Information about the annotation batch"
)
2 changes: 1 addition & 1 deletion src/api/endpoints/annotate/relevance/get/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ class GetNextRelevanceAnnotationResponseInfo(AnnotationInnerResponseInfoBase):
)

class GetNextRelevanceAnnotationResponseOuterInfo(BaseModel):
next_annotation: Optional[GetNextRelevanceAnnotationResponseInfo]
next_annotation: GetNextRelevanceAnnotationResponseInfo | None
6 changes: 3 additions & 3 deletions src/api/endpoints/annotate/relevance/get/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
RelevanceAnnotationResponseInfo
from src.db.dto_converter import DTOConverter
from src.db.dtos.url.mapping import URLMapping
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.models.instantiations.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.instantiations.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion
from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion
from src.db.queries.base.builder import QueryBuilderBase


Expand Down
16 changes: 8 additions & 8 deletions src/api/endpoints/annotate/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
async def get_next_url_for_relevance_annotation(
access_info: AccessInfo = Depends(get_access_info),
async_core: AsyncCore = Depends(get_async_core),
batch_id: Optional[int] = Query(
batch_id: int | None = Query(
description="The batch id of the next URL to get. "
"If not specified, defaults to first qualifying URL",
default=None),
Expand All @@ -48,7 +48,7 @@ async def annotate_url_for_relevance_and_get_next_url(
url_id: int = Path(description="The URL id to annotate"),
async_core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextRelevanceAnnotationResponseOuterInfo:
"""
Post URL annotation and get next URL to annotate
Expand All @@ -67,7 +67,7 @@ async def annotate_url_for_relevance_and_get_next_url(
async def get_next_url_for_record_type_annotation(
access_info: AccessInfo = Depends(get_access_info),
async_core: AsyncCore = Depends(get_async_core),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextRecordTypeAnnotationResponseOuterInfo:
return await async_core.get_next_url_for_record_type_annotation(
user_id=access_info.user_id,
Expand All @@ -80,7 +80,7 @@ async def annotate_url_for_record_type_and_get_next_url(
url_id: int = Path(description="The URL id to annotate"),
async_core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextRecordTypeAnnotationResponseOuterInfo:
"""
Post URL annotation and get next URL to annotate
Expand All @@ -99,7 +99,7 @@ async def annotate_url_for_record_type_and_get_next_url(
async def get_next_url_for_agency_annotation(
access_info: AccessInfo = Depends(get_access_info),
async_core: AsyncCore = Depends(get_async_core),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextURLForAgencyAnnotationResponse:
return await async_core.get_next_url_agency_for_annotation(
user_id=access_info.user_id,
Expand All @@ -112,7 +112,7 @@ async def annotate_url_for_agency_and_get_next_url(
agency_annotation_post_info: URLAgencyAnnotationPostInfo,
async_core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextURLForAgencyAnnotationResponse:
"""
Post URL annotation and get next URL to annotate
Expand All @@ -131,7 +131,7 @@ async def annotate_url_for_agency_and_get_next_url(
async def get_next_url_for_all_annotations(
access_info: AccessInfo = Depends(get_access_info),
async_core: AsyncCore = Depends(get_async_core),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextURLForAllAnnotationResponse:
return await async_core.get_next_url_for_all_annotations(
batch_id=batch_id
Expand All @@ -143,7 +143,7 @@ async def annotate_url_for_all_annotations_and_get_next_url(
all_annotation_post_info: AllAnnotationPostInfo,
async_core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
batch_id: Optional[int] = batch_query
batch_id: int | None = batch_query
) -> GetNextURLForAllAnnotationResponse:
"""
Post URL annotation and get next URL to annotate
Expand Down
Loading