Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ Please ensure these are properly defined in a `.env` file in the root directory.

[^1:] The user account in question will require elevated permissions to access certain endpoints. At a minimum, the user will require the `source_collector` and `db_write` permissions.

# Variables With Defaults

The following environment variables have default values that will be used if not otherwise defined.

| Variable | Description | Default |
|-------------------------------|------------------------------------------------------------------|---------|
| `URL_TASKS_FREQUENCY_MINUTES` | The frequency for the `RUN_URL_TASKS` Scheduled Task, in minutes | `60` |

# Flags

Flags are used to enable/disable certain features. They are set to `1` to enable the feature and `0` to disable the feature. By default, all flags are enabled.
Expand Down Expand Up @@ -77,6 +85,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
| `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. |
| `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. |
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
| `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. |

### Agency ID Subtasks

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Create url screenshot task

Revision ID: e7189dc92a83
Revises: 70baaee0dd79
Create Date: 2025-09-12 20:40:45.950204

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from src.util.alembic_helpers import switch_enum_type, id_column, url_id_column, created_at_column, updated_at_column

Check warning on line 13 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L13 <401>

'src.util.alembic_helpers.id_column' imported but unused
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:13:1: F401 'src.util.alembic_helpers.id_column' imported but unused

# revision identifiers, used by Alembic.
revision: str = 'e7189dc92a83'
down_revision: Union[str, None] = '70baaee0dd79'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

URL_SCREENSHOT_TABLE_NAME = "url_screenshot"
SCREENSHOT_ERROR_TABLE_NAME = "error_url_screenshot"



def upgrade() -> None:

Check warning on line 26 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L26 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:26:1: D103 Missing docstring in public function

Check failure on line 26 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L26 <303>

too many blank lines (3)
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:26:1: E303 too many blank lines (3)
_add_url_screenshot_task()
_add_url_screenshot_table()
_add_screenshot_error_table()



def downgrade() -> None:

Check warning on line 33 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L33 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:33:1: D103 Missing docstring in public function

Check failure on line 33 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L33 <303>

too many blank lines (3)
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:33:1: E303 too many blank lines (3)
_remove_url_screenshot_task()
_remove_url_screenshot_table()
_remove_screenshot_error_table()


def _add_screenshot_error_table():
op.create_table(
SCREENSHOT_ERROR_TABLE_NAME,
url_id_column(),
sa.Column('error', sa.String(), nullable=False),
created_at_column(),
sa.PrimaryKeyConstraint('url_id')
)


def _add_url_screenshot_table():
op.create_table(
URL_SCREENSHOT_TABLE_NAME,
url_id_column(),
sa.Column('content', sa.LargeBinary(), nullable=False),
sa.Column('file_size', sa.Integer(), nullable=False),
created_at_column(),
updated_at_column(),
sa.UniqueConstraint('url_id', name='uq_url_id_url_screenshot')
)


def _remove_url_screenshot_table():
op.drop_table(URL_SCREENSHOT_TABLE_NAME)


def _remove_screenshot_error_table():
op.drop_table(SCREENSHOT_ERROR_TABLE_NAME)


def _add_url_screenshot_task():
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face',
'URL Probe',
'Populate Backlog Snapshot',
'Delete Old Logs',
'Run URL Task Cycles',
'Root URL',
'Internet Archives Probe',
'Internet Archives Archive',
'Screenshot'
]
)

def _remove_url_screenshot_task():
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face',
'URL Probe',
'Populate Backlog Snapshot',
'Delete Old Logs',
'Run URL Task Cycles',
'Root URL',
'Internet Archives Probe',
'Internet Archives Archive'
]
)

Check warning on line 122 in alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py#L122 <292>

no newline at end of file
Raw output
./alembic/versions/2025_09_12_2040-e7189dc92a83_create_url_screenshot_task.py:122:6: W292 no newline at end of file
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"marshmallow~=3.23.2",
"openai~=1.60.1",
"pdap-access-manager==0.3.6",
"pillow>=11.3.0",
"pip>=25.2",
"playwright~=1.49.1",
"psycopg2-binary~=2.9.6",
Expand Down
4 changes: 2 additions & 2 deletions src/api/endpoints/task/by_id/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pydantic import BaseModel

from src.db.models.impl.url.core.pydantic.info import URLInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic
from src.db.enums import TaskType
from src.core.enums import BatchStatus

Expand All @@ -15,4 +15,4 @@
updated_at: datetime.datetime
error_info: str | None = None
urls: list[URLInfo]
url_errors: list[URLErrorPydanticInfo]
url_errors: list[URLErrorInfoPydantic]

Check warning on line 18 in src/api/endpoints/task/by_id/dto.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/task/by_id/dto.py#L18 <292>

no newline at end of file
Raw output
./src/api/endpoints/task/by_id/dto.py:18:43: W292 no newline at end of file
4 changes: 2 additions & 2 deletions src/api/endpoints/task/by_id/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from src.collectors.enums import URLStatus
from src.core.enums import BatchStatus
from src.db.models.impl.url.core.pydantic.info import URLInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic
from src.db.enums import TaskType
from src.db.models.impl.task.core import Task
from src.db.models.impl.url.core.sqlalchemy import URL
Expand Down Expand Up @@ -50,7 +50,7 @@ async def run(self, session: AsyncSession) -> TaskInfo:

errored_urls = []
for url in task.errored_urls:
url_error_info = URLErrorPydanticInfo(
url_error_info = URLErrorInfoPydantic(
task_id=url.task_id,
url_id=url.url_id,
error=url.error,
Expand Down
Empty file.
Empty file.
28 changes: 28 additions & 0 deletions src/api/endpoints/url/by_id/screenshot/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Any

Check warning on line 1 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:1:1: D100 Missing docstring in public module

Check warning on line 1 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L1 <401>

'typing.Any' imported but unused
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:1:1: F401 'typing.Any' imported but unused

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot
from src.db.queries.base.builder import QueryBuilderBase

from src.db.helpers.session import session_helper as sh

class GetURLScreenshotQueryBuilder(QueryBuilderBase):

Check warning on line 11 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L11 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:11:1: D101 Missing docstring in public class

def __init__(self, url_id: int):

Check warning on line 13 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L13 <107>

Missing docstring in __init__
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:13:1: D107 Missing docstring in __init__
super().__init__()
self.url_id = url_id

async def run(self, session: AsyncSession) -> bytes | None:

Check warning on line 17 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L17 <102>

Missing docstring in public method
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:17:1: D102 Missing docstring in public method

query = (
select(URLScreenshot.content)
.where(URLScreenshot.url_id == self.url_id)
)

return await sh.one_or_none(
session=session,
query=query
)

Check warning on line 28 in src/api/endpoints/url/by_id/screenshot/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/query.py#L28 <391>

blank line at end of file
Raw output
./src/api/endpoints/url/by_id/screenshot/query.py:28:1: W391 blank line at end of file
22 changes: 22 additions & 0 deletions src/api/endpoints/url/by_id/screenshot/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from http import HTTPStatus

Check warning on line 1 in src/api/endpoints/url/by_id/screenshot/wrapper.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/wrapper.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/url/by_id/screenshot/wrapper.py:1:1: D100 Missing docstring in public module

from fastapi import HTTPException

from src.api.endpoints.url.by_id.screenshot.query import GetURLScreenshotQueryBuilder
from src.db.client.async_ import AsyncDatabaseClient


async def get_url_screenshot_wrapper(

Check warning on line 9 in src/api/endpoints/url/by_id/screenshot/wrapper.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/wrapper.py#L9 <103>

Missing docstring in public function
Raw output
./src/api/endpoints/url/by_id/screenshot/wrapper.py:9:1: D103 Missing docstring in public function
url_id: int,
adb_client: AsyncDatabaseClient,
) -> bytes:

raw_result: bytes | None = await adb_client.run_query_builder(
GetURLScreenshotQueryBuilder(url_id=url_id)
)
if raw_result is None:
raise HTTPException(
status_code=HTTPStatus.NOT_FOUND,
detail="URL not found"
)
return raw_result

Check warning on line 22 in src/api/endpoints/url/by_id/screenshot/wrapper.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/by_id/screenshot/wrapper.py#L22 <292>

no newline at end of file
Raw output
./src/api/endpoints/url/by_id/screenshot/wrapper.py:22:22: W292 no newline at end of file
18 changes: 17 additions & 1 deletion src/api/endpoints/url/routes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from fastapi import APIRouter, Query, Depends
from fastapi import APIRouter, Query, Depends, Response

Check warning on line 1 in src/api/endpoints/url/routes.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/routes.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/url/routes.py:1:1: D100 Missing docstring in public module

from src.api.dependencies import get_async_core
from src.api.endpoints.url.by_id.screenshot.wrapper import get_url_screenshot_wrapper
from src.api.endpoints.url.get.dto import GetURLsResponseInfo
from src.core.core import AsyncCore
from src.security.manager import get_access_info
Expand All @@ -27,3 +28,18 @@
) -> GetURLsResponseInfo:
result = await async_core.get_urls(page=page, errors=errors)
return result

@url_router.get("/{url_id}/screenshot")
async def get_url_screenshot(

Check warning on line 33 in src/api/endpoints/url/routes.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/url/routes.py#L33 <103>

Missing docstring in public function
Raw output
./src/api/endpoints/url/routes.py:33:1: D103 Missing docstring in public function
url_id: int,
async_core: AsyncCore = Depends(get_async_core),
) -> Response:

raw_result: bytes = await get_url_screenshot_wrapper(
url_id=url_id,
adb_client=async_core.adb_client
)
return Response(
content=raw_result,
media_type="image/webp"
)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from src.db.dtos.url.mapping import URLMapping
from src.db.enums import TaskType
from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic
from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic
from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic
from src.external.internet_archives.client import InternetArchivesClient
from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping
Expand Down Expand Up @@ -60,10 +60,10 @@ async def inner_task_logic(self) -> None:
await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata)

async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
url_error_info_list: list[URLErrorPydanticInfo] = []
url_error_info_list: list[URLErrorInfoPydantic] = []
for ia_mapping in ia_mappings:
url_id = mapper.get_id(ia_mapping.url)
url_error_info = URLErrorPydanticInfo(
url_error_info = URLErrorInfoPydantic(
url_id=url_id,
error=ia_mapping.error,
task_id=self.task_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase
from src.db.client.async_ import AsyncDatabaseClient
from src.db.enums import TaskType
from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic
from src.db.models.impl.url.internet_archives.save.pydantic import URLInternetArchiveSaveMetadataPydantic
from src.external.internet_archives.client import InternetArchivesClient
from src.external.internet_archives.models.save_response import InternetArchivesSaveResponseInfo
Expand Down Expand Up @@ -89,10 +89,10 @@ async def _add_errors_to_db(
mapper: URLToEntryMapper,
responses: list[InternetArchivesSaveResponseInfo]
) -> None:
error_info_list: list[URLErrorPydanticInfo] = []
error_info_list: list[URLErrorInfoPydantic] = []
for response in responses:
url_id = mapper.get_url_id(response.url)
url_error_info = URLErrorPydanticInfo(
url_error_info = URLErrorInfoPydantic(
url_id=url_id,
error=response.error,
task_id=self.task_id
Expand Down
19 changes: 11 additions & 8 deletions src/core/tasks/scheduled/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,55 +53,58 @@ async def load_entries(self) -> list[ScheduledTaskEntry]:
adb_client=self.adb_client,
ia_client=self.ia_client
),
interval=IntervalEnum.TEN_MINUTES,
interval_minutes=IntervalEnum.TEN_MINUTES.value,
enabled=self.env.bool("IA_PROBE_TASK_FLAG", default=True),
),
ScheduledTaskEntry(
operator=InternetArchivesSaveTaskOperator(
adb_client=self.adb_client,
ia_client=self.ia_client
),
interval=IntervalEnum.TEN_MINUTES,
interval_minutes=IntervalEnum.TEN_MINUTES.value,
enabled=self.env.bool("IA_SAVE_TASK_FLAG", default=True),
),
ScheduledTaskEntry(
operator=DeleteOldLogsTaskOperator(adb_client=self.adb_client),
interval=IntervalEnum.DAILY,
interval_minutes=IntervalEnum.DAILY.value,
enabled=self.env.bool("DELETE_OLD_LOGS_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=SyncDataSourcesTaskOperator(
adb_client=self.adb_client,
pdap_client=self.pdap_client
),
interval=IntervalEnum.DAILY,
interval_minutes=IntervalEnum.DAILY.value,
enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=SyncAgenciesTaskOperator(
adb_client=self.async_core.adb_client,
pdap_client=self.pdap_client
),
interval=IntervalEnum.DAILY,
interval_minutes=IntervalEnum.DAILY.value,
enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=RunURLTasksTaskOperator(async_core=self.async_core),
interval=IntervalEnum.HOURLY,
interval_minutes=self.env.int(
"URL_TASKS_FREQUENCY_MINUTES",
default=IntervalEnum.HOURLY.value
),
enabled=self.env.bool("RUN_URL_TASKS_TASK_FLAG", default=True)

),
ScheduledTaskEntry(
operator=PopulateBacklogSnapshotTaskOperator(adb_client=self.async_core.adb_client),
interval=IntervalEnum.DAILY,
interval_minutes=IntervalEnum.DAILY.value,
enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True)
),
ScheduledTaskEntry(
operator=PushToHuggingFaceTaskOperator(
adb_client=self.async_core.adb_client,
hf_client=self.hf_client
),
interval=IntervalEnum.DAILY,
interval_minutes=IntervalEnum.DAILY.value,
enabled=self.env.bool(
"PUSH_TO_HUGGING_FACE_TASK_FLAG",
default=True
Expand Down
2 changes: 1 addition & 1 deletion src/core/tasks/scheduled/models/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ class Config:
arbitrary_types_allowed = True

operator: ScheduledTaskOperatorBase
interval: IntervalEnum
interval_minutes: int
enabled: bool
2 changes: 1 addition & 1 deletion src/core/tasks/scheduled/registry/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ async def add_job(
id=entry.operator.task_type.value,
func=func,
trigger=IntervalTrigger(
minutes=entry.interval.value,
minutes=entry.interval_minutes,
start_date=datetime.now() + timedelta(minutes=minute_lag)
),
misfire_grace_time=60,
Expand Down
Loading