diff --git a/.env.example b/.env.example index 48ec26f..14b2ec2 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,5 @@ -PROJECT_ID=ons-sdx-bob -ENV=development -DATASET_BUCKET_NAME="ons-sdx-bob-datasets" +PROJECT_ID=ons-sds-sandbox +PROFILE=dev +DATASET_BUCKET_NAME="ons-sds-sandbox-datasets" FIRESTORE_DATABASE="test" PUBLISH_DATASET_TOPIC_ID="projects/ons-sds-sandbox/topics/publish-dataset" diff --git a/Makefile b/Makefile index 093f299..bb9e5b6 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,6 @@ SHELL := bash .ONESHELL: -PHONY: install -install: ## Install dependencies - uv sync - - .PHONY: lint lint: @echo "Running Ruff linter..." @@ -25,10 +20,9 @@ test: uv run --dev pytest -v --disable-warnings tests/ -.PHONY: test-parallel -test-parallel: - @echo "Running Local Tests..." - uv run --dev pytest -n auto -v --disable-warnings tests/ +.PHONY: install +install: ## Install dependencies + uv sync .PHONY: dev diff --git a/README.md b/README.md index 1824386..1561d0d 100644 --- a/README.md +++ b/README.md @@ -59,4 +59,53 @@ make test docker build -t sds-loader . ``` +## Profiles + +sds-loader uses "profiles" to determine the concrete implementation of the abstracted services it uses. + +For example when running locally, you may just want to use fake repositories and test the business logic of the application, whereas in production you will want to use a Firestore database, GCP etc. + +Profiles are determined by the `PROFILE` environment variable. This will default to `prod` if not set. The following profiles are available... + +- `prod`: This profile will use the real implementations of all services. This is the default profile. +- `dev`: This profile will use fake repositories and services that do not connect to any real services +- `local_storage_firestore` This will use fake repositories for all services except the `DatasetStorageRepositoryInterface` which will use Firestore. To set up a local Firestore read the instructions below... + +## Firestore emulator + +In order to use Firestore locally, you will need to set up the Firestore emulator. You can do this using Docker. Run the following command to start the Firestore emulator: + +You will need to set the envronment variable FIRESTORE_EMULATOR_HOST to instruct the application to connect to the emulator instead of the real Firestore service... + +```bash +export FIRESTORE_EMULATOR_HOST=localhost:8080 +``` + +Ensure the profile for the application is set to `local_storage_firestore` to use the Firestore emulator... + +```bash +export PROFILE=local_storage_firestore + +# Or add it to the .env file + +PROFILE=local_storage_firestore +``` + +Then run the following command to start the Firestore emulator in Docker. Note it takes a few seconds to start up, so you may want to run this command before starting the application... + +``` +docker run \ + --rm \ + -p=9000:9000 \ + -p=8080:8080 \ + -p=4000:4000 \ + -p=9099:9099 \ + -p=8085:8085 \ + -p=5001:5001 \ + -p=9199:9199 \ + --env "GCP_PROJECT=${PROJECT_ID}" \ + --env "ENABLE_UI=true" \ + spine3/firebase-emulator & +``` + diff --git a/app/broadcasters/fake_broadcaster.py b/app/broadcasters/fake_broadcaster.py index 6788179..55110c6 100644 --- a/app/broadcasters/fake_broadcaster.py +++ b/app/broadcasters/fake_broadcaster.py @@ -6,6 +6,7 @@ class FakeBroadcaster(DatasetBroadcastInterface): """ A fake broadcaster that doesn't actually broadcast the data """ + def __init__(self): self.broadcasted = [] diff --git a/app/broadcasters/pubsub_broadcaster.py b/app/broadcasters/pubsub_broadcaster.py index 7f3f583..61068fa 100644 --- a/app/broadcasters/pubsub_broadcaster.py +++ b/app/broadcasters/pubsub_broadcaster.py @@ -16,6 +16,7 @@ class PubsubBroadcaster(DatasetBroadcastInterface): A broadcaster that will broadcast to pubsub """ + def __init__( self, settings: PubsubBroadcastSettings, @@ -24,9 +25,4 @@ def __init__( self.pubsub_client = PubsubService() def broadcast(self, dataset_metadata: DatasetMetadata) -> None: - - self.pubsub_client.publish_message( - self.settings.publish_dataset_topic_id, - json.dumps(dataset_metadata), - {} - ) + self.pubsub_client.publish_message(self.settings.publish_dataset_topic_id, json.dumps(dataset_metadata), {}) diff --git a/app/dependencies.py b/app/dependencies.py index 7ca6bf7..2b37da6 100644 --- a/app/dependencies.py +++ b/app/dependencies.py @@ -1,26 +1,13 @@ -from lagom import Singleton, dependency_definition +from lagom import Singleton from lagom.container import Container -from sds_common.publishers.gcs_schema_publisher import GcsSchemaPublisher -from sds_common.publishers.github_schema_publisher import GithubSchemaPublisher -from sdx_base.services.storage import StorageService - -from app.broadcasters.fake_broadcaster import FakeBroadcaster -from app.broadcasters.pubsub_broadcaster import PubsubBroadcaster -from app.interfaces.dataset_broadcast_interface import DatasetBroadcastInterface -from app.interfaces.dataset_deletion_repository_interface import DatasetDeletionRepositoryInterface -from app.interfaces.dataset_source_repository_interface import DatasetSourceRepositoryInterface -from app.interfaces.dataset_storage_repository_interface import DatasetStorageRepositoryInterface -from app.repositories.dataset_deletion.fake_dataset_deletion_repository import FakeDatasetDeletionRepository -from app.repositories.dataset_deletion.firestore_dataset_deletion_repository import FirestoreDatasetDeletionRepository -from app.repositories.dataset_source.bucket_dataset_source_repository import BucketDatasetSourceRepository -from app.repositories.dataset_source.fake_dataset_source_repository import FakeDatasetSourceRepository -from app.repositories.dataset_storage.fake_dataset_storage_repository import FakeDatasetStorageRepository -from app.repositories.dataset_storage.firestore_dataset_storage_repository import FirestoreDatasetStorageRepository -from app.services.dataset_service import DatasetService, DatasetSettings -from app.services.schema_service import SchemaService +from app import get_logger +from app.profiles import PROFILES +from app.services.dataset_service import DatasetService, DatasetSettings from app.settings import Settings, get_instance, QuickSettings +logger = get_logger() + class FakePublisher: def __init__(self, name: str): @@ -41,101 +28,30 @@ def build_container() -> Container: # Create the DI container container = Container() - # Determine environment - is_prod = QuickSettings().is_production() - # ----------------------------- # Core / shared dependencies # ----------------------------- container[Settings] = lambda: get_instance() # ----------------------------- - # DatasetSourceRepositoryInterface - # ----------------------------- - - if is_prod: - - @dependency_definition(container) - def build_bucket_dataset_source_repository() -> BucketDatasetSourceRepository: - return BucketDatasetSourceRepository( - bucket_reader=StorageService(), - settings=container[Settings] - ) - - container[DatasetSourceRepositoryInterface] = BucketDatasetSourceRepository - - else: - container[DatasetSourceRepositoryInterface] = FakeDatasetSourceRepository - - # ----------------------------- - # DatasetStorageRepositoryInterface + # Apply profile # ----------------------------- + profile = QuickSettings().get_profile() - if is_prod: + try: + profile_fn = PROFILES[profile] + except KeyError: + raise ValueError(f"Unknown profile '{profile}'. Available: {list(PROFILES.keys())}") + logger.info(f"Using profile {profile}") - @dependency_definition(container) - def build_firestore_dataset_storage_repository() -> FirestoreDatasetStorageRepository: - return FirestoreDatasetStorageRepository( - settings=container[Settings] - ) - - container[DatasetStorageRepositoryInterface] = FirestoreDatasetStorageRepository - else: - container[DatasetStorageRepositoryInterface] = FakeDatasetStorageRepository + # Apply profile + profile_fn(container) # ----------------------------- - # DatasetDeletionRepositoryInterface + # Static Services # ----------------------------- - if is_prod: - - @dependency_definition(container) - def build_firestore_dataset_deletion_repository() -> FirestoreDatasetDeletionRepository: - return FirestoreDatasetDeletionRepository( - settings=container[Settings] - ) - - container[DatasetDeletionRepositoryInterface] = FirestoreDatasetDeletionRepository - else: - container[DatasetDeletionRepositoryInterface] = FakeDatasetDeletionRepository - - # ----------------------------- - # DatasetBroadcastInterface - # ----------------------------- - - if is_prod: - - @dependency_definition(container) - def build_pubsub_broadcaster() -> PubsubBroadcaster: - return PubsubBroadcaster( - settings=container[Settings] - ) - - container[DatasetBroadcastInterface] = PubsubBroadcaster - else: - container[DatasetBroadcastInterface] = FakeBroadcaster - - # Settings container[DatasetSettings] = lambda: get_instance() - - # ----------------------------- - # Services - # ----------------------------- - - # Schema Service - - if is_prod: - container[SchemaService] = SchemaService( - bucket_publisher=GcsSchemaPublisher, - repository_publisher=GithubSchemaPublisher, - ) - else: - container[SchemaService] = SchemaService( - bucket_publisher=FakePublisher(name="Fake bucket publisher"), - repository_publisher=FakePublisher(name="Fake github publisher"), - ) - - # Dataset service container[DatasetService] = Singleton(DatasetService) return container diff --git a/app/exceptions/__init__.py b/app/exceptions/__init__.py index 938d408..908cc4c 100644 --- a/app/exceptions/__init__.py +++ b/app/exceptions/__init__.py @@ -1,6 +1,4 @@ - -class NonCriticalException(Exception): - ... +class NonCriticalException(Exception): ... class DatasetException(Exception): diff --git a/app/exceptions/dataset_deletion_empty_exception.py b/app/exceptions/dataset_deletion_empty_exception.py index 359e693..20088fa 100644 --- a/app/exceptions/dataset_deletion_empty_exception.py +++ b/app/exceptions/dataset_deletion_empty_exception.py @@ -1,5 +1,4 @@ from app.exceptions import NonCriticalException -class DatasetDeletionEmptyException(NonCriticalException): - ... +class DatasetDeletionEmptyException(NonCriticalException): ... diff --git a/app/exceptions/dataset_deletion_exception.py b/app/exceptions/dataset_deletion_exception.py index 086b2b9..75285f0 100644 --- a/app/exceptions/dataset_deletion_exception.py +++ b/app/exceptions/dataset_deletion_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetDeletionException(DatasetException): - ... +class DatasetDeletionException(DatasetException): ... diff --git a/app/exceptions/dataset_deletion_mark_exception.py b/app/exceptions/dataset_deletion_mark_exception.py index d3c1a66..78aab29 100644 --- a/app/exceptions/dataset_deletion_mark_exception.py +++ b/app/exceptions/dataset_deletion_mark_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetDeletionMarkException(DatasetException): - ... +class DatasetDeletionMarkException(DatasetException): ... diff --git a/app/exceptions/dataset_invalid_filename_exception.py b/app/exceptions/dataset_invalid_filename_exception.py index f612bb9..a0e959f 100644 --- a/app/exceptions/dataset_invalid_filename_exception.py +++ b/app/exceptions/dataset_invalid_filename_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetInvalidFilenameException(DatasetException): - ... +class DatasetInvalidFilenameException(DatasetException): ... diff --git a/app/exceptions/dataset_metadata_retrival_exception.py b/app/exceptions/dataset_metadata_retrival_exception.py index b42daf0..3be4b09 100644 --- a/app/exceptions/dataset_metadata_retrival_exception.py +++ b/app/exceptions/dataset_metadata_retrival_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetMetadataRetrivalException(DatasetException): - ... +class DatasetMetadataRetrivalException(DatasetException): ... diff --git a/app/exceptions/dataset_not_found_exception.py b/app/exceptions/dataset_not_found_exception.py index f7a6e68..96c5753 100644 --- a/app/exceptions/dataset_not_found_exception.py +++ b/app/exceptions/dataset_not_found_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetNotFoundException(DatasetException): - ... +class DatasetNotFoundException(DatasetException): ... diff --git a/app/exceptions/dataset_source_empty_exception.py b/app/exceptions/dataset_source_empty_exception.py index 8c6aefb..b7bb7c2 100644 --- a/app/exceptions/dataset_source_empty_exception.py +++ b/app/exceptions/dataset_source_empty_exception.py @@ -1,5 +1,4 @@ from app.exceptions import NonCriticalException -class DatasetSourceEmptyException(NonCriticalException): - ... +class DatasetSourceEmptyException(NonCriticalException): ... diff --git a/app/exceptions/dataset_storing_exception.py b/app/exceptions/dataset_storing_exception.py index ee87181..ecbefd9 100644 --- a/app/exceptions/dataset_storing_exception.py +++ b/app/exceptions/dataset_storing_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetStoringException(DatasetException): - ... +class DatasetStoringException(DatasetException): ... diff --git a/app/exceptions/dataset_validation_exception.py b/app/exceptions/dataset_validation_exception.py index d876030..60327d7 100644 --- a/app/exceptions/dataset_validation_exception.py +++ b/app/exceptions/dataset_validation_exception.py @@ -1,5 +1,4 @@ from app.exceptions import DatasetException -class DatasetValidationException(DatasetException): - ... +class DatasetValidationException(DatasetException): ... diff --git a/app/exceptions/schema_source_invalid_exception.py b/app/exceptions/schema_source_invalid_exception.py index 0cc94dd..392331b 100644 --- a/app/exceptions/schema_source_invalid_exception.py +++ b/app/exceptions/schema_source_invalid_exception.py @@ -1,6 +1,4 @@ - from app.exceptions import SchemaException -class SchemaSourceInvalidException(SchemaException): - ... +class SchemaSourceInvalidException(SchemaException): ... diff --git a/app/interfaces/dataset_storage_repository_interface.py b/app/interfaces/dataset_storage_repository_interface.py index 5eaeac3..cec14e7 100644 --- a/app/interfaces/dataset_storage_repository_interface.py +++ b/app/interfaces/dataset_storage_repository_interface.py @@ -13,11 +13,7 @@ class DatasetStorageRepositoryInterface(ABC): """ @abstractmethod - def get_latest_dataset_metadata( - self, - survey_id: str, - period_id: str - ) -> DatasetMetadataWithoutId | None: + def get_latest_dataset_metadata(self, survey_id: str, period_id: str) -> DatasetMetadataWithoutId | None: """ Gets the latest dataset for a given survey and period id @@ -49,12 +45,7 @@ def store_dataset( ... @abstractmethod - def delete_dataset_version( - self, - survey_id: str, - period_id: str, - version: int - ): + def delete_dataset_version(self, survey_id: str, period_id: str, version: int): """ Delete a specific version of the dataset from the repository diff --git a/app/middleware/timing.py b/app/middleware/timing.py index 4f29444..3650ce8 100644 --- a/app/middleware/timing.py +++ b/app/middleware/timing.py @@ -27,14 +27,8 @@ async def dispatch(self, request: Request, call_next): response = await call_next(request) - process_time = round( - time.perf_counter() - start_time, - 4 - ) - - logger.info( - f"{request.method} {path} " - f"took {process_time}s" - ) + process_time = round(time.perf_counter() - start_time, 4) + + logger.info(f"{request.method} {path} took {process_time}s") return response diff --git a/app/models/__init__.py b/app/models/__init__.py index 68be02c..6773454 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -2,8 +2,8 @@ class StrictBase(BaseModel): - model_config = ConfigDict(extra='forbid', use_enum_values=True) + model_config = ConfigDict(extra="forbid", use_enum_values=True) class AllowExtraBase(BaseModel): - model_config = ConfigDict(extra='allow', use_enum_values=True) + model_config = ConfigDict(extra="allow", use_enum_values=True) diff --git a/app/models/dataset.py b/app/models/dataset.py index 67f4486..2d50e3a 100644 --- a/app/models/dataset.py +++ b/app/models/dataset.py @@ -8,7 +8,8 @@ class RawDatasetDataItem(AllowExtraBase): """ Represents an item in the "data" block of the raw dataset """ - unit_data: list | dict + + unit_data: str identifier: str @@ -17,6 +18,7 @@ class RawDataset(AllowExtraBase): Represents a raw dataset JSON found in the source repository (i.e a Bucket) """ + survey_id: str period_id: str form_types: list[str] @@ -29,11 +31,12 @@ class UnitDataset(AllowExtraBase): Represents a unit from the datasets "data" block along with metadata from the parent dataset """ + dataset_id: Guid survey_id: str period_id: str form_types: list[str] - data: dict | list # The actual data + data: str class DatasetMetadataWithoutId(AllowExtraBase): @@ -41,6 +44,7 @@ class DatasetMetadataWithoutId(AllowExtraBase): Represents metadata about a dataset without the dataset_id (i.e. Guid) """ + survey_id: str period_id: str form_types: list[str] @@ -55,6 +59,7 @@ class DatasetMetadata(AllowExtraBase): """ Represents metadata about a dataset """ + dataset_id: Guid survey_id: str period_id: str diff --git a/app/models/guid.py b/app/models/guid.py index 293cef5..2a02b22 100644 --- a/app/models/guid.py +++ b/app/models/guid.py @@ -1,2 +1 @@ - Guid = str diff --git a/app/profiles.py b/app/profiles.py new file mode 100644 index 0000000..cd48ef4 --- /dev/null +++ b/app/profiles.py @@ -0,0 +1,134 @@ +from lagom import Container, dependency_definition + +from sds_common.publishers.gcs_schema_publisher import GcsSchemaPublisher +from sds_common.publishers.github_schema_publisher import GithubSchemaPublisher +from sdx_base.services.storage import StorageService + +from app.broadcasters.fake_broadcaster import FakeBroadcaster +from app.broadcasters.pubsub_broadcaster import PubsubBroadcaster + +from app.interfaces.dataset_broadcast_interface import DatasetBroadcastInterface +from app.interfaces.dataset_deletion_repository_interface import DatasetDeletionRepositoryInterface +from app.interfaces.dataset_source_repository_interface import DatasetSourceRepositoryInterface +from app.interfaces.dataset_storage_repository_interface import DatasetStorageRepositoryInterface + +from app.repositories.dataset_deletion.fake_dataset_deletion_repository import FakeDatasetDeletionRepository +from app.repositories.dataset_deletion.firestore_dataset_deletion_repository import FirestoreDatasetDeletionRepository + +from app.repositories.dataset_source.bucket_dataset_source_repository import BucketDatasetSourceRepository +from app.repositories.dataset_source.fake_dataset_source_repository import FakeDatasetSourceRepository + +from app.repositories.dataset_storage.fake_dataset_storage_repository import FakeDatasetStorageRepository +from app.repositories.dataset_storage.firestore_dataset_storage_repository import FirestoreDatasetStorageRepository + +from app.services.schema_service import SchemaService +from app.settings import Settings + + +class FakePublisher: + def __init__(self, name: str): + self._name = name + + def publish_schema(self, file_name: str): + print(f"Published: {file_name} to {self._name}") + + +# ------------------------- +# DEV PROFILE +# Everything fake +# ------------------------- + + +def dev(container: Container): + container[DatasetSourceRepositoryInterface] = FakeDatasetSourceRepository + + container[DatasetStorageRepositoryInterface] = FakeDatasetStorageRepository + + container[DatasetDeletionRepositoryInterface] = FakeDatasetDeletionRepository + + container[DatasetBroadcastInterface] = FakeBroadcaster + + container[SchemaService] = SchemaService( + bucket_publisher=FakePublisher(name="Fake bucket publisher"), + repository_publisher=FakePublisher(name="Fake github publisher"), + ) + + +# ------------------------- +# PRODUCTION PROFILE +# Everything real +# ------------------------- + + +def production(container: Container): + @dependency_definition(container) + def build_bucket_dataset_source_repository() -> BucketDatasetSourceRepository: + return BucketDatasetSourceRepository( + bucket_reader=StorageService(), + settings=container[Settings], + ) + + @dependency_definition(container) + def build_firestore_dataset_storage_repository() -> FirestoreDatasetStorageRepository: + return FirestoreDatasetStorageRepository( + settings=container[Settings], + ) + + @dependency_definition(container) + def build_firestore_dataset_deletion_repository() -> FirestoreDatasetDeletionRepository: + return FirestoreDatasetDeletionRepository( + settings=container[Settings], + ) + + @dependency_definition(container) + def build_pubsub_broadcaster() -> PubsubBroadcaster: + return PubsubBroadcaster( + settings=container[Settings], + ) + + container[DatasetSourceRepositoryInterface] = BucketDatasetSourceRepository + container[DatasetStorageRepositoryInterface] = FirestoreDatasetStorageRepository + container[DatasetDeletionRepositoryInterface] = FirestoreDatasetDeletionRepository + container[DatasetBroadcastInterface] = PubsubBroadcaster + container[SchemaService] = SchemaService( + bucket_publisher=GcsSchemaPublisher, + repository_publisher=GithubSchemaPublisher, + ) + + +# ------------------------- +# local_storage_firestore +# Everything fake except FirestoreDatasetStorageRepository +# ------------------------- + + +def local_storage_firestore(container: Container): + @dependency_definition(container) + def build_firestore_dataset_storage_repository() -> FirestoreDatasetStorageRepository: + return FirestoreDatasetStorageRepository( + settings=container[Settings], + ) + + container[DatasetSourceRepositoryInterface] = FakeDatasetSourceRepository + + container[DatasetStorageRepositoryInterface] = FirestoreDatasetStorageRepository + + container[DatasetDeletionRepositoryInterface] = FakeDatasetDeletionRepository + + container[DatasetBroadcastInterface] = FakeBroadcaster + + container[SchemaService] = SchemaService( + bucket_publisher=FakePublisher(name="Fake bucket publisher"), + repository_publisher=FakePublisher(name="Fake github publisher"), + ) + + +# ------------------------- +# Profile registry +# ------------------------- + +PROFILES = { + "prod": production, # Everything using production implementations + "dev": dev, # Everything using fake implementations + "local_storage_firestore": local_storage_firestore, # Everything fake except FirestoreDatasetStorageRepository +} diff --git a/app/repositories/dataset_deletion/fake_dataset_deletion_repository.py b/app/repositories/dataset_deletion/fake_dataset_deletion_repository.py index cde40e1..9f13584 100644 --- a/app/repositories/dataset_deletion/fake_dataset_deletion_repository.py +++ b/app/repositories/dataset_deletion/fake_dataset_deletion_repository.py @@ -9,7 +9,6 @@ class FakeDatasetDeletionRepository(DatasetDeletionRepositoryInterface): """ def __init__(self): - # Key = guid, value = status self.delete_records = {} diff --git a/app/repositories/dataset_deletion/firestore_dataset_deletion_repository.py b/app/repositories/dataset_deletion/firestore_dataset_deletion_repository.py index 4da766e..ccc4c66 100644 --- a/app/repositories/dataset_deletion/firestore_dataset_deletion_repository.py +++ b/app/repositories/dataset_deletion/firestore_dataset_deletion_repository.py @@ -31,10 +31,7 @@ def __init__( self.settings = settings # Create a firestore client - self.client = firestore.Client( - project=self.settings.project_id, - database=self.settings.firestore_database - ) + self.client = firestore.Client(project=self.settings.project_id, database=self.settings.firestore_database) # Initialize Firestore collections self.mark_deletion_collection = self.client.collection("marked_for_deletion") @@ -43,28 +40,23 @@ def __init__( self.document_references = {} def mark_record_status(self, guid: Guid, status: DeleteStatus) -> None: - # Look for the document reference in the cache doc_id = self.document_references.get(guid) # If for some reason the guid is not in the cache, query firestore for it if not doc_id: - logger.warning(f"Marking record with guid that is not in repository cache: {guid}, looking in firestore...") - results = ( - self.mark_deletion_collection - .where("dataset_guid", "==", guid) - .limit(1) - .stream() - ) + results = self.mark_deletion_collection.where("dataset_guid", "==", guid).limit(1).stream() results_list = list(results) if len(results_list) > 0: doc_id = results_list[0].id else: - raise DatasetDeletionMarkException(f"Could not mark record with guid {guid} as a record with this guid could not be found") + raise DatasetDeletionMarkException( + f"Could not mark record with guid {guid} as a record with this guid could not be found" + ) try: # Update the status of this record in firestore @@ -73,7 +65,6 @@ def mark_record_status(self, guid: Guid, status: DeleteStatus) -> None: # If the status is deleted, then mark a timestamp if status == DeleteStatus.DELETED: - utc_dt = datetime.now(timezone.utc) # UTC time dt = utc_dt.astimezone() # local time timestamp = dt.strftime("%Y-%m-%dT%H:%M:%SZ") @@ -82,21 +73,14 @@ def mark_record_status(self, guid: Guid, status: DeleteStatus) -> None: raise DatasetDeletionMarkException from e def get_dataset_to_delete(self) -> Guid | None: - # First, try to fetch a PROCESSING dataset - processing = ( - self.mark_deletion_collection - .where("status", "==", DeleteStatus.PROCESSING) - .limit(1) - .stream() - ) + processing = self.mark_deletion_collection.where("status", "==", DeleteStatus.PROCESSING).limit(1).stream() processing_list = list(processing) # If a result is found that is "Processing" if len(processing_list) > 0: - # Pick the first record doc = processing_list[0] @@ -104,7 +88,7 @@ def get_dataset_to_delete(self) -> Guid | None: dataset = doc.to_dict() # Extract the fields we need - guid = dataset.get('dataset_guid') + guid = dataset.get("dataset_guid") doc_id = doc.id # Store in cache @@ -114,12 +98,7 @@ def get_dataset_to_delete(self) -> Guid | None: return guid # If no "Processing" results found, fetch a PENDING dataset - pending = ( - self.mark_deletion_collection - .where("status", "==", DeleteStatus.PENDING) - .limit(1) - .stream() - ) + pending = self.mark_deletion_collection.where("status", "==", DeleteStatus.PENDING).limit(1).stream() pending_list = list(pending) @@ -130,7 +109,7 @@ def get_dataset_to_delete(self) -> Guid | None: dataset = doc.to_dict() # Extract the fields we need - guid = dataset.get('dataset_guid') + guid = dataset.get("dataset_guid") doc_id = doc.id # Store in cache diff --git a/app/repositories/dataset_source/bucket_dataset_source_repository.py b/app/repositories/dataset_source/bucket_dataset_source_repository.py index d324438..bc3cf98 100644 --- a/app/repositories/dataset_source/bucket_dataset_source_repository.py +++ b/app/repositories/dataset_source/bucket_dataset_source_repository.py @@ -8,23 +8,17 @@ class BucketReader(Protocol): - - def get_blobs(self, - bucket_name: str, - project_id: Optional[str] = None, - directory: Optional[str] = None) -> list[Blob]: - ... + def get_blobs( + self, bucket_name: str, project_id: Optional[str] = None, directory: Optional[str] = None + ) -> list[Blob]: ... def read( self, filename: str, bucket_name: str, sub_dir: Optional[str] = None, project_id: Optional[str] = None ) -> bytes: ... - def delete(self, - filename: str, - bucket_name: str, - sub_dir: Optional[str] = None, - project_id: Optional[str] = None) -> bool: - ... + def delete( + self, filename: str, bucket_name: str, sub_dir: Optional[str] = None, project_id: Optional[str] = None + ) -> bool: ... class BucketSettings(Protocol): @@ -37,22 +31,14 @@ class BucketDatasetSourceRepository(DatasetSourceRepositoryInterface): uses GCP buckets """ - def __init__( - self, - bucket_reader: BucketReader, - settings: BucketSettings - ) -> None: + def __init__(self, bucket_reader: BucketReader, settings: BucketSettings) -> None: self._bucket_reader = bucket_reader self._settings = settings def get_oldest_filename(self) -> str | None: blobs = self._bucket_reader.get_blobs(self._settings.dataset_bucket_name) - valid_blobs = ( - blob - for blob in blobs - if blob.updated is not None and blob.name is not None - ) + valid_blobs = (blob for blob in blobs if blob.updated is not None and blob.name is not None) oldest_blob = min( valid_blobs, @@ -63,12 +49,8 @@ def get_oldest_filename(self) -> str | None: return oldest_blob.name if oldest_blob else None def get_raw_data(self, file_name: str) -> RawDataset | None: - # Read the raw data from the bucket - data_bytes = self._bucket_reader.read( - file_name, - bucket_name=self._settings.dataset_bucket_name - ) + data_bytes = self._bucket_reader.read(file_name, bucket_name=self._settings.dataset_bucket_name) # If not found, return None if data_bytes is None: @@ -81,9 +63,5 @@ def get_raw_data(self, file_name: str) -> RawDataset | None: return RawDataset.model_validate(json_content) def delete_raw_data(self, file_name: str) -> None: - # Delete from Bucket - self._bucket_reader.delete( - file_name, - bucket_name=self._settings.dataset_bucket_name - ) + self._bucket_reader.delete(file_name, bucket_name=self._settings.dataset_bucket_name) diff --git a/app/repositories/dataset_source/fake_dataset_source_repository.py b/app/repositories/dataset_source/fake_dataset_source_repository.py index 406a88b..7c84609 100644 --- a/app/repositories/dataset_source/fake_dataset_source_repository.py +++ b/app/repositories/dataset_source/fake_dataset_source_repository.py @@ -1,4 +1,3 @@ - from app.factories.dataset_factories import RawDatasetFactory from app.interfaces.dataset_source_repository_interface import DatasetSourceRepositoryInterface from app.models.dataset import RawDataset @@ -11,12 +10,8 @@ class FakeDatasetSourceRepository(DatasetSourceRepositoryInterface): """ def __init__(self): - # Store fake datasets - self.datasets = { - f"dataset_{x}.json": RawDatasetFactory.build() - for x in range(3) - } + self.datasets = {f"dataset_{x}.json": RawDatasetFactory.build() for x in range(3)} def get_oldest_filename(self) -> str | None: if not self.datasets: @@ -26,7 +21,6 @@ def get_oldest_filename(self) -> str | None: return next(iter(self.datasets.keys())) def get_raw_data(self, file_name: str) -> RawDataset | None: - if file_name in self.datasets: return self.datasets[file_name] return None diff --git a/app/repositories/dataset_storage/fake_dataset_storage_repository.py b/app/repositories/dataset_storage/fake_dataset_storage_repository.py index f16e249..4e8b18a 100644 --- a/app/repositories/dataset_storage/fake_dataset_storage_repository.py +++ b/app/repositories/dataset_storage/fake_dataset_storage_repository.py @@ -10,17 +10,12 @@ class FakeDatasetStorageRepository(DatasetStorageRepositoryInterface): """ def __init__(self): - # Store datasets in a dictionary # key is a tuple of (survey_id, period_id) and value is a 2D list of... # [dataset_id, dataset_metadata, unit_data_collection_with_metadata, unit_data_identifiers, marked_for_deletion?] self.datasets = {} - def get_latest_dataset_metadata( - self, - survey_id: str, - period_id: str - ) -> DatasetMetadataWithoutId | None: + def get_latest_dataset_metadata(self, survey_id: str, period_id: str) -> DatasetMetadataWithoutId | None: dataset = self.datasets.get((survey_id, period_id)) if dataset is None: return None @@ -33,7 +28,7 @@ def store_dataset( dataset_id: Guid, dataset_metadata: DatasetMetadataWithoutId, unit_data_collection_with_metadata: list[UnitDataset], - unit_data_identifiers: list[str] + unit_data_identifiers: list[str], ): dataset_key = (dataset_metadata.survey_id, dataset_metadata.period_id) self.datasets[dataset_key] = [ @@ -41,15 +36,10 @@ def store_dataset( dataset_metadata, unit_data_collection_with_metadata, unit_data_identifiers, - False # marked for deletion? + False, # marked for deletion? ] - def delete_dataset_version( - self, - survey_id: str, - period_id: str, - version: int - ): + def delete_dataset_version(self, survey_id: str, period_id: str, version: int): dataset_key = (survey_id, period_id) datasets = self.datasets.get(dataset_key) diff --git a/app/repositories/dataset_storage/firestore_dataset_storage_repository.py b/app/repositories/dataset_storage/firestore_dataset_storage_repository.py index 8e75cb1..aebc40f 100644 --- a/app/repositories/dataset_storage/firestore_dataset_storage_repository.py +++ b/app/repositories/dataset_storage/firestore_dataset_storage_repository.py @@ -22,16 +22,17 @@ class FirestoreDatasetStorageRepository(DatasetStorageRepositoryInterface): that uses firestore """ - def __init__( - self, - settings: FirestoreSettings - ): + def __init__(self, settings: FirestoreSettings): self.settings = settings # Create a firestore client self.client = firestore.Client( project=self.settings.project_id, - database=self.settings.firestore_database + database=self.settings.firestore_database, + ) + + logger.info( + f"Connected to Firestore with project_id: {settings.project_id} and database: {settings.firestore_database}" ) # Max size in bytes we can upload in one batch @@ -41,15 +42,9 @@ def __init__( # Initialize Firestore collections self.dataset_collection = self.client.collection("datasets") - def get_latest_dataset_metadata( - self, - survey_id: str, - period_id: str - ) -> DatasetMetadataWithoutId | None: - + def get_latest_dataset_metadata(self, survey_id: str, period_id: str) -> DatasetMetadataWithoutId | None: latest_dataset = ( - self.dataset_collection - .where("survey_id", "==", survey_id) + self.dataset_collection.where("survey_id", "==", survey_id) .where("period_id", "==", period_id) .order_by("sds_dataset_version", direction=firestore.Query.DESCENDING) .limit(1) @@ -64,16 +59,9 @@ def get_latest_dataset_metadata( return DatasetMetadataWithoutId.model_validate(datasets[0]) - def _get_dataset_metadata( - self, - survey_id: str, - period_id: str, - version: int - ) -> DatasetMetadata | None: - + def _get_dataset_metadata(self, survey_id: str, period_id: str, version: int) -> DatasetMetadata | None: latest_dataset = ( - self.dataset_collection - .where("survey_id", "==", survey_id) + self.dataset_collection.where("survey_id", "==", survey_id) .where("period_id", "==", period_id) .where("version", "==", version) .limit(1) @@ -93,24 +81,25 @@ def store_dataset( dataset_id: Guid, dataset_metadata: DatasetMetadataWithoutId, unit_data_collection_with_metadata: list[UnitDataset], - unit_data_identifiers: list[str] + unit_data_identifiers: list[str], ): - # Write to firebase in batches or not depends on the settings if self.settings.should_batch: + logger.info("Writing to Firestore in BATCH mode") self._store_dataset_with_batching( dataset_id=dataset_id, dataset_metadata=dataset_metadata, unit_data_collection_with_metadata=unit_data_collection_with_metadata, - unit_data_identifiers=unit_data_identifiers + unit_data_identifiers=unit_data_identifiers, ) else: + logger.info("Writing to Firestore in NORMAL mode") self._store_dataset_without_batching( dataset_id=dataset_id, dataset_metadata=dataset_metadata, unit_data_collection_with_metadata=unit_data_collection_with_metadata, - unit_data_identifiers=unit_data_identifiers + unit_data_identifiers=unit_data_identifiers, ) def _store_dataset_without_batching( @@ -118,7 +107,7 @@ def _store_dataset_without_batching( dataset_id: Guid, dataset_metadata: DatasetMetadataWithoutId, unit_data_collection_with_metadata: list[UnitDataset], - unit_data_identifiers: list[str] + unit_data_identifiers: list[str], ): """ Write the data to firestore without batching @@ -133,7 +122,7 @@ def _store_dataset_without_batching( units_collection = new_dataset_document.collection("units") # Go through unit data - for (unit_data, unit_identifier) in zip(unit_data_collection_with_metadata, unit_data_identifiers): + for unit_data, unit_identifier in zip(unit_data_collection_with_metadata, unit_data_identifiers): # Create and save the unit data as a new sub document units_collection.document(unit_identifier).set(unit_data.model_dump()) @@ -142,7 +131,7 @@ def _store_dataset_with_batching( dataset_id: Guid, dataset_metadata: DatasetMetadataWithoutId, unit_data_collection_with_metadata: list[UnitDataset], - unit_data_identifiers: list[str] + unit_data_identifiers: list[str], ): """ Use batches to write the data to firestore @@ -163,7 +152,7 @@ def _store_dataset_with_batching( batch_num_records = 0 # Go through unit data - for (unit_data, unit_identifier) in zip(unit_data_collection_with_metadata, unit_data_identifiers): + for unit_data, unit_identifier in zip(unit_data_collection_with_metadata, unit_data_identifiers): """ Add this unit to the current batch if ... @@ -172,10 +161,12 @@ def _store_dataset_with_batching( """ # Work out the size of this unit - unit_size = len(unit_data.model_dump().encode('utf-8')) + unit_size = len(unit_data.model_dump_json().encode("utf-8")) # Work out if the current batch is too big already - if (batch_size_bytes + unit_size >= self.MAX_BATCH_SIZE_BYTES) or (batch_num_records + 1 >= self.MAX_NUMBER_OF_WRITES_PER_BATCH): + if (batch_size_bytes + unit_size >= self.MAX_BATCH_SIZE_BYTES) or ( + batch_num_records + 1 >= self.MAX_NUMBER_OF_WRITES_PER_BATCH + ): # Commit the current batch batch.commit() @@ -194,13 +185,7 @@ def _store_dataset_with_batching( if batch_size_bytes > 0: batch.commit() - def delete_dataset_version( - self, - survey_id: str, - period_id: str, - version: int - ): - + def delete_dataset_version(self, survey_id: str, period_id: str, version: int): dataset_metadata = self._get_dataset_metadata(survey_id, period_id, version) if not dataset_metadata: @@ -211,7 +196,6 @@ def delete_dataset_version( self.delete_dataset_by_guid(dataset_metadata.dataset_id) def delete_dataset_by_guid(self, guid: Guid): - # Get all the collections for this dataset collections = self.dataset_collection.document(guid).collections() diff --git a/app/routes.py b/app/routes.py index afa66a4..da55fc5 100644 --- a/app/routes.py +++ b/app/routes.py @@ -47,12 +47,9 @@ async def version(): async def publish_schemas( request: Request, source: Annotated[ - Literal["github", "bucket"], - Query( - description="The source of the files specified in this request." - ) + Literal["github", "bucket"], Query(description="The source of the files specified in this request.") ] = "github", - schema_service: SchemaService = DEPS.depends(SchemaService) + schema_service: SchemaService = DEPS.depends(SchemaService), ): """ This endpoint handles a publishing schemas from a given @@ -70,12 +67,8 @@ async def publish_schemas( try: # Publish the new schemas - schema_service.publish_new_schemas( - source=source, - file_list=get_data(message).split("\n") - ) + schema_service.publish_new_schemas(source=source, file_list=get_data(message).split("\n")) except NonCriticalException as e: - # Return a status 200 (non-critical exception) return JSONResponse( status_code=200, @@ -83,7 +76,6 @@ async def publish_schemas( ) except (SchemaException, Exception) as e: - return JSONResponse( status_code=500, content={"success": False, "message": "Exception publishing schema: " + str(e)}, @@ -96,9 +88,7 @@ async def publish_schemas( @router.get("/events/dataset/create") -async def create_dataset( - dataset_service: DatasetService = DEPS.depends(DatasetService) -): +async def create_dataset(dataset_service: DatasetService = DEPS.depends(DatasetService)): """ This endpoint handles creating a dataset @@ -112,7 +102,6 @@ async def create_dataset( try: dataset_service.create_dataset() except NonCriticalException as e: - # Return a status 200 (non-critical exception) return JSONResponse( status_code=200, @@ -120,6 +109,7 @@ async def create_dataset( ) except (DatasetException, Exception) as e: + logger.exception("Exception creating dataset") return JSONResponse( status_code=500, @@ -133,9 +123,7 @@ async def create_dataset( @router.get("/events/dataset/delete") -async def delete_dataset( - dataset_service: DatasetService = DEPS.depends(DatasetService) -): +async def delete_dataset(dataset_service: DatasetService = DEPS.depends(DatasetService)): """ This endpoint deletes a dataset @@ -149,7 +137,6 @@ async def delete_dataset( try: dataset_service.delete_dataset() except NonCriticalException as e: - # Return a status 200 (non critical exception) return JSONResponse( status_code=200, @@ -157,7 +144,6 @@ async def delete_dataset( ) except (DatasetException, Exception) as e: - return JSONResponse( status_code=500, content={"success": False, "message": "Exception deleting dataset: " + str(e)}, diff --git a/app/services/dataset_service.py b/app/services/dataset_service.py index 141bd73..dd46bc3 100644 --- a/app/services/dataset_service.py +++ b/app/services/dataset_service.py @@ -42,7 +42,7 @@ def __init__( dataset_storage_repo: DatasetStorageRepositoryInterface, dataset_deletion_repo: DatasetDeletionRepositoryInterface, broadcaster: DatasetBroadcastInterface, - settings: DatasetSettings + settings: DatasetSettings, ): self.dataset_source_repo = dataset_source_repo self.dataset_storage_repo = dataset_storage_repo @@ -160,11 +160,13 @@ def create_dataset(self): # Determine next dataset version based on the latest dataset version if current_dataset: logger.info( - f"Found previous dataset version: {current_dataset.sds_dataset_version} for survey {raw_dataset.survey_id}, period {raw_dataset.period_id}, incrementing version for new dataset") + f"Found previous dataset version: {current_dataset.sds_dataset_version} for survey {raw_dataset.survey_id}, period {raw_dataset.period_id}, incrementing version for new dataset" + ) new_version = current_dataset.sds_dataset_version + 1 else: logger.info( - f"Could not find a previous dataset version for survey {raw_dataset.survey_id}, period {raw_dataset.period_id}, setting version to 1") + f"Could not find a previous dataset version for survey {raw_dataset.survey_id}, period {raw_dataset.period_id}, setting version to 1" + ) new_version = 1 # Create a new dataset_metadata object to store @@ -194,14 +196,11 @@ def create_dataset(self): form_types=new_dataset_metadata.form_types, data=item.unit_data, ) - for item in raw_dataset.data ] # Fetch a list of the identifiers for the unit data in the dataset - unit_data_identifiers = [ - item.identifier for item in raw_dataset.data - ] + unit_data_identifiers = [item.identifier for item in raw_dataset.data] # Write the new dataset to storage (firestore) @@ -215,7 +214,6 @@ def create_dataset(self): unit_data_identifiers=unit_data_identifiers, ) except Exception as e: - logger.error(f"Failed to save new dataset to storage repository, cleaning up: {e}") # If an error occurs, ensure this is fully deleted self.dataset_storage_repo.delete_dataset_by_guid(dataset_id) @@ -225,10 +223,7 @@ def create_dataset(self): logger.info(f"Dataset saved to storage successfully: {dataset_id}") # Create a DatasetMetadata object - dataset_metadata = DatasetMetadata( - dataset_id=dataset_id, - **new_dataset_metadata.model_dump() - ) + dataset_metadata = DatasetMetadata(dataset_id=dataset_id, **new_dataset_metadata.model_dump()) # Broadcast the dataset has been created (pubsub) self.broadcaster.broadcast(dataset_metadata) @@ -248,13 +243,7 @@ def create_dataset(self): logger.info(f"Dataset creation process completed: {dataset_id}") - def _cleanup( - self, - survey_id: str, - period_id: str, - new_version: int, - older_version: int - ): + def _cleanup(self, survey_id: str, period_id: str, new_version: int, older_version: int): """ Determine if the other versions of the dataset should be deleted @@ -268,17 +257,16 @@ def _cleanup( # Delete the previous version if the retention flag is false and this is not v1 if not self.settings.retain_old_datasets and new_version > 1: - logger.info("Deleting previous version of dataset...") # Delete the old version self.dataset_storage_repo.delete_dataset_version( - survey_id=survey_id, - period_id=period_id, - version=older_version + survey_id=survey_id, period_id=period_id, version=older_version ) - logger.info(f"Older Dataset deleted successfully: survey: {survey_id}, period: {period_id}, version: {older_version}") + logger.info( + f"Older Dataset deleted successfully: survey: {survey_id}, period: {period_id}, version: {older_version}" + ) else: logger.info("Nothing to cleanup") @@ -304,9 +292,7 @@ def delete_dataset(self): if not dataset_guid_to_delete: logger.info("No datasets marked for deletion (skipping process)") - raise DatasetDeletionEmptyException( - "No datasets marked for deletion in the storage repository" - ) + raise DatasetDeletionEmptyException("No datasets marked for deletion in the storage repository") logger.info(f"Selected dataset to delete: {dataset_guid_to_delete}") @@ -321,7 +307,6 @@ def delete_dataset(self): self.dataset_storage_repo.delete_dataset_by_guid(dataset_guid_to_delete) except (DatasetDeletionException, Exception) as e: - logger.error("Error deleting dataset, updating delete record status to ERROR") # If an error occurred, update the status for the delete record diff --git a/app/services/schema_service.py b/app/services/schema_service.py index b92d1f3..18d45f1 100644 --- a/app/services/schema_service.py +++ b/app/services/schema_service.py @@ -13,8 +13,7 @@ class PublisherProtocol(Protocol): which is responsible for publishing schema files. """ - def publish_schema(self, file_name: str): - ... + def publish_schema(self, file_name: str): ... class SchemaService: @@ -30,7 +29,7 @@ def __init__( self.bucket_publisher = bucket_publisher self.repository_publisher = repository_publisher - def _publish_single_file(self, file_name: str, publisher: PublisherProtocol): # noqa + def _publish_single_file(self, file_name: str, publisher: PublisherProtocol): # noqa """ Publish a single file using a given publisher :param file_name: name of the file to be published @@ -42,18 +41,14 @@ def _publish_single_file(self, file_name: str, publisher: PublisherProtocol): # except Exception as e: logger.error(f"Failed to publish schema {file_name}: {e}") - def _filter_github_files(self, files: list[str]) -> list[str]: # noqa + def _filter_github_files(self, files: list[str]) -> list[str]: # noqa """ Filter a list of file names to publish by the regex """ pattern = re.compile(r"^schemas/[^/]+/[^/]+\.json$") return [f for f in files if pattern.match(f)] - def publish_new_schemas( - self, - source: str, - file_list: list[str] - ): + def publish_new_schemas(self, source: str, file_list: list[str]): """ Take a list of file names and publish each one using a SchemaPublisher that is identified using publisher If any schema fails to publish, it will print an error message but continue processing the remaining files. diff --git a/app/settings.py b/app/settings.py index 61eac3e..445f586 100644 --- a/app/settings.py +++ b/app/settings.py @@ -11,11 +11,12 @@ class QuickSettings(BaseSettings): Quick settings are settings that are needed before SDX Base populates the AppSettings class. """ - model_config = SettingsConfigDict(env_file='.env', extra='ignore') - env: str = 'production' - def is_production(self) -> bool: - return self.env.lower() in ('production', 'prod') + model_config = SettingsConfigDict(env_file=".env", extra="ignore") + profile: str = "prod" + + def get_profile(self) -> str: + return self.profile.lower() class Settings(AppSettings): @@ -27,8 +28,11 @@ class Settings(AppSettings): dataset_bucket_name: The bucket name to pick up datasets from firestore_database: The Firestore database to publish datasets to publish_dataset_topic_id: The Pub/Sub topic ID to publish dataset updates to + + # Note this will get overridden by any duplicate entries in bash profile """ - project_id: str + + project_id: str = "ons-sds-sandbox" autodelete_dataset: bool = True retain_old_dataset: bool = True should_batch: bool = True diff --git a/pyproject.toml b/pyproject.toml index 02cfd0f..58e2784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "sds-loader" -version = "0.1.1" -description = "A service for loading new sds schemas and datasets" +version = "0.1.2" +description = "A service for loading new SDS schemas and datasets" requires-python = "==3.13.*" dependencies = [ "sdx-base>=0.2.9", diff --git a/run.py b/run.py index 2d405a3..b093785 100644 --- a/run.py +++ b/run.py @@ -45,18 +45,11 @@ async def smart_txid(request: Request) -> str: # Basic router configuration -router_1 = RouterConfig( - router, tx_id_getter=smart_txid -) +router_1 = RouterConfig(router, tx_id_getter=smart_txid) # Initialize the FastAPI app with the specified settings, routers, and project root. -app: FastAPI = initialise( - settings=Settings, - routers=[router_1], - middleware=[TimingMiddleware], - proj_root=ROOT -) +app: FastAPI = initialise(settings=Settings, routers=[router_1], middleware=[TimingMiddleware], proj_root=ROOT) # Fetch the populated settings settings = get_instance() @@ -68,7 +61,4 @@ async def smart_txid(request: Request) -> str: if __name__ == "__main__": print(load_startup_banner()) - default_server( - app, - port=settings.port - ) + default_server(app, port=settings.port) diff --git a/tests/conftest.py b/tests/conftest.py index 2abd3dd..964879f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,7 @@ class MockPublisher: Mock Publisher class, allows tracking of published schemas and side effects for testing purposes. """ + def __init__(self, label: str): self.label = label self.side_effects = {} @@ -55,6 +56,7 @@ def publish_schema(self, file_name: str): self.published_schemas.append(file_name) + # ------------------------ # Testing classes for dataset_service # ------------------------ @@ -79,9 +81,7 @@ def broadcast(self, dataset_metadata: DatasetMetadata) -> None: @pytest.fixture def mock_repo_publisher() -> MockPublisher: - return MockPublisher( - label="repo publisher" - ) + return MockPublisher(label="repo publisher") @pytest.fixture @@ -90,6 +90,7 @@ def mock_bucket_publisher() -> MockPublisher: label="bucket publisher", ) + # ------------------------ # Fixtures for dataset_service # ------------------------ @@ -119,6 +120,7 @@ def mock_broadcaster() -> DatasetBroadcastInterface: # App fixture # ------------------------ + @pytest.fixture def test_app() -> FastAPI: """ @@ -138,11 +140,7 @@ def get_secret(self, project_id: str, secret_id: str) -> str: return initialise( settings=FakeSettings, - routers=[ - RouterConfig( - router, tx_id_getter=txid_not_applicable - ) - ], + routers=[RouterConfig(router, tx_id_getter=txid_not_applicable)], proj_root=ROOT, secret_reader=MockSecretReader, ) diff --git a/tests/integration/test_create_dataset_endpoint.py b/tests/integration/test_create_dataset_endpoint.py index c01e2b2..f00c66d 100644 --- a/tests/integration/test_create_dataset_endpoint.py +++ b/tests/integration/test_create_dataset_endpoint.py @@ -8,7 +8,6 @@ class TestCreateDatasetEndpoint: - def test_create_dataset_first_version( self, test_app: FastAPI, @@ -16,7 +15,7 @@ def test_create_dataset_first_version( mock_dataset_storage_repo, mock_dataset_deletion_repo, mock_broadcaster: MockBroadcaster, - raw_dataset_factory: RawDatasetFactory + raw_dataset_factory: RawDatasetFactory, ): """ Test a happy path for creating a dataset endpoint. @@ -50,7 +49,6 @@ def test_create_dataset_first_version( mock_dataset_storage_repo.get_latest_dataset_metadata.return_value = None with DEPS.override_for_test() as test_container: - # For this test we set autodelete to True class MockSettings: autodelete_dataset: bool = True @@ -62,7 +60,7 @@ class MockSettings: dataset_storage_repo=mock_dataset_storage_repo, dataset_deletion_repo=mock_dataset_deletion_repo, broadcaster=mock_broadcaster, - settings=MockSettings() + settings=MockSettings(), ) # Create a TestClient instance @@ -99,7 +97,6 @@ def test_create_dataset_returns_200_if_source_repository_empty( mock_dataset_source_repo.get_oldest_filename.return_value = None with DEPS.override_for_test() as test_container: - # For this test we set autodelete to True class MockSettings: autodelete_dataset: bool = True @@ -111,7 +108,7 @@ class MockSettings: dataset_storage_repo=mock_dataset_storage_repo, dataset_deletion_repo=mock_dataset_deletion_repo, broadcaster=mock_broadcaster, - settings=MockSettings() + settings=MockSettings(), ) # Create a TestClient instance @@ -148,7 +145,6 @@ def test_create_dataset_returns_500_if_the_filename_is_invalid( mock_dataset_source_repo.get_oldest_filename.return_value = "invalid" with DEPS.override_for_test() as test_container: - # For this test we set autodelete to True class MockSettings: autodelete_dataset: bool = True @@ -160,7 +156,7 @@ class MockSettings: dataset_storage_repo=mock_dataset_storage_repo, dataset_deletion_repo=mock_dataset_deletion_repo, broadcaster=mock_broadcaster, - settings=MockSettings() + settings=MockSettings(), ) # Create a TestClient instance diff --git a/tests/integration/test_delete_dataset_endpoint.py b/tests/integration/test_delete_dataset_endpoint.py index b4e87ff..9e40277 100644 --- a/tests/integration/test_delete_dataset_endpoint.py +++ b/tests/integration/test_delete_dataset_endpoint.py @@ -7,7 +7,6 @@ class TestDeleteDatasetEndpoint: - def test_returns_200_when_no_datasets_are_to_be_deleted( self, test_app: FastAPI, @@ -29,7 +28,6 @@ def test_returns_200_when_no_datasets_are_to_be_deleted( mock_dataset_deletion_repo.get_dataset_to_delete.return_value = None with DEPS.override_for_test() as test_container: - # For this test we set autodelete to True class MockSettings: autodelete_dataset: bool = False @@ -41,7 +39,7 @@ class MockSettings: dataset_storage_repo=mock_dataset_storage_repo, dataset_deletion_repo=mock_dataset_deletion_repo, broadcaster=mock_broadcaster, - settings=MockSettings() + settings=MockSettings(), ) # Create a TestClient instance @@ -86,7 +84,6 @@ def error(): mock_dataset_storage_repo.delete_dataset_by_guid.side_effect = error with DEPS.override_for_test() as test_container: - # For this test we set autodelete to True class MockSettings: autodelete_dataset: bool = False @@ -98,7 +95,7 @@ class MockSettings: dataset_storage_repo=mock_dataset_storage_repo, dataset_deletion_repo=mock_dataset_deletion_repo, broadcaster=mock_broadcaster, - settings=MockSettings() + settings=MockSettings(), ) # Create a TestClient instance diff --git a/tests/integration/test_publish_schemas_endpoint.py b/tests/integration/test_publish_schemas_endpoint.py index b48f14c..eb79314 100644 --- a/tests/integration/test_publish_schemas_endpoint.py +++ b/tests/integration/test_publish_schemas_endpoint.py @@ -10,16 +10,13 @@ class TestPublishSchemasEndpoint: - def _encode_filenames(self, filenames: str) -> Envelope: """ Helper method to encode a string of filenames into the format expected by our endpoint """ # Encode the message - encoded_data = base64.b64encode( - filenames.encode("utf-8") - ).decode("utf-8") + encoded_data = base64.b64encode(filenames.encode("utf-8")).decode("utf-8") # Create a fake Message object to simulate a pubsub object message: Message = { @@ -33,10 +30,7 @@ def _encode_filenames(self, filenames: str) -> Envelope: return {"message": message, "subscription": ""} def test_publish_schemas_to_github_with_all_valid_schemas( - self, - test_app: FastAPI, - mock_repo_publisher: MockPublisher, - mock_bucket_publisher: MockPublisher + self, test_app: FastAPI, mock_repo_publisher: MockPublisher, mock_bucket_publisher: MockPublisher ): """ Test our publish schemas endpoint (to GitHub) @@ -48,7 +42,6 @@ def test_publish_schemas_to_github_with_all_valid_schemas( """ with DEPS.override_for_test() as test_container: - # Create our own schema service to use in this app test_container[SchemaService] = SchemaService( repository_publisher=mock_repo_publisher, @@ -63,8 +56,7 @@ def test_publish_schemas_to_github_with_all_valid_schemas( # Make a POST request to the /events/schema/publish endpoint and specify "source" to be GitHub response = client.post( - "/events/schema/publish?source=github", - json=self._encode_filenames(received_filenames) + "/events/schema/publish?source=github", json=self._encode_filenames(received_filenames) ) # Assert a 200 status code @@ -79,10 +71,7 @@ def test_publish_schemas_to_github_with_all_valid_schemas( assert len(mock_bucket_publisher.published_schemas) == 0 def test_publish_schemas_to_bucket_with_all_valid_schemas( - self, - test_app: FastAPI, - mock_repo_publisher: MockPublisher, - mock_bucket_publisher: MockPublisher + self, test_app: FastAPI, mock_repo_publisher: MockPublisher, mock_bucket_publisher: MockPublisher ): """ Test our publish schemas endpoint with a single @@ -93,7 +82,6 @@ def test_publish_schemas_to_bucket_with_all_valid_schemas( """ with DEPS.override_for_test() as test_container: - # Create our own schema service to use in this app test_container[SchemaService] = SchemaService( repository_publisher=mock_repo_publisher, @@ -108,8 +96,7 @@ def test_publish_schemas_to_bucket_with_all_valid_schemas( # Make a POST request to the /events/schema/publish endpoint and specify "source" to be bucket response = client.post( - "/events/schema/publish?source=bucket", - json=self._encode_filenames(received_filenames) + "/events/schema/publish?source=bucket", json=self._encode_filenames(received_filenames) ) # Assert a 200 status code @@ -122,10 +109,7 @@ def test_publish_schemas_to_bucket_with_all_valid_schemas( assert len(mock_repo_publisher.published_schemas) == 0 def test_publish_schemas_to_github_with_some_invalid_filenames( - self, - test_app: FastAPI, - mock_repo_publisher: MockPublisher, - mock_bucket_publisher: MockPublisher + self, test_app: FastAPI, mock_repo_publisher: MockPublisher, mock_bucket_publisher: MockPublisher ): """ Test our publish schemas endpoint (to GitHub) @@ -141,7 +125,6 @@ def test_publish_schemas_to_github_with_some_invalid_filenames( """ with DEPS.override_for_test() as test_container: - # Create our own schema service to use in this app test_container[SchemaService] = SchemaService( repository_publisher=mock_repo_publisher, @@ -149,15 +132,16 @@ def test_publish_schemas_to_github_with_some_invalid_filenames( ) # Create fake files to simulate new added schemas sent to loader - received_filenames = "schemas/abc/v1.json\nother/foo/v2.json\nschemas/abc/v3.json\nother/foo/v2_template.json" + received_filenames = ( + "schemas/abc/v1.json\nother/foo/v2.json\nschemas/abc/v3.json\nother/foo/v2_template.json" + ) # Create a TestClient instance client = TestClient(test_app) # Make a POST request to the /events/schema/publish endpoint and specify "source" to be GitHub response = client.post( - "/events/schema/publish?source=github", - json=self._encode_filenames(received_filenames) + "/events/schema/publish?source=github", json=self._encode_filenames(received_filenames) ) # Assert a 200 status code diff --git a/tests/unit/services/test_dataset_service.py b/tests/unit/services/test_dataset_service.py index 9923701..6b8cc2a 100644 --- a/tests/unit/services/test_dataset_service.py +++ b/tests/unit/services/test_dataset_service.py @@ -1,4 +1,3 @@ - import pytest from app.enums.delete_status import DeleteStatus @@ -19,13 +18,12 @@ class TestCreateDataset: - def test_raises_exception_if_source_is_empty( self, mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that when the dataset source repository where datasets are picked up from is empty @@ -51,7 +49,6 @@ class MockSettings: # Call create_dataset and assert that it raises the expected exception with pytest.raises(DatasetSourceEmptyException): - service.create_dataset() def test_raises_exception_if_filename_invalid( @@ -59,7 +56,7 @@ def test_raises_exception_if_filename_invalid( mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that the filename in the dataset source repository is invalid @@ -95,7 +92,7 @@ def test_raises_exception_and_autodeletes_dataset_if_filename_invalid( mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that the filename in the dataset source repository is invalid @@ -133,7 +130,7 @@ def test_dataset_is_fully_deleted_if_exception_occurs_during_creation( mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that if an exception occurs when saving the dataset to the storage repository, @@ -186,7 +183,7 @@ def test_raises_exception_when_file_contents_are_invalid( mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that when the dataset is read in from the source repository @@ -225,7 +222,7 @@ def test_raises_exception_and_autodeletes_when_file_contents_are_invalid( mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that when the dataset is read in from the source repository @@ -264,7 +261,7 @@ def test_raises_exception_if_cannot_find_dataset_in_source_repo( mock_dataset_source_repo: DatasetSourceRepositoryInterface, mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, - mock_broadcaster + mock_broadcaster, ): """ Test that if the specified filename for a dataset cannot be found in the source repository @@ -302,7 +299,7 @@ def test_autodelete_after_successfully_reading_dataset( mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that when autodelete is set to True the dataset is automatically deleted @@ -354,7 +351,7 @@ def test_does_not_autodelete_after_successfully_reading_dataset_if_autodelete_fa mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that when autodelete is set to False the dataset should not be autodeleted @@ -405,7 +402,7 @@ def test_increments_dataset_version( mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that if a dataset exists for the current survey_id and period @@ -465,7 +462,7 @@ class MockSettings: args, kwargs = mock_dataset_storage_repo.store_dataset.call_args # Extract just the dataset_metadata argument - dataset_metadata: DatasetMetadataWithoutId = kwargs['dataset_metadata'] + dataset_metadata: DatasetMetadataWithoutId = kwargs["dataset_metadata"] # Assert the metadata contains correct data assert dataset_metadata.survey_id == survey_id @@ -534,7 +531,7 @@ class MockSettings: args, kwargs = mock_dataset_storage_repo.store_dataset.call_args # Extract just the dataset_metadata argument - dataset_metadata: DatasetMetadataWithoutId = kwargs['dataset_metadata'] + dataset_metadata: DatasetMetadataWithoutId = kwargs["dataset_metadata"] # Assert the metadata contains correct data assert dataset_metadata.survey_id == survey_id @@ -547,7 +544,7 @@ def test_stored_successfully_when_created( mock_dataset_storage_repo: DatasetStorageRepositoryInterface, mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, - raw_dataset_factory: RawDatasetFactory + raw_dataset_factory: RawDatasetFactory, ): """ Test the dataset_storage_repo is called correctly when a valid @@ -573,17 +570,11 @@ def test_stored_successfully_when_created( data=[ { "identifier": "abc", - "unit_data": ["hello", "world"], + "unit_data": "hello", }, - { - "identifier": "def", - "unit_data": [] - }, - { - "identifier": "ghi", - "unit_data": ["test"] - } - ] + {"identifier": "def", "unit_data": "world"}, + {"identifier": "ghi", "unit_data": "test"}, + ], ) # ------------------------ @@ -619,11 +610,9 @@ class MockSettings: # Check unit_data_identifiers # ------------------------ - unit_data_identifiers = kwargs['unit_data_identifiers'] + unit_data_identifiers = kwargs["unit_data_identifiers"] - expected_identifiers = [ - "abc", "def", "ghi" - ] + expected_identifiers = ["abc", "def", "ghi"] assert unit_data_identifiers == expected_identifiers @@ -631,7 +620,7 @@ class MockSettings: # Check unit_data_collection_with_metadata # ------------------------ - unit_data_collection_with_metadata = kwargs['unit_data_collection_with_metadata'] + unit_data_collection_with_metadata = kwargs["unit_data_collection_with_metadata"] # There should be 3 sets of unit_data assert len(unit_data_collection_with_metadata) == 3 @@ -641,15 +630,15 @@ class MockSettings: assert unit_data_collection_with_metadata[0].survey_id == survey_id assert unit_data_collection_with_metadata[0].period_id == period_id - assert unit_data_collection_with_metadata[0].data == ["hello", "world"] + assert unit_data_collection_with_metadata[0].data == "hello" assert unit_data_collection_with_metadata[1].survey_id == survey_id assert unit_data_collection_with_metadata[1].period_id == period_id - assert unit_data_collection_with_metadata[1].data == [] + assert unit_data_collection_with_metadata[1].data == "world" assert unit_data_collection_with_metadata[2].survey_id == survey_id assert unit_data_collection_with_metadata[2].period_id == period_id - assert unit_data_collection_with_metadata[2].data == ["test"] + assert unit_data_collection_with_metadata[2].data == "test" def test_broadcasts_dataset_when_created_successfully( self, @@ -724,7 +713,7 @@ def test_cleanup_old_dataset_version_when_created_successfully( mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that if a new version of the dataset is created successfully @@ -796,7 +785,7 @@ def test_cleanup_old_dataset_version_does_not_delete_when_retain_flag_is_true( mock_dataset_deletion_repo: DatasetDeletionRepositoryInterface, mock_broadcaster, raw_dataset_factory: RawDatasetFactory, - dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory + dataset_metadata_without_id_factory: DatasetMetadataWithoutIdFactory, ): """ Test that if a new version of the dataset is created successfully @@ -854,7 +843,6 @@ class MockSettings: class TestDeleteDataset: - def test_raises_exception_if_no_datasets_to_delete( self, mock_dataset_source_repo: DatasetSourceRepositoryInterface, diff --git a/tests/unit/services/test_schema_service.py b/tests/unit/services/test_schema_service.py index 418a0fb..c59a75b 100644 --- a/tests/unit/services/test_schema_service.py +++ b/tests/unit/services/test_schema_service.py @@ -12,13 +12,11 @@ def raise_schema_error(): class TestPublishNewSchemas: - def test_publish_new_schemas_publishes_after_exception( self, mock_repo_publisher: MockPublisher, mock_bucket_publisher: MockPublisher, ): - # Define input filenames filenames = [ "schemas/abc/v1.json", @@ -37,10 +35,7 @@ def test_publish_new_schemas_publishes_after_exception( ) # Publish schemas from GitHub - service.publish_new_schemas( - "github", - filenames - ) + service.publish_new_schemas("github", filenames) should_be_published = [ "schemas/abc/v1.json", @@ -63,7 +58,6 @@ def test_publish_all_files_successfully( mock_repo_publisher: MockPublisher, mock_bucket_publisher: MockPublisher, ): - # Define input filenames repo_filenames = [ "schemas/abc/v1.json", @@ -85,16 +79,10 @@ def test_publish_all_files_successfully( ) # Publish the repository filenames - service.publish_new_schemas( - "github", - repo_filenames - ) + service.publish_new_schemas("github", repo_filenames) # Publish the bucket filenames - service.publish_new_schemas( - "bucket", - bucket_filenames - ) + service.publish_new_schemas("bucket", bucket_filenames) # Assert all the GitHub files are published assert len(mock_repo_publisher.published_schemas) == len(repo_filenames) @@ -118,13 +106,13 @@ def test_publish_github_filter( filenames = { "schemas/a/b.json": True, "schemas/abc/def.json": True, - "schemas/a/b.JSON": False, # case-sensitive - "schemas/a/b": False, # no extension + "schemas/a/b.JSON": False, # case-sensitive + "schemas/a/b": False, # no extension "schemas/a/b.json/extra": False, # too deep - "schemas/a//b.json": False, # empty segment => doesn't match [^/]+ - "schemas//b.json": False, # empty segment - "schemas/a/": False, # missing filename - "Schemas/a/b.json": False, # wrong case in prefix + "schemas/a//b.json": False, # empty segment => doesn't match [^/]+ + "schemas//b.json": False, # empty segment + "schemas/a/": False, # missing filename + "Schemas/a/b.json": False, # wrong case in prefix "other/schemas/a/b.json": False, # doesn't start with schemas/ } @@ -133,10 +121,7 @@ def test_publish_github_filter( bucket_publisher=mock_bucket_publisher, ) - service.publish_new_schemas( - "github", - list(filenames.keys()) - ) + service.publish_new_schemas("github", list(filenames.keys())) # Assert only the valid filenames are published assert len(mock_repo_publisher.published_schemas) == 2 diff --git a/uv.lock b/uv.lock index 5143665..dd4f85f 100644 --- a/uv.lock +++ b/uv.lock @@ -1328,7 +1328,7 @@ wheels = [ [[package]] name = "sds-loader" -version = "0.1.1" +version = "0.1.2" source = { virtual = "." } dependencies = [ { name = "fastapi" },