From c87dcffc3d83ea5199f71038341ea9cdf76c090e Mon Sep 17 00:00:00 2001 From: NogaNHS <127490765+NogaNHS@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:09:58 +0000 Subject: [PATCH 1/3] [PRMP-1595] Refactor file key generation in review processor to use deterministic UUIDs --- .../document_review_processor_service.py | 5 +- .../test_document_review_processor_service.py | 64 +++++++++++++------ 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/lambdas/services/document_review_processor_service.py b/lambdas/services/document_review_processor_service.py index 9b3e02c61e..1e392567e5 100644 --- a/lambdas/services/document_review_processor_service.py +++ b/lambdas/services/document_review_processor_service.py @@ -25,6 +25,8 @@ logger = LoggingService(__name__) +S3_STORAGE_NAMESPACE = uuid.UUID('fb0e6109-abd7-4531-b0f2-c6e5be5861b0') + class ReviewProcessorService: @@ -114,9 +116,8 @@ def _move_files_to_review_bucket( review_record_id: str, ) -> list[DocumentReviewFileDetails]: new_file_keys: list[DocumentReviewFileDetails] = [] - for file in message_data.files: - object_key = uuid.uuid4() + object_key = uuid.uuid5(S3_STORAGE_NAMESPACE, f"{review_record_id}/{file.file_path}") new_file_key = f"{review_record_id}/{object_key}" logger.info( diff --git a/lambdas/tests/unit/services/test_document_review_processor_service.py b/lambdas/tests/unit/services/test_document_review_processor_service.py index c0e3a46262..07994a3e64 100644 --- a/lambdas/tests/unit/services/test_document_review_processor_service.py +++ b/lambdas/tests/unit/services/test_document_review_processor_service.py @@ -1,3 +1,5 @@ +import uuid + import pytest from botocore.exceptions import ClientError from enums.document_review_reason import DocumentReviewReason @@ -8,7 +10,7 @@ ) from models.pds_models import PatientDetails from models.sqs.review_message_body import ReviewMessageBody, ReviewMessageFile -from services.document_review_processor_service import ReviewProcessorService +from services.document_review_processor_service import ReviewProcessorService, S3_STORAGE_NAMESPACE from tests.unit.conftest import MOCK_DOCUMENT_REVIEW_BUCKET, S3_PREFIX, TEST_NHS_NUMBER from utils.exceptions import ( InvalidResourceIdException, @@ -280,16 +282,17 @@ def test_build_review_record_with_multiple_files(service_under_test): assert result.files[1].file_name == "document_2.pdf" -def test_move_files_success(service_under_test, sample_review_message, mocker): - mocker.patch("uuid.uuid4", return_value="123412342") +def test_move_files_success(service_under_test, sample_review_message): + review_id = "test-review-id-123" + source_file_path = f"staging/{TEST_NHS_NUMBER}/test_document.pdf" + expected_object_key = uuid.uuid5(S3_STORAGE_NAMESPACE, f"{review_id}/{source_file_path}") + expected_key = f"{review_id}/{expected_object_key}" files = service_under_test._move_files_to_review_bucket( sample_review_message, - "test-review-id-123", + review_id, ) - expected_key = "test-review-id-123/123412342" - assert len(files) == 1 assert files[0].file_name == "test_document.pdf" assert ( @@ -299,49 +302,70 @@ def test_move_files_success(service_under_test, sample_review_message, mocker): service_under_test.s3_service.copy_across_bucket.assert_called_once_with( source_bucket="test_staging_bulk_store", - source_file_key=f"staging/{TEST_NHS_NUMBER}/test_document.pdf", + source_file_key=source_file_path, dest_bucket="test_document_review_bucket", dest_file_key=expected_key, if_none_match=True, ) -def test_move_multiple_files_success(service_under_test, mocker): +def test_move_multiple_files_success(service_under_test): + review_id = "test-review-id" + file_path_1 = f"staging/{TEST_NHS_NUMBER}/document_1.pdf" + file_path_2 = f"staging/{TEST_NHS_NUMBER}/document_2.pdf" + expected_key_1 = f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_1}')}" + expected_key_2 = f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_2}')}" + message = ReviewMessageBody( upload_id="test-upload-id-999", files=[ ReviewMessageFile( file_name="document_1.pdf", - file_path=f"staging/{TEST_NHS_NUMBER}/document_1.pdf", + file_path=file_path_1, ), ReviewMessageFile( file_name="document_2.pdf", - file_path=f"staging/{TEST_NHS_NUMBER}/document_2.pdf", + file_path=file_path_2, ), ], nhs_number=TEST_NHS_NUMBER, failure_reason=DocumentReviewReason.UNSUCCESSFUL_UPLOAD, uploader_ods="Y12345", ) - mocker.patch("uuid.uuid4", side_effect=["123412342", "56785678"]) - files = service_under_test._move_files_to_review_bucket(message, "test-review-id") + files = service_under_test._move_files_to_review_bucket(message, review_id) assert len(files) == 2 assert files[0].file_name == "document_1.pdf" - assert ( - files[0].file_location - == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/test-review-id/123412342" - ) + assert files[0].file_location == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_1}" assert files[1].file_name == "document_2.pdf" - assert ( - files[1].file_location - == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/test-review-id/56785678" - ) + assert files[1].file_location == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_2}" assert service_under_test.s3_service.copy_across_bucket.call_count == 2 +def test_move_files_destination_key_is_deterministic_across_retries( + service_under_test, sample_review_message +): + + review_id = "test-review-id-123" + + files_first_invocation = service_under_test._move_files_to_review_bucket( + sample_review_message, + review_id, + ) + + # Reset the mock so the second call is tracked independently + service_under_test.s3_service.copy_across_bucket.reset_mock() + + files_second_invocation = service_under_test._move_files_to_review_bucket( + sample_review_message, + review_id, + ) + + assert files_first_invocation[0].file_location == files_second_invocation[0].file_location + + def test_move_files_copy_error(service_under_test, sample_review_message): service_under_test.s3_service.copy_across_bucket.side_effect = ClientError( {"Error": {"Code": "NoSuchKey", "Message": "Source not found"}}, From 495e64a235ef7e8635b972e88aec3b81c29b0b83 Mon Sep 17 00:00:00 2001 From: NogaNHS <127490765+NogaNHS@users.noreply.github.com> Date: Wed, 18 Mar 2026 14:05:09 +0000 Subject: [PATCH 2/3] [PRMP-1595] Deduplicate identical file paths in review processing and improve logging --- .../document_review_processor_service.py | 14 +++- .../test_document_review_processor_service.py | 73 +++++++++++++++++-- 2 files changed, 77 insertions(+), 10 deletions(-) diff --git a/lambdas/services/document_review_processor_service.py b/lambdas/services/document_review_processor_service.py index 1e392567e5..ca618103a6 100644 --- a/lambdas/services/document_review_processor_service.py +++ b/lambdas/services/document_review_processor_service.py @@ -25,7 +25,7 @@ logger = LoggingService(__name__) -S3_STORAGE_NAMESPACE = uuid.UUID('fb0e6109-abd7-4531-b0f2-c6e5be5861b0') +S3_STORAGE_NAMESPACE = uuid.UUID("fb0e6109-abd7-4531-b0f2-c6e5be5861b0") class ReviewProcessorService: @@ -115,9 +115,19 @@ def _move_files_to_review_bucket( message_data: ReviewMessageBody, review_record_id: str, ) -> list[DocumentReviewFileDetails]: + seen_source_paths: set[str] = set() new_file_keys: list[DocumentReviewFileDetails] = [] for file in message_data.files: - object_key = uuid.uuid5(S3_STORAGE_NAMESPACE, f"{review_record_id}/{file.file_path}") + if file.file_path in seen_source_paths: + logger.warning( + f"Duplicate source file path detected, skipping: {file.file_path}", + ) + continue + seen_source_paths.add(file.file_path) + + object_key = uuid.uuid5( + S3_STORAGE_NAMESPACE, f"{review_record_id}/{file.file_path}", + ) new_file_key = f"{review_record_id}/{object_key}" logger.info( diff --git a/lambdas/tests/unit/services/test_document_review_processor_service.py b/lambdas/tests/unit/services/test_document_review_processor_service.py index 07994a3e64..b4ced07943 100644 --- a/lambdas/tests/unit/services/test_document_review_processor_service.py +++ b/lambdas/tests/unit/services/test_document_review_processor_service.py @@ -2,6 +2,7 @@ import pytest from botocore.exceptions import ClientError + from enums.document_review_reason import DocumentReviewReason from enums.document_review_status import DocumentReviewStatus from models.document_review import ( @@ -10,7 +11,10 @@ ) from models.pds_models import PatientDetails from models.sqs.review_message_body import ReviewMessageBody, ReviewMessageFile -from services.document_review_processor_service import ReviewProcessorService, S3_STORAGE_NAMESPACE +from services.document_review_processor_service import ( + S3_STORAGE_NAMESPACE, + ReviewProcessorService, +) from tests.unit.conftest import MOCK_DOCUMENT_REVIEW_BUCKET, S3_PREFIX, TEST_NHS_NUMBER from utils.exceptions import ( InvalidResourceIdException, @@ -285,7 +289,9 @@ def test_build_review_record_with_multiple_files(service_under_test): def test_move_files_success(service_under_test, sample_review_message): review_id = "test-review-id-123" source_file_path = f"staging/{TEST_NHS_NUMBER}/test_document.pdf" - expected_object_key = uuid.uuid5(S3_STORAGE_NAMESPACE, f"{review_id}/{source_file_path}") + expected_object_key = uuid.uuid5( + S3_STORAGE_NAMESPACE, f"{review_id}/{source_file_path}", + ) expected_key = f"{review_id}/{expected_object_key}" files = service_under_test._move_files_to_review_bucket( @@ -313,8 +319,12 @@ def test_move_multiple_files_success(service_under_test): review_id = "test-review-id" file_path_1 = f"staging/{TEST_NHS_NUMBER}/document_1.pdf" file_path_2 = f"staging/{TEST_NHS_NUMBER}/document_2.pdf" - expected_key_1 = f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_1}')}" - expected_key_2 = f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_2}')}" + expected_key_1 = ( + f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_1}')}" + ) + expected_key_2 = ( + f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{file_path_2}')}" + ) message = ReviewMessageBody( upload_id="test-upload-id-999", @@ -337,15 +347,21 @@ def test_move_multiple_files_success(service_under_test): assert len(files) == 2 assert files[0].file_name == "document_1.pdf" - assert files[0].file_location == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_1}" + assert ( + files[0].file_location + == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_1}" + ) assert files[1].file_name == "document_2.pdf" - assert files[1].file_location == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_2}" + assert ( + files[1].file_location + == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key_2}" + ) assert service_under_test.s3_service.copy_across_bucket.call_count == 2 def test_move_files_destination_key_is_deterministic_across_retries( - service_under_test, sample_review_message + service_under_test, sample_review_message, ): review_id = "test-review-id-123" @@ -363,7 +379,48 @@ def test_move_files_destination_key_is_deterministic_across_retries( review_id, ) - assert files_first_invocation[0].file_location == files_second_invocation[0].file_location + assert ( + files_first_invocation[0].file_location + == files_second_invocation[0].file_location + ) + + +def test_move_files_deduplicates_identical_file_paths(service_under_test): + review_id = "test-review-id-dup" + source_file_path = f"staging/{TEST_NHS_NUMBER}/document_1.pdf" + expected_key = f"{review_id}/{uuid.uuid5(S3_STORAGE_NAMESPACE, f'{review_id}/{source_file_path}')}" + + message_with_duplicate_files = ReviewMessageBody( + upload_id=review_id, + files=[ + ReviewMessageFile(file_name="document_1.pdf", file_path=source_file_path), + ReviewMessageFile( + file_name="document_1.pdf", file_path=source_file_path, + ), # duplicate row + ], + nhs_number=TEST_NHS_NUMBER, + failure_reason=DocumentReviewReason.UNSUCCESSFUL_UPLOAD, + uploader_ods="Y12345", + ) + + files = service_under_test._move_files_to_review_bucket( + message_with_duplicate_files, review_id, + ) + + assert len(files) == 1 + assert files[0].file_name == "document_1.pdf" + assert ( + files[0].file_location + == f"{S3_PREFIX}{MOCK_DOCUMENT_REVIEW_BUCKET}/{expected_key}" + ) + + service_under_test.s3_service.copy_across_bucket.assert_called_once_with( + source_bucket="test_staging_bulk_store", + source_file_key=source_file_path, + dest_bucket="test_document_review_bucket", + dest_file_key=expected_key, + if_none_match=True, + ) def test_move_files_copy_error(service_under_test, sample_review_message): From 2ebeecda92b44885933e2aeacd4c1dc4b258eb1c Mon Sep 17 00:00:00 2001 From: NogaNHS <127490765+NogaNHS@users.noreply.github.com> Date: Mon, 23 Mar 2026 10:32:18 +0000 Subject: [PATCH 3/3] format --- .../services/document_review_processor_service.py | 3 ++- .../handlers/test_bulk_upload_metadata_handler.py | 3 ++- .../test_document_review_processor_service.py | 12 ++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lambdas/services/document_review_processor_service.py b/lambdas/services/document_review_processor_service.py index ca618103a6..40deb3f5e3 100644 --- a/lambdas/services/document_review_processor_service.py +++ b/lambdas/services/document_review_processor_service.py @@ -126,7 +126,8 @@ def _move_files_to_review_bucket( seen_source_paths.add(file.file_path) object_key = uuid.uuid5( - S3_STORAGE_NAMESPACE, f"{review_record_id}/{file.file_path}", + S3_STORAGE_NAMESPACE, + f"{review_record_id}/{file.file_path}", ) new_file_key = f"{review_record_id}/{object_key}" diff --git a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_handler.py b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_handler.py index 988a86d8ad..bbee57a0f3 100644 --- a/lambdas/tests/unit/handlers/test_bulk_upload_metadata_handler.py +++ b/lambdas/tests/unit/handlers/test_bulk_upload_metadata_handler.py @@ -1,11 +1,12 @@ import pytest + from handlers.bulk_upload_metadata_handler import lambda_handler from models.staging_metadata import METADATA_FILENAME from services.bulk_upload_metadata_service import BulkUploadMetadataService def test_lambda_call_process_metadata_of_service_class( - set_env, event, context, mock_metadata_service + set_env, event, context, mock_metadata_service, ): lambda_handler(event, context) diff --git a/lambdas/tests/unit/services/test_document_review_processor_service.py b/lambdas/tests/unit/services/test_document_review_processor_service.py index b4ced07943..4ee3bb1d71 100644 --- a/lambdas/tests/unit/services/test_document_review_processor_service.py +++ b/lambdas/tests/unit/services/test_document_review_processor_service.py @@ -290,7 +290,8 @@ def test_move_files_success(service_under_test, sample_review_message): review_id = "test-review-id-123" source_file_path = f"staging/{TEST_NHS_NUMBER}/test_document.pdf" expected_object_key = uuid.uuid5( - S3_STORAGE_NAMESPACE, f"{review_id}/{source_file_path}", + S3_STORAGE_NAMESPACE, + f"{review_id}/{source_file_path}", ) expected_key = f"{review_id}/{expected_object_key}" @@ -361,7 +362,8 @@ def test_move_multiple_files_success(service_under_test): def test_move_files_destination_key_is_deterministic_across_retries( - service_under_test, sample_review_message, + service_under_test, + sample_review_message, ): review_id = "test-review-id-123" @@ -395,7 +397,8 @@ def test_move_files_deduplicates_identical_file_paths(service_under_test): files=[ ReviewMessageFile(file_name="document_1.pdf", file_path=source_file_path), ReviewMessageFile( - file_name="document_1.pdf", file_path=source_file_path, + file_name="document_1.pdf", + file_path=source_file_path, ), # duplicate row ], nhs_number=TEST_NHS_NUMBER, @@ -404,7 +407,8 @@ def test_move_files_deduplicates_identical_file_paths(service_under_test): ) files = service_under_test._move_files_to_review_bucket( - message_with_duplicate_files, review_id, + message_with_duplicate_files, + review_id, ) assert len(files) == 1