Skip to content

Commit 3a497d7

Browse files
komalg1tectonia
andauthored
fix: Delete from storage when files deleted from Admin site (#970)
Co-authored-by: Martyna Marcinkowska <61530975+tectonia@users.noreply.github.com>
1 parent 5be92b5 commit 3a497d7

File tree

9 files changed

+104
-29
lines changed

9 files changed

+104
-29
lines changed

code/backend/batch/batch_push_results.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,4 @@ def _process_document_deleted_event(message_body) -> None:
6060
search_handler = Search.get_search_handler(env_helper)
6161

6262
blob_url = message_body.get("data", {}).get("url", "")
63-
search_handler.delete_by_source(f"{blob_url}_SAS_TOKEN_PLACEHOLDER_")
63+
search_handler.delete_from_index(blob_url)

code/backend/batch/utilities/helpers/azure_blob_storage_client.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,23 @@ def delete_file(self, file_name):
156156
blob_client = self.blob_service_client.get_blob_client(
157157
container=self.container_name, blob=file_name
158158
)
159-
blob_client.delete_blob()
159+
if blob_client.exists():
160+
blob_client.delete_blob()
161+
162+
def delete_files(self, files, integrated_vectorization: bool):
163+
"""
164+
Deletes files from the Azure Blob Storage container.
165+
166+
Args:
167+
files (list[str]): The names of the files to delete.
168+
169+
Returns:
170+
None
171+
"""
172+
for filename, ids in files.items():
173+
if not integrated_vectorization:
174+
filename = filename.split("/")[-1]
175+
self.delete_file(filename)
160176

161177
def get_all_files(self):
162178
# Get all files in the container from Azure Blob Storage

code/backend/batch/utilities/search/azure_search_handler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import List
2+
23
from .search_handler_base import SearchHandlerBase
34
from ..helpers.llm_helper import LLMHelper
45
from ..helpers.azure_computer_vision_client import AzureComputerVisionClient
@@ -63,6 +64,14 @@ def delete_files(self, files):
6364

6465
return ", ".join(files_to_delete)
6566

67+
def search_by_blob_url(self, blob_url):
68+
return self.search_client.search(
69+
"*",
70+
select="id, title",
71+
include_total_count=True,
72+
filter=f"source eq '{blob_url}_SAS_TOKEN_PLACEHOLDER_'",
73+
)
74+
6675
def query_search(self, question) -> List[SourceDocument]:
6776
encoding = tiktoken.get_encoding(self._ENCODER_NAME)
6877
tokenised_question = encoding.encode(question)

code/backend/batch/utilities/search/integrated_vectorization_search_handler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ def output_results(self, results):
5757

5858
return files
5959

60+
def search_by_blob_url(self, blob_url: str):
61+
if self._check_index_exists():
62+
title = blob_url.split(f"{self.env_helper.AZURE_BLOB_CONTAINER_NAME}/")[1]
63+
return self.search_client.search(
64+
"*",
65+
select="id, chunk_id, title",
66+
include_total_count=True,
67+
filter=f"title eq '{title}'",
68+
)
69+
6070
def delete_files(self, files):
6171
ids_to_delete = []
6272
files_to_delete = []

code/backend/batch/utilities/search/search_handler_base.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ def get_unique_files(self, results, facet_key: str):
2222
return [facet["value"] for facet in results.get_facets()[facet_key]]
2323
return []
2424

25+
def delete_from_index(self, blob_url) -> None:
26+
documents = self.search_by_blob_url(blob_url)
27+
files_to_delete = self.output_results(documents)
28+
self.delete_files(files_to_delete)
29+
2530
@abstractmethod
2631
def create_search_client(self) -> SearchClient:
2732
pass
@@ -50,24 +55,6 @@ def delete_files(self, files):
5055
def query_search(self, question) -> list[SourceDocument]:
5156
pass
5257

53-
def delete_by_source(self, source) -> None:
54-
if source is None:
55-
return
56-
57-
documents = self._get_documents_by_source(source)
58-
if documents is None:
59-
return
60-
61-
files_to_delete = self.output_results(documents)
62-
self.delete_files(files_to_delete)
63-
64-
def _get_documents_by_source(self, source):
65-
if source is None:
66-
return None
67-
68-
return self.search_client.search(
69-
"*",
70-
select="id, title",
71-
include_total_count=True,
72-
filter=f"source eq '{source}'",
73-
)
58+
@abstractmethod
59+
def search_by_blob_url(self, blob_url):
60+
pass

code/backend/pages/03_Delete_Data.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
from batch.utilities.helpers.env_helper import EnvHelper
77
from batch.utilities.search.search import Search
8+
from batch.utilities.helpers.azure_blob_storage_client import AzureBlobStorageClient
89

910
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
1011
env_helper: EnvHelper = EnvHelper()
@@ -61,11 +62,15 @@
6162
st.info("No files selected")
6263
st.stop()
6364
else:
64-
files_to_delete = search_handler.delete_files(
65-
selected_files,
65+
blob_client = AzureBlobStorageClient()
66+
blob_client.delete_files(
67+
selected_files, env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
6668
)
67-
if len(files_to_delete) > 0:
68-
st.success("Deleted files: " + str(files_to_delete))
69+
if len(selected_files) > 0:
70+
st.success(
71+
"Deleted files from storage. Deleting from the index is an asynchronous process and may take a few minutes to complete."
72+
+ ", ".join([name for name, ids in selected_files.items()])
73+
)
6974

7075
except Exception:
7176
logger.error(traceback.format_exc())

code/tests/search_utilities/test_azure_search_handler.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,3 +363,26 @@ def test_semantic_search_with_advanced_image_processing(
363363
query_answer="extractive",
364364
top=handler.env_helper.AZURE_SEARCH_TOP_K,
365365
)
366+
367+
368+
def test_delete_from_index(handler, mock_search_client):
369+
# given
370+
blob_url = "https://example.com/blob"
371+
filter_value = f"source eq '{blob_url}_SAS_TOKEN_PLACEHOLDER_'"
372+
documents = [
373+
{"id": 1, "title": "file1"},
374+
{"id": 2, "title": "file2"},
375+
{"id": 3, "title": "file1"},
376+
{"id": 4, "title": "file3"},
377+
]
378+
handler.search_client.search.return_value = documents
379+
ids_to_delete = [{"id": 1}, {"id": 3}, {"id": 2}, {"id": 4}]
380+
381+
# when
382+
handler.delete_from_index(blob_url)
383+
384+
# then
385+
handler.search_client.search.assert_called_once_with(
386+
"*", select="id, title", include_total_count=True, filter=filter_value
387+
)
388+
handler.search_client.delete_documents.assert_called_once_with(ids_to_delete)

code/tests/search_utilities/test_integrated_vectorization_search_handler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,28 @@ def test_query_search_converts_results_to_source_documents(handler):
254254

255255
# then
256256
assert actual_results == expected_results
257+
258+
259+
def test_delete_from_index(env_helper_mock, handler, search_client_mock):
260+
# given
261+
env_helper_mock.AZURE_BLOB_CONTAINER_NAME = "documents"
262+
blob_url = "https://example.com/documents/file1.txt"
263+
title = "file1.txt"
264+
documents = [
265+
{"chunk_id": "123_chunk", "title": title},
266+
{"chunk_id": "789_chunk", "title": title},
267+
]
268+
search_client_mock.search.return_value = documents
269+
ids_to_delete = [{"chunk_id": "123_chunk"}, {"chunk_id": "789_chunk"}]
270+
271+
# when
272+
handler.delete_from_index(blob_url)
273+
274+
# then
275+
search_client_mock.search.assert_called_once_with(
276+
"*",
277+
select="id, chunk_id, title",
278+
include_total_count=True,
279+
filter=f"title eq '{title}'",
280+
)
281+
search_client_mock.delete_documents.assert_called_once_with(ids_to_delete)

code/tests/test_batch_push_results.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,6 @@ def test_batch_push_results_with_blob_deleted_event_uses_search_to_delete_with_s
123123
)
124124

125125
batch_push_results.build().get_user_function()(mock_queue_message)
126-
mock_get_search_handler.delete_by_source.assert_called_once_with(
127-
"https://test.test/test/test_filename.pdf_SAS_TOKEN_PLACEHOLDER_"
126+
mock_get_search_handler.delete_from_index.assert_called_once_with(
127+
"https://test.test/test/test_filename.pdf"
128128
)

0 commit comments

Comments
 (0)