Skip to content

Commit 8f5ea28

Browse files
authored
fix: delete url embeddings from index when deleting from Admin website (#977)
1 parent 5617f82 commit 8f5ea28

File tree

4 files changed

+37
-17
lines changed

4 files changed

+37
-17
lines changed

code/backend/batch/utilities/search/search_handler_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def get_unique_files(self, results, facet_key: str):
2424

2525
def delete_from_index(self, blob_url) -> None:
2626
documents = self.search_by_blob_url(blob_url)
27+
if documents is None or documents.get_count() == 0:
28+
return
2729
files_to_delete = self.output_results(documents)
2830
self.delete_files(files_to_delete)
2931

code/backend/pages/03_Delete_Data.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,15 @@
6262
st.info("No files selected")
6363
st.stop()
6464
else:
65+
files_to_delete = search_handler.delete_files(
66+
selected_files,
67+
)
6568
blob_client = AzureBlobStorageClient()
6669
blob_client.delete_files(
6770
selected_files, env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
6871
)
69-
if len(selected_files) > 0:
70-
st.success(
71-
"Deleted files from storage. Deleting from the index is an asynchronous process and may take a few minutes to complete."
72-
+ ", ".join([name for name, ids in selected_files.items()])
73-
)
72+
if len(files_to_delete) > 0:
73+
st.success("Deleted files: " + str(files_to_delete))
7474

7575
except Exception:
7676
logger.error(traceback.format_exc())

code/tests/search_utilities/test_azure_search_handler.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from backend.batch.utilities.search.azure_search_handler import AzureSearchHandler
44
import json
55
from azure.search.documents.models import VectorizedQuery
6-
6+
from azure.search.documents import SearchItemPaged
77
from backend.batch.utilities.common.source_document import SourceDocument
88

99

@@ -369,14 +369,23 @@ def test_delete_from_index(handler, mock_search_client):
369369
# given
370370
blob_url = "https://example.com/blob"
371371
filter_value = f"source eq '{blob_url}_SAS_TOKEN_PLACEHOLDER_'"
372-
documents = [
373-
{"id": 1, "title": "file1"},
374-
{"id": 2, "title": "file2"},
375-
{"id": 3, "title": "file1"},
376-
{"id": 4, "title": "file3"},
377-
]
372+
documents = Mock(
373+
SearchItemPaged(
374+
[
375+
{"id": 1, "title": "file1"},
376+
{"id": 2, "title": "file2"},
377+
{"id": 3, "title": "file1"},
378+
{"id": 4, "title": "file3"},
379+
]
380+
)
381+
)
382+
378383
handler.search_client.search.return_value = documents
384+
documents.get_count.return_value = 4
379385
ids_to_delete = [{"id": 1}, {"id": 3}, {"id": 2}, {"id": 4}]
386+
handler.output_results = MagicMock(
387+
return_value={"file1": [1, 3], "file2": [2], "file3": [4]}
388+
)
380389

381390
# when
382391
handler.delete_from_index(blob_url)

code/tests/search_utilities/test_integrated_vectorization_search_handler.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import pytest
2-
from unittest.mock import Mock, patch
2+
from unittest.mock import MagicMock, Mock, patch
33
from backend.batch.utilities.search.integrated_vectorization_search_handler import (
44
IntegratedVectorizationSearchHandler,
55
)
66
from azure.search.documents.models import VectorizableTextQuery
7+
from azure.search.documents import SearchItemPaged
78

89
from backend.batch.utilities.common.source_document import SourceDocument
910

@@ -261,12 +262,20 @@ def test_delete_from_index(env_helper_mock, handler, search_client_mock):
261262
env_helper_mock.AZURE_BLOB_CONTAINER_NAME = "documents"
262263
blob_url = "https://example.com/documents/file1.txt"
263264
title = "file1.txt"
264-
documents = [
265-
{"chunk_id": "123_chunk", "title": title},
266-
{"chunk_id": "789_chunk", "title": title},
267-
]
265+
documents = Mock(
266+
SearchItemPaged(
267+
[
268+
{"chunk_id": "123_chunk", "title": title},
269+
{"chunk_id": "789_chunk", "title": title},
270+
]
271+
)
272+
)
268273
search_client_mock.search.return_value = documents
274+
documents.get_count.return_value = 2
269275
ids_to_delete = [{"chunk_id": "123_chunk"}, {"chunk_id": "789_chunk"}]
276+
handler.output_results = MagicMock(
277+
return_value={"file1.txt": ["123_chunk", "789_chunk"]}
278+
)
270279

271280
# when
272281
handler.delete_from_index(blob_url)

0 commit comments

Comments
 (0)