Skip to content

Commit c0a997a

Browse files
fix: improve NLTK resource handling and logging in GCS document retri… (#1395)
* fix: improve NLTK resource handling and logging in GCS document retrieval * fix: streamline NLTK resource downloading and path management in GCS document retrieval
1 parent 0f7161c commit c0a997a

File tree

1 file changed

+41
-39
lines changed

1 file changed

+41
-39
lines changed

backend/src/document_sources/gcs_bucket.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -47,57 +47,59 @@ def gcs_loader_func(file_path):
4747

4848
def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):
4949

50-
nltk.data.path.append("/usr/local/nltk_data")
51-
nltk.data.path.append(os.path.expanduser("~/.nltk_data"))
52-
try:
53-
nltk.data.find("tokenizers/punkt")
54-
except LookupError:
55-
for resource in ["punkt", "averaged_perceptron_tagger"]:
50+
nltk_data_dirs = ["/usr/local/nltk_data", os.path.expanduser("~/.nltk_data")]
51+
for d in nltk_data_dirs:
52+
if d not in nltk.data.path:
53+
nltk.data.path.append(d)
54+
55+
resources = [
56+
("punkt", "tokenizers"),
57+
("averaged_perceptron_tagger", "taggers"),
58+
]
59+
for res, res_type in resources:
5660
try:
57-
nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"taggers/{resource}")
61+
nltk.data.find(f"{res_type}/{res}")
5862
except LookupError:
59-
logging.info(f"Downloading NLTK resource: {resource}")
60-
nltk.download(resource, download_dir=os.path.expanduser("~/.nltk_data"))
61-
62-
logging.info("NLTK resources downloaded successfully.")
63-
if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="":
63+
logging.info(f"NLTK resource '{res}' not found; downloading to /usr/local/nltk_data")
64+
nltk.download(res, download_dir="/usr/local/nltk_data")
65+
66+
67+
if gcs_bucket_folder is not None and gcs_bucket_folder.strip() != "":
6468
if gcs_bucket_folder.endswith('/'):
65-
blob_name = gcs_bucket_folder+gcs_blob_filename
69+
blob_name = gcs_bucket_folder + gcs_blob_filename
6670
else:
67-
blob_name = gcs_bucket_folder+'/'+gcs_blob_filename
68-
else:
69-
blob_name = gcs_blob_filename
70-
71-
logging.info(f"GCS project_id : {gcs_project_id}")
72-
73-
if access_token is None:
71+
blob_name = gcs_bucket_folder + '/' + gcs_blob_filename
72+
else:
73+
blob_name = gcs_blob_filename
74+
75+
logging.info(f"GCS project_id : {gcs_project_id}")
76+
77+
if access_token is None:
7478
storage_client = storage.Client(project=gcs_project_id)
7579
bucket = storage_client.bucket(gcs_bucket_name)
76-
blob = bucket.blob(blob_name)
77-
80+
blob = bucket.blob(blob_name)
7881
if blob.exists():
7982
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func)
80-
pages = loader.load()
81-
else :
82-
raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
83-
else:
84-
creds= Credentials(access_token)
83+
pages = loader.load()
84+
else:
85+
raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.')
86+
else:
87+
creds = Credentials(access_token)
8588
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
86-
8789
bucket = storage_client.bucket(gcs_bucket_name)
88-
blob = bucket.blob(blob_name)
90+
blob = bucket.blob(blob_name)
8991
if blob.exists():
90-
content = blob.download_as_bytes()
91-
pdf_file = io.BytesIO(content)
92-
pdf_reader = PdfReader(pdf_file)
93-
# Extract text from all pages
94-
text = ""
95-
for page in pdf_reader.pages:
96-
text += page.extract_text()
97-
pages = [Document(page_content = text)]
92+
content = blob.download_as_bytes()
93+
pdf_file = io.BytesIO(content)
94+
pdf_reader = PdfReader(pdf_file)
95+
# Extract text from all pages
96+
text = ""
97+
for page in pdf_reader.pages:
98+
text += page.extract_text() or ""
99+
pages = [Document(page_content=text)]
98100
else:
99-
raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}')
100-
return gcs_blob_filename, pages
101+
raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}')
102+
return gcs_blob_filename, pages
101103

102104
def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed):
103105
try:

0 commit comments

Comments
 (0)