@@ -47,57 +47,59 @@ def gcs_loader_func(file_path):
4747
4848def get_documents_from_gcs (gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token = None ):
4949
50- nltk .data .path .append ("/usr/local/nltk_data" )
51- nltk .data .path .append (os .path .expanduser ("~/.nltk_data" ))
52- try :
53- nltk .data .find ("tokenizers/punkt" )
54- except LookupError :
55- for resource in ["punkt" , "averaged_perceptron_tagger" ]:
50+ nltk_data_dirs = ["/usr/local/nltk_data" , os .path .expanduser ("~/.nltk_data" )]
51+ for d in nltk_data_dirs :
52+ if d not in nltk .data .path :
53+ nltk .data .path .append (d )
54+
55+ resources = [
56+ ("punkt" , "tokenizers" ),
57+ ("averaged_perceptron_tagger" , "taggers" ),
58+ ]
59+ for res , res_type in resources :
5660 try :
57- nltk .data .find (f"tokenizers/ { resource } " if resource == "punkt" else f"taggers/ { resource } " )
61+ nltk .data .find (f"{ res_type } / { res } " )
5862 except LookupError :
59- logging .info (f"Downloading NLTK resource: { resource } " )
60- nltk .download (resource , download_dir = os . path . expanduser ( "~/. nltk_data") )
61-
62- logging . info ( "NLTK resources downloaded successfully." )
63- if gcs_bucket_folder is not None and gcs_bucket_folder .strip ()!= "" :
63+ logging .info (f"NLTK resource ' { res } ' not found; downloading to /usr/local/nltk_data " )
64+ nltk .download (res , download_dir = "/usr/local/ nltk_data" )
65+
66+
67+ if gcs_bucket_folder is not None and gcs_bucket_folder .strip () != "" :
6468 if gcs_bucket_folder .endswith ('/' ):
65- blob_name = gcs_bucket_folder + gcs_blob_filename
69+ blob_name = gcs_bucket_folder + gcs_blob_filename
6670 else :
67- blob_name = gcs_bucket_folder + '/' + gcs_blob_filename
68- else :
69- blob_name = gcs_blob_filename
70-
71- logging .info (f"GCS project_id : { gcs_project_id } " )
72-
73- if access_token is None :
71+ blob_name = gcs_bucket_folder + '/' + gcs_blob_filename
72+ else :
73+ blob_name = gcs_blob_filename
74+
75+ logging .info (f"GCS project_id : { gcs_project_id } " )
76+
77+ if access_token is None :
7478 storage_client = storage .Client (project = gcs_project_id )
7579 bucket = storage_client .bucket (gcs_bucket_name )
76- blob = bucket .blob (blob_name )
77-
80+ blob = bucket .blob (blob_name )
7881 if blob .exists ():
7982 loader = GCSFileLoader (project_name = gcs_project_id , bucket = gcs_bucket_name , blob = blob_name , loader_func = gcs_loader_func )
80- pages = loader .load ()
81- else :
82- raise LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
83- else :
84- creds = Credentials (access_token )
83+ pages = loader .load ()
84+ else :
85+ raise LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
86+ else :
87+ creds = Credentials (access_token )
8588 storage_client = storage .Client (project = gcs_project_id , credentials = creds )
86-
8789 bucket = storage_client .bucket (gcs_bucket_name )
88- blob = bucket .blob (blob_name )
90+ blob = bucket .blob (blob_name )
8991 if blob .exists ():
90- content = blob .download_as_bytes ()
91- pdf_file = io .BytesIO (content )
92- pdf_reader = PdfReader (pdf_file )
93- # Extract text from all pages
94- text = ""
95- for page in pdf_reader .pages :
96- text += page .extract_text ()
97- pages = [Document (page_content = text )]
92+ content = blob .download_as_bytes ()
93+ pdf_file = io .BytesIO (content )
94+ pdf_reader = PdfReader (pdf_file )
95+ # Extract text from all pages
96+ text = ""
97+ for page in pdf_reader .pages :
98+ text += page .extract_text () or ""
99+ pages = [Document (page_content = text )]
98100 else :
99- raise LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name } ' )
100- return gcs_blob_filename , pages
101+ raise LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name } ' )
102+ return gcs_blob_filename , pages
101103
102104def upload_file_to_gcs (file_chunk , chunk_number , original_file_name , bucket_name , folder_name_sha1_hashed ):
103105 try :
0 commit comments