-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilesDetect2.0.py
More file actions
295 lines (244 loc) · 11.6 KB
/
filesDetect2.0.py
File metadata and controls
295 lines (244 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# filesDetect2.0.py: Skyflow Files Detect API
# Author: Priyanta Dharmasena
# Modified: October 2024, November 2025
import requests # type: ignore
import json
import os
import sys
import base64
import time
import re
import docx
from datetime import datetime
from pathlib import Path
BEARER_TOKEN = None
# get full file path to current script
script_path = Path(__file__).resolve()
script_directory = script_path.parent
script_name = script_path.name
script_directory = str(script_directory).replace(script_name, '')
current_directory = script_directory
file_path = current_directory + '/' + 'detect_params.json' #get params file
with open(file_path, 'r') as file:
params = json.load(file)
SKYFLOW_ACCOUNT_ID = params['AccountId']
VAULT_ID = params['Vault_ID']
VAULT_URL = params['Vault_URL']
DATA_TYPES = params['Data_Types'] #detect supported file types
MAX_ATTEMPTS = params['Max_Api_Attempts']
FULL_REDACTION = params['fullRedaction'].lower() == 'true'
REDACTION_SET = params['redaction_set']
common_directory = params['Common_Files_Directory']
# Add common utils directory to path
if common_directory not in sys.path:
sys.path.append(common_directory)
from fileFinder import file_selector # type: ignore
from debugOut import debug # type: ignore
from get_bearer_token import getSignedJWT_fromfile, getBearerToken #type: ignore
def get_bearer_token_global(fullCredsFile):
# Generate BEARER_TOKEN only once and reuse it globally.
global BEARER_TOKEN
if BEARER_TOKEN is None:
signedJWT, creds, creds["clientID"] = getSignedJWT_fromfile(fullCredsFile)
BEARER_TOKEN = getBearerToken(signedJWT, creds)
return BEARER_TOKEN
def getFileSetEnv(params):
# Setup environment and get JWT.
output_dir = "output"
fileName, fullFile = file_selector('input ', current_directory)
print(f"Input File: \n {fileName} successfully uploaded...")
credsFile, fullCredsFile = file_selector('Skyflow Credentials', current_directory)
print(f"Credentials file: \n {credsFile} successfully uploaded...\n")
#get type from the file name
name, extension = os.path.splitext(fileName)
file_type = extension[1:].lower() #get file_type
if file_type not in DATA_TYPES: #not picking a supported data file
print(f"ERROR: File type {file_type} not currently supported")
print("Supported files are: doc, docx, pdf, txt, json, jpg, jpeg, tiff, png, bmp, xlsx, xls, pptx, ppt, mp3, mp4, wav, flac")
exit(1)
else: #set payload api-options for file_type
if file_type in params['ImageFiles']:
payloadOption = params['image']
group = 'image'
elif file_type in params['AudioFiles']:
payloadOption = params['audio']
group = 'audio'
elif file_type == 'pdf':
payloadOption = params['pdf']
group = 'pdf'
else:
payloadOption = None #detect api no options currently available
group = None
BEARER_TOKEN = get_bearer_token_global(fullCredsFile)
return fullFile, current_directory, BEARER_TOKEN, payloadOption, file_type, name, group
def convert_file_to_base64(file_path):
with open(file_path, "rb") as file:
base64_encoded = base64.b64encode(file.read()).decode('utf-8')
return base64_encoded
def save_base64_to_file(base64_string, output_path):
with open(output_path, "wb") as file:
file.write(base64.b64decode(base64_string))
def detect_file(file_path, payloadOption, file_type, fname, group):
audioOut = None
base64_file = convert_file_to_base64(file_path)
url = VAULT_URL + '/' + 'v1/detect/deidentify/file'
print(f"url: --> {url}")
print(f"Bearer {BEARER_TOKEN}")
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {BEARER_TOKEN}',
}
# Corrected payload structure based on your Postman call
payload = {
"file": {
"base64": base64_file,
"data_format": file_type
},
"entity_types": ["all"],
"token_type": {
"entity_only": ["all"]
},
"vault_id": VAULT_ID
}
response = requests.post(url, headers=headers, data=json.dumps(payload))
if response.status_code == 200:
return response.json(), audioOut
else:
print(f"Error {response.status_code}: {response.text}")
return {"error": response.status_code, "message": response.text}
def check_runs(runs_id):
url = f'{VAULT_URL}/v1/detect/runs/{runs_id}?vault_id={VAULT_ID}'
headers = {
'Authorization': f'Bearer {BEARER_TOKEN}'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
else:
return {"error": response.status_code, "message": response.text}
def extract_base64_from_response(runs_response):
acceptable_types = ["redacted_transcription", "redacted_file", "redacted_image", "redacted_audio"] # Detect processed file types
for output in runs_response["output"]:
if output.get("processedFileType") in acceptable_types:
return output["processedFile"]
return None
def getDateTime():
now = datetime.now()
return str(now.strftime("%m%d%Y%H%M%S"))
def replace_text_in_docx(doc, output_path):
# Define a regex pattern to match text within square brackets
pattern = re.compile(r'\[.*?\]')
# Iterate through paragraphs and replace the text
for para in doc.paragraphs:
if pattern.search(para.text):
replaced_text = pattern.sub(lambda match: '*' * len(match.group()), para.text)
para.text = replaced_text
# Iterate through tables (if the .docx file has tables)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if pattern.search(cell.text):
replaced_text = pattern.sub(lambda match: '*' * len(match.group()), cell.text)
cell.text = replaced_text
# Save the modified document to the output path
doc.save(output_path)
print(f"\nRedacted DOCX file saved to: \n{output_path}\n")
def replace_text_in_txt(file_path, output_path):
# Read the text from the file
with open(file_path, 'r') as file:
content = file.read()
# Define a regex pattern to match text within square brackets
pattern = re.compile(r'\[.*?\]')
# Replace matched patterns with equivalent length '*' characters
replaced_content = pattern.sub(lambda match: '*' * len(match.group()), content)
# Write modified content back to new file
with open(output_path, 'w') as file:
file.write(replaced_content)
print(f"\nSUCCESS: Redacted {file_type} saved to: \n{output_path}")
def redact_file(file_path, output_path, file_type):
# Check it's a docx or txt file
if file_type.lower() == 'docx':
# Process the .docx file
doc = docx.Document(file_path)
replace_text_in_docx(doc, output_path)
elif file_type.lower() in ['txt', 'json', 'csv', 'xml']:
# Process the .txt, .json, or .csv file
replace_text_in_txt(file_path, output_path)
else:
print("Unsupported format. Redaction availble for: .docx, csv ,json, xml or .txt files.")
def save_runs_response(runs_response, output_dir, file_type, fname):
entities_processed_file = None
for output in runs_response.get('output', []):
if output.get('processedFileType') == 'entities':
entities_processed_file = output.get('processedFile')
break
if entities_processed_file: # Decode base64-encoded content & save to file
decoded_text = base64.b64decode(entities_processed_file).decode('utf-8')
try:
decoded_json = json.loads(decoded_text)
text_file_path = os.path.join(output_dir, f"entities_{fname}-{file_type}_{getDateTime()}.json")
with open(text_file_path, "w") as text_file:
json.dump(decoded_json, text_file, indent=4)
print(f"Decoded and formatted JSON text saved to: {text_file_path}\n")
except json.JSONDecodeError:
print("Error: Decoded text is not valid JSON.")
else:
print("Error: 'entities' processedFile not found in runs response.")
def check_and_save_runs(runs_url_full, runs_id, output_dir, file_type, fname, group, audioOut, max_attempts):
# Check status of the file processing and save results.
attempt = 0
while attempt < max_attempts:
runs_response = check_runs(runs_id)
outExt = None
if audioOut is False and group == 'audio':
outExt = 'txt'
elif audioOut is True and group == 'audio':
outExt = 'mp3'
else:
outExt = file_type
if runs_response.get("status") == "SUCCESS":
processed_file_base64 = extract_base64_from_response(runs_response)
if processed_file_base64:
timestamp = getDateTime()
output_file_path = os.path.join(output_dir, f"tokenized_{fname}-{file_type}-{timestamp}.{outExt}")
save_base64_to_file(processed_file_base64, output_file_path)
print(f"\nSUCCESS: Processed {file_type} saved to: \n{output_file_path}\n")
# Save the runs response for debugging purposes
save_runs_response(runs_response, output_dir, file_type, fname)
# **Perform redaction if setting directive is set**
if FULL_REDACTION == True and file_type in REDACTION_SET:
redacted_output_path = os.path.join(output_dir, f"redacted_{fname}-{file_type}-{getDateTime()}.{file_type}")
redact_file(output_file_path, redacted_output_path, file_type)
else:
print("Processed file not found in the response.")
break
elif runs_response.get("status") == "FAILED":
print(f"Error in processing: {runs_response.get('message')}")
break
else:
attempt += 1
print(f"Processing is still in progress (Attempt {attempt}/{max_attempts}), checking again in 2 seconds...")
time.sleep(2)
else:
print("Maximum attempts reached. Process did not complete successfully.")
def run_files_detect(input_file_path, output_dir, payloadOption, file_type, fname, group, max_attempts=MAX_ATTEMPTS):
os.makedirs(output_dir, exist_ok=True)
# Detect file and get runs URL
detect_response, audioOut = detect_file(input_file_path, payloadOption, file_type, fname, group)
# Check if response is a string or JSON (dict)
if isinstance(detect_response, dict):
if 'run_id' in detect_response: # Updated to look for 'run_id'
run_id = detect_response['run_id']
runs_url_full = VAULT_URL + run_id # Construct the full URL using VAULT_URL
#runs_id = runs_url_full.split('/')[-1] # Extract the unique identifier
runs_id = run_id #Per detect GA
# Replace this with the function that processes the run using runs_id
check_and_save_runs(runs_url_full, runs_id, output_dir, file_type, fname, group, audioOut, max_attempts)
else:
print(f"Error in initial request: {detect_response.get('message')}")
else:
print(f"Error in initial request: {detect_response}")
if __name__ == "__main__":
input_file_path, current_directory, BEARER_TOKEN, payloadOption, file_type, fname, group = getFileSetEnv(params)
output_dir = current_directory + "/" + "output"
run_files_detect(input_file_path, output_dir, payloadOption, file_type, fname, group, max_attempts=MAX_ATTEMPTS)