filesDetect/filesDetect2.0.py at main · SkyflowFoundry/filesDetect · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# filesDetect2.0.py: Skyflow Files Detect API
# Author: Priyanta Dharmasena
# Modified: October 2024, November 2025
import requests  # type: ignore
import json
import os
import sys
import base64
import time
import re
import docx
from datetime import datetime
from pathlib import Path

BEARER_TOKEN = None

# get full file path to current script
script_path = Path(__file__).resolve()
script_directory = script_path.parent
script_name = script_path.name
script_directory = str(script_directory).replace(script_name, '')
current_directory = script_directory

file_path = current_directory + '/' + 'detect_params.json'  #get params file
with open(file_path, 'r') as file:
    params = json.load(file)

SKYFLOW_ACCOUNT_ID  = params['AccountId']
VAULT_ID            = params['Vault_ID']
VAULT_URL           = params['Vault_URL']
DATA_TYPES          = params['Data_Types']      #detect supported file types
MAX_ATTEMPTS        = params['Max_Api_Attempts']
FULL_REDACTION = params['fullRedaction'].lower() == 'true'
REDACTION_SET  = params['redaction_set']
common_directory    = params['Common_Files_Directory']
# Add common utils directory to path
if common_directory not in sys.path:
    sys.path.append(common_directory)

from fileFinder import file_selector   # type: ignore
from debugOut import debug             # type: ignore
from get_bearer_token import getSignedJWT_fromfile, getBearerToken  #type: ignore

def get_bearer_token_global(fullCredsFile):
    # Generate BEARER_TOKEN only once and reuse it globally.
    global BEARER_TOKEN
    if BEARER_TOKEN is None:
        signedJWT, creds, creds["clientID"] = getSignedJWT_fromfile(fullCredsFile)
        BEARER_TOKEN = getBearerToken(signedJWT, creds)
    return BEARER_TOKEN

def getFileSetEnv(params):
    # Setup environment and get JWT.
    output_dir = "output"
    fileName, fullFile = file_selector('input ', current_directory)
    print(f"Input File: \n {fileName}   successfully uploaded...")
    credsFile, fullCredsFile = file_selector('Skyflow Credentials', current_directory)
    print(f"Credentials file: \n {credsFile} successfully uploaded...\n")
    #get type from the file name
    name, extension = os.path.splitext(fileName)
    file_type = extension[1:].lower()     #get file_type

    if file_type not in DATA_TYPES:       #not picking a supported data file
        print(f"ERROR: File type {file_type} not currently supported")
        print("Supported files are: doc, docx, pdf, txt, json, jpg, jpeg, tiff, png, bmp, xlsx, xls, pptx, ppt, mp3, mp4, wav, flac")
        exit(1)
    else:                           #set payload api-options for file_type
        if file_type in params['ImageFiles']:
            payloadOption = params['image']
            group = 'image'
        elif file_type in params['AudioFiles']:
            payloadOption = params['audio']
            group = 'audio'
        elif file_type == 'pdf':
            payloadOption = params['pdf']
            group = 'pdf'
        else:
            payloadOption =  None      #detect api no options currently available
            group = None

    BEARER_TOKEN = get_bearer_token_global(fullCredsFile)
    return fullFile, current_directory, BEARER_TOKEN, payloadOption, file_type, name, group

def convert_file_to_base64(file_path):
    with open(file_path, "rb") as file:
        base64_encoded = base64.b64encode(file.read()).decode('utf-8')
    return base64_encoded

def save_base64_to_file(base64_string, output_path):
    with open(output_path, "wb") as file:
        file.write(base64.b64decode(base64_string))

def detect_file(file_path, payloadOption, file_type, fname, group):
    audioOut = None
    base64_file = convert_file_to_base64(file_path)

    url = VAULT_URL + '/' + 'v1/detect/deidentify/file'
    print(f"url: -->   {url}")
    print(f"Bearer {BEARER_TOKEN}")

    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {BEARER_TOKEN}',
    }

    # Corrected payload structure based on your Postman call
    payload = {
        "file": {
            "base64": base64_file,
            "data_format": file_type
        },
        "entity_types": ["all"],
        "token_type": {
            "entity_only": ["all"]
        },
        "vault_id": VAULT_ID
    }

    response = requests.post(url, headers=headers, data=json.dumps(payload))

    if response.status_code == 200:
        return response.json(), audioOut
    else:
        print(f"Error {response.status_code}: {response.text}")
        return {"error": response.status_code, "message": response.text}

def check_runs(runs_id):
    url = f'{VAULT_URL}/v1/detect/runs/{runs_id}?vault_id={VAULT_ID}'

    headers = {
        'Authorization': f'Bearer {BEARER_TOKEN}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return {"error": response.status_code, "message": response.text}

def extract_base64_from_response(runs_response):
    acceptable_types = ["redacted_transcription", "redacted_file", "redacted_image", "redacted_audio"]  # Detect processed file types
    for output in runs_response["output"]:
        if output.get("processedFileType") in acceptable_types:
            return output["processedFile"]
    return None

def getDateTime():
    now = datetime.now()
    return str(now.strftime("%m%d%Y%H%M%S"))

def replace_text_in_docx(doc, output_path):
    # Define a regex pattern to match text within square brackets
    pattern = re.compile(r'\[.*?\]')

    # Iterate through paragraphs and replace the text
    for para in doc.paragraphs:
        if pattern.search(para.text):
            replaced_text = pattern.sub(lambda match: '*' * len(match.group()), para.text)
            para.text = replaced_text

    # Iterate through tables (if the .docx file has tables)
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if pattern.search(cell.text):
                    replaced_text = pattern.sub(lambda match: '*' * len(match.group()), cell.text)
                    cell.text = replaced_text

    # Save the modified document to the output path
    doc.save(output_path)
    print(f"\nRedacted DOCX file saved to: \n{output_path}\n")

def replace_text_in_txt(file_path, output_path):
    # Read the text from the file
    with open(file_path, 'r') as file:
        content = file.read()

    # Define a regex pattern to match text within square brackets
    pattern = re.compile(r'\[.*?\]')

    # Replace matched patterns with equivalent length '*' characters
    replaced_content = pattern.sub(lambda match: '*' * len(match.group()), content)

    # Write modified content back to new file
    with open(output_path, 'w') as file:
        file.write(replaced_content)
    print(f"\nSUCCESS: Redacted {file_type} saved to: \n{output_path}")

def redact_file(file_path, output_path, file_type):
    # Check it's a docx or txt file
    if file_type.lower() == 'docx':
        # Process the .docx file
        doc = docx.Document(file_path)
        replace_text_in_docx(doc, output_path)

    elif file_type.lower() in ['txt', 'json', 'csv', 'xml']:
        # Process the .txt, .json, or .csv file
        replace_text_in_txt(file_path, output_path)

    else:
        print("Unsupported format. Redaction availble for: .docx, csv ,json, xml or .txt files.")

def save_runs_response(runs_response, output_dir, file_type, fname):
    entities_processed_file = None

    for output in runs_response.get('output', []):
        if output.get('processedFileType') == 'entities':
            entities_processed_file = output.get('processedFile')
            break

    if entities_processed_file:  # Decode base64-encoded content & save to file
        decoded_text = base64.b64decode(entities_processed_file).decode('utf-8')

        try:
            decoded_json = json.loads(decoded_text)
            text_file_path = os.path.join(output_dir, f"entities_{fname}-{file_type}_{getDateTime()}.json")
            with open(text_file_path, "w") as text_file:
                json.dump(decoded_json, text_file, indent=4)
            print(f"Decoded and formatted JSON text saved to: {text_file_path}\n")
        except json.JSONDecodeError:
            print("Error: Decoded text is not valid JSON.")
    else:
        print("Error: 'entities' processedFile not found in runs response.")

def check_and_save_runs(runs_url_full, runs_id, output_dir, file_type, fname, group, audioOut, max_attempts):
    # Check status of the file processing and save results.
    attempt = 0
    while attempt < max_attempts:
        runs_response = check_runs(runs_id)

        outExt = None
        if audioOut is False and group == 'audio':
            outExt = 'txt'
        elif audioOut is True and group == 'audio':
            outExt = 'mp3'
        else:
            outExt = file_type

        if runs_response.get("status") == "SUCCESS":
            processed_file_base64 = extract_base64_from_response(runs_response)
            if processed_file_base64:
                timestamp = getDateTime()
                output_file_path = os.path.join(output_dir, f"tokenized_{fname}-{file_type}-{timestamp}.{outExt}")
                save_base64_to_file(processed_file_base64, output_file_path)
                print(f"\nSUCCESS: Processed {file_type} saved to: \n{output_file_path}\n")

                # Save the runs response for debugging purposes
                save_runs_response(runs_response, output_dir, file_type, fname)

                # **Perform redaction if setting directive is set**
                if FULL_REDACTION == True and file_type in REDACTION_SET:
                    redacted_output_path = os.path.join(output_dir, f"redacted_{fname}-{file_type}-{getDateTime()}.{file_type}")
                    redact_file(output_file_path, redacted_output_path, file_type)

            else:
                print("Processed file not found in the response.")
            break
        elif runs_response.get("status") == "FAILED":
            print(f"Error in processing: {runs_response.get('message')}")
            break
        else:
            attempt += 1
            print(f"Processing is still in progress (Attempt {attempt}/{max_attempts}), checking again in 2 seconds...")
            time.sleep(2)
    else:
        print("Maximum attempts reached. Process did not complete successfully.")

def run_files_detect(input_file_path, output_dir, payloadOption, file_type, fname, group, max_attempts=MAX_ATTEMPTS):
    os.makedirs(output_dir, exist_ok=True)

    # Detect file and get runs URL
    detect_response, audioOut = detect_file(input_file_path, payloadOption, file_type, fname, group)

    # Check if response is a string or JSON (dict)
    if isinstance(detect_response, dict):
        if 'run_id' in detect_response:  # Updated to look for 'run_id'
            run_id = detect_response['run_id']
            runs_url_full = VAULT_URL + run_id  # Construct the full URL using VAULT_URL
            #runs_id = runs_url_full.split('/')[-1]  # Extract the unique identifier
            runs_id = run_id    #Per detect GA

            # Replace this with the function that processes the run using runs_id
            check_and_save_runs(runs_url_full, runs_id, output_dir, file_type, fname, group, audioOut, max_attempts)

        else:
            print(f"Error in initial request: {detect_response.get('message')}")
    else:
        print(f"Error in initial request: {detect_response}")


if __name__ == "__main__":
    input_file_path, current_directory, BEARER_TOKEN, payloadOption, file_type, fname, group = getFileSetEnv(params)
    output_dir = current_directory + "/" + "output"
    run_files_detect(input_file_path, output_dir, payloadOption, file_type, fname, group, max_attempts=MAX_ATTEMPTS)