-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
163 lines (124 loc) · 5.53 KB
/
utils.py
File metadata and controls
163 lines (124 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import json, os, requests
def change_dataset_name(name):
json_file_path = "draft/metadata.json"
with open(json_file_path, 'r') as file:
data = json.load(file)
# Update the dataset_name
data['dataset_name'] = name
# Save the updated JSON back to the file
with open(json_file_path, 'w') as file:
json.dump(data, file, indent=4)
def read_dataset_name():
with open("draft/metadata.json", 'r') as file:
data2 = json.load(file)
# Extract the "dataset_name" property
dataset_name = data2['dataset_name']
return dataset_name
def LLMApi(input_text, max_length=8888, model="gpt-4o-mini"):
api_key = os.getenv('OPENAI_API_KEY') # Get the API key from environment variables
if not api_key:
return "API key not found in environment variables."
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
# Clamp input text to max_length
if len(input_text) > max_length:
input_text = input_text[:max_length] # Truncate the text if it's too long
data = {
"model": model, # Ensure you're using a valid model, e.g., "gpt-4"
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": input_text}
]
}
try:
# Send POST request to OpenAI API
response = requests.post(url, headers=headers, data=json.dumps(data))
# If the response is successful (status code 200)
if response.status_code == 200:
result = response.json()
return result['choices'][0]['message']['content'].strip()
else:
return f"Error: {response.status_code} - {response.text}"
except Exception as e:
return f"An error occurred: {e}"
def fetch_html_from_link(link):
"""Fetches HTML content from a given link."""
try:
response = requests.get(link)
response.raise_for_status() # Raise an error for bad responses
return response.text
except requests.RequestException:
return None # Return None on error
from bs4 import BeautifulSoup
import requests
def fetch_html_from_link_no_script(link):
"""Fetches HTML content from a given link."""
try:
response = requests.get(link)
response.raise_for_status() # Raise an error for bad responses
html_content = response.text
# Try removing <script> tags from the HTML
try:
soup = BeautifulSoup(html_content, 'html.parser')
for script in soup.find_all('script'):
script.decompose() # Remove the <script> tags
return str(soup)
except Exception:
return html_content # In case of error, return the raw HTML content
except requests.RequestException:
return None # Return None on error
def clamp_prompt(long_string, char_limit=8888):
if len(long_string) > char_limit:
return long_string[:char_limit] + '...'
return long_string
def read_metadata(file_path='draft/metadata.json'):
with open(file_path, 'r', encoding='utf-8') as file:
# Load the JSON data from the file
metadata = json.load(file)
# Extract dataset_name and convert the entire 'info' dictionary to a string
dataset_name = metadata['dataset_name']
dataset_info = json.dumps(metadata['info']) # Convert the 'info' dictionary to a JSON-formatted string
return dataset_name, dataset_info
def read_metadata_dataset_websites(file_path='draft/metadata.json'):
try:
with open(file_path, 'r', encoding='utf-8') as file:
# Load the JSON data from the file
metadata = json.load(file)
# Extract dataset_name and convert the entire 'info' dictionary to a string
dataset_websites = metadata["dataset_websites"]
return dataset_websites
except Exception as e:
print(f"failed to read_metadata_dataset_websites, reason is : {e}")
return []
# # Example usage:
# dataset_name, dataset_info = read_metadata()
# print(f"Dataset Name: {dataset_name}")
# print(f"Dataset Info: {dataset_info}")
def clean_llm_json_res(res):
res_json = res
try:
if res.startswith('```json\n'):
res = res[len('```json\n'):].strip('` \n')
# Convert the string to JSON format
res_json = json.loads(res)
except Exception as e:
# Skip invalid JSON strings
print(f"Error decoding JSON for item: {res} - {e}")
return res_json
def get_py_files_length(folder_path):
total_length = 0
# Traverse through all files in the folder and its subfolders
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".py"): # Only consider .py files
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
total_length += len(f.readlines()) # Add number of lines in the file
return total_length
if __name__ == "__main__":
folder_path = os.path.dirname(os.path.realpath(__file__)) # Get the current folder path
total_lines = get_py_files_length(folder_path)
print(f"The total number of lines in all .py files (including this script) is: {total_lines}")