This repository was archived by the owner on Sep 22, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processing.py
More file actions
117 lines (83 loc) · 3.78 KB
/
data_processing.py
File metadata and controls
117 lines (83 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import json
import random
import re
import unicodedata
CLEANUP_REGEXES = [
# Rimuove tutto tranne lettere, accenti, ñ, ç, apostrofi, e spazi
(re.compile(r"[^A-Za-zÀ-ÿáéíóúÁÉÍÓÚñç'\s]+"), ""),
(re.compile(r"\s+"), " ") # normalizza spazi multipli
]
def preprocess_phrase(phrase: str) -> str | None:
phrase = phrase.lower().strip()
for regex, repl in CLEANUP_REGEXES:
phrase = regex.sub(repl, phrase)
phrase = phrase.strip()
return phrase if phrase else None
def train_val_split(data, ratio=0.8, seed=42):
random.seed(seed)
indices = list(range(len(data)))
random.shuffle(indices)
split_idx = int(len(indices) * ratio)
train_idx = indices[:split_idx]
val_idx = indices[split_idx:]
return train_idx, val_idx
class PreProcessData:
class ResultObject:
def __init__(self, start, end, text, labels):
self.start = start
self.end = end
self.text = text
self.labels = labels
def __repr__(self):
return f"ResultObject(start={self.start}, end={self.end}, text={self.text}, labels={self.labels})"
def __init__(self, filepath: str = None):
self.text = [] # List of text strings
self.results = [] # List of list of ResultObject
self.neg_words, self.unc_words = [], []
if filepath:
print("Reading data from {}".format(filepath))
self.load_data(filepath)
self.neg_words = sorted(list(set(self.neg_words)))
self.unc_words = sorted(list(set(self.unc_words)))
@classmethod
def from_existing(cls, text, results, neg_words=None, unc_words=None):
instance = cls(filepath=None)
instance.text = text
instance.results = results
instance.neg_words = sorted(list(set(neg_words))) if neg_words else []
instance.unc_words = sorted(list(set(unc_words))) if unc_words else []
return instance
def load_data(self, filepath: str = None):
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data:
current_text = entry.get("data", {}).get("text", "")
current_text = unicodedata.normalize("NFC", current_text) # Normalize text
self.text.append(current_text)
result_objs = []
predictions = entry.get("predictions", [])
for pred in predictions:
results = pred.get("result", [])
for res in results:
try:
start = res["value"]["start"]
end = res["value"]["end"]
labels = res["value"]["labels"]
text = current_text[start:end]
cleaned_text = preprocess_phrase(text)
# Save the object
result_obj = self.ResultObject(start, end, text, labels)
result_objs.append(result_obj)
if cleaned_text:
if labels == ["NEG"]:
self.neg_words.append(cleaned_text)
elif labels == ["UNC"]:
self.unc_words.append(cleaned_text)
except (KeyError, TypeError):
continue
self.results.append(result_objs)
def print_results_to_file(self):
with open("extracted_snippets.txt", "w", encoding="utf-8") as out_file:
out_file.write("\n".join(self.neg_words))
print(f"Saved {len(self.neg_words)} NEG snippets to 'extracted_snippets.txt'")
print(f"Loaded {len(self.text)} texts and results.")