NLP-Project/data_processing.py at master · AlessandroFornara/NLP-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import json
import random
import re

import unicodedata

CLEANUP_REGEXES = [
    # Rimuove tutto tranne lettere, accenti, ñ, ç, apostrofi, e spazi
    (re.compile(r"[^A-Za-zÀ-ÿáéíóúÁÉÍÓÚñç'\s]+"), ""),
    (re.compile(r"\s+"), " ")  # normalizza spazi multipli
]


def preprocess_phrase(phrase: str) -> str | None:
    phrase = phrase.lower().strip()

    for regex, repl in CLEANUP_REGEXES:
        phrase = regex.sub(repl, phrase)

    phrase = phrase.strip()

    return phrase if phrase else None


def train_val_split(data, ratio=0.8, seed=42):
    random.seed(seed)
    indices = list(range(len(data)))
    random.shuffle(indices)

    split_idx = int(len(indices) * ratio)
    train_idx = indices[:split_idx]
    val_idx = indices[split_idx:]

    return train_idx, val_idx


class PreProcessData:
    class ResultObject:
        def __init__(self, start, end, text, labels):
            self.start = start
            self.end = end
            self.text = text
            self.labels = labels

        def __repr__(self):
            return f"ResultObject(start={self.start}, end={self.end}, text={self.text}, labels={self.labels})"

    def __init__(self, filepath: str = None):
        self.text = []  # List of text strings
        self.results = []  # List of list of ResultObject
        self.neg_words, self.unc_words = [], []

        if filepath:
            print("Reading data from {}".format(filepath))
            self.load_data(filepath)

            self.neg_words = sorted(list(set(self.neg_words)))
            self.unc_words = sorted(list(set(self.unc_words)))

    @classmethod
    def from_existing(cls, text, results, neg_words=None, unc_words=None):
        instance = cls(filepath=None)

        instance.text = text
        instance.results = results

        instance.neg_words = sorted(list(set(neg_words))) if neg_words else []
        instance.unc_words = sorted(list(set(unc_words))) if unc_words else []

        return instance

    def load_data(self, filepath: str = None):

        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

            for entry in data:
                current_text = entry.get("data", {}).get("text", "")
                current_text = unicodedata.normalize("NFC", current_text)  # Normalize text
                self.text.append(current_text)

                result_objs = []
                predictions = entry.get("predictions", [])

                for pred in predictions:
                    results = pred.get("result", [])
                    for res in results:
                        try:
                            start = res["value"]["start"]
                            end = res["value"]["end"]
                            labels = res["value"]["labels"]

                            text = current_text[start:end]
                            cleaned_text = preprocess_phrase(text)

                            # Save the object
                            result_obj = self.ResultObject(start, end, text, labels)
                            result_objs.append(result_obj)

                            if cleaned_text:
                                if labels == ["NEG"]:
                                    self.neg_words.append(cleaned_text)
                                elif labels == ["UNC"]:
                                    self.unc_words.append(cleaned_text)

                        except (KeyError, TypeError):
                            continue

                self.results.append(result_objs)

    def print_results_to_file(self):

        with open("extracted_snippets.txt", "w", encoding="utf-8") as out_file:
            out_file.write("\n".join(self.neg_words))

        print(f"Saved {len(self.neg_words)} NEG snippets to 'extracted_snippets.txt'")
        print(f"Loaded {len(self.text)} texts and results.")