-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
122 lines (103 loc) · 4.93 KB
/
parser.py
File metadata and controls
122 lines (103 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import spacy
import os
import json
import pandas as pd
from collections import defaultdict, Counter
from multiprocessing import Pool
import logging
import re
# === Constants ===
words = ['stupid', 'foolish', 'dumb', 'idiotic']
regex_pattern = re.compile(r'\b(?:' + '|'.join(words) + r')\b', flags=re.IGNORECASE)
DATA_DIR = '/scratch/midway3/maxzhuyt/cleaned_coha_lower'
OUTPUT_JSON = "files_by_year.json"
OUTPUT_CSV = f"{'_'.join(words)}_targets_by_year_sbatch.csv"
LOG_PATH = f"{'_'.join(words)}_extraction.log"
# === Setup logging ===
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(f"multiword_logger")
file_handler = logging.FileHandler(LOG_PATH)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
# === Build file map by year ===
files_by_year = defaultdict(list)
for fname in os.listdir(DATA_DIR):
if fname.endswith(".txt"):
parts = fname.split("_")
if len(parts) >= 3 and parts[1].isdigit():
year = int(parts[1])
if year >= 1900:
files_by_year[year].append(os.path.join(DATA_DIR, fname))
with open(OUTPUT_JSON, "w") as f:
json.dump(files_by_year, f, indent=2)
# === Multiprocessing task ===
def process_year(year_files_tuple):
year, files = year_files_tuple
logger.info(f"Processing year {year} with {len(files)} files")
# Re-initialize spaCy inside subprocess
nlp_sent = spacy.load("en_core_web_sm", disable=["ner", "tagger", "parser"])
nlp_sent.add_pipe("sentencizer")
nlp_sent.max_length = 2012100 # Set max length for large documents
nlp_full = spacy.load("en_core_web_lg")
nlp_full.max_length = 2012100 # Set max length for large documents
def extract_stupid_sentences(text):
doc = nlp_sent(text)
return [sent.text for sent in doc.sents if regex_pattern.search(sent.text)]
def parse_targets(sentences):
counter = Counter()
for doc in nlp_full.pipe(sentences, batch_size=8):
for token in doc:
if token.text.lower() in words:
dep = token.dep_
head = token.head
# "something1 is a stupid something2"
if dep == "amod" and head.pos_ in {"NOUN", "PROPN"}:
counter[head.lemma_.lower()] += 1
copula_verb = head.head
if copula_verb.lemma_ == "be":
subj = [tok for tok in copula_verb.children if tok.dep_ == "nsubj"]
if subj:
counter[subj[0].lemma_.lower()] += 1
# "something looks/appears/seems stupid"
elif dep == "acomp":
verb = head
subj = [tok for tok in verb.children if tok.dep_ == "nsubj"]
if subj:
counter[subj[0].lemma_.lower()] += 1
# "something1 and something2 are stupid"
elif dep == "conj" and token.head.text.lower() in words:
conj_targets = [child for child in token.subtree if child.dep_ == "nsubj"]
for subj in conj_targets:
counter[subj.lemma_.lower()] += 1
# "somebody tried to be stupid" or "be a stupid person"
elif token.dep_ == "acomp" and head.lemma_ == "be":
for ancestor in token.ancestors:
if ancestor.dep_ == "ROOT" and ancestor.pos_ == "VERB":
subject = [child for child in ancestor.children if child.dep_ == "nsubj"]
if subject:
counter[subject[0].lemma_.lower()] += 1
break
return counter
year_counter = Counter()
for path in files:
try:
with open(path, "r", encoding="utf-8") as f:
text = f.read()
sents = extract_stupid_sentences(text)
count = parse_targets(sents)
year_counter.update(count)
except Exception as e:
logger.error(f"Error processing {path}: {e}")
return {"year": int(year), "error": str(e)}
logger.info(f"Finished processing year {year}, found {len(year_counter)} targets")
return [{"year": int(year), "target": target, "count": count} for target, count in year_counter.items()]
# === Run with multiprocessing ===
year_file_pairs = sorted(files_by_year.items())
with Pool(processes=46) as pool:
all_results = pool.map(process_year, year_file_pairs)
# === Flatten and save ===
flattened = [item for sublist in all_results if isinstance(sublist, list) for item in sublist]
df = pd.DataFrame(flattened)
df.sort_values(["year", "count"], ascending=[True, False], inplace=True)
df.to_csv(OUTPUT_CSV, index=False)
logger.info(f"Finished writing to {OUTPUT_CSV}")