-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_program_metadata.py
More file actions
47 lines (36 loc) · 1.43 KB
/
extract_program_metadata.py
File metadata and controls
47 lines (36 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import json
import re
def clean_html(raw_html):
if not raw_html: return ""
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext.strip()
try:
with open("nyc_benefits.json", "r") as f:
data = json.load(f)
metadata = {}
print(f"Loaded {len(data)} entries from nyc_benefits.json")
for entry in data:
code = entry.get("program_code")
if code and code != "NULL":
name = entry.get("program_name", "Unknown")
# Prefer plain language name if available?
plain = entry.get("plain_language_program_name")
if plain and plain != "NULL":
# Maybe append plain name?
# name = f"{name} ({plain})"
pass
# Description: brief excerpt is usually better for a list
desc_html = entry.get("brief_excerpt", "")
if desc_html == "NULL": desc_html = entry.get("program_description", "")
desc = clean_html(desc_html)
metadata[code] = {
"name": name,
"description": desc
}
print(f"Extracted metadata for {len(metadata)} programs.")
with open("program_metadata.json", "w") as f:
json.dump(metadata, f, indent=4)
print("Wrote program_metadata.json")
except Exception as e:
print(f"Error: {e}")