-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathentities.py
More file actions
177 lines (152 loc) · 8.47 KB
/
entities.py
File metadata and controls
177 lines (152 loc) · 8.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""Entity extraction engine for Knowledge Engine — no LLM needed."""
import re
from collections import Counter, defaultdict
import db
# Entity patterns for different types
PATTERNS = {
"law": [
r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Act|Statute|Code|Regulation|Ordinance|Rule|Law))\b',
r'\b(Section\s+\d+(?:\.\d+)*(?:\([a-z]\))?)\b',
r'\b(R\.?S\.?B\.?C\.?\s*\d{4})\b',
r'\b([A-Z][a-z]+\s+v\.?\s+[A-Z][a-z]+)\b', # Case names
],
"person": [
r'\b((?:Dr|Mr|Mrs|Ms|Prof|Judge|Justice|Hon)\.?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b',
],
"place": [
r'\b(British Columbia|Vancouver|Victoria|Kamloops|Kelowna|Prince George|Nanaimo|Nelson|Revelstoke|Cranbrook|Trail|Rossland|Castlegar|Fernie|Golden|Invermere|Lillooet|Quesnel|Williams Lake|100 Mile House|Burns Lake|Smithers|Terrace|Prince Rupert|Kitimat|Fort Nelson|Fort St\.? John|Dawson Creek|Chetwynd|Tumbler Ridge|Hudson\'s Hope|Mackenzie|Valemount|McBride|Clearwater|Merritt|Princeton|Hope|Chilliwack|Abbotsford|Mission|Squamish|Whistler|Pemberton|Sechelt|Powell River|Campbell River|Courtenay|Comox|Port Alberni|Tofino|Ucluelet|Duncan|Ladysmith|Parksville|Qualicum Beach|Port Hardy|Alert Bay|Haida Gwaii|Salt Spring Island|Penticton|Vernon|Salmon Arm|Enderby|Armstrong|Sicamous|Summerland|Oliver|Osoyoos|Grand Forks)\b',
r'\b(Kootenay|Cariboo|Okanagan|Fraser Valley|Thompson|Skeena|Peace River|Vancouver Island|Haida Gwaii|Bulkley|Nechako|Columbia Valley)\b',
],
"plant": [
r'\b((?:wild |common |western |eastern |northern |pacific )?(?:ginger|yarrow|chamomile|echinacea|elderberry|valerian|lavender|peppermint|turmeric|ginseng|calendula|comfrey|dandelion|nettle|plantain|rosemary|sage|thyme|oregano|basil|garlic|aloe vera|arnica|astragalus|ashwagandha|black cohosh|bloodroot|blue cohosh|boneset|burdock|cat\'s claw|cayenne|chaparral|cleavers|coltsfoot|dong quai|elecampane|evening primrose|eyebright|fennel|feverfew|gentian|ginkgo|goldenseal|gotu kola|hawthorn|hops|horsetail|hyssop|kava|lemon balm|licorice|lobelia|marshmallow|milk thistle|motherwort|mugwort|mullein|oat straw|oregon grape|passionflower|pau d\'arco|pennyroyal|red clover|rhodiola|saw palmetto|schisandra|skullcap|slippery elm|st\.? john\'s wort|uva ursi|white willow|witch hazel|wood betony|wormwood|yellow dock|devil\'s club|fireweed|Labrador tea|kinnikinnick|salal|thimbleberry|huckleberry|saskatoon berry|oregon grape|red cedar|cottonwood|birch|alder))\b',
],
"mineral": [
r'\b(gold|silver|copper|zinc|lead|molybdenum|coal|jade|placer|quartz|galena|pyrite|chalcopyrite|sphalerite|magnetite|hematite|chromite|platinum|palladium|tungsten|antimony|bismuth|cobalt|nickel|tin)\b',
],
"org": [
r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Corporation|Corp|Inc|Ltd|Society|Association|Ministry|Department|Commission|Board|Council|Authority|Agency|Institute|University|College|Foundation|Trust|Fund|Bureau|Service|Office|Centre|Center))\b',
r'\b(BC (?:Hydro|Ferries|Transit|Housing|Assessment|Securities|Lottery|Timber Sales|Wildfire))\b',
],
"concept": [
r'\b(tort|negligence|duty of care|standard of care|causation|damages|assault|battery|trespass|nuisance|defamation|libel|slander|fraud|breach of contract|fiduciary duty|limitation period|contributory negligence|vicarious liability|strict liability|informed consent|mineral rights|mining claim|free miner|placer claim|mineral tenure|crown land|water rights|riparian rights|aboriginal title|treaty rights|adverse possession|easement|right of way|encumbrance|strata|zoning|land title|fee simple|leasehold|freehold)\b',
],
}
def extract_entities(text: str) -> list[dict]:
"""Extract named entities from text using pattern matching.
Returns list of {name, type, normalized, frequency}."""
entities = []
seen = set()
for entity_type, patterns in PATTERNS.items():
for pattern in patterns:
flags = re.IGNORECASE if entity_type in ("plant", "mineral", "concept") else 0
matches = re.findall(pattern, text, flags)
for match in matches:
name = match.strip()
normalized = name.lower().strip()
if len(normalized) < 2:
continue
key = (normalized, entity_type)
if key not in seen:
seen.add(key)
# Count frequency
freq = len(re.findall(re.escape(name), text, re.IGNORECASE))
entities.append({
"name": name,
"type": entity_type,
"normalized": normalized,
"frequency": freq,
})
return entities
def store_entities(document_id: int, entities: list[dict]):
"""Store extracted entities and link them to a document."""
conn = db.get_conn()
for ent in entities:
# Upsert entity
conn.execute("""
INSERT INTO entities (name, type, normalized, doc_count)
VALUES (?, ?, ?, 1)
ON CONFLICT(normalized, type) DO UPDATE SET
doc_count = doc_count + 1,
name = CASE WHEN LENGTH(excluded.name) > LENGTH(entities.name)
THEN excluded.name ELSE entities.name END
""", (ent["name"], ent["type"], ent["normalized"]))
entity_id = conn.execute(
"SELECT id FROM entities WHERE normalized = ? AND type = ?",
(ent["normalized"], ent["type"])
).fetchone()[0]
# Link to document
conn.execute("""
INSERT OR REPLACE INTO document_entities (document_id, entity_id, frequency)
VALUES (?, ?, ?)
""", (document_id, entity_id, ent["frequency"]))
conn.commit()
conn.close()
def extract_and_store(document_id: int, text: str) -> int:
"""Extract entities from text and store them. Returns count."""
entities = extract_entities(text)
if entities:
store_entities(document_id, entities)
return len(entities)
def process_all_documents():
"""Run entity extraction on all existing documents."""
conn = db.get_conn()
docs = conn.execute("SELECT id, title, content FROM documents").fetchall()
conn.close()
total = 0
for doc in docs:
text = f"{doc['title'] or ''} {doc['content']}"
count = extract_and_store(doc["id"], text)
total += count
if count > 0:
print(f" [{doc['id']}] {doc['title'][:50]}: {count} entities")
return total
def find_related_documents(document_id: int, limit: int = 10) -> list:
"""Find documents that share the most entities with given document."""
conn = db.get_conn()
rows = conn.execute("""
SELECT
d.id,
d.title,
COUNT(DISTINCT de2.entity_id) AS shared_entities,
GROUP_CONCAT(DISTINCT e.name) AS shared_entity_names
FROM document_entities de1
JOIN document_entities de2 ON de2.entity_id = de1.entity_id AND de2.document_id != de1.document_id
JOIN documents d ON d.id = de2.document_id
JOIN entities e ON e.id = de2.entity_id
WHERE de1.document_id = ?
GROUP BY d.id
ORDER BY shared_entities DESC
LIMIT ?
""", (document_id, limit)).fetchall()
conn.close()
return rows
def auto_link_documents(min_shared: int = 3):
"""Automatically create document_links based on shared entities."""
conn = db.get_conn()
pairs = conn.execute("""
SELECT
de1.document_id AS doc1,
de2.document_id AS doc2,
COUNT(DISTINCT de1.entity_id) AS shared,
CAST(COUNT(DISTINCT de1.entity_id) AS REAL) /
CAST(MAX(
(SELECT COUNT(*) FROM document_entities WHERE document_id = de1.document_id),
(SELECT COUNT(*) FROM document_entities WHERE document_id = de2.document_id)
) AS REAL) AS strength
FROM document_entities de1
JOIN document_entities de2 ON de2.entity_id = de1.entity_id
AND de2.document_id > de1.document_id
GROUP BY de1.document_id, de2.document_id
HAVING shared >= ?
ORDER BY shared DESC
""", (min_shared,)).fetchall()
count = 0
for pair in pairs:
conn.execute("""
INSERT OR IGNORE INTO document_links (source_doc_id, target_doc_id, relation, strength)
VALUES (?, ?, 'related', ?)
""", (pair["doc1"], pair["doc2"], min(pair["strength"], 1.0)))
count += 1
conn.commit()
conn.close()
return count