-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest_knowledge_base.py
More file actions
124 lines (96 loc) · 4.03 KB
/
ingest_knowledge_base.py
File metadata and controls
124 lines (96 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""Ingest markdown files from ~/knowledge-base/ into knowledge.db (source_id=26)."""
import os
import sys
import hashlib
import json
import re
import sqlite3
sys.path.insert(0, os.path.dirname(__file__))
from db import get_conn
KB_DIR = os.path.expanduser("~/knowledge-base")
SOURCE_ID = 26
STOP_WORDS = {
"the", "and", "for", "with", "from", "that", "this", "are", "was", "were",
"been", "being", "have", "has", "had", "does", "did", "will", "would",
"could", "should", "may", "might", "must", "shall", "can", "need", "not",
"but", "nor", "yet", "also", "just", "than", "then", "when", "where",
"how", "what", "which", "who", "whom", "why", "its", "their", "our",
"your", "his", "her", "all", "each", "every", "both", "few", "more",
"most", "other", "some", "such", "only", "own", "same", "into", "over",
"after", "before", "between", "under", "above", "below", "about", "out",
"off", "through", "during", "against", "among", "along", "across", "behind",
"beyond", "upon", "within", "without", "toward", "towards", "onto", "via",
}
def extract_title(content):
for line in content.splitlines():
line = line.strip()
if line.startswith("# "):
return line[2:].strip()
return None
def extract_keywords(filename):
name = os.path.splitext(filename)[0]
words = [w.lower() for w in re.split(r'[_\-]', name) if len(w) > 2]
return [w for w in words if w not in STOP_WORDS]
def content_hash(content):
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def main():
if not os.path.isdir(KB_DIR):
print(f"ERROR: {KB_DIR} not found")
sys.exit(1)
md_files = sorted(f for f in os.listdir(KB_DIR) if f.endswith(".md"))
print(f"Found {len(md_files)} markdown files in {KB_DIR}")
conn = get_conn()
# Get existing hashes for source_id=26 to check for dups
existing_hashes = set(
row[0] for row in conn.execute(
"SELECT hash FROM documents WHERE source_id = ? AND hash IS NOT NULL", (SOURCE_ID,)
).fetchall()
)
print(f"Existing documents for source_id={SOURCE_ID}: {len(existing_hashes)}")
inserted = 0
skipped = 0
errors = 0
for fname in md_files:
fpath = os.path.join(KB_DIR, fname)
try:
with open(fpath, "r", encoding="utf-8") as f:
content = f.read()
if not content.strip():
print(f" SKIP (empty): {fname}")
skipped += 1
continue
title = extract_title(content) or fname.replace(".md", "").replace("_", " ").title()
keywords = extract_keywords(fname)
h = content_hash(content)
if h in existing_hashes:
skipped += 1
continue
conn.execute(
"""INSERT OR IGNORE INTO documents
(source_id, title, content, url, author, keywords, hash)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(SOURCE_ID, title, content, None, "opus-knowledge",
json.dumps(keywords), h)
)
if conn.execute("SELECT changes()").fetchone()[0] > 0:
inserted += 1
existing_hashes.add(h)
print(f" INSERT: {fname} -> \"{title}\" ({len(keywords)} keywords)")
else:
skipped += 1
print(f" SKIP (dup hash): {fname}")
except Exception as e:
errors += 1
print(f" ERROR: {fname} -> {e}")
conn.commit()
conn.close()
total_now = get_conn().execute("SELECT COUNT(*) FROM documents").fetchone()[0]
source_now = get_conn().execute("SELECT COUNT(*) FROM documents WHERE source_id = ?", (SOURCE_ID,)).fetchone()[0]
print(f"\n{'='*50}")
print(f"Results: Inserted={inserted}, Skipped={skipped}, Errors={errors}")
print(f"Total files processed: {len(md_files)}")
print(f"Source '{SOURCE_ID}' documents: {source_now}")
print(f"Total documents in DB: {total_now}")
if __name__ == "__main__":
main()