-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex_docs.py
More file actions
executable file
·158 lines (132 loc) · 5.02 KB
/
index_docs.py
File metadata and controls
executable file
·158 lines (132 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""Main script to index ServiceNow documentation into vector database"""
import click
from pathlib import Path
import json
from datetime import datetime
import sys
from document_processor import DocumentProcessor
from vector_store import VectorStoreManager
from config import settings
@click.command()
@click.option(
'--docs-path',
type=click.Path(exists=True),
default='zurich-intelligent-experiences.md',
help='Path to the markdown documentation file'
)
@click.option(
'--chunk-size',
type=int,
default=None,
help='Size of text chunks (overrides config)'
)
@click.option(
'--chunk-overlap',
type=int,
default=None,
help='Overlap between chunks (overrides config)'
)
@click.option(
'--reset',
is_flag=True,
help='Reset the vector database before indexing'
)
@click.option(
'--batch-size',
type=int,
default=100,
help='Batch size for indexing'
)
def index_documentation(docs_path, chunk_size, chunk_overlap, reset, batch_size):
"""Index ServiceNow documentation into vector database"""
print("=" * 60)
print("ServiceNow Documentation Vectorizer")
print("=" * 60)
# Convert path to Path object
docs_file = Path(docs_path)
if not docs_file.exists():
print(f"Error: Documentation file not found: {docs_file}")
sys.exit(1)
# Get file size
file_size = docs_file.stat().st_size / (1024 * 1024) # Convert to MB
print(f"\nDocument: {docs_file.name}")
print(f"Size: {file_size:.2f} MB")
# Initialize components
print("\n1. Initializing components...")
# Use provided values or fall back to config
chunk_size = chunk_size or settings.chunk_size
chunk_overlap = chunk_overlap or settings.chunk_overlap
processor = DocumentProcessor(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
vector_store = VectorStoreManager()
# Reset database if requested
if reset:
print("\n2. Resetting vector database...")
vector_store.reset_database()
else:
print("\n2. Using existing vector database...")
info = vector_store.get_collection_info()
print(f" Current documents: {info.get('count', 0)}")
# Process the document
print(f"\n3. Processing document with chunk_size={chunk_size}, overlap={chunk_overlap}...")
documents = processor.chunk_markdown(docs_file)
# Get and display statistics
stats = processor.get_chunk_statistics(documents)
print("\nChunking Statistics:")
print(f" Total chunks: {stats['total_chunks']}")
print(f" Average chunk size: {stats['avg_char_count']:.0f} characters")
print(f" Min/Max chunk size: {stats['min_char_count']}/{stats['max_char_count']} characters")
print(f" Total words: {stats['total_words']:,}")
# Index documents
print(f"\n4. Indexing {len(documents)} chunks into vector database...")
print(f" Using embedding model: {settings.embedding_model_type}/{settings.embedding_model_name}")
print(f" Batch size: {batch_size}")
indexing_stats = vector_store.add_documents(documents, batch_size=batch_size)
# Display results
print("\n5. Indexing Complete!")
print("=" * 60)
print(f" Successfully indexed: {indexing_stats['successful']} chunks")
if indexing_stats['failed'] > 0:
print(f" Failed: {indexing_stats['failed']} chunks")
print(f" Total documents in database: {indexing_stats['final_document_count']}")
# Save summary
summary_file = Path("indexing_summary.json")
summary = {
"timestamp": datetime.now().isoformat(),
"document": str(docs_file),
"file_size_mb": file_size,
"chunk_config": {
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap
},
"chunking_stats": stats,
"indexing_stats": indexing_stats,
"embedding_model": {
"type": settings.embedding_model_type,
"name": settings.embedding_model_name
}
}
with open(summary_file, 'w') as f:
json.dump(summary, f, indent=2)
print(f"\nSummary saved to: {summary_file}")
# Test search
print("\n6. Testing search functionality...")
test_query = "Now Assist AI agents"
results = vector_store.search_with_relevance(test_query, k=3)
if results:
print(f"\nTest query: '{test_query}'")
print(f"Found {len(results)} relevant chunks:")
for i, result in enumerate(results, 1):
print(f"\n Result {i} (score: {result['score']:.3f}):")
preview = result['content'][:200].replace('\n', ' ')
print(f" {preview}...")
if result.get('headers'):
print(f" Headers: {result['headers']}")
print("\n✅ Indexing complete! The vector database is ready for use.")
print(f" Database location: {settings.chroma_persist_directory}")
print(f" Collection name: {settings.collection_name}")
if __name__ == "__main__":
index_documentation()