-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
144 lines (108 loc) · 3.99 KB
/
main.py
File metadata and controls
144 lines (108 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import argparse
import sys
from pathlib import Path
from ingestion.scrape import download_all_pages
from ingestion.extract_text import extract_all_files
from rag.ingest import main as ingest_main
from rag.query import TaxRAG
sys.path.append(str(Path(__file__).parent))
def main():
parser = argparse.ArgumentParser(
description='Income Tax Act RAG System',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Download HTML files
python main.py --download
# Extract and clean text
python main.py --extract
# Build vector store
python main.py --ingest
# Query the system
python main.py --query "What is taxable income?"
# Interactive mode
python main.py --interactive
# Start API server
python main.py --serve
# Run full pipeline
python main.py --download --extract --ingest
"""
)
parser.add_argument('--download', action='store_true',
help='Download HTML files from Justice Laws website')
parser.add_argument('--extract', action='store_true',
help='Extract and clean text from HTML files')
parser.add_argument('--ingest', action='store_true',
help='Build vector store from cleaned text')
parser.add_argument('--query', type=str,
help='Query the RAG system')
parser.add_argument('--interactive', action='store_true',
help='Start interactive query mode')
args = parser.parse_args()
if not any(vars(args).values()):
parser.print_help()
return
if args.download:
print("\n" + "="*70)
print("STEP 1: DOWNLOADING HTML FILES")
print("="*70 + "\n")
download_all_pages()
if args.extract:
print("\n" + "="*70)
print("STEP 2: EXTRACTING TEXT")
print("="*70 + "\n")
extract_all_files()
if args.ingest:
print("\n" + "="*70)
print("STEP 3: BUILDING VECTOR STORE")
print("="*70 + "\n")
ingest_main()
if args.query:
print("\n" + "="*70)
print("QUERYING RAG SYSTEM")
print("="*70 + "\n")
rag = TaxRAG()
result = rag.query(args.query)
print("\n" + "="*70)
print("ANSWER:")
print("="*70)
print(result['answer'])
print("\n" + "="*70)
print("SOURCES:")
print("="*70)
for i, source in enumerate(result['sources'], 1):
print(f"\n[{i}] {source['section']} ({source['source_file']})")
print(f" {source['text'][:200]}...")
if args.interactive:
print("\n" + "="*70)
print("INTERACTIVE MODE")
print("="*70 + "\n")
rag = TaxRAG()
print("Type your questions (or 'quit' to exit)\n")
while True:
try:
question = input("\nYour question: ").strip()
if question.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
if not question:
continue
print("\nSearching...")
result = rag.query(question)
print("\n" + "="*70)
print("ANSWER:")
print("="*70)
print(result['answer'])
print("\n" + "="*70)
print("SOURCES:")
print("="*70)
for i, source in enumerate(result['sources'], 1):
print(f"\n[{i}] {source['section']} ({source['source_file']})")
print(f" {source['text'][:150]}...")
except KeyboardInterrupt:
print("\n\nGoodbye!")
break
except Exception as e:
print(f"\nError: {e}")
if __name__ == "__main__":
main()