1+ digraph Information_Flow_Enhanced {
2+ // Graph settings
3+ rankdir =LR;
4+ node [shape =box , style =filled, fontname =" Arial" , fontsize =9];
5+ edge [fontname =" Arial" , fontsize =8];
6+
7+ // Process steps
8+ subgraph cluster_ingestion {
9+ label =" 1. Document Ingestion (Cached)" ;
10+ style =filled;
11+ color =lightblue;
12+
13+ HTML_File [label =" IPCC HTML File\n with paragraph IDs" , shape =folder , fillcolor =lightcyan];
14+ Cache_Check [label =" Cache Check\n (Skip if exists)" , fillcolor =lightgreen];
15+ Parse_HTML [label =" Parse HTML\n (lxml)" , fillcolor =lightblue];
16+ Extract_Text [label =" Extract Text\n from h1-h6, p tags" , fillcolor =lightblue];
17+ Create_Chunks [label =" Create Chunks\n with metadata" , fillcolor =lightblue];
18+ }
19+
20+ subgraph cluster_embedding {
21+ label =" 2. Vector Embedding" ;
22+ style =filled;
23+ color =lightgreen;
24+
25+ Text_Chunks [label =" Text Chunks\n + Paragraph IDs" , fillcolor =lightgreen];
26+ Sentence_Transformers [label =" Sentence Transformers\n (all-MiniLM-L6-v2)" , fillcolor =lightgreen];
27+ Vector_Embeddings [label =" Vector Embeddings\n (384 dimensions)" , fillcolor =lightgreen];
28+ }
29+
30+ subgraph cluster_storage {
31+ label =" 3. Vector Storage (Persistent)" ;
32+ style =filled;
33+ color =lightyellow;
34+
35+ ChromaDB [label =" ChromaDB\n Vector Database" , fillcolor =lightyellow];
36+ User_Collection [label =" User-Specific\n Collection" , fillcolor =lightyellow];
37+ Metadata_Store [label =" Store Metadata\n (paragraph IDs, types)" , fillcolor =lightyellow];
38+ Cache_Manager [label =" Cache Manager\n (Force re-ingest)" , fillcolor =lightgreen];
39+ }
40+
41+ subgraph cluster_query {
42+ label =" 4. Query Processing" ;
43+ style =filled;
44+ color =lightcoral;
45+
46+ User_Question [label =" User Question" , shape =note , fillcolor =lightcoral];
47+ Question_Embedding [label =" Question Embedding" , fillcolor =lightcoral];
48+ Similarity_Search [label =" Similarity Search\n (Top-k chunks)" , fillcolor =lightcoral];
49+ Retrieved_Context [label =" Retrieved Context\n + Source IDs" , fillcolor =lightcoral];
50+ }
51+
52+ subgraph cluster_generation {
53+ label =" 5. Answer Generation (Enhanced)" ;
54+ style =filled;
55+ color =lightpink;
56+
57+ Scientific_Prompt [label =" Scientific Prompt\n (IPCC Expert Role)" , fillcolor =lightpink];
58+ Language_Model [label =" Language Model\n (gpt2-large)" , fillcolor =lightpink];
59+ Generated_Answer [label =" Generated Answer\n + Sources" , shape =note , fillcolor =lightpink];
60+ Quality_Check [label =" Quality Validation\n (Scientific accuracy)" , fillcolor =lightpink];
61+ }
62+
63+ subgraph cluster_interfaces {
64+ label =" 6. User Interfaces" ;
65+ style =filled;
66+ color =lightgoldenrod;
67+
68+ Streamlit_UI [label =" Streamlit Web UI\n (Chapter selection, chat)" , fillcolor =lightgoldenrod];
69+ CLI_Interface [label =" CLI Development\n (list, load, ask, test)" , fillcolor =lightgoldenrod];
70+ Vector_Manager [label =" Vector Store Manager\n (status, cleanup, delete)" , fillcolor =lightgoldenrod];
71+ Performance_Tools [label =" Performance Tools\n (caching, quality, benchmark)" , fillcolor =lightgoldenrod];
72+ }
73+
74+ subgraph cluster_output {
75+ label =" 7. Response Delivery" ;
76+ style =filled;
77+ color =lightsteelblue;
78+
79+ Final_Answer [label =" Final Answer\n with Sources" , shape =note , fillcolor =lightsteelblue];
80+ Chat_History [label =" Store in\n Chat History" , fillcolor =lightsteelblue];
81+ Export_Data [label =" Export to JSON\n (CLI automation)" , fillcolor =lightsteelblue];
82+ Progress_Tracking [label =" Progress Tracking\n (Loading, querying)" , fillcolor =lightsteelblue];
83+ }
84+
85+ // Flow connections - Ingestion
86+ HTML_File -> Cache_Check [label =" check if exists" ];
87+ Cache_Check -> Parse_HTML [label =" process if new" ];
88+ Parse_HTML -> Extract_Text [label =" DOM Tree" ];
89+ Extract_Text -> Create_Chunks [label =" Text Elements" ];
90+ Create_Chunks -> Text_Chunks [label =" Chunked Text" ];
91+
92+ // Flow connections - Embedding
93+ Text_Chunks -> Sentence_Transformers [label =" Text Input" ];
94+ Sentence_Transformers -> Vector_Embeddings [label =" Embeddings" ];
95+ Vector_Embeddings -> ChromaDB [label =" Store Vectors" ];
96+ ChromaDB -> User_Collection [label =" User Isolation" ];
97+ User_Collection -> Metadata_Store [label =" Store Metadata" ];
98+
99+ // Flow connections - Query
100+ User_Question -> Question_Embedding [label =" Embed Question" ];
101+ Question_Embedding -> Similarity_Search [label =" Search Vectors" ];
102+ Similarity_Search -> Retrieved_Context [label =" Top-k Results" ];
103+ Retrieved_Context -> Scientific_Prompt [label =" Context + Question" ];
104+
105+ // Flow connections - Generation
106+ Scientific_Prompt -> Language_Model [label =" Generate Answer" ];
107+ Language_Model -> Generated_Answer [label =" AI Response" ];
108+ Generated_Answer -> Quality_Check [label =" Validate Quality" ];
109+ Quality_Check -> Final_Answer [label =" Quality Assured" ];
110+
111+ // Flow connections - Interfaces
112+ Streamlit_UI -> User_Collection [label =" load chapter" ];
113+ CLI_Interface -> User_Collection [label =" load chapter" ];
114+ Vector_Manager -> ChromaDB [label =" manage collections" ];
115+ Performance_Tools -> Cache_Check [label =" test caching" ];
116+ Performance_Tools -> Quality_Check [label =" test quality" ];
117+
118+ // Flow connections - Output
119+ Final_Answer -> Chat_History [label =" Store History" ];
120+ Final_Answer -> Export_Data [label =" JSON Export" ];
121+ Progress_Tracking -> Create_Chunks [label =" track progress" ];
122+ Progress_Tracking -> Language_Model [label =" track progress" ];
123+
124+ // Cache management
125+ Cache_Manager -> Cache_Check [label =" force re-ingest" ];
126+ Cache_Manager -> ChromaDB [label =" cleanup old" ];
127+
128+ // Performance indicators
129+ edge [style =dashed, color =green, label =" 100% faster" ];
130+ Cache_Check -> User_Collection;
131+
132+ edge [style =dashed, color =blue, label =" scientific quality" ];
133+ Scientific_Prompt -> Generated_Answer;
134+
135+ // Cache bypass
136+ edge [style =dotted, color =red, label =" force re-ingest" ];
137+ Cache_Manager -> Parse_HTML;
138+
139+ // Storage monitoring
140+ edge [style =dashed, color =orange, label =" 191MB, 10 collections" ];
141+ ChromaDB -> Vector_Manager;
142+ }
0 commit comments