LangGraphChatBot/graph/graph.py at main · kdjlyy/LangGraphChatBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
from datetime import datetime

from langchain.schema import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.runnables import RunnableConfig
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph.state import StateGraph, CompiledStateGraph, END
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

from chains.generate import GenerateChain
from chains.summary import SummaryChain
from graph.graph_state import GraphState
from utils.common import get_current_time


def route_question(state: GraphState) -> str:
    """
    根据操作类型路由到相应的处理节点。

    参数:
        state (GraphState): 当前图的状态

    返回:
        str: 下一个要调用的节点名称
    """
    print("🤖 正在根据类型选择分支")
    if state['type'] == 'websearch':
        return "extract_keywords"
    if state['type'] == 'file':
        return "file_process"
    elif state['type'] == 'chat':
        return "generate"

def generate(state: GraphState) -> GraphState:
    """
    根据文档和对话历史生成答案。

    参数:
        state (GraphState): 当前图的状态

    返回:
        state (GraphState): 返回添加了LLM生成内容的新状态
    """
    print("🤖 正在生成回答")
    chain = GenerateChain(state["model_name"], state["temperature"])
    messages = state["messages"]
    state["messages"] = chain.invoke({
        "question": messages[-1].content,
        "history": messages[:-1],
        "documents": state["documents"],
        "current_date": get_current_time()
    })
    return state

def file_process(state: GraphState, config: RunnableConfig) -> GraphState:
    """
    处理文件

    参数:
        state (GraphState): 当前图的状态
        config (RunnableConfig): 可运行配置

    返回:
        state (GraphState): 返回图状态，将文档添加 config 中的向量存储
    """

    print("🤖 开始处理文件")
    vector_store = config["configurable"]["vectorstore"]

    for doc in state["documents"]:
        file_path: str = doc.page_content
        if os.path.exists(file_path):
            print(f"📄 文件路径: {file_path}")
            split_docs: list[Document] = None
            if file_path.endswith(".txt") or file_path.endswith(".md"):
                # 处理文本或Markdown文件
                docs = TextLoader(file_path, autodetect_encoding=True).load()
                # 文本分割
                splitter = RecursiveCharacterTextSplitter(
                    separators=["\n\n", "\n", " ", ".", ",", "\u200B", "\uff0c", "\u3001", "\uff0e", "\u3002", ""],
                    chunk_size=512,
                    chunk_overlap=256,
                    add_start_index=True
                )
                split_docs = splitter.split_documents(docs)
            else:
                # 使用 marker-pdf 处理其他文件
                converter = PdfConverter(artifact_dict=create_model_dict())
                rendered = converter(file_path)
                docs, _, _ = text_from_rendered(rendered)
                splitter = MarkdownHeaderTextSplitter(
                    [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
                    strip_headers = False
                )
                split_docs = splitter.split_text(docs)

            # 将处理后的文档添加到向量存储中
            vector_store.add_documents(split_docs)
        else:
            print(f"📄 文件路径不存在: {file_path}")
    return state

def extract_keywords(state: GraphState, config: RunnableConfig) -> GraphState:
    """
    从问题中提取关键词。

    参数:
        state (GraphState): 当前图的状态
        config (RunnableConfig): 可运行配置

    返回:
        state (GraphState): 返回添加了提取关键词的新状态
    """

    print("🤖 正在提取关键词")
    chain = SummaryChain(state["model_name"], state["temperature"])
    messages = state["messages"]
    # query = chain.invoke({"question": messages[-1].content, "history": messages[:-1]})
    query = chain.invoke({"question": messages[-1].content, "current_time": get_current_time()})

    if state["type"] == "websearch":
        # 将生成的搜索查询添加到消息列表中，下一个节点将会使用
        state["messages"] = query
    elif state["type"] == "file":
        # 使用生成的搜索查询在向量数据库中搜索
        # docs = config["configurable"]["vectorstore"].max_marginal_relevance_search(query.content, 5)
        docs_and_scores = config["configurable"]["vectorstore"].similarity_search_with_score(query.content, 20)
        print(f" 📄 召回共{len(docs_and_scores)}篇文档:")
        idx, docs, curr_time_str = 0, [], datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for doc, score in docs_and_scores:
            docs.append(doc)
            idx += 1
            print(f"============= [Recall] [{curr_time_str}] [{idx}] Score:{score} Source:{doc.metadata['source']} =============")
            print(doc.page_content)

        # rerank
        docs_result = config["configurable"]["rerank"].rerank(docs, query.content, 3)

        idx = 0
        for doc in (docs_result if docs_result else []):
            idx += 1
            print(f"============= [Rerank] [{curr_time_str}] [{idx}] Source:{doc.metadata['source']} =============")
            print(doc.page_content)

        state["documents"] = docs_result

    return state

def decide_to_generate(state: GraphState) -> str:
    """
    决定是进行网络搜索还是直接生成回答。

    参数:
        state (GraphState): 当前图的状态

    返回:
        str: 下一个要调用的节点名称
    """

    if state["type"] == "websearch":
        print("🌐 需要进行网络搜索")
        return "websearch"
    elif state["type"] == "file":
        print("⭐ 无需搜索，直接生成答案")
        return "generate"

def web_search(state: GraphState) -> GraphState:
    """
    基于问题进行网络搜索。

    参数:
        state (GraphState): 当前图的状态

    返回:
        state (GraphState): 返回添加了网络搜索结果的新状态
    """

    print(f"🌐 正在进行网络搜索，搜索网页数量：{state['search_num']}...")
    web_search_tool = TavilySearchResults(k = state["search_num"])
    documents = state["documents"]
    try:
        docs = web_search_tool.invoke({"query": state["messages"][-1].content})
        web_results = "\n".join([d["content"] for d in docs])
        web_results = Document(page_content=web_results)
        documents.append(web_results)
        state["documents"] = documents
    except:
        pass
    print(f"🌐 搜索结果:\n{documents[0].page_content}")
    return state

def create_graph() -> CompiledStateGraph:
    """
    创建并配置状态图工作流。

    返回:
        CompiledStateGraph: 编译好的状态图
    """

    workflow = StateGraph(GraphState)
    # 添加节点
    workflow.add_node("websearch", web_search)
    workflow.add_node("extract_keywords", extract_keywords)
    workflow.add_node("file_process", file_process)
    workflow.add_node("generate", generate)
    # 添加边
    workflow.set_conditional_entry_point(
        route_question,
        {
            "extract_keywords": "extract_keywords",
            "generate": "generate",
            "file_process": "file_process",
        },
    )
    workflow.add_edge("file_process", "extract_keywords")
    workflow.add_conditional_edges(
        "extract_keywords",
        decide_to_generate,
        {
            "websearch": "websearch",
            "generate": "generate",
        },
    )
    workflow.add_edge("websearch", "generate")
    workflow.add_edge("generate", END)

    # 创建图，并使用 `MemorySaver()` 在内存中保存状态
    return workflow.compile(checkpointer=MemorySaver())

def stream_graph_updates(graph: CompiledStateGraph, user_input: GraphState, config: dict):
    """
    流式处理图更新并返回最终结果。

    参数:
        graph (CompiledStateGraph): 编译好的状态图
        user_input (GraphState): 用户输入的状态
        config (dict): 配置字典

    返回:
        generator: 生成器对象，逐步返回图更新的内容
    """

    for chunk, _ in graph.stream(user_input, config, stream_mode="messages"):
        yield chunk.content