ProfRCA/run_generate_description.py at master · IntelligentDDS/ProfRCA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import pickle
import networkx as nx
import time
import json
from typing import List, Dict, Any
import torch
from torch_geometric.loader import DataLoader
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate

import graphbuilder
from faults import FaultType
from profile_agent.prompts import graph_compare_summary, graph_compare_summary_syscall
from utils import clean_llm_output, timestamp_readable


llm = ChatOllama(
    model="qwen3:30b-a3b-64k",
    temperature=0,
    num_predict=16384,
)

prompt = PromptTemplate(
    template=graph_compare_summary_syscall,
    input_variables=["input", "service", "fault_type"]
)


def generate_description_with_retry(comparison, service, fault_type, max_retries=20):
    """
    生成描述，如果生成失败则重试

    Args:
        comparison: 比较结果
        service: 服务名称
        fault_type: 故障类型描述
        max_retries: 最大重试次数

    Returns:
        tuple: (raw_description, clean_description, elapsed_time)
    """
    for attempt in range(max_retries):
        # 记录开始时间
        start_time = time.time()

        # 生成描述
        response = llm.invoke(
            prompt.format(input=comparison, service=service, fault_type=fault_type),
        )
        raw_description = response.content

        # 计算耗时
        end_time = time.time()
        elapsed_time = end_time - start_time

        # 清理LLM输出，移除<think></think>部分
        clean_description = clean_llm_output(raw_description)

        if clean_description:
            return raw_description, clean_description, elapsed_time

        print(f"description生成失败: {raw_description}\n正在进行第{attempt+1}次重试...")
        time.sleep(30)

    return "", "", elapsed_time


count = 0


# 服务列表
services = [
    'adservice',
    'checkoutservice',
    'emailservice',
    'frontend',
    'recommendationservice',
]

# 故障类型
fault_types = [
    FaultType.FUTEX_DELAY,
    FaultType.WRITE_DELAY,
    FaultType.READ_DELAY,
    FaultType.EPOLL_WAIT_DELAY,
]

for service in services:
    print(f"处理服务: {service}")

    # 加载common图
    common_graph = graphbuilder.load_common_graph(service, generate=True)

    for fault_type in fault_types:
        print(f"{timestamp_readable()}处理故障类型: {fault_type}")

        # 加载故障图
        graph_dir = fault_type.path.format(service)
        fault_graphs, _ = graphbuilder.load_graph_from_glob_directory(graph_dir, fault_type.code, generate=False)

        # 创建输出目录
        output_dir = graph_dir.replace("data_fault", "data_fault_description")
        os.makedirs(output_dir, exist_ok=True)

        for i, fault_graph in enumerate(fault_graphs):
            # 确定输出文件路径
            output_filename = fault_graph.filename.split('/')[-1] + '.gpickle'
            output_file_path = os.path.join(output_dir, 'nx', output_filename)

            # 确保nx目录存在
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            # 检查输出文件是否已存在
            if os.path.exists(output_file_path):
                print(f"跳过图{i+1}: {fault_graph.filename} - 输出文件已存在")
                continue

            print(f"处理图{i+1}: {fault_graph.filename}")

            # 比较图
            comparison = graphbuilder.compare_call_graph(fault_graph, common_graph)

            # 生成描述，如果失败则重试
            raw_description, clean_description, elapsed_time = generate_description_with_retry(
                comparison, service, fault_type.description
            )
            if clean_description == "":
                print(f"{timestamp_readable()}生成失败，进程退出")
                exit(1)

            print(f"LLM生成耗时: {elapsed_time:.2f}秒")
            print(f"LLM description: {clean_description}")

            # 保存单个结果
            fault_graph.description = clean_description
            fault_graph.fault = fault_type.function_name
            fault_graph.fault_description = fault_type.description

            with open(output_file_path, 'wb') as f:
                pickle.dump(fault_graph, f)

            count += 1
            print(f"{timestamp_readable()}处理{count}个图，已保存结果到: {output_file_path}")