KI-Performance/2.responses_to_table.py at main · AndreM92/KI-Performance · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# !/usr/bin/env python3
"""
Iteratives Textverarbeitungs-Skript für ChatGPT API
Dabei werden die Angaben zu den Marken in den Antworten der LLMs extrahiert, durch weitere Recherchen ergänzt und
im Tabellenformat ausgegeben.
"""
import os
import re
from datetime import datetime
import pandas as pd
import requests
import json

from ki_functions import *
# API-Key setzen
os.environ["OPENAI_API_KEY"] = ChatGPT_key
# OpenAI-Client initialisieren (ohne Argumente!)
client = OpenAI()
llm_model = "gpt-5.4"
#llm_model = "sonar"
responses_synthesis_filename = "prompt_responses_synthesis" + ".txt"
file_path = r"C:\Users\andre\OneDrive\Desktop\KI-Performance Versicherungen 2026"
########################################################################################################################
#Dependencies
# pip install openai
#pip install openpyxl
#pip install tabulate
########################################################################################################################

if __name__ == '__main__':
    os.chdir(file_path)
    with open(responses_synthesis_filename, "r", encoding="utf-8") as f:
        response_synthesis = f.read()
    if '\n' in response_synthesis:
        response_synthesis = response_synthesis.replace('\n',' ')
    final_table = []
    os.chdir('./responses')
    file_list = sorted([f for f in os.listdir() if '.txt' in f and 'full_responses' in f])
    start_at = 0

    for n, source_file_filename in enumerate(file_list):
        print(source_file_filename)
        if n < start_at:
            continue
        model_name = source_file_filename.replace('full_responses','').replace('.txt','').replace('_','')
        # Quellendatei mit den Responses im Textformat
        with open(source_file_filename, "r", encoding="utf-8") as f:
            source_file = f.read()
        patterns = re.findall(r'(?:[1-9]|[1-4]\d|50):\n', source_file)
        responses_list = re.split(r'(?:[1-9]|[1-4]\d|50):\n', source_file)
        if not len(responses_list) == 51:
            print(f'Abweichende Anzahl: {len(responses_list)}')
            print(patterns)
            continue
#            for n, l in enumerate(responses_list):
#                print(n,l)
#                if n > 3:
#                    break

        for ID, response in enumerate(responses_list):
            if ID <= 1:
                continue

            if len(response) <= 3:
                continue
            if str(ID) not in patterns[ID-1]:
                break
            if str(response).count('\n') >= 1:
                response_s = response.rsplit('\n',1)[0]
                if len(response) - len(response_s) < 4:
                    response = response_s

            full_prompt = response_synthesis + "\n" + response
            print(f"{ID}: {response}")
            table_format = gpt_chat(client, llm_model, full_prompt)
#            table_format = perplexity_chat(llm_model, full_prompt)

            for line in table_format.split('\n'):
                if not str(line[0]).isdigit():
                    continue
                row = line.split(';')
                if len(row) != 7:
                    print(f'Abweichende Spalten: {len(row)}')
                    print(len(row), row)
                if len(row) < 7:
                    if 'http' in row[3]:
                        row.insert(4, '')
                final_table.append([ID]+row)

        new_table = []
        for row in final_table:
#            print(len(row), row)
            if len(row) >= 8:
                overhang = [str(e).strip() for e in row[7:] if len(str(e).strip()) > 4]
                row = row[:7] + ['; '.join(overhang)]
#                print(len(row), row)
            if len(row) < 8:
                if 'http' in row[3]:
                    row.insert(4, '')
                else:
                    row.insert(-1, '')
#                    print(len(row), row)
            new_table.append(row)

        header = ['Anfrage', 'Rang', 'Firma', 'Marke', 'Website', 'Produkt', 'Quellen',
                  'Wörtliche Beschreibung der Marke im Chat']
        df_responses = pd.DataFrame(new_table,columns=header)
        dt_str_now = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
        filename = model_name + '_responses_table_' + dt_str_now + '.xlsx'
        df_responses.to_excel(filename)