Text-Reference-AIChatbot/main.py at main · ZhaoJackson/Text-Reference-AIChatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright (c) 2025 Zichen Zhao
# Columbia University School of Social Work
# Licensed under the MIT Academic Research License
# See LICENSE file in the project root for details.

"""
Main execution script for the benchmark pipeline.
"""

from __future__ import annotations

import pandas as pd

from src.commonconst import *
from src.data.data_processing import (
    extract_text_from_docx,
    save_processed_files,
)
from src.utils.evaluation_algo import (
    ensure_output_dirs,
    generate_evaluation_scores,
    generate_identity_dimension_scores,
    generate_safety_dimension_scores,
    save_evaluation_to_csv,
)
from src.utils.output_processing import (
    process_all_outputs,
    build_overall_summary_table,
    save_overall_summary_table,
)


def main():
    ensure_output_dirs()

    # Step 1: load raw docx text
    reference_text = extract_text_from_docx(REFERENCE_DOCX_PATH)
    chatbot_text = extract_text_from_docx(CHATBOT_DOCX_PATH)

    # Step 2: process and save all intermediate files
    save_processed_files(
        chatbot_text=chatbot_text,
        reference_text=reference_text,
        chatbot_output_path=CHATBOT_PROCESSED_CSV_PATH,
        reference_output_path=REFERENCE_PROCESSED_CSV_PATH,
        integrated_output_path=INTEGRATED_OUTPUT_CSV_PATH,
    )

    # Step 3: load integrated responses
    integrated_responses = pd.read_csv(INTEGRATED_OUTPUT_CSV_PATH)

    # Step 4: primary continuous metrics
    evaluation_df = generate_evaluation_scores(
        integrated_responses,
        include_overall_average=True,
    )
    save_evaluation_to_csv(OUTPUT_CSV_PATH, evaluation_df)

    # Step 5: triangulated dimensions
    identity_df = generate_identity_dimension_scores(
        integrated_responses,
        include_overall_average=True,
    )
    safety_df = generate_safety_dimension_scores(
        integrated_responses,
        include_overall_average=True,
    )

    # Step 6: merged overall summary
    overall_summary_df = build_overall_summary_table(
        evaluation_df=evaluation_df,
        identity_df=identity_df,
        safety_df=safety_df,
        include_overall_average=True,
    )
    save_overall_summary_table(overall_summary_df, OVERALL_SUMMARY_CSV_PATH)

    # Step 7: plotting
    process_all_outputs(evaluation_df, identity_df, safety_df)

    print("Benchmark evaluation complete.")
    print(f"Main results saved to: {OUTPUT_CSV_PATH}")
    print(f"Integrated responses saved to: {INTEGRATED_OUTPUT_CSV_PATH}")
    print(f"Identity dimension results saved to: {IDENTITY_DIMENSION_CSV_PATH}")
    print(f"Safety dimension results saved to: {SAFETY_DIMENSION_CSV_PATH}")
    print(f"Overall summary saved to: {OVERALL_SUMMARY_CSV_PATH}")
    print(f"Plots saved to: {PLOTS_DIR}")
    print(f"Dimension plots saved to: {SENSITIVITY_DIR}")


if __name__ == "__main__":
    main()