-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
92 lines (76 loc) · 2.79 KB
/
main.py
File metadata and controls
92 lines (76 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright (c) 2025 Zichen Zhao
# Columbia University School of Social Work
# Licensed under the MIT Academic Research License
# See LICENSE file in the project root for details.
"""
Main execution script for the benchmark pipeline.
"""
from __future__ import annotations
import pandas as pd
from src.commonconst import *
from src.data.data_processing import (
extract_text_from_docx,
save_processed_files,
)
from src.utils.evaluation_algo import (
ensure_output_dirs,
generate_evaluation_scores,
generate_identity_dimension_scores,
generate_safety_dimension_scores,
save_evaluation_to_csv,
)
from src.utils.output_processing import (
process_all_outputs,
build_overall_summary_table,
save_overall_summary_table,
)
def main():
ensure_output_dirs()
# Step 1: load raw docx text
reference_text = extract_text_from_docx(REFERENCE_DOCX_PATH)
chatbot_text = extract_text_from_docx(CHATBOT_DOCX_PATH)
# Step 2: process and save all intermediate files
save_processed_files(
chatbot_text=chatbot_text,
reference_text=reference_text,
chatbot_output_path=CHATBOT_PROCESSED_CSV_PATH,
reference_output_path=REFERENCE_PROCESSED_CSV_PATH,
integrated_output_path=INTEGRATED_OUTPUT_CSV_PATH,
)
# Step 3: load integrated responses
integrated_responses = pd.read_csv(INTEGRATED_OUTPUT_CSV_PATH)
# Step 4: primary continuous metrics
evaluation_df = generate_evaluation_scores(
integrated_responses,
include_overall_average=True,
)
save_evaluation_to_csv(OUTPUT_CSV_PATH, evaluation_df)
# Step 5: triangulated dimensions
identity_df = generate_identity_dimension_scores(
integrated_responses,
include_overall_average=True,
)
safety_df = generate_safety_dimension_scores(
integrated_responses,
include_overall_average=True,
)
# Step 6: merged overall summary
overall_summary_df = build_overall_summary_table(
evaluation_df=evaluation_df,
identity_df=identity_df,
safety_df=safety_df,
include_overall_average=True,
)
save_overall_summary_table(overall_summary_df, OVERALL_SUMMARY_CSV_PATH)
# Step 7: plotting
process_all_outputs(evaluation_df, identity_df, safety_df)
print("Benchmark evaluation complete.")
print(f"Main results saved to: {OUTPUT_CSV_PATH}")
print(f"Integrated responses saved to: {INTEGRATED_OUTPUT_CSV_PATH}")
print(f"Identity dimension results saved to: {IDENTITY_DIMENSION_CSV_PATH}")
print(f"Safety dimension results saved to: {SAFETY_DIMENSION_CSV_PATH}")
print(f"Overall summary saved to: {OVERALL_SUMMARY_CSV_PATH}")
print(f"Plots saved to: {PLOTS_DIR}")
print(f"Dimension plots saved to: {SENSITIVITY_DIR}")
if __name__ == "__main__":
main()