fife/evaluation_lib.py at main · gtfintechlab/fife · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Copyright 2025 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code has been adapted from the original IFEval implementation
# for the IFF (Instruction Following for Finance) project.
# Modifications include adaptation for finance-specific instruction evaluation.

import dataclasses
import difflib
import json

import instructions_registry as reg
import instructions_util as iu


@dataclasses.dataclass
class InputExample:
    key: int
    instruction_id_list: list[str]
    prompt: str
    kwargs: list[dict[str, str | int | None]]


@dataclasses.dataclass
class OutputExample:
    instruction_id_list: list[str]
    prompt: str
    response: str
    follow_all_instructions: bool
    follow_instruction_list: list[bool]


def read_prompt_list(path):
    out = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            j = json.loads(line)
            out.append(
                InputExample(
                    key=j["key"],
                    instruction_id_list=j["instruction_id_list"],
                    prompt=j["prompt"],
                    kwargs=j["kwargs"],
                )
            )
    return out


def read_prompt_to_response_dict(path):
    d = {}
    if not path:
        return d
    with open(path, encoding="utf-8") as f:
        for line in f:
            j = json.loads(line)
            d[j["prompt"]] = j["response"]
    return d


def write_outputs(path, outputs):
    with open(path, "w") as f:
        for o in outputs:
            f.write(json.dumps(dataclasses.asdict(o)) + "\n")


def _build_and_check(instruction_id, kwargs, prompt, responses: list[str]) -> bool:
    canonical = reg.canonical_id(instruction_id)
    try:
        cls = reg.INSTRUCTION_DICT[canonical]
    except KeyError:
        candidates = list(reg.INSTRUCTION_DICT.keys())
        near = difflib.get_close_matches(instruction_id, candidates, n=5, cutoff=0.6)
        raise KeyError(
            f"Unknown instruction id: {instruction_id} (canonical: {canonical}). "
            f"Closest matches: {near}"
        )
    inst = cls(canonical)
    kw = kwargs or {}
    if kw:
        inst.build_description(**kw)
    else:
        inst.build_description()
    for r in responses:
        if r.strip() and inst.check_following(r):
            return True
    return False


def test_instruction_following_strict(inp: InputExample, prompt_to_response: dict):
    response = prompt_to_response.get(inp.prompt, "")
    is_ok = []
    for i, iid in enumerate(inp.instruction_id_list):
        ok = _build_and_check(
            iid, (inp.kwargs[i] if i < len(inp.kwargs) else {}), inp.prompt, [response]
        )
        is_ok.append(ok)
    return OutputExample(inp.instruction_id_list, inp.prompt, response, all(is_ok), is_ok)


# def test_instruction_following_loose(inp: InputExample, prompt_to_response: dict):
#     response = prompt_to_response.get(inp.prompt, "")
#     lines = response.split("\n")
#     variants = [
#         response,
#         "\n".join(lines[1:]).strip(),
#         "\n".join(lines[:-1]).strip(),
#         "\n".join(lines[1:-1]).strip(),
#         response.replace("*", ""),
#     ]
#     is_ok = []
#     for i, iid in enumerate(inp.instruction_id_list):
#         ok = _build_and_check(
#             iid, (inp.kwargs[i] if i < len(inp.kwargs) else {}), inp.prompt, variants
#         )
#         is_ok.append(ok)
#     return OutputExample(inp.instruction_id_list, inp.prompt, response, all(is_ok), is_ok)
def test_instruction_following_loose(inp: InputExample, prompt_to_response: dict):
    response = prompt_to_response.get(inp.prompt, "")
    variants = iu.make_loose_variants(response)  # << use centralized variants
    is_ok = []
    for i, iid in enumerate(inp.instruction_id_list):
        ok = _build_and_check(
            iid, (inp.kwargs[i] if i < len(inp.kwargs) else {}), inp.prompt, variants
        )
        is_ok.append(ok)
    return OutputExample(inp.instruction_id_list, inp.prompt, response, all(is_ok), is_ok)


def print_report(outputs: list[OutputExample]):
    prompt_total = len(outputs)
    prompt_correct = sum(1 for o in outputs if o.follow_all_instructions)
    instr_total = sum(len(o.instruction_id_list) for o in outputs)
    instr_correct = sum(sum(o.follow_instruction_list) for o in outputs)

    prompt_accuracy = prompt_correct / prompt_total if prompt_total else 0
    instr_accuracy = instr_correct / instr_total if instr_total else 0

    print(f"prompt-level: {prompt_accuracy:.3f}")
    print(f"instruction-level: {instr_accuracy:.3f}")

    # Return metrics dictionary
    return {
        "prompt-level": prompt_accuracy,
        "instruction-level": instr_accuracy,
        "prompt_total": prompt_total,
        "prompt_correct": prompt_correct,
        "instruction_total": instr_total,
        "instruction_correct": instr_correct,
    }