PolyEval/polyeval/evaluators/json.py at main · EvalsOne/PolyEval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from polyeval.utils.helpers import is_valid_json, json_match
from typing import Any, Callable, Dict, List
from jsonschema import validate, ValidationError
from datasets import Dataset
import logging
import json
logging.basicConfig(level=logging.INFO)

class IsJson:
    name = 'is_json'

    def __init__(self, lang='zh'):
        self.language = lang

    def eval(self, dataset: Dataset, **kwargs):
        if dataset is None:
            return False, "No dataset provided"

        question = dataset["question"]
        sampled = dataset["answer"]

        """
        judge if the answer is a valid json
        """

        pass_eval = is_valid_json(sampled)
        score = 1 if pass_eval else 0

        try:
            eval_result = {
                'question': question,
                'sampled': sampled,
                "score": score,
                "pass_eval": pass_eval
            }
            return eval_result
        except Exception as e:
            return False, str(e)

class JsonMatch:
    name = 'json_match'

    def __init__(self, lang='zh'):
        self.language = lang

    def eval(self, dataset: Dataset, **kwargs):
        if dataset is None:
            return False, "No dataset provided"

        question = dataset["question"]
        sampled = dataset["answer"]
        correct_answers = dataset["ideal"]

        """
        judge if the
        """
        sampled_json: Any
        try:
            sampled_json = json.loads(sampled)
        except ValueError:
            # If the sampled string is not valid JSON, it will never match
            sampled_json = None

        # Allow the following to raise ValueError; the correct answers
        # should always be valid JSON
        correct_json = [json.loads(correct_answer) for correct_answer in correct_answers]

        matches = [json_match(sampled_json, cj) for cj in correct_json]
        picked=[sampled for i in range(len(correct_answers)) if matches[i]],
        score = 1 if True in matches else 0
        pass_eval = True in matches

        try:
            eval_result = {
                'question': question,
                'sampled': sampled,
                'ideal': correct_answers,
                'picked': picked,
                "score": score,
                "pass_eval": pass_eval
            }
            return eval_result
        except Exception as e:
            return False, str(e)

class JsonSchemaMatch:
    name = 'json_schema_match'

    def __init__(self, lang='zh'):
        self.language = lang

    def eval(self, dataset: Dataset, **kwargs):
        if dataset is None:
            return False, "No dataset provided"

        schema = kwargs.get('schema', None)
        if not schema:
            return False, "No sampling parameters provided"

        if not schema:
            return False, "Schema not provided"
        question = dataset["question"]
        sampled = dataset["answer"]

        try:
            sampled_json = json.loads(sampled)
        except ValueError:
            sampled_json = None
        is_valid = False
        error_msg = ""

        if sampled_json is not None:
            try:
                validate(instance=sampled_json, schema=schema)
                is_valid = True
            except ValidationError as e:
                error_msg = str(e)

        eval_result = {
            'question': question,
            'score': 1 if is_valid else 0,
            'pass_eval': is_valid,
            'sampled': sampled,
            'is_valid': is_valid,
            'reasoning': error_msg
        }
        return eval_result