-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_test_sample_for_annotations.py
More file actions
49 lines (35 loc) · 1.65 KB
/
get_test_sample_for_annotations.py
File metadata and controls
49 lines (35 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from operator import itemgetter
import json
import random
import argparse
random.seed(42)
def create_sample_from_testfile(test_file, sample_size=100):
"""read test file, get a random sample of 100, return the sample"""
data_list = []
with open(test_file, 'r', encoding='utf-8') as f:
for line in f:
instance_dict = json.loads(line)
data_list.append(instance_dict)
print(f"Total number of test instances: {len(data_list)}")
sample_list = random.sample(data_list, sample_size)
print(f"Number of sample instances: {len(sample_list)}")
sample_list = sorted(sample_list, key=itemgetter('id'))
return sample_list
def write_the_sample(sample_list, outfile):
# Let's save the data in a jsonl file
with open(outfile, 'w', encoding='utf-8') as f:
for line in sample_list:
line = json.dump(line, f, ensure_ascii=False)
f.write(f'{line}\n')
def main():
print("Creating sample from test file...")
parser = argparse.ArgumentParser()
parser.add_argument("--testfile", type=str, default="/home/finapolat/GenIE/data/rebel/en_test.jsonl")
parser.add_argument("--outfolder", type=str, default="/home/finapolat/KGC-LLM/sample_data/sample_from_testdata_for_annotations.jsonl")
parser.add_argument("--sample_size", type=int, default=100)
args = parser.parse_args()
sample_list = create_sample_from_testfile(test_file=args.testfile, sample_size=args.sample_size)
write_the_sample(sample_list, args.outfolder)
if __name__ == "__main__":
print("Running get_test_sample_for_annotations.py...")
main()