forked from ChadiHelwe/FailBERT
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_utils.py
More file actions
101 lines (86 loc) · 2.87 KB
/
run_utils.py
File metadata and controls
101 lines (86 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__author__ = Chadi Helwe
__version__ = 1.0
__maintainer__ = Chadi Helwe
__email__ = chadi.helwe@telecom-paris.fr
__description__ = CLI to use methods from utils.py
"""
import click
from failBERT.utils import create_equally_distributed_dataset, split_dataset
@click.group()
def cli():
pass
@click.command()
@click.option("--path_dataset")
@click.option("--path_equally_distributed_dataset")
@click.option("--limit", default=True)
@click.option("--nbr_instances", default=5000)
def create_equally_distributed_natural_dataset(
path_dataset: str,
path_equally_distributed_dataset: str,
limit: bool,
nbr_instances: int,
) -> None:
"""
Command to create an equally ditributed dataset from an unequally distributed dataset
:param path_dataset: Path of the dataset
:type path_dataset: str
:param path_equally_distrbuted_dataset: Path to save the equally distributed dataset
:type path_equally_distrbuted_dataset: str
:param limit: If true the limitation is based on the next parameter. Otherwise, the limiation is based on the positive instances, defaults to True
:type limit: bool, optional
:param nbr_instances: Number of postive and negative instances, defaults to 10000
:type nbr_instances: int, optional
"""
create_equally_distributed_dataset(
path_dataset, path_equally_distributed_dataset, limit, int(nbr_instances)
)
@click.command()
@click.option("--path_dataset")
@click.option("--path_train")
@click.option("--path_val")
@click.option("--path_test")
@click.option("--passages_column", default="modified_sentence")
@click.option("--labels_column", default="label")
@click.option("--upsample", default=False)
def split_natural_dataset(
path_dataset: str,
path_train: str,
path_val: str,
path_test: str,
passages_column: str,
labels_column: str,
upsample: bool,
) -> None:
"""
Command to split a dataset into training 60%, validation 20%, and testing 20% sets
:param path_dataset: Path of the dataset
:type path_dataset: str
:param path_train: Path to save the training set
:type path_train: str
:param path_val: Path to save the validation set
:type path_val: str
:param path_test: Path to save the testing set
:type path_test: str
:param passages_column: Passages column name
:type passages_column: str
:param labels_column: Labels column name
:type labels_column: str
:param upsample: Upsample the training set for data augmentation, defaults to False
:type upsample: bool, optional
"""
split_dataset(
path_dataset,
path_train,
path_val,
path_test,
passages_column,
labels_column,
upsample,
)
cli.add_command(create_equally_distributed_natural_dataset)
cli.add_command(split_natural_dataset)
if __name__ == "__main__":
cli()