-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemp.py
More file actions
56 lines (46 loc) · 3.42 KB
/
temp.py
File metadata and controls
56 lines (46 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import torch
import pandas as pd
import argparse
from sklearn.model_selection import train_test_split
parser = argparse.ArgumentParser(description='Hate Speech Model')
parser.add_argument('--traindata', type=str, default='english_dataset/english_dataset.tsv', help='location of the training data')
parser.add_argument('--traindata2', type=str, default='English_2020/hasoc_2020_en_train_new.xlsx', help='location of the training data 2')
parser.add_argument('--traindata3', type=str, default='en_Hasoc2021_train.csv', help='location of the training data 3')
parser.add_argument('--testdata', type=str, default='english_dataset/hasoc2019_en_test-2919.tsv', help='location of the test data')
parser.add_argument('--testdata2', type=str, default='English_2020/hasoc_2020_en_test_new.xlsx', help='location of the test data 2')
parser.add_argument('--testdata3', type=str, default='en_Hasoc2021_test_task1.csv', help='location of the test data 3')
# HOS
parser.add_argument('--hos_traindata', type=str, default='/home/shared_data/h/HOS_labeled_data.csv', help='location of the training data')
# OLID
parser.add_argument('--olid_traindata', type=str, default='/home/shared_data/h/OLIDv1.0/olid-training-v1.0.tsv', help='location of the training data')
args = parser.parse_args()
if __name__ == '__main__':
data1 = pd.read_csv(args.olid_traindata, sep='\t', header=0, encoding="latin1").fillna(method="ffill")
# test_data1 = pd.read_csv(args.testdata, sep='\t', header=0, encoding="latin1").fillna(method="ffill")
# #print(len(test_data1))
# data2 = pd.read_excel(args.traindata2, dtype={'tweet_id':'string'}, engine='openpyxl')
# test_data2 = pd.read_excel(args.testdata2, dtype={'tweet_id':'string'}, engine='openpyxl')
# data3 = pd.read_csv(args.hos_traindata, header=0, encoding="latin1").fillna(method="ffill")
# test_data3 = pd.read_csv(args.testdata3)
# data1 = data1[['text_id', 'text', 'task_1', 'task_2']].copy() # .drop(axis=1, columns=['text_id','task_2','task_3']) # drop unneeded columns
# test_data1 = test_data1[['text_id', 'text', 'task_1', 'task_2']].copy()
# data1 = data1.rename(columns={'text_id': '_id'}, inplace=False)
# test_data1 = test_data1.rename(columns={'text_id': '_id'}, inplace=False)
# data2 = data2[['tweet_id', 'text', 'task_1', 'task_2']].copy()
# test_data2 = test_data2[['tweet_id', 'text', 'task_1', 'task_2']].copy()
# data2 = data2.rename(columns={'tweet_id': '_id'}, inplace=False)
# test_data2 = test_data2.rename(columns={'tweet_id': '_id'}, inplace=False)
# data3 = data3[['_id', 'text', 'task_1', 'task_2']].copy()
# test_data3 = test_data3[['_id', 'text']].copy()
traindata1, valdata1 = train_test_split(data1, test_size=0.1, shuffle=True)
# traindata2, valdata2 = train_test_split(data2, test_size=0.1, shuffle=True)
# traindata3, valdata3 = train_test_split(data3, test_size=0.1, shuffle=True)
traindata1.to_csv('/home/shared_data/h/OLIDv1.0/olid-training.csv', index=False)
valdata1.to_csv('/home/shared_data/h/OLIDv1.0/olid-dev.csv', index=False)
# test_data1.to_csv('has19_testdata.csv', index=False)
# traindata2.to_csv('has20_traindata.csv', index=False)
# valdata2.to_csv('has20_devdata.csv', index=False)
# test_data2.to_csv('has20_testdata.csv', index=False)
# traindata3.to_csv('has21_traindata.csv', index=False)
# valdata3.to_csv('has21_devdata.csv', index=False)
# test_data3.to_csv('has21_testdata.csv', index=False)