-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrowdflower_sampling.py
More file actions
30 lines (22 loc) · 875 Bytes
/
crowdflower_sampling.py
File metadata and controls
30 lines (22 loc) · 875 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import os
import random
import csv
CROWDFLOWER_DIR = "crowdflower_data"
open(CROWDFLOWER_DIR + "/samples.csv", 'w') # Clear contents of file
# Get txt version of data
with open(CROWDFLOWER_DIR + "/samples.csv", 'a+') as csvfile:
writter = csv.writer(csvfile)
writter.writerow(["Dataset", "Sample"])
for filename in os.listdir(os.getcwd()+ "/data/txt"):
if filename in [".keep", "coherent_sentences.txt"]:
continue
filepath = "data/txt/" + filename
num_lines = sum(1 for line in open(filepath, 'r'))
# Randomly get 101 lines in the file
sample_lines = random.sample(range(1, num_lines), 51)
i = 0
for line in open(filepath, 'r'):
if i in sample_lines:
writter.writerow([filename, line])
sample_lines.remove(i)
i += 1