-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIMW_Database_Extractor.py
More file actions
118 lines (96 loc) · 4.59 KB
/
IMW_Database_Extractor.py
File metadata and controls
118 lines (96 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# take in file, pull out text wanted, output
# need to fill in blanks or incomplete vouchers 3-6 8-9 and 11, 7 is a marker,
# This program is designed to pull out the data needed for uploading to BOLD. The main issue is that the columns are hardcoded in.
# import re
import string
# def rewrite_cell(to_write, used_info):
# if to_write == "":
# to_write =
def taxonomy(list):
t_count = 0
for taxon in list:
if taxon.title() == taxon:
continue
for letter in taxon:
if letter.isupper() or letter.isdigit() or letter in string.punctuation or taxon == "unknown" or taxon == "family" or taxon == "genus" or taxon == "species":
t_count += 1
t_string = ""
if t_count > 0:
for tax in list:
t_string = t_string + f"{tax} "
return t_string
def data_extraction():
filename = "database_all_01july2022.txt" #input("Please enter file name: ")
to_create = "aug17_extraction.txt" #input("Please name created file with .txt: ")
voucher_d = {}
with open(filename) as reader:
# header = reader.readline()
for line in reader:
lines = line.rstrip().split("\t")
if len(lines) == 1:
continue
tax_notes = taxonomy([lines[4], lines[5], lines[6]])
if lines[7].lower() == 'y':
if lines[1] != "":
if len(lines) >= 12:
my_list = [lines[2], lines[3], lines[4], lines[5], lines[6], tax_notes,lines[8], lines[9], lines[11]]
elif len(lines) < 10:
my_list = [lines[2],lines[3],lines[4],lines[5],lines[6],tax_notes,lines[8]]
else:
my_list = [lines[2],lines[3],lines[4],lines[5],lines[6],tax_notes,lines[8], lines[9]]
if lines[1] not in voucher_d:
voucher_d[lines[1]] = []
else:
x = 1
voucher_d[lines[1]].append(my_list)
else:
if len(lines) >= 12:
my_list = [lines[2],lines[3],lines[4],lines[5],lines[6],tax_notes,lines[8], lines[9],lines[11]]
elif len(lines) < 10:
my_list = [lines[2],lines[3],lines[4],lines[5],lines[6], tax_notes,lines[8]]
else:
my_list = [lines[2],lines[3],lines[4],lines[5],lines[6],tax_notes,lines[8], lines[9]]
with open(to_create, 'w') as writer:
key_list = []
museum_list = []
for key, value in voucher_d.items():
for piece in value:
try:
if value[1]:
if f"{piece[1]}" not in museum_list:
museum_list.append(f"{piece[1]}")
else:
museum_list.append(f"{piece[1]}")
except:
continue
for key, value in voucher_d.items():
# duplicates get _v1 _v2 etc; June 23 2022 - change implemented
# duplicate Museum IDs get _v work -
count = 1
for piece in value:
try:
if value[1]:
if f"{key}" not in key_list:
writer.write(f"{key.rstrip()}_v1\t")
key_list.append(f"{key}")
else:
key_list.append(f"{key}")
writer.write(f"{key.rstrip()}_v{key_list.count(f'{key}')}\t")
# New changes as of June 23, 2022: allows for duplicate IDs to receive an appended "_v" with a number, and increment that number
for part in piece:
if part == piece[1]:
if museum_list.count(f"{part}") > 1:
writer.write(f"{part.rstrip()}_v{count}\t")
count += 1
else:
writer.write(f"{part.rstrip()}\t")
else:
writer.write(f"{part.rstrip()}\t")
writer.write("\n")
except:
writer.write(f"{key.rstrip()}\t")
for part in piece:
writer.write(f"{part}\t")
writer.write("\n")
# Evidently, something is causing duplication. i suspect it is line 63.
data_extraction()