-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgrabwikidata.py
More file actions
99 lines (89 loc) · 3.19 KB
/
grabwikidata.py
File metadata and controls
99 lines (89 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
"""
Parses through mediawiki writeup of plants and returns a dictionary.
Format for each plant is:
Common_name: {Scientific_name:string, helps:[list], helped_by:[list],
attracts:[list], repels:[list], avoid:[list], comment:string}
Due to the non-uniformed format, the returned data will have to
skimmed through and edited a little bit.
All of the dictionaries are printed out in json format.
full_dict = {"veg":vegetables, "flo":flowers, "her":herbs}
"""
import json
def skip_header(filename):
'''
Returns a string with the formatting top and bottom removed from mediawiki format.
This makes it easy to parse the main data.
'''
f = open(filename)
original_data = f.read()
f.close()
i = 0
for char in original_data:
if char == "[":
start_position = i
break
i += 1
return original_data[start_position-4:]
def parse_data(filename, plant_type):
mediawiki_format = skip_header(filename)
# Creates a string called split_plants of all information for each plant.
removed_brackets = mediawiki_format.replace("[[", "")
removed_brackets = removed_brackets.replace("]]", "")
split_plants = removed_brackets.split('|-')
# Strips each string to create a new list called clean_plants
clean_plants = []
for string in split_plants:
if "style=" in string:
string = string.replace('style="background:#ffffff"', '')
string = remove_tags(string, '<', '>')
string = remove_tags(string, '{{', '}}')
new_string = string.strip()
if new_string[0] == "|":
clean_plants.append(new_string)
# Creates a dictionary of all the plants.
# Key is the plant name.
# Common_name: {plant_type:string, scientific_name:string,
# helps:[list], helped_by:[list],
# attracts:[list], repels:[list],
# avoid:[list], comment:string}
plant_dict = {}
for plant in clean_plants:
info_list = plant.split("||")
common_name = info_list[0][1:].replace("'", "").strip()
scientific_name = info_list[1].replace("'", "").strip()
helps = info_list[2].strip().split(",")
helped_by = info_list[3].strip().split(",")
attracts = info_list[4].strip().split(",")
repels = info_list[5].strip().split(",")
avoid = info_list[6].strip().split(",")
comment = info_list[7].strip()
plant_dict[common_name] = {
"plant_type": plant_type,
"scientific_name": scientific_name,
"helps": helps,
"helped_by": helped_by,
"attracts": attracts,
"repels": repels,
"avoid": avoid,
"comment": comment
}
return plant_dict
def remove_tags(string, start_char, end_char):
'''
Removes tags and their entire contents within a given string.
'''
while "<" in string:
start = string.find(start_char)
end = string.find(end_char)
new_string = string[:start] + string[end+1:]
string = new_string
return string
def main():
veg = parse_data('plantdata/veggies.txt', "veggie")
flo = parse_data('plantdata/flowers.txt', "flower")
her = parse_data('plantdata/herbs.txt', "herbs")
full_dict = {"veg":veg, "flo":flo, "her":her}
return json.dumps(full_dict, sort_keys=True, indent=1)
if __name__ == '__main__':
main()