-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_file_cache_from_db.py
More file actions
78 lines (61 loc) · 2.42 KB
/
generate_file_cache_from_db.py
File metadata and controls
78 lines (61 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# This script takes chatgpt values out of the local database and writes
# them to "transformed" directory. This lets us reuse everything that
# already lives in the database without querying chatgpt again.
import logging
import csv
import json
import os
import pathlib
import subprocess
from getpass import getpass
from io import StringIO
logging.getLogger().setLevel(logging.WARN)
fields_to_extract = [
"pollinator_rating",
"bird_rating",
"spread_rating",
"deer_resistance_rating",
"height",
"spread",
"bloom",
]
base_dir = "data/transformed/chatgpt"
def query_database(password):
"""Simple but kind of janky way to run query against the
database, results in a string in tsv format"""
fields_str = ", ".join(fields_to_extract)
cmd = f"""mysql -h localhost -P 3306 -u planting_life_user --protocol=tcp -p{password} \
planting_life -e "SELECT scientific_name, {fields_str} FROM plants;" """
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True
)
stdout, stderr = process.communicate()
return stdout.decode()
if __name__ == "__main__":
password = getpass("planting_life_user password:")
db_result = query_database(password)
reader = csv.DictReader(StringIO(db_result), delimiter="\t")
for row in reader:
scientific_name = row["scientific_name"].lower()
for field in fields_to_extract:
field_value = row[field]
if field_value == "NULL":
continue # don't save null values
try:
field_value = int(row[field])
except ValueError:
pass # keep it as a string if it can't parse as an int
# width is called spread in the db, this is a janky fix to
# account for the difference
if field == "spread":
field = "width"
# create the directory if it doesn't already exist
dir_path = f"{base_dir}/{field}"
pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
file_path = f"{dir_path}/{scientific_name}.json"
if os.path.exists(file_path):
continue # file already exists, don't overwrite
with open(file_path, "w") as f:
print(f"Writing to {file_path}")
formatted_output = json.dumps({field: field_value}, indent=4)
f.write(formatted_output)