planting-life-data/generate_file_cache_from_db.py at main · developing-human/planting-life-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# This script takes chatgpt values out of the local database and writes
# them to "transformed" directory.  This lets us reuse everything that
# already lives in the database without querying chatgpt again.

import logging
import csv
import json
import os
import pathlib
import subprocess
from getpass import getpass
from io import StringIO

logging.getLogger().setLevel(logging.WARN)

fields_to_extract = [
    "pollinator_rating",
    "bird_rating",
    "spread_rating",
    "deer_resistance_rating",
    "height",
    "spread",
    "bloom",
]

base_dir = "data/transformed/chatgpt"


def query_database(password):
    """Simple but kind of janky way to run query against the
    database, results in a string in tsv format"""

    fields_str = ", ".join(fields_to_extract)
    cmd = f"""mysql -h localhost -P 3306 -u planting_life_user --protocol=tcp -p{password} \
    planting_life -e "SELECT scientific_name, {fields_str} FROM plants;" """
    process = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True
    )
    stdout, stderr = process.communicate()
    return stdout.decode()


if __name__ == "__main__":
    password = getpass("planting_life_user password:")

    db_result = query_database(password)
    reader = csv.DictReader(StringIO(db_result), delimiter="\t")

    for row in reader:
        scientific_name = row["scientific_name"].lower()

        for field in fields_to_extract:
            field_value = row[field]
            if field_value == "NULL":
                continue  # don't save null values

            try:
                field_value = int(row[field])
            except ValueError:
                pass  # keep it as a string if it can't parse as an int

            # width is called spread in the db, this is a janky fix to
            # account for the difference
            if field == "spread":
                field = "width"

            # create the directory if it doesn't already exist
            dir_path = f"{base_dir}/{field}"
            pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)

            file_path = f"{dir_path}/{scientific_name}.json"
            if os.path.exists(file_path):
                continue  # file already exists, don't overwrite

            with open(file_path, "w") as f:
                print(f"Writing to {file_path}")
                formatted_output = json.dumps({field: field_value}, indent=4)
                f.write(formatted_output)