-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfake_data.py
More file actions
261 lines (227 loc) · 10 KB
/
fake_data.py
File metadata and controls
261 lines (227 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import random
import sqlalchemy
import os
import dotenv
from faker import Faker
import numpy as np
def database_connection_url():
dotenv.load_dotenv()
DB_USER: str = os.environ.get("POSTGRES_USER")
DB_PASSWD = os.environ.get("POSTGRES_PASSWORD")
DB_SERVER: str = os.environ.get("POSTGRES_SERVER")
DB_PORT: str = os.environ.get("POSTGRES_PORT")
DB_NAME: str = os.environ.get("POSTGRES_DB")
return f"postgresql://{DB_USER}:{DB_PASSWD}@{DB_SERVER}:{DB_PORT}/{DB_NAME}"
print(database_connection_url())
#DATABASE_URL = "postgresql://carsonolander:your_password@localhost:5432/your_database"
# Create a new DB engine based on our connection string
engine = sqlalchemy.create_engine(database_connection_url(), use_insertmanyvalues=True)
# Define Enums
with engine.begin() as conn:
conn.execute(sqlalchemy.text("""
DROP TYPE IF EXISTS public.frequency_enum CASCADE;
DROP TYPE IF EXISTS public.status_enum CASCADE;
DROP TYPE IF EXISTS public.bill_type_enum CASCADE;
DROP TYPE IF EXISTS public.payment_status_enum CASCADE;
CREATE TYPE public.frequency_enum AS ENUM ('daily', 'weekly', 'biweekly', 'monthly', 'bimonthly', 'yearly');
CREATE TYPE public.status_enum AS ENUM ('pending', 'in_progress', 'completed');
CREATE TYPE public.bill_type_enum AS ENUM ('electricity', 'water', 'internet', 'rent', 'gas', 'trash', 'groceries');
CREATE TYPE public.payment_status_enum AS ENUM ('unpaid', 'paid', 'overdue');
"""))
# Create Tables
with engine.begin() as conn:
conn.execute(sqlalchemy.text("""
DROP TABLE IF EXISTS chore CASCADE;
DROP TABLE IF EXISTS roommate CASCADE;
DROP TABLE IF EXISTS chore_assignment CASCADE;
DROP TABLE IF EXISTS bill CASCADE;
DROP TABLE IF EXISTS bill_list CASCADE;
create table
public.roommate (
id bigint generated by default as identity not null,
created_at timestamptz DEFAULT NOW(),
first_name text null,
last_name text null,
email text null,
constraint roommate_pkey primary key (id)
) tablespace pg_default;
create table
public.chore (
id bigint generated by default as identity not null,
created_at timestamptz DEFAULT NOW(),
name text null,
location_in_house text null,
frequency public.frequency_enum null,
duration_mins bigint null default '0'::bigint,
priority integer null,
due_date date null,
constraint chore_pkey primary key (id),
constraint chore_priority_check check (
(
(priority >= 1)
and (priority <= 5)
)
)
) tablespace pg_default;
create table
public.chore_assignment (
id bigint generated by default as identity not null,
created_at timestamptz DEFAULT NOW(),
chore_id bigint null,
roommate_id bigint null,
status public.status_enum null,
constraint chore_assignment_pkey primary key (id),
constraint chore_assignment_chore_id_fkey foreign key (chore_id) references chore (id),
constraint chore_assignment_roommate_id_fkey foreign key (roommate_id) references roommate (id)
) tablespace pg_default;
create table
public.bill (
id bigint generated by default as identity not null,
created_at timestamptz DEFAULT NOW(),
cost double precision null,
due_date date null,
bill_type public.bill_type_enum null,
message text null,
constraint bill_pkey primary key (id)
) tablespace pg_default;
create table
public.bill_list (
id bigint generated by default as identity not null,
created_at timestamptz DEFAULT NOW(),
bill_id bigint null,
roommate_id bigint null,
status public.payment_status_enum null,
amount real not null default '0'::real,
constraint bill_list_pkey primary key (id),
constraint bill_list_bill_id_fkey foreign key (bill_id) references bill (id)
) tablespace pg_default;
"""))
# Data Generation
fake = Faker()
# num_roommates = 10
# num_chores_per_roommate = 5
# num_bills = 5
num_roommates = 10
num_chores_per_roommate = 355000
num_bills = 26365
# Sample Distributions
priorities = [1,2,3,4,5]
frequency = ['daily', 'weekly', 'biweekly', 'monthly', 'bimonthly', 'yearly']
house_location = ['living room', 'bedroom', 'kitchen', 'dining room']
bill_type = ['electricity', 'water', 'internet', 'rent', 'gas', 'trash', 'groceries']
chore_type = ['wash dishes', 'clean bathroom', 'mop floor', 'vacuum', 'clean living room', 'clean bedroom']
# List of custom domain options
domains = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "example.com"]
total_chores = num_roommates * num_chores_per_roommate
chore_sample_distribution = np.random.choice(
chore_type,
total_chores,
p=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1])
priorities_sample_distribution = np.random.choice(
priorities,
total_chores,
p=[0.2, 0.2, 0.2, 0.2, 0.2])
frequency_sample_distribution = np.random.choice(
frequency,
total_chores,
p=[0.3, 0.3, 0.2, 0.1, 0.05, 0.05])
house_location_sample_distribution = np.random.choice(
house_location,
total_chores,
p=[0.25, 0.25, 0.25, 0.25])
bill_type_sample_distribution = np.random.choice(
bill_type,
total_chores,
p=[0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1])
with engine.begin() as conn:
print("creating fake roommates and chores...")
roommate_ids_list = []
global_index = 0
chores_data = []
assignments_data = []
#Create roommates
for roommate_index in range(num_roommates):
# Create roommate
first_name = fake.first_name()
last_name = fake.last_name()
# Generate email using the first and last name
email = f"{first_name.lower()}.{last_name.lower()}@{random.choice(domains)}"
result = conn.execute(sqlalchemy.text('''
INSERT INTO roommate (first_name, last_name, email)
VALUES (:first_name, :last_name, :email) RETURNING id;
'''),
{
"first_name": first_name,
"last_name": last_name,
"email": email
})
roommate_id = result.scalar()
roommate_ids_list.append(roommate_id)
print(f"roommate id list: {roommate_ids_list}")
# Create chores for this roommate
chore_ids_list = []
for chore_index in range(num_chores_per_roommate):
# Prepare chore data
result = conn.execute(sqlalchemy.text('''
INSERT INTO chore (name, location_in_house, frequency, duration_mins, priority, due_date)
VALUES (:name, :location_in_house, :frequency, :duration_mins, :priority, :due_date) RETURNING id
'''), {
"name": chore_sample_distribution[chore_index].item(),
"location_in_house": house_location_sample_distribution[chore_index].item(),
"frequency": frequency_sample_distribution[chore_index].item(),
"duration_mins": random.randint(1, 120),
"priority": priorities_sample_distribution[chore_index].item(),
"due_date": fake.date_between(start_date="-60d", end_date="+60d")
})
print(f"insert into chore {chore_index} for roommate id {roommate_id}")
#Fetch the ID
chore_id = result.scalar()
chore_ids_list.append(chore_id)
# Create assignments
for chore in chore_ids_list:
conn.execute(sqlalchemy.text('''
INSERT INTO chore_assignment (chore_id, roommate_id, status)
VALUES (:chore_id, :roommate_id, :status)
'''),
{
"chore_id": chore,
"roommate_id": roommate_id,
"status": fake.random_element(elements=('pending', 'in_progress', 'completed'))
})
print(f"insert into chore_assignment {chore} for roommate id {roommate_id}")
print("creating fake bills and bill lists...")
bill_ids_list = []
# Create bills
for bill_index in range(num_bills):
cost = round(random.uniform(50.0, 1000.0), 2)
due_date = fake.future_date(end_date='+60d')
bill_type_value = bill_type_sample_distribution[bill_index]
message = f"Pay for {bill_type_value}"
result = conn.execute(sqlalchemy.text('''
INSERT INTO bill (cost, due_date, bill_type, message)
VALUES (:cost, :due_date, :bill_type, :message) RETURNING ID '''),
{
"cost": cost,
"due_date": due_date,
"bill_type": bill_type_value,
"message": message
})
print(f"insert into bill {bill_index} for roommate id {roommate_id}")
bill_id = result.scalar()
bill_ids_list.append({"bill_id": bill_id, "cost":cost})
for bill in bill_ids_list:
bill_id = bill["bill_id"]
cost = bill["cost"]
amount_per_roommate = round(cost/ num_roommates, 2)
for roommate_id in roommate_ids_list:
conn.execute(sqlalchemy.text('''
INSERT INTO bill_list (bill_id, roommate_id, status, amount)
VALUES (:bill_id, :roommate_id, :status, :amount);
'''),{
"bill_id": bill_id,
"roommate_id": roommate_id,
"status": fake.random_element(elements=('unpaid', 'paid', 'overdue')),
"amount": amount_per_roommate
})
print(f"insert into bill list for each roommate of bill id {bill_id} for roommate id {roommate_id}")
print("Fake roommates, chores, chore assignments, bills, and bill list created!")