-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_faker.py
More file actions
448 lines (339 loc) · 15.8 KB
/
data_faker.py
File metadata and controls
448 lines (339 loc) · 15.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
from faker import Faker
from datetime import datetime
import pandas
import db
from random import choice, randrange, sample, shuffle
from unidecode import unidecode
from util.wiki import get_obs
import requests
import json
import math
def dist_lavoisier(coord1, coord2 = (48.838185294510346, 2.592986786694019)):
"""
Calcule la distance entre deux coordonnées.
Par defaut la 2eme coordonnee est celle du batiment lavoisier de l'universite gustave eiffel
C'est la formule de Haversine
https://fr.wikipedia.org/wiki/Formule_de_haversine
"""
lat1, lon1 = coord1
lat2, lon2 = coord2
lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
c = 2 * math.asin(math.sqrt(a))
R = 6371
return R * c
class DataGen:
def __init__(self):
self.faker = Faker(["fr_FR"])
# Génération déterministe pour avoir des tests cohérents
seed = int(datetime.strptime("24/10/2025", "%d/%m/%Y").strftime("%Y%m%d"))
if seed is not None:
self.faker.seed_instance(seed)
def gen_profile(self) -> dict:
"""
Genere un profile
"""
return self.faker.profile()
def fill_in_nichoirs(self, m = 70):
"""
Remple les donnees des nichoirs
Args
m : limite du nombre de nichoirs
"""
print("Filling in nichoirs...")
df = pandas.read_csv('data/bor_abrisfaune.csv', delimiter=';')
noms = df['NOM_SITE_A']
geos = df['Geo Point']
for i in range(min(len(df), m)):
if len(noms[i]) <= 50:
db.insert_nichoir(noms[i], geos[i])
def fill_in_biomes(self, m = 150):
"""
Remple les donnees des biomes (autres que nichoirs)
Args
m : limite du nombre de biomes
"""
print("Filling in biomes...")
df = pandas.read_csv('data/znieff-type2.csv', delimiter=',')
noms = df['NOM']
geos = df['Geo Point']
for i in range(min(len(df), m)):
if len(noms[i]) <= 50:
db.insert_habitat(noms[i], geos[i])
def gen_users(self, lim = 100):
print("Filling in profiles...")
unique_usernames = {}
for i in range(lim):
nom, prenom = self.faker.first_name(),self.faker.last_name()
coords = [
('mail', f"{prenom.lower()}.{nom.lower()}@{self.faker.free_email_domain()}"),
('addr', self.faker.address()),
('tel', self.faker.phone_number())
]
# on filtre pour eviter erreurs
coords = [c for c in coords if len(str(c[1])) <= 150]
pwd = self.faker.password()
user_templates = [
f"{prenom}_{nom}",
f"{nom}.{prenom}",
f"{prenom[0]}_{nom}",
f"{prenom}{nom[0]}",
f"{prenom}.{nom}{self.faker.random_int(1,99)}"
]
user = unidecode(choice(user_templates)).lower().replace(' ', '')
if user not in unique_usernames:
unique_usernames[user] = 0
else:
user += f"{unique_usernames[user]}"
unique_usernames[user] = 0
unique_usernames[user] += 1
print(f"{i}. {nom}, {prenom}, {user}")
db.create_user(user, prenom, nom, pwd, coords)
# ~ 90% d'adherents
if randrange(10) < 9:
statut = choice([1,2])
db.make_adherent(user, statut)
def gen_oiseaux(self, lim = float('inf')):
db.clear_oiseaux()
df = pandas.read_csv('data/oiseaux.csv', delimiter=',')
noms = df['nom']
nomsci = df['sci']
i = 0
adherents = db.get_adherents()
biomes = db.get_biomes_id()
nichoirs = db.get_nichoirs_id()
for nom, sci in zip(noms, nomsci):
obs = get_obs(sci)
# cas d'erreur wikipedia
if obs == 0:
continue
taille = obs['taille'].replace(",", ".") if 'taille' in obs else None
images = obs['imgs'] if 'imgs' in obs else []
# On insere l'animal
print(f"{i}. {nom} {sci} {taille} {date}")
db.insert_animal(sci, nom, taille, 'oiseaux')
# On stocke toutes les images comme des "observations"
for image in images:
date = self.faker.date_this_year(before_today=True, after_today=False)
if taille and float(taille) < 50:
habitat = choice(nichoirs)
else:
habitat = choice(biomes)
db.add_observation(choice(adherents), sci, habitat, image, date)
# On genere les attributs
for attr, elts in obs.items():
if 'attr' in ['imgs','taille']:
continue
else:
for elt in elts:
db.add_animal_info(attr, sci, choice(adherents), elt)
i+=1
def gen_flore(self, lim = float('inf')):
db.clear_flore()
df = pandas.read_csv('data/autres.csv', delimiter=',')
noms = df['nom']
nomsci = df['sci']
grp = df['groupe']
i = 0
adherents = db.get_adherents()
biomes = db.get_biomes_id()
for nom, sci, grp in zip(noms, nomsci, grp):
obs = get_obs(sci)
# cas d'erreur wikipedia
if obs == 0:
continue
taille = obs['taille'].replace(",", ".") if 'taille' in obs else None
images = obs['imgs'] if 'imgs' in obs else []
# On insere l'animal
print(f"{i}. {nom} {sci} {taille} {date}")
db.insert_animal(sci, nom, taille, grp)
# On stocke toutes les images comme des "observations"
for image in images:
habitat = choice(biomes)
date = self.faker.date_this_year(before_today=True, after_today=False)
db.add_observation(choice(adherents), sci, habitat, image, date)
# On genere les attributs
for attr, elts in obs.items():
if 'attr' in ['imgs','taille']:
continue
else:
for elt in elts:
db.add_animal_info(attr, sci, choice(adherents), elt)
i+=1
def gen_sortie(self, habitat):
payload = {
"model": "openai-fast",
"messages": [
{"role": "system", "content": "You are a data generator."},
{"role": "user", "content": f"""
I am making a fake database for showcase purpose. In it there are people, animals and places and your purpose is to create "outings" given a place name.
The outings will be about one option and strictly one option. You will choose among the following:
- a walk in a given place,
- Jeu du snapshot : a snapshot game (try and take as much pictures of animals during the outing, if you see an animal that was never seen here you get some extra bonuses) / comparable to pokemon snap but in real life
- Cleaning operation : Clean nest boxes (nichoirs) for example if the place is a nichoir. PICK THIS OPTION IF AND ONLY IF YOU ARE TOLD THAT THE PLACE IS A NICHOIR
- Entretien nichoir (NICHOIR ONLY)
- Desherbage
- Cueillette
- Pic nic
- Feel free to create more themes if you want so they don't feel too limited. these are example you may and have to use at some point but you can create others as long as they are coherent.
For each request, answer returning one and only one outing as I will send other prompts if i want some others.
You should be very creative and create the following in a json:
desc : The outing description, IN FRENCH
title : The outing title, IN FRENCH. It should be compliant with VARCHAR(50)
theme : The outing theme, try to make this generic as themes must be shared among outings. Must be one word and quite self explanatory (eg. Balade / Pic-nic / Nettoyage...)
Let's get started. Make an outing for the place : {habitat}
"""}
],
"temperature": 1,
"max_tokens": 500
}
response = requests.post(
"https://text.pollinations.ai/openai",
json=payload
)
content = response.json()
print(content)
content = content['choices'][0]['message']['content']
return json.loads(content)
def gen_info_hab(self, habitat, attr):
payload = {
"model": "openai-fast",
"messages": [
{"role": "system", "content": "You are a data generator."},
{"role": "user", "content": f"""
I am making a fake database for showcase purpose. There are various attributes you can generate.
For exemple ; Humidité : 75
Description : Brief description given a place name
Attributes you can generate for the habitat include:
- Climat : Specify the climate type (e.g., "Tempéré", "Tropical", "Aride").
- Altitude : The altitude of the habitat in meters.
- Superficie : The area of the habitat in square kilometers.
- Accessibilité : Describe how accessible the habitat is (e.g., "Facile", "Difficile").
- Statut : The conservation status of the habitat (e.g., "Protégé", "Menacé").
- Activités : Activities that can be performed in the habitat (e.g., "Randonnée", "Observation des oiseaux").
- Pollution : Level of pollution in the habitat (e.g., "Faible", "Modérée", "Élevée").
- Température Moyenne : Average temperature in the habitat.
- Précipitations : Average annual rainfall in the habitat.
- Description : A description of the place
- Humidité : a coherent value
These are all meant to be fake but realistic
Each of these should be a sentence - two setnecnes but not too long to give an info, eg.
"Cette forêt est connue pour un climat très humide en raison des fortes pluies avoisinnant 70% d'humidité"
return json like {{info_type: data.. ,}} for each attribute i'll ask for and nothing else as it is meant to be parsed programatically.
Generate for {habitat} the attributes {attr}
"""}
],
"temperature": 1,
"max_tokens": 500
}
response = requests.post(
"https://text.pollinations.ai/openai",
json=payload
)
content = response.json()
print(content)
content = content['choices'][0]['message']['content']
return json.loads(content)
def gen_sorties(self):
# db.clear_sortie()
biomes = db.get_biomes_dev()
adherents = db.get_adherents()
theme = ['Balade', 'Nettoyage', 'Desherbage', 'Cueillette', 'Pic-Nic']
# en prevision de crash -- on shuffle pour eviter que ceux du debut soit surrepresentes
shuffle(biomes)
for biome in biomes:
hab_id, hab_nom, position = biome['idhabitat'], biome['nomhabitat'], biome['coords']
print(hab_id, hab_nom)
# on cree entre 0 et 3 sorties par habitat
for i in range(randrange(0, 4)):
print('-'*100)
print('#'*5, 'Numero',i, 'Biome',hab_nom, hab_id)
effectif_max = randrange(1,10) * 5
coord_sortie = map(float, position.split(','))
dist_km = dist_lavoisier(coord_sortie)
date_rdv = self.faker.date_between(
start_date='-1y', end_date='+1y'
)
# par construction de la db ca fonctionne avec < 70 (serial + ordre d'insert)
is_nichoir = int(hab_id) < 70
if is_nichoir:
is_nichoir = ". This habitat is a nichoir (nest box)"
else:
is_nichoir = ""
try:
sortie_resp = self.gen_sortie(hab_nom+" with the theme : "+choice(theme) + is_nichoir)
except: # en cas de json malforme
continue
nom = sortie_resp['title']
if len(nom) > 50:
continue
desc = sortie_resp['desc']
theme = sortie_resp['theme']
print(f"Sortie:\n Nom: {nom}\n Thème: {theme}\n Habitat ID: {hab_id}\n Date: {date_rdv}\n Distance (km): {dist_km:.2f}\n Effectif max: {effectif_max}\n Description: {desc}\n")
sortie_id = db.insert_sortie(nom, theme, hab_id, date_rdv, dist_km, effectif_max, desc)
# On peuple la sortie
if randrange(0,5) == 3:
participants = sample(adherents, k=effectif_max)
else:
participants = sample(adherents, k=randrange(0, effectif_max + 1))
for adherent in participants:
db.inscrire_sortie(sortie_id, adherent)
def gen_animateurs(self):
'''
Donne des animateurs a toutes les sorties
'''
sorties = db.get_sorties_dev()
profiles = db.get_profiles_dev()
selection = sample(profiles, k=10)
for id in sorties:
anim = choice(selection)
db.add_animateur(anim, id)
print(f"Added {id} with animateur {anim}")
def gen_info_habitat(self):
"""
Génère des informations supplémentaires pour les habitats.
"""
biomes = db.get_biomes_dev()[::-1]
adherents = db.get_profiles_dev()
for biome in biomes:
hab_id, hab_nom = biome['idhabitat'], biome['nomhabitat']
print(f"Generating info for habitat {hab_nom} (ID: {hab_id})")
# Génère des informations aléatoires pour l'habitat
info_types = [
'Climat',
'Altitude',
'Superficie',
'Accessibilité',
'Statut',
'Activités',
'Pollution',
'Température Moyenne',
'Humidité',
'Précipitations',
'Description'
]
sample_size = randrange(2, 7)
sampled_info_types = sample(info_types, k=sample_size)
print(sample_size, biome)
for info_type in sampled_info_types:
author = choice(adherents)
try:
info = self.gen_info_hab(biome['nomhabitat'], info_type)
except: # en cas de json malforme
continue
print(info)
for attr, content in info.items():
formated = content[:200].rsplit(' ', 1)[0] + '...' if len(content) > 200 else content
db.add_info_habitat(hab_id, info_type[:200], formated[:200], author)
print(attr, content)
break
if __name__ == '__main__':
pass
# DataGen().fill_in_nichoirs()
# DataGen().fill_in_biomes()
# DataGen().gen_users()
# DataGen().gen_oiseaux()
# DataGen().gen_flore()
DataGen().gen_info_habitat()