-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path5fusion.py
More file actions
64 lines (50 loc) · 2.38 KB
/
5fusion.py
File metadata and controls
64 lines (50 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import glob
import os
base_dir = os.path.dirname(os.path.abspath(__file__))
#data location
ruta = os.path.join(base_dir, "Datos")
provincias = ["VALLADOLID","BURGOS","SALAMANCA","AVILA","SEGOVIA","ZAMORA","SORIA","PALENCIA","LEON"]
print("data fusion")
for provincia in provincias:
print(f"\n ---PROVINCE: {provincia}")
rutaprovincia= os.path.join(ruta, provincia)
rescate= os.path.join(rutaprovincia, "rescates")
#we take only the clean files
limpio= os.path.join(rutaprovincia, "*_limpio.csv")
base= glob.glob(limpio)
for archivo in base:
nombre = os.path.basename(archivo)
#extract the station code:
try:
estacion= nombre.split('_')[1]
except IndexError:
continue #in case there are files with other naming formats
print(f"Analyzing station: {estacion} from {provincia}")
#use asterisks to catch any year for the selected station
rutarescate = os.path.join(rescate, f"*{estacion}*_limpio.csv")
parche = glob.glob(rutarescate)
#open the base file without the "rescates":
df_base = pd.read_csv(archivo)
df_base['fecha'] = pd.to_datetime(df_base['fecha'])
if not parche:
print(f"No rescue files found, saving directly as definitive")
df_fusion= df_base
else:
print(f"Fusion started. Found rescue files: {len(parche)}")
union= [df_base]
for falta in parche: #add missing data files
df_parche= pd.read_csv(falta)
df_parche['fecha'] = pd.to_datetime(df_parche['fecha'])
union.append(df_parche) #merge them with the base
#concatenate all files into a single table
df_fusion= pd.concat(union, ignore_index=True)
#sort by date
df_fusion= df_fusion.sort_values(by='fecha')
#remove duplicates, keeping the patch data just in case
df_fusion= df_fusion.drop_duplicates(subset=['fecha'], keep='last')
#final save as definitive
definitivo= archivo.replace("_limpio.csv", "_definitivo.csv")
df_fusion.to_csv(definitivo, index=False)
print(f"Saving: {os.path.basename(definitivo)}")
print("FUSION COMPLETE")