-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscript.py
More file actions
49 lines (37 loc) · 2.13 KB
/
script.py
File metadata and controls
49 lines (37 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import os
ORIGINAL_DATA_FOLDER = "original_datasets/"
# File paths
locations_file = os.path.join(ORIGINAL_DATA_FOLDER, "46_bird_survey_locations.csv")
surveys_file = os.path.join(ORIGINAL_DATA_FOLDER, "46_bird_surveys.csv")
observations_file = os.path.join(ORIGINAL_DATA_FOLDER, "46_bird_observations.csv")
# Load the datasets
locations_df = pd.read_csv(locations_file)
surveys_df = pd.read_csv(surveys_file)
observations_df = pd.read_csv(observations_file)
### DATA CLEANING ###
# Clean Bird Survey Locations Data
locations_df['lat'] = pd.to_numeric(locations_df['lat'], errors='coerce')
locations_df['long'] = pd.to_numeric(locations_df['long'], errors='coerce')
locations_df['begin_date'] = pd.to_datetime(locations_df['begin_date'], errors='coerce')
locations_df['end_date'] = pd.to_datetime(locations_df['end_date'], errors='coerce')
# Drop rows with missing location data, that is missing latitude and longitude values
locations_df = locations_df.dropna(subset=['lat', 'long'])
# Clean Bird Surveys Data
surveys_df['survey_date'] = pd.to_datetime(surveys_df['survey_date'], errors='coerce')
surveys_df['time_start'] = pd.to_datetime(surveys_df['time_start'], errors='coerce').dt.time
surveys_df['time_end'] = pd.to_datetime(surveys_df['time_end'], errors='coerce').dt.time
# Fill missing values
surveys_df.fillna({"wind_speed": 0, "air_temp": surveys_df["air_temp"].median()}, inplace=True)
# Remove duplicates
surveys_df = surveys_df.drop_duplicates()
# Clean Bird Observations Data
observations_df['survey_date'] = pd.to_datetime(observations_df['survey_date'], errors='coerce')
observations_df['bird_count'] = pd.to_numeric(observations_df['bird_count'], errors='coerce')
observations_df['bird_count'].fillna(1, inplace=True)
observations_df = observations_df.dropna(subset=['common_name'])
# Cleaned Data
CLEAN_DATA_FOLDER = "cleaned_datasets/"
locations_df.to_csv(os.path.join(CLEAN_DATA_FOLDER, "cleaned_bird_survey_locations.csv"), index=False)
surveys_df.to_csv(os.path.join(CLEAN_DATA_FOLDER, "cleaned_bird_surveys.csv"), index=False)
observations_df.to_csv(os.path.join(CLEAN_DATA_FOLDER, "cleaned_bird_observations.csv"), index=False)