Skip to content
Binary file added .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ venv/
ENV/
env.bak/
venv.bak/
airlife_env/

# Spyder project settings
.spyderproject
Expand Down Expand Up @@ -205,3 +206,6 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/

# DB configuration file
config.json
22 changes: 7 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,28 @@ def main():
print("\n=== EXTRACTION ===")
print("📥 Extracting data from sources...")

# TODO: Call the extraction functions
# airports = extract_airports()
# flights = extract_flights()

# Uncomment the lines above once you've implemented the functions
print("⚠️ Extraction functions not yet implemented")
return
airports = extract_airports()
flights = extract_flights()

# Step 2: Transform data
print("\n=== TRANSFORMATION ===")
print("🔄 Cleaning and transforming data...")

# TODO: Call the transformation functions
# clean_airports_data = clean_airports(airports)
# clean_flights_data = clean_flights(flights)
# final_airports, final_flights = combine_data(clean_airports_data, clean_flights_data)
clean_airports_data = clean_airports(airports)
clean_flights_data = clean_flights(flights)
final_airports, final_flights = combine_data(clean_airports_data, clean_flights_data)

# Step 3: Load data
print("\n=== LOADING ===")
print("💾 Loading data to database...")

# TODO: Call the loading function
# load_to_database(final_airports, final_flights)
load_to_database(final_airports, final_flights)

# Step 4: Verify everything worked
print("\n=== VERIFICATION ===")
print("✅ Verifying data was loaded correctly...")

# TODO: Call the verification function
# verify_data()
verify_data()

print("\n🎉 ETL Pipeline completed!")
print("=" * 50)
Expand Down
25 changes: 16 additions & 9 deletions src/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ def extract_airports():
try:
# TODO: Read the airports.csv file using pandas
# The file is located at: data/airports.csv
# For now, return an empty DataFrame
df = pd.DataFrame()

# Hint: Use pd.read_csv()
df=pd.read_csv('data/airports.csv')

# TODO: Print how many airports were loaded
# Example: print(f"Loaded {len(df)} airports")

print("⚠️ Airport extraction not yet implemented")
count_invalid = df[df['name'].str.strip().str.contains("Invalid", case=False, na=False)].shape[0]
valid_airports = len(df)-count_invalid
print(f"Loaded {valid_airports} valid airports")
return df

except Exception as e:
Expand Down Expand Up @@ -61,22 +62,28 @@ def extract_flights():
print("Making API request... (this may take a few seconds)")

# TODO: Make the API request using requests.get()

response = requests.get(url, params=params, timeout=10)

# TODO: Check if the response is successful
if response.status_code != 200:
print(f"❌ API request failed: {response.status_code}")
return pd.DataFrame()

# TODO: Get the JSON data from the response
data = response.json()

# TODO: Extract the 'states' data from the JSON
# The API returns: {"time": 123456789, "states": [[aircraft_data], [aircraft_data], ...]}
states = data['states'] if data['states'] else []

# TODO: Convert to DataFrame
df = pd.DataFrame(states)

# TODO: Print how many flights were found
# Example: print(f"Found {len(df)} active flights")
print(f"Found {len(df)} active flights")

# For now, return empty DataFrame
print("⚠️ Flight extraction not yet implemented")
return pd.DataFrame()
return df

except requests.exceptions.RequestException as e:
print(f"❌ Network error fetching flight data: {e}")
Expand Down
94 changes: 41 additions & 53 deletions src/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,13 @@

import pandas as pd
from sqlalchemy import create_engine, text
import psycopg2
import json

# Database connection configuration
# TODO: Update these values with your actual database credentials
DATABASE_CONFIG = {
'username': 'your_username',
'password': 'your_password',
'host': 'localhost',
'port': '5432',
'database': 'airlife_db'
}

with open('config.json', 'r') as f:
DATABASE_CONFIG = json.load(f)


def get_connection_string():
"""Build PostgreSQL connection string"""
Expand All @@ -35,42 +31,33 @@ def load_to_database(airports_df, flights_df):
"""
print("💾 Loading data to PostgreSQL database...")

# TODO: Create connection string using the function above
# connection_string = get_connection_string()
connection_string = get_connection_string()

try:
# TODO: Create SQLAlchemy engine

print("⚠️ Database loading not yet implemented")
return

# TODO: Load airports data
# Use pandas to_sql method to insert data
#
# Parameters explanation:
# - 'airports': table name in database
# - engine: database connection
# - if_exists='replace': replace table if it exists (use 'append' to add to existing data)
# - index=False: don't include pandas row index as a column
engine = create_engine(connection_string)

# Load airports data
airports_df.to_sql('airports', engine, if_exists='replace', index=False)

# TODO: Load flights data (only if not empty)
# Check if flights_df is not empty before loading
# flights_df.to_sql('flights', engine, if_exists='replace', index=False)
if not flights_df.empty:
flights_df.to_sql('flights', engine, if_exists='replace', index=False)

# TODO: Print loading statistics
# print(f"✅ Loaded {len(airports_df)} airports to database")
# if not flights_df.empty:
# print(f"✅ Loaded {len(flights_df)} flights to database")
# else:
# print("ℹ️ No flight data to load")
# Print loading statistics
print(f"✅ Loaded {len(airports_df)} airports to database")
if not flights_df.empty:
print(f"✅ Loaded {len(flights_df)} flights to database")
else:
print("ℹ️ No flight data to load")

except Exception as e:
print(f"❌ Error loading data to database: {e}")
print("💡 Make sure:")
print(" - PostgreSQL is running")
print(" - Database 'airlife_db' exists")
print(" - Username and password are correct")
print(" - Tables are created (run database_setup.sql)")
print(" - Tables are created (if not, run database_setup.sql)")


def verify_data():
"""
Expand All @@ -81,27 +68,28 @@ def verify_data():
connection_string = get_connection_string()

try:
# TODO: Create SQLAlchemy engine
# engine = create_engine(connection_string)

print("⚠️ Data verification not yet implemented")
return

# TODO: Count airports in database
# print(f"📊 Airports in database: {airports_count.iloc[0]['count']}")

# TODO: Count flights in database
# print(f"📊 Flights in database: {flights_count.iloc[0]['count']}")

# TODO: Show sample airport data
# print("\n📋 Sample airports:")
# print(sample_airports.to_string(index=False))
engine = create_engine(connection_string)

# TODO: Show sample flight data (if any exists)
# sample_flights = pd.read_sql("SELECT callsign, origin_country, altitude FROM flights LIMIT 3", engine)
# if not sample_flights.empty:
# print("\n✈️ Sample flights:")
# print(sample_flights.to_string(index=False))
# Count airports in database
airports_count = pd.read_sql("SELECT COUNT(*) as count FROM airports", engine)
print(f"📊 Airports in database: {airports_count.iloc[0]['count']}")

# Count flights in database
flights_count = pd.read_sql("SELECT COUNT(*) as count FROM flights", engine)
print(f"📊 Flights in database: {flights_count.iloc[0]['count']}")

# Show sample airport data
sample_airports = pd.read_sql("SELECT name, city, country FROM airports LIMIT 3", engine)
print("\n📋 Sample airports:")
print('\tNOTE: This is a non-exhaustive view of the table; some columns have been omitted for clarity.')
print(sample_airports.to_string(index=False))

# Show sample flight data (if any exists)
sample_flights = pd.read_sql("SELECT callsign, origin_country, altitude FROM flights LIMIT 3", engine)
if not sample_flights.empty:
print("\n✈️ Sample flights:")
print('\tNOTE: This is a non-exhaustive view of the table; some columns have been omitted for clarity.')
print(sample_flights.to_string(index=False))

except Exception as e:
print(f"❌ Error verifying data: {e}")
Expand Down
26 changes: 21 additions & 5 deletions src/transform_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,23 @@ def clean_airports(airports_df):
df = airports_df.copy()

# TODO: Remove rows with missing latitude or longitude
# df = df.dropna(subset=['latitude', 'longitude'])
# Hint: Use .dropna(subset=['latitude', 'longitude'])
df = df.dropna(subset=['latitude', 'longitude'])

# TODO: Remove airports with invalid coordinates
# Latitude should be between -90 and 90
# Longitude should be between -180 and 180
df = df[(df['latitude'] >= -90) & (df['latitude'] <= 90)]
df = df[(df['longitude'] >= -180) & (df['longitude'] <= 180)]

# TODO: Handle missing IATA codes (replace empty strings or 'N' with None)
df['iata_code'] = df['iata_code'].replace(['', 'N', '\\N'], None)

# TODO: Convert altitude to numeric (handle non-numeric values)
df['altitude'] = pd.to_numeric(df['altitude'], errors='coerce')

# TODO: Print how many airports remain after cleaning
# print(f"After cleaning: {len(df)} airports remain")
print(f"After cleaning: {len(df)} airports remain")

print("⚠️ Airport cleaning not yet implemented")
return df
Expand Down Expand Up @@ -78,26 +83,37 @@ def clean_flights(flights_df):
'on_ground', # Boolean: is aircraft on ground
'velocity', # Ground speed in m/s
'true_track', # Aircraft heading in degrees
'vertical_rate' # Vertical speed in m/s
'vertical_rate', # Vertical speed in m/s
"sensors", # Serial numbers of sensors
"geo_altitude", # Geometric altitude
"squawk", # Transponder code
"spi", # Special purpose indicator
"position_source" # Origin of position (0=ADS-B, 1=ASTERIX, 2=MLAT)
]

# Make a copy to avoid modifying the original
df = flights_df.copy()

# TODO: Assign column names to the DataFrame
df.columns = expected_columns

# TODO: Remove flights with missing coordinates
df = df.dropna(subset=['longitude', 'latitude'])

# TODO: Convert altitude from meters to feet (multiply by 3.28084)
# This makes it easier to understand for aviation
df['altitude'] = df['altitude'] * 3.28084

# TODO: Remove flights with invalid coordinates
# Same coordinate bounds as airports
df = df[(df['latitude'] >= -90) & (df['latitude'] <= 90)]
df = df[(df['longitude'] >= -180) & (df['longitude'] <= 180)]

# TODO: Clean callsign (remove extra whitespace)
df['callsign'] = df['callsign'].str.strip()

# TODO: Print how many flights remain after cleaning
# print(f"After cleaning: {len(df)} flights remain")
print(f"After cleaning: {len(df)} flights remain")

print("⚠️ Flight cleaning not yet implemented")
return df
Expand Down Expand Up @@ -160,7 +176,7 @@ def validate_data_quality(df, data_type):

# Check coordinate bounds if applicable
if 'latitude' in df.columns and 'longitude' in df.columns:
invalid_coords = (
invalid_coords = (
(df['latitude'] < -90) | (df['latitude'] > 90) |
(df['longitude'] < -180) | (df['longitude'] > 180)
).sum()
Expand Down