-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimport_data.py
More file actions
47 lines (35 loc) · 1.58 KB
/
Copy pathimport_data.py
File metadata and controls
47 lines (35 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
def preprocess_data(file_path):
"""
Preprocess the data from a CSV file for machine learning.
Args:
file_path (str): Path to the CSV file.
Returns:
pd.DataFrame: Preprocessed dataset.
"""
# Load the dataset
data = pd.read_csv(file_path)
# Keep only the relevant columns
columns_to_keep = ['price', 'distance', 'surge_multiplier', 'cab_type', 'product_id']
data = data[columns_to_keep]
# Drop rows where the target variable 'price' is missing
data = data.dropna(subset=['price'])
# Apply KNN Imputation for missing input features
imputer = KNNImputer(n_neighbors=5)
data[['distance', 'surge_multiplier']] = imputer.fit_transform(data[['distance', 'surge_multiplier']])
# Handle missing 'cab_type' by filling with the most frequent value
if data['cab_type'].isnull().any():
most_frequent_cab_type = data['cab_type'].mode()[0]
data['cab_type'] = data['cab_type'].fillna(most_frequent_cab_type)
# scaler - scale numerical features (distance, surge_multiplier)
scaler = StandardScaler()
data[['distance', 'surge_multiplier']] = scaler.fit_transform(data[['distance', 'surge_multiplier']])
# encode the categorical 'cab_type' feature
le = LabelEncoder()
data['cab_type'] = le.fit_transform(data['cab_type'])
if data['product_id'].dtype == 'object': # for categorical
data['product_id'] = le.fit_transform(data['product_id'])
return data