Machine-Learning-Project/import_data.py at main · rmunjewar/Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(file_path):
    """
    Preprocess the data from a CSV file for machine learning.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    # Load the dataset
    data = pd.read_csv(file_path)

    # Keep only the relevant columns
    columns_to_keep = ['price', 'distance', 'surge_multiplier', 'cab_type', 'product_id']
    data = data[columns_to_keep]

    # Drop rows where the target variable 'price' is missing
    data = data.dropna(subset=['price'])

    # Apply KNN Imputation for missing input features
    imputer = KNNImputer(n_neighbors=5)
    data[['distance', 'surge_multiplier']] = imputer.fit_transform(data[['distance', 'surge_multiplier']])

    # Handle missing 'cab_type' by filling with the most frequent value
    if data['cab_type'].isnull().any():
        most_frequent_cab_type = data['cab_type'].mode()[0]
        data['cab_type'] = data['cab_type'].fillna(most_frequent_cab_type)

    # scaler - scale numerical features (distance, surge_multiplier)
    scaler = StandardScaler()
    data[['distance', 'surge_multiplier']] = scaler.fit_transform(data[['distance', 'surge_multiplier']])

    # encode the categorical 'cab_type' feature
    le = LabelEncoder()
    data['cab_type'] = le.fit_transform(data['cab_type'])

    if data['product_id'].dtype == 'object':  # for categorical
        data['product_id'] = le.fit_transform(data['product_id'])


    return data