Streamlit_Deploy/checkpoint_1.py at main · mactar221/Streamlit_Deploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pandas as pd
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import io # import the io module
import streamlit as st # import streamlit and assign it to the variable 'st'

df = pd.read_csv('output.csv')
#sampled_df = df.sample(frac=0.4)  # Get 50% of the data
#len(sampled_df)


st.dataframe(df)

import streamlit as st # import streamlit and assign it to the variable 'st'

st.title("Machine Learning Workflow with Streamlit")

#Step 1: Upload the dataset
#st.header("Step 1: Upload the Dataset")
#uploaded_file = st.file_uploader("output.csv", type="csv")
#if uploaded_file is not None:
    #df = pd.read_csv(uploaded_file)
    #st.write("Data Preview:")
    #st.write(df.head())
#else:
    #st.stop()


#st.title("Machine Learning Workflow with Streamlit")

# Step 2: Data Exploration
st.header("Step 2: Data Exploration")
st.write("Data Information:")
buffer = io.StringIO() # Create a StringIO object to capture df.info() output
df.info(buf=buffer)
s = buffer.getvalue()
st.text(s)
st.write("Summary Statistics:")
st.write(df.describe())

# Step 3: Data Cleaning
st.header("Step 3: Data Cleaning")

# Handle Missing Values
st.write("Handling Missing Values...")
for column in df.columns:
    if df[column].dtype == 'object':
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        df[column].fillna(df[column].mean(), inplace=True)
st.write("Missing values handled.")

# Remove Duplicates
st.write("Removing Duplicates...")
df.drop_duplicates(inplace=True)
st.write("Duplicates removed.")

# Encode Categorical Features

import pandas as pd
from sklearn.preprocessing import LabelEncoder
st.write("Encoding Categorical Features...")
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
st.write("Categorical features encoded.")
st.write(df.head())

# Step 4: Train the Model
st.header("Step 4: Train the Model")


# Select the target variable from the dataset
# Author text
st.sidebar.markdown('<h5 style="color: black;"> Author : Mactar Sarr </h5>', unsafe_allow_html=True)

# Sidebar for user input selection
st.sidebar.markdown('<h1 style="color: blue;">Select One output and at least one input Variable</h1>', unsafe_allow_html=True)

# Assuming 'df' is your DataFrame, get the column names
column_names = df.columns.tolist()  # Get column names from your DataFrame

# Select output variable
output_variable_model = st.sidebar.selectbox('Select One output Variable', column_names)

# Get potential default values that are actually in column_names
potential_defaults = [col for col in ['R_450', 'R_550', 'R_650', 'R_720', 'R_750', 'R_800'] if col in column_names]

# Select input variables to predict the target variable (output)
# Use the safe defaults calculated above
input_variables_model = st.sidebar.multiselect('Select at least one input Variable', column_names, default=potential_defaults)

if not output_variable_model or not input_variables_model:
    st.warning('Select One output and at least one input Variable to start.')

# User option for setting the rate of test data
test_data_rate = st.sidebar.slider('Select the rate of test data (%)', 0, 100, 20, 1)
# Import necessary library
from sklearn.model_selection import train_test_split
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Define input features (X) and target variable (y) for model training
# Ensure input_variables_model is not empty before proceeding
if input_variables_model:
    X_model = df[input_variables_model]
    y_model = df[output_variable_model]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=test_data_rate / 100, random_state=42)

    # Train Random Forest model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
else:
    st.warning('Please select at least one input variable.') # Provide feedback to the user

# Import necessary library
from sklearn.model_selection import train_test_split
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Define input features (X) and target variable (y) for model training
# Ensure input_variables_model is not empty before proceeding
if input_variables_model: # Check if input variables have been selected
    X_model = df[input_variables_model]
    y_model = df[output_variable_model]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=test_data_rate / 100, random_state=42)

    # Train Random Forest model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
else:
    st.warning('Please select at least one input variable.') # Provide feedback to the user if no input variables are selected


# Use the model for prediction
if input_variables_model: # Check if input variables have been selected before making predictions
    st.title('Use the Model for Prediction')
    st.markdown('<h4 style="color: black;"> Use Sidebar menu to select the values of input variables to predict the target variable. </h4>', unsafe_allow_html=True)

    # User input for feature values
    st.sidebar.markdown('<h2 style="color: blue;"> Select the values of input variables to predict the target variable</h2>', unsafe_allow_html=True)
    user_input_prediction = {}
    for column in input_variables_model:
        user_input_prediction[column] = st.sidebar.slider(f'Select {column}', float(df[column].min()), float(df[column].max()), float(df[column].mean())) # Use 'df' instead of 'data'

    # Predict and display result
    prediction = model.predict(pd.DataFrame([user_input_prediction]))
    st.subheader('Prediction')
    st.write(f'The predicted {output_variable_model} value is: {prediction[0]:.5f}')

    # Display a bar chart for the predicted output
    st.subheader('Predicted Output Chart')
    prediction_data = pd.DataFrame({output_variable_model: [prediction[0]]})
    fig, ax=plt.subplots(figsize=(8, 5))
    sns.barplot(data=prediction_data, palette=['orange'])
    ax.set_title(f'Predicted {output_variable_model} Value')
    ax.set_ylabel('Value')
    st.pyplot(fig)
else:
    st.warning("Please select input variables first.") # Inform the user to select input variables