From 05e16969607e6a1c8422e411c766e981b5e762d0 Mon Sep 17 00:00:00 2001
From: anushka1511 <anushka.clg15@gmail.com>
Date: Mon, 24 Feb 2025 13:16:28 +0530
Subject: [PATCH 1/2] add house price prediction model to machine learning

---
 .../house-price-prediction.md                 | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 docs/machine-learning/house-price-prediction.md

diff --git a/docs/machine-learning/house-price-prediction.md b/docs/machine-learning/house-price-prediction.md
new file mode 100644
index 0000000..44eaaf8
--- /dev/null
+++ b/docs/machine-learning/house-price-prediction.md
@@ -0,0 +1,83 @@
+Aim: To predict housing prices using Linear Regression
+Dataset: https://www.kaggle.com/datasets/ashydv/housing-dataset/data
+
+#importing necessary libraries
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+%matplotlib inline
+#read csv file
+HouseDF = pd.read_csv("C:/Users/HP/Downloads/Housing.csv")
+
+#display first 5 entries of dataset
+HouseDF.head()
+
+HouseDF.info()
+
+HouseDF.describe()
+
+#display dataset columns
+HouseDF.columns
+
+#display data types of dataset columns
+print(HouseDF.dtypes)
+
+# EDA
+sns.pairplot(HouseDF)
+
+sns.distplot(HouseDF['price'])
+
+#heatmap if converting non numeric values
+HouseDF_encoded = HouseDF.copy()
+HouseDF_encoded = pd.get_dummies(HouseDF_encoded, drop_first=True)
+sns.heatmap(HouseDF_encoded.corr(), annot=True)
+
+#heatmap if dropping non numeric values
+drop non numeric columsn
+sns.heatmap(HouseDF_numeric.corr(), annot=True)
+
+#if dropping non numerical values
+# Selecting numerical features as independent variables (X)
+X = HouseDF[['area', 'bedrooms', 'bathrooms', 'stories', 'parking']]
+
+# Selecting the target variable (y)
+y = HouseDF['price']
+
+#if using the non numerical values
+# Encoding categorical variables
+HouseDF_encoded = pd.get_dummies(HouseDF, drop_first=True)
+
+# Selecting independent variables (X) - all numeric columns after encoding
+X = HouseDF_encoded.drop(columns=['price'])
+
+# Selecting the target variable (y)
+y = HouseDF_encoded['price']
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
+
+from sklearn.linear_model import LinearRegression
+
+lm = LinearRegression()
+
+lm.fit(X_train,y_train)
+
+print(lm.intercept_)
+
+
+coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
+coeff_df
+predictions = lm.predict(X_test)
+plt.scatter(y_test,predictions)
+sns.distplot((y_test-predictions),bins=50);
+from sklearn import metrics
+
+print('MAE:', metrics.mean_absolute_error(y_test, predictions))
+print('MSE:', metrics.mean_squared_error(y_test, predictions))
+print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
+
+#lower the errors, better the prediction model
+

From 3f7194f43c328aa2ebad97d2c1974fd1da54624a Mon Sep 17 00:00:00 2001
From: anushka1511 <anushka.clg15@gmail.com>
Date: Fri, 28 Feb 2025 21:47:21 +0530
Subject: [PATCH 2/2] These changes are in response to PR comments

---
 .../house-price-prediction.md                 | 65 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/docs/machine-learning/house-price-prediction.md b/docs/machine-learning/house-price-prediction.md
index 44eaaf8..c93ad44 100644
--- a/docs/machine-learning/house-price-prediction.md
+++ b/docs/machine-learning/house-price-prediction.md
@@ -1,6 +1,31 @@
 Aim: To predict housing prices using Linear Regression
 Dataset: https://www.kaggle.com/datasets/ashydv/housing-dataset/data
 
+Tech Stack
+
+
+
+Language: Python
+
+Libraries/Frameworks: pandas, numpy, seaborn, matplotlib, sklearn
+
+Description: The project aims at predicting prices of houses based on features extracted from a real dataset set in Delhi, using Linear Regression. It performs basic Exploratory Data Analysis (EDA) in order to understand data trends.
+
+This project can be a helpful tool for real estate decision-making by estimating property values, and helping market participants understand trends.
+
+The approach of the project required selection and understanding the dataset foremost, visulaising the data and selecting releavnt and useful features, upon which a Linear Regression model was built and evaluated using metrics such as MAE, MSE, and RMSE.
+Project Workflow and Basic code explanation:
+Step 1 - Data Cleaning
+
+Step 2 - Feature Engineering
+
+Step 3 - Model Selection
+
+Step 4 - Model Training
+
+Step 5 - Evaluation
+
+Step 6 - Deployment
 #importing necessary libraries
 import pandas as pd
 import numpy as np
@@ -14,6 +39,7 @@ HouseDF = pd.read_csv("C:/Users/HP/Downloads/Housing.csv")
 #display first 5 entries of dataset
 HouseDF.head()
 
+#dataset overview and feature details
 HouseDF.info()
 
 HouseDF.describe()
@@ -27,6 +53,7 @@ print(HouseDF.dtypes)
 # EDA
 sns.pairplot(HouseDF)
 
+# pair plots to visualise features' relations
 sns.distplot(HouseDF['price'])
 
 #heatmap if converting non numeric values
@@ -55,29 +82,65 @@ X = HouseDF_encoded.drop(columns=['price'])
 # Selecting the target variable (y)
 y = HouseDF_encoded['price']
 
+
 from sklearn.model_selection import train_test_split
 
+# splitting dataset into training (60%) and test (40%) sets 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
 
+# import linear regression model form scikit learn
 from sklearn.linear_model import LinearRegression
 
+# initialising the model
 lm = LinearRegression()
 
+# training model using the training data
 lm.fit(X_train,y_train)
 
 print(lm.intercept_)
 
-
+# display feature coefficients
 coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
 coeff_df
 predictions = lm.predict(X_test)
+#visualisation of actual vs predicted values
 plt.scatter(y_test,predictions)
 sns.distplot((y_test-predictions),bins=50);
+# import evaluation metrics
 from sklearn import metrics
 
+# display and calculation of the performance metrics
 print('MAE:', metrics.mean_absolute_error(y_test, predictions))
 print('MSE:', metrics.mean_squared_error(y_test, predictions))
 print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
 
 #lower the errors, better the prediction model
+Project Tradeoffs
+
+Accuracy vs. Computation Time:
+More features could improve accuracy but increase computational cost, addressed by selecting only highly correlated features.
+
+Model Simplicity vs. Performance
+Linear Regression for better explainability.
+Screenshots of EDA
+
+![Screenshot 2025-02-28 203414.png](<attachment:Screenshot 2025-02-28 203414.png>)
+
+![Screenshot 2025-02-28 203431.png](<attachment:Screenshot 2025-02-28 203431.png>)
+
+![Screenshot 2025-02-28 203437.png](<attachment:Screenshot 2025-02-28 203437.png>)
+Evaluation Metrics for Linear Regression:
+
+MAE: 798540.2157757834
+
+MSE: 1233466174021.77
+
+RMSE: 1110615.2232081864
+Conclusion
+
+House price is strongly influenced by area, stories, and bathrooms.
+
+Features like airconditioning and parking also contribute significantly.
+Use Cases
 
+Real Estate Pricing, Market Trend Analysis