From d0b67c4dd0d9b738b05547c42561be7f73264366 Mon Sep 17 00:00:00 2001 From: mehtataral <75374656+mehtataral@users.noreply.github.com> Date: Wed, 24 May 2023 19:23:53 +0530 Subject: [PATCH] Add files via upload --- 5thRegressionMultiReg (1).py | 70 ++++++++++++++++++ 5thRegressionMultiReg.py | 77 ++++++++++++++++++++ Mumbai_regression_housePredicvtion.py | 83 +++++++++++++++++++++ house_pred_duplicate.csv | 11 +++ labelencoder_linearregression.py | 99 +++++++++++++++++++++++++ lassoReg.py | 67 +++++++++++++++++ linearRegresssion.py | 65 +++++++++++++++++ linear_reg_train.py | 99 +++++++++++++++++++++++++ linear_regression_final.py | 100 ++++++++++++++++++++++++++ multi linear regression.py | 63 ++++++++++++++++ multi_house_para.csv | 6 ++ 11 files changed, 740 insertions(+) create mode 100644 5thRegressionMultiReg (1).py create mode 100644 5thRegressionMultiReg.py create mode 100644 Mumbai_regression_housePredicvtion.py create mode 100644 house_pred_duplicate.csv create mode 100644 labelencoder_linearregression.py create mode 100644 lassoReg.py create mode 100644 linearRegresssion.py create mode 100644 linear_reg_train.py create mode 100644 linear_regression_final.py create mode 100644 multi linear regression.py create mode 100644 multi_house_para.csv diff --git a/5thRegressionMultiReg (1).py b/5thRegressionMultiReg (1).py new file mode 100644 index 0000000..544a833 --- /dev/null +++ b/5thRegressionMultiReg (1).py @@ -0,0 +1,70 @@ +import pandas as pd +df=pd.read_csv(open("D:/desktop/AI Training Content/multireg.csv","rb")) + +x=df.iloc[:,:-1] +y=df.iloc[:,-1] + + +import numpy as np +x=np.array(x) +y=np.array(y) + + +from sklearn.impute import SimpleImputer +im=SimpleImputer() +x=im.fit_transform(x) #rule: min 2D array + + +from sklearn.preprocessing import MinMaxScaler +sc=MinMaxScaler() +x=sc.fit_transform(x) + +from sklearn.model_selection import train_test_split +xtrain,xtest,ytrain,ytest=train_test_split(x,y, test_size=0.2) + + +#now select the algo: as target is of continous type use regression algo: Linear Regression, RandomForest +# it works on equation of a line: y=bx+a + +#Types: Simple linear regression {1 input column: 1 output column} 2) Multi linear regression{more than 1 input and 1 output}. + +from sklearn.linear_model import LinearRegression +lr=LinearRegression() +lr.fit(xtrain,ytrain) +pred=lr.predict(xtest) + +#for regression the evaluation method is completely different. +#1) visualization + +xaxis=np.linspace(1,len(pred),len(pred)) +import matplotlib.pyplot as plt +plt.plot(xaxis,pred,color='red') +plt.plot(xaxis,ytest,color='blue') +plt.show() + +#2) how much good??? to define it in a number RMSE calculate +from sklearn.metrics import mean_squared_error +res=np.sqrt(mean_squared_error(pred,ytest)) + + + +from sklearn.ensemble import RandomForestRegressor +rf=RandomForestRegressor() +rf.fit(xtrain,ytrain) +predrf=rf.predict(xtest) + +#for regression the evaluation method is completely different. +#1) visualization + +xaxis=np.linspace(1,len(predrf),len(predrf)) +import matplotlib.pyplot as plt +plt.plot(xaxis,predrf,color='red') +plt.plot(xaxis,ytest,color='blue') +plt.show() + +#2) how much good??? to define it in a number RMSE calculate +from sklearn.metrics import mean_squared_error +resrf=np.sqrt(mean_squared_error(predrf,ytest)) + + + diff --git a/5thRegressionMultiReg.py b/5thRegressionMultiReg.py new file mode 100644 index 0000000..9219379 --- /dev/null +++ b/5thRegressionMultiReg.py @@ -0,0 +1,77 @@ +import pandas as pd +# df=pd.read_csv(open("D:/desktop/AI Training Content/multireg.csv","rb")) +df=pd.read_csv(open("C:/Users/Admin/Downloads/Churn_Modelling.csv","rb")) + + +# x=df.iloc[:,:-1] +# y=df.iloc[:,-1] +x=df.iloc[:,3:-1] +y=df.iloc[:,-1] + +import numpy as np +x=np.array(x) +y=np.array(y) + +from sklearn.preprocessing import LabelEncoder +lbl=LabelEncoder() +x[:,1]=lbl.fit_transform(x[:,1]) +x[:,2]=lbl.fit_transform(x[:,2]) + +from sklearn.impute import SimpleImputer +im=SimpleImputer() +x=im.fit_transform(x) #rule: min 2D array + + +from sklearn.preprocessing import MinMaxScaler +sc=MinMaxScaler() +x=sc.fit_transform(x) + +from sklearn.model_selection import train_test_split +xtrain,xtest,ytrain,ytest=train_test_split(x,y, test_size=0.2) + + +#now select the algo: as target is of continous type use regression algo: Linear Regression, RandomForest +# it works on equation of a line: y=bx+a + +#Types: Simple linear regression {1 input column: 1 output column} 2) Multi linear regression{more than 1 input and 1 output}. + +from sklearn.linear_model import LinearRegression +lr=LinearRegression() +lr.fit(xtrain,ytrain) +pred=lr.predict(xtest) + +#for regression the evaluation method is completely different. +#1) visualization + +xaxis=np.linspace(1,len(pred),len(pred)) +import matplotlib.pyplot as plt +plt.plot(xaxis,pred,color='red') +plt.plot(xaxis,ytest,color='blue') +plt.show() + +#2) how much good??? to define it in a number RMSE calculate +from sklearn.metrics import mean_squared_error +res=np.sqrt(mean_squared_error(pred,ytest)) + + + +from sklearn.ensemble import RandomForestRegressor +rf=RandomForestRegressor() +rf.fit(xtrain,ytrain) +predrf=rf.predict(xtest) + +#for regression the evaluation method is completely different. +#1) visualization + +xaxis=np.linspace(1,len(predrf),len(predrf)) +import matplotlib.pyplot as plt +plt.plot(xaxis,predrf,color='red') +plt.plot(xaxis,ytest,color='blue') +plt.show() + +#2) how much good??? to define it in a number RMSE calculate +from sklearn.metrics import mean_squared_error +resrf=np.sqrt(mean_squared_error(predrf,ytest)) + + + diff --git a/Mumbai_regression_housePredicvtion.py b/Mumbai_regression_housePredicvtion.py new file mode 100644 index 0000000..d76941d --- /dev/null +++ b/Mumbai_regression_housePredicvtion.py @@ -0,0 +1,83 @@ +import numpy as np +import pandas as pd + + +df =pd.read_csv(r"C:\Users\user\Documents\Taral\DATA_SET\DATA_SET\Regression Data Set\Mumbai1.csv") +print(df) +df.info() +df.drop("Unnamed: 0",axis ="columns",inplace =True) + +x = df.iloc[:,1:]#input +print(x) +x.info() +x.corr() +y =df.iloc[:,0]#output +print(y) + +corr = x.corr() +#training and testing data + +from sklearn.preprocessing import LabelEncoder +lb = LabelEncoder() +x.Location = lb.fit_transform(x.Location) +print(x) + +from sklearn.model_selection import train_test_split +x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=500) + + +print(len(x_train)) +print(len(x_test)) +print(len(y_train)) +print(len(y_test)) + + +print(x_train) +print(x_test) +print(y_train) +print(y_test) + + +from sklearn.linear_model import LinearRegression +lbl =LinearRegression() +lbl.fit(x_train,y_train) +lbl.predict(x_test) +lbl.score(x_test,y_test) + +from sklearn.linear_model import Lasso,Ridge +las1 = Lasso() + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + +las1 = Lasso(alpha =1000,selection="random") + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + +rid1 = Ridge() + +rid1.fit(x_train,y_train) + +predict1=rid1.predict(x_test) +rid1.score(x_test,y_test) + +# dff = pd.DataFrame(predict1) +# dff +# dff["actual_value"] =y_test +# dff + +rid2 = Ridge(alpha =50) + +rid2.fit(x_train,y_train) + +rid2.predict(x_test) +rid2.score(x_test,y_test) +rid2.score(x_train,y_train) + diff --git a/house_pred_duplicate.csv b/house_pred_duplicate.csv new file mode 100644 index 0000000..508cda9 --- /dev/null +++ b/house_pred_duplicate.csv @@ -0,0 +1,11 @@ +area,price +3200,61000000 +2600,55000000 +2600,55000000 +3000,56500000 +3200,61000000 +3600,59500000 +4000,76000000 +3000, +4011, +2600,55000000 diff --git a/labelencoder_linearregression.py b/labelencoder_linearregression.py new file mode 100644 index 0000000..5617c9f --- /dev/null +++ b/labelencoder_linearregression.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 25 12:06:40 2022 +Label Encoder Multilinear Regression + +@author: user +""" +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +df =pd.read_csv(r"C:\Users\user\Documents\Taral\DATA_SET\DATA_SET\Regression Data Set\Basic Regression Data\multi_house_dummy.csv") +print(df) +df = df.reindex(columns=["area","town","price"]) +print(df) + +x = df.iloc[:,:-1] +print(x) + +y =df.iloc[:,-1:] +print(y) + +plt.scatter(x.area, y) +plt.show() + +#LabelEncoder converts strings inot numbers +from sklearn.preprocessing import LabelEncoder +lbl =LabelEncoder() +x.town = lbl.fit_transform(x.town) +print(x) + +from sklearn.linear_model import LinearRegression +reg = LinearRegression() +reg.fit(x,y) +reg.predict(x) + +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression + +pr = PolynomialFeatures(degree =3) +x_poly = pr.fit_transform(x) +reg = LinearRegression() +reg.fit(x_poly,y) + +#lasso regression +#ridge regression +#polynomail regression --> scatter + +print(reg.coef_) +print(reg.intercept_) + +reg.predict(x_poly) +reg.score(x_poly,y) + +reg.predict([[3600]]) +reg.predict([[2600]]) +reg.predict([[4600]]) + +reg.predict([[1,2600]]) +reg.predict([[2,2600]]) +reg.score(x,y) + +plt.scatter(x.area, y) +plt.plot(x.area,reg.predict(x),'rD:') +plt.show() + + +from sklearn.linear_model import Ridge,Lasso +from sklearn.model_selection import GridSearchCV +model1=Ridge(alpha=5) +model1.fit(x,y) +model1.score(x,y) +print(model1.coef_)#coefiicent +print(model1.intercept_) +model1.predict([[2600]]) +model1.predict([[3600]]) +model1.predict([[4600]]) +model1.predict([[2000]]) + +model2=Lasso(alpha=6) +model2.fit(x,y) +model2.score(x,y) +print(model2.coef_) +print(model2.intercept_) + + +m1=Ridge() +parameters={'alpha':[1e-15,1e-18,1e-8,1e-4,1e-3,1e-2,1,5,10,20]} + +lasso_regressor=GridSearchCV(m1,parameters,cv=5) +lasso_regressor.fit(x,y) +lasso_regressor.score(x,y) + +print(model2.coef_) + +# from sklearn.metrics import mean_squared_error +# mse = mean_squared_error(y,ypred) +# rmse= np.sqrt(mse) +# \ No newline at end of file diff --git a/lassoReg.py b/lassoReg.py new file mode 100644 index 0000000..a8ba861 --- /dev/null +++ b/lassoReg.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 25 12:26:46 2022 + +@author: user +""" + +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 10 12:29:51 2020 +LASSO +@author: Trainer 1 +""" +x =(1,2,3) +lst =list(x) +print(lst) +lst.append(55) +lst +x =tuple(lst) +import pandas as pd +import numpy as np +df=pd.read_csv('Advertising.csv') +df.head() +df.drop('SR',axis=1,inplace=True) + +from sklearn.linear_model import LinearRegression +x=df.drop('SALE',axis=1) +y=pd.DataFrame(df.SALE) +model1=LinearRegression() +model1.fit(x,y) +print(model1.coef_) +ki7i + +from sklearn.linear_model import Ridge,Lasso +from sklearn.model_selection import GridSearchCV +model2=Ridge(alpha=700) +model2.fit(x,y) +print(model2.coef_) + + +model3=Lasso(alpha=6) +model3.fit(x,y) +model3.coef_ +m1=Ridge() +parameters={'alpha':[1e-15,1e-18,1e-8,1e-4,1e-3,1e-2,1,5,10,20]} + +lasso_regressor=GridSearchCV(m1,parameters,cv=5) +lasso_regressor.fit(x,y) +print(model3.coef_) +print(lasso_regressor.best_params_) +print(lasso_regressor.best_score_) + + +89,102,88,0 :Lasso n rigid +y=89*555676+0*23454+ + +higer no feature n data : processing time n compution cost + +when we say Processing Time incerse : n no calct happen backen +foreg : + y=m1x1+m2x2+m3x3+m4x4 +c + m1=9,m2=6,m3=4,m4=2, x1=100,x2=350,x3=55,x4=10 +what if i have m1=220,m2=300,m4=267,m3=200 then processing time then we need to find a method to decrese the values how ??? + Lasso n Rigid : decrese the values + + +Featureselection : chisquare test diff --git a/linearRegresssion.py b/linearRegresssion.py new file mode 100644 index 0000000..18fad53 --- /dev/null +++ b/linearRegresssion.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 24 17:10:59 2022 + +@author: user +""" +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +x ={ + "area":[2600,3000,3200,3600,4000], + "price":["55000000","56500000","61000000","59500000","76000000"] + } + +print(x) +df =pd.DataFrame(x) +print(df) +# y ={ +# "area":[2600,3000,3200,3600,4000,2800,3400,3800,3900,4900] +# } + +# print(y) +# df =pd.DataFrame(y) +# print(df) +# df.to_csv(r"C:\Users\user\Desktop\DS\DATA_SET\house_pred.csv",index =False) +# df.to_csv(r"C:\Users\user\Desktop\DS\DATA_SET\house_area.csv",index =False) + + +# df =pd.read_csv(r"C:\Users\user\Desktop\DS\DATA_SET\house_pred.csv") +# print(df) + +x = df.iloc[:,0] +y = df.iloc[:,-1] +plt.xlabel("Price") +plt.ylabel("Area") +plt.scatter(df.price,df.area,marker="+") +plt.plot(df.price,df.area) +plt.show() + +from sklearn.linear_model import LinearRegression +reg =LinearRegression() +# reg.fit([[x],[y]]) +reg.fit(df[['area']],df.price) +# reg.fit(x[x['area']],y.price) #not valid ..used xtrain and ytrain +reg.coef_ +reg.intercept_ +x =reg.predict([[3300]]) +reg.predict([[2600]]) +reg.predict([[4500]]) + + +x =reg.predict(df)#only area is mentioned in df +df['price'] = x #store that price values in price +print(df) + +plt.xlabel("Price") +plt.ylabel("Area") +plt.scatter(df.price,df.area,marker="+") +plt.plot(df.area,reg.predict(df[['area']]),color ="green",marker = "o") +plt.show() + +# from sklearn.metrics import accuracy_score +# acc =accuracy_score((2600),(4500))#not valid ..used xtrain and ytrain +# print(acc) diff --git a/linear_reg_train.py b/linear_reg_train.py new file mode 100644 index 0000000..b2994b2 --- /dev/null +++ b/linear_reg_train.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 25 12:38:01 2022 +train and test Multilinear Regression + +@author: user +""" + +import numpy as np +import pandas as pd + + +df =pd.read_csv(r"C:\Users\user\Documents\Taral\DATA_SET\DATA_SET\Regression Data Set\Basic Regression Data\multi_house_dummy.csv") +print(df) + +x = df.iloc[:,:-1] +print(x) + +y =df.iloc[:,-1] +print(y) + +a = pd.get_dummies(x.town) +print(a) + +new_df = pd.concat([x,a],axis="columns") +print(new_df) + +new_df =new_df.drop("town",axis ="columns") +print(new_df) + +#training and testing data + +from sklearn.model_selection import train_test_split +x_train,x_test,y_train,y_test = train_test_split(new_df,y,test_size=0.3,random_state=5) + + +print(len(x_train)) +print(len(x_test)) +print(len(y_train)) +print(len(y_test)) + + +print(x_train) +print(x_test) +print(y_train) +print(y_test) + + +from sklearn.linear_model import LinearRegression +lbl =LinearRegression() +lbl.fit(x_train,y_train) + + +lbl.predict(x_test) +y_test +lbl.score(x_test,y_test) + +lbl.score(x_train,y_train) + + + + + +from sklearn.linear_model import Lasso,Ridge +las1 = Lasso() + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + + +las1 = Lasso(alpha =100) + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + +rid1 = Ridge() + +rid1.fit(x_train,y_train) + +rid1.predict(x_test) +rid1.score(x_test,y_test) + + + +rid2 = Ridge(alpha =10) + +rid2.fit(x_train,y_train) + +rid2.predict(x_test) +rid2.score(x_test,y_test) + + + diff --git a/linear_regression_final.py b/linear_regression_final.py new file mode 100644 index 0000000..ec633d0 --- /dev/null +++ b/linear_regression_final.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 22 19:52:36 2022 + +@author: user +""" + +import numpy as np +import pandas as pd + + +df =pd.read_csv(r"C:\Users\user\Desktop\DS\DATA_SET\Regression Data Set\CAR DETAILS FROM CAR DEKHO.csv") +print(df) +df.info() + +df = df.reindex(columns = ["name","year","km_driven","fuel","seller_type","transmission","owner","selling_price"]) + +x = df.iloc[:,:-1] +print(x) +x.info() +x.corr() +y =df.iloc[:,-1] +print(y) + +a = pd.get_dummies(x.fuel) +print(a) + +b =pd.get_dummies(x.seller_type) +print(b) + +c =pd.get_dummies(x.transmission) +print(c) + +d =pd.get_dummies(x.owner) +print(d) + + +new_df = pd.concat([x,a,b,c,d],axis="columns") +print(new_df) + +new_df =new_df.drop(["fuel","seller_type","transmission","owner","name"],axis ="columns") +print(new_df) +new_df.corr() +#training and testing data + +from sklearn.model_selection import train_test_split +x_train,x_test,y_train,y_test = train_test_split(new_df,y,test_size=0.3,random_state=500) + + +print(len(x_train)) +print(len(x_test)) +print(len(y_train)) +print(len(y_test)) + + +print(x_train) +print(x_test) +print(y_train) +print(y_test) + + +from sklearn.linear_model import LinearRegression +lbl =LinearRegression() +lbl.fit(x_train,y_train) +lbl.predict(x_test) +lbl.score(x_test,y_test) + +from sklearn.linear_model import Lasso,Ridge +las1 = Lasso() + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + + +las1 = Lasso(alpha =100) + +las1.fit(x_train,y_train) + +las1.predict(x_test) +las1.score(x_test,y_test) + + +rid1 = Ridge() + +rid1.fit(x_train,y_train) + +rid1.predict(x_test) +rid1.score(x_test,y_test) + + + +rid2 = Ridge(alpha =50) + +rid2.fit(x_train,y_train) + +rid2.predict(x_test) +rid2.score(x_test,y_test) diff --git a/multi linear regression.py b/multi linear regression.py new file mode 100644 index 0000000..c915aa7 --- /dev/null +++ b/multi linear regression.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 24 19:04:12 2022 + +@author: user +""" +import pandas as pd +import numpy as np +# x ={ +# "area":[2600,3000,3200,3600,4000], +# "bedroom":[3,4,np.nan,3,5], +# "age_house":[20,15,np.nan,30,8], +# "price":["55000000","56500000","61000000","59500000","76000000"] +# } +# print(x) + +# df =pd.DataFrame(x) +# print(df) +# df.to_csv(r"C:\Users\user\Desktop\DS\DATA_SET\multi_house_para.csv",index =False) + +df =pd.read_csv(r"C:\Users\user\Documents\Taral\DataScience\Regression\multi_house_para.csv") +print(df) + + +df["bedroom"] =df['bedroom'].fillna(df.bedroom.mean()) +print(df) +df["age_house"] = df.age_house.fillna(df.age_house.mean()) +print(df) + + +df.bedroom = df.bedroom.astype(int) +df.age_house = df.age_house.astype(int) +print(df) + +x =df.iloc[:,:-1] +y =df.iloc[:,-1] +print(x) +print(y) +from sklearn.linear_model import LinearRegression +reg = LinearRegression() +reg.fit(x,y) +reg.predict([[2600,3,20]])#area, bedroom , age_house +reg.predict([[2600,3,10]]) +reg.predict([[2600,3,1]]) +reg.predict([[600,1,30]]) + +print(reg.score(x,y)) + +reg.fit(df[["area","bedroom","age_house"]],df.price) +reg.predict([[2600,3,20]])#area, bedroom , age_house +reg.predict([[2600,3,10]]) +reg.predict([[2600,3,1]]) + +reg.predict([[3600,3,30]]) + + + + + + + + + diff --git a/multi_house_para.csv b/multi_house_para.csv new file mode 100644 index 0000000..8932140 --- /dev/null +++ b/multi_house_para.csv @@ -0,0 +1,6 @@ +area,bedroom,age_house,price +2600,3,20,55000000 +3000,4,15,56500000 +3200,,,61000000 +3600,3,30,59500000 +4000,5,8,76000000