This ch3 env builds and works for all the cases tested now.

AndyMc629 · AndyMc629 · commit c536d89bef0b · 2025-02-07T20:20:06.000Z
diff --git a/Chapter03/pipelines/sklearn_pipeline.py b/Chapter03/pipelines/sklearn_pipeline.py
@@ -3,6 +3,8 @@
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+import pandas as pd 
 
 numeric_features = ['age', 'balance']
 numeric_transformer = Pipeline(steps=[
@@ -22,5 +24,10 @@
 clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', LogisticRegression())])
 
+df = pd.read_csv('../../Chapter01/classifying/bank_data/bank.csv', delimiter=';', decimal=',')
+X, y = df.drop('y', axis=1), df['y'].apply(lambda x: 1 if x == 'yes' else 0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
+# You need ot get 
 clf_pipeline.fit(X_train, y_train)
 
+print(clf_pipeline.predict(X_test))
diff --git a/Chapter03/pipelines/sparkmllib_pipeline.py b/Chapter03/pipelines/sparkmllib_pipeline.py
@@ -68,3 +68,5 @@
     # Define the entire pipeline and fit on the train data and transform on the test data
     clfPipeline = Pipeline().setStages(stages).fit(trainingData)
     clfPipeline.transform(testData)
+    
+    print(clfPipeline.transform(testData).show())