From 7f6d31ef27d68f4b33803cc29e7619d3f96e441a Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 12:41:02 -0600
Subject: [PATCH 1/6] initial fix, need more robust synth data

---
 graphistry/feature_utils.py         | 16 ++++++++++++++++
 graphistry/tests/test_umap_utils.py | 13 +++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
index 26214f3a69..38a75a284d 100644
--- a/graphistry/feature_utils.py
+++ b/graphistry/feature_utils.py
@@ -912,6 +912,22 @@ def process_dirty_dataframes(
             nndf[object_columns] = nndf[object_columns].astype(str)
             X_enc = data_encoder.fit_transform(nndf, y)
             logger.info("obj columns: %s are being converted to str", object_columns)
+        except AssertionError:
+            nndf = ndf.copy()
+            object_columns = pd.DataFrame(nndf).select_dtypes(include=['object']).columns
+            for j in object_columns:
+                num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
+                if num_floats > len(nndf[j]) / 2:
+                    print(nndf[j].dropna())
+                    try:
+                        nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
+                        logger.info("Coerced strings to floats")
+                    except:
+                        nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
+                        nndf = nndf.explode(j)
+                        nndf[j] = nndf[j].astype(float)
+                        logger.info("Exploded rows with multiple values in single cell")
+            X_enc = data_encoder.fit_transform(nndf, y)
         X_enc = make_array(X_enc)
 
         import warnings
diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py
index 3362e3405f..addf61f27c 100644
--- a/graphistry/tests/test_umap_utils.py
+++ b/graphistry/tests/test_umap_utils.py
@@ -400,6 +400,19 @@ def test_umap_edgecase(self):
 
         graphistry.nodes(df).umap()
         assert True
+        
+    @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
+    def test_type_edgecase(self):
+        values = pd.Series(np.random.rand(50))
+        num_to_convert = int(len(values) * 0.05)
+        indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False)
+        for i in indices_to_convert:
+            values[i] = str(values[i])
+        values.loc[13] = '92.026 123.903'
+        values.loc[33] = '26.092 903.123'
+
+        graphistry.nodes(values).umap()
+        assert True
 
     @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
     def test_node_umap(self):

From 66a7bf2cb5691e3dbe001a5eb6a89bea8b8759de Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 13:47:54 -0600
Subject: [PATCH 2/6] explode adds node names -- need smart fix

---
 graphistry/feature_utils.py            | 54 ++++++++++++++++++--------
 graphistry/tests/test_feature_utils.py | 12 ++++++
 graphistry/tests/test_umap_utils.py    |  2 +-
 3 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
index 38a75a284d..a843e300b5 100644
--- a/graphistry/feature_utils.py
+++ b/graphistry/feature_utils.py
@@ -393,6 +393,27 @@ def convert_money_string_to_float(money: str):
     mask = where_is_currency_column(df, col)
     df[col, mask] = df[col, mask].apply(convert_money_string_to_float)
 
+def try_coerce_to_numeric(ndf: pd.DataFrame):
+    try:
+        nndf = ndf.copy()
+        object_columns = nndf.select_dtypes(include=['object']).columns
+        for j in object_columns:
+            num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
+            if num_floats > len(nndf[j]) / 2:  # most of column is float
+                try:
+                    nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
+                    exploded = False
+                    logger.info("Coerced strings to floats")
+                except:
+                    nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
+                    nndf = nndf.explode(j)
+                    nndf[j] = nndf[j].astype(float)
+                    nndf.reset_index(drop=True, inplace=True)
+                    exploded = True
+                    logger.info("Exploded rows with multiple values in single cell")
+    except:
+        pass
+    return nndf, exploded
 
 def is_dataframe_all_numeric(df: pd.DataFrame) -> bool:
     is_all_numeric = True
@@ -890,6 +911,7 @@ def process_dirty_dataframes(
     from sklearn.preprocessing import FunctionTransformer
     t = time()
 
+    ndf, explode = try_coerce_to_numeric(ndf)
     all_numeric = is_dataframe_all_numeric(ndf)
     if not all_numeric and has_dirty_cat:
         data_encoder = SuperVectorizer(
@@ -912,22 +934,22 @@ def process_dirty_dataframes(
             nndf[object_columns] = nndf[object_columns].astype(str)
             X_enc = data_encoder.fit_transform(nndf, y)
             logger.info("obj columns: %s are being converted to str", object_columns)
-        except AssertionError:
-            nndf = ndf.copy()
-            object_columns = pd.DataFrame(nndf).select_dtypes(include=['object']).columns
-            for j in object_columns:
-                num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
-                if num_floats > len(nndf[j]) / 2:
-                    print(nndf[j].dropna())
-                    try:
-                        nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
-                        logger.info("Coerced strings to floats")
-                    except:
-                        nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
-                        nndf = nndf.explode(j)
-                        nndf[j] = nndf[j].astype(float)
-                        logger.info("Exploded rows with multiple values in single cell")
-            X_enc = data_encoder.fit_transform(nndf, y)
+        # except AssertionError:  # is actually all_numeric
+            # nndf = pd.DataFrame(ndf.copy())
+            # object_columns = nndf.select_dtypes(include=['object']).columns
+            # for j in object_columns:
+            #     num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
+            #     if num_floats > len(nndf[j]) / 2:  # most of column is float
+            #         try:
+            #             nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
+            #             logger.info("Coerced strings to floats")
+            #             X_enc = data_encoder.fit_transform(nndf, y)
+            #         except:
+            #             nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
+            #             nndf = nndf.explode(j)
+            #             nndf[j] = nndf[j].astype(float)
+            #             logger.info("Exploded rows with multiple values in single cell")
+                        # X_enc, _, data_encoder, _ = get_numeric_transformers(nndf, None)
         X_enc = make_array(X_enc)
 
         import warnings
diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py
index fa4333737a..79661a17d6 100644
--- a/graphistry/tests/test_feature_utils.py
+++ b/graphistry/tests/test_feature_utils.py
@@ -438,6 +438,18 @@ def test_edge_scaling(self):
                                   return_scalers=True)
 
 
+    @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
+    def test_type_edgecase(self):
+        values = pd.Series(np.random.rand(50))
+        num_to_convert = int(len(values) * 0.05)
+        indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False)
+        for i in indices_to_convert:
+            values[i] = str(values[i])
+        values.loc[13] = '92.026 123.903 702.124'
+        values.loc[33] = '26.092 903.123'
+
+        graphistry.nodes(values).featurize()
+        assert True
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py
index addf61f27c..93dd5656a4 100644
--- a/graphistry/tests/test_umap_utils.py
+++ b/graphistry/tests/test_umap_utils.py
@@ -408,7 +408,7 @@ def test_type_edgecase(self):
         indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False)
         for i in indices_to_convert:
             values[i] = str(values[i])
-        values.loc[13] = '92.026 123.903'
+        values.loc[13] = '92.026 123.903 702.124'
         values.loc[33] = '26.092 903.123'
 
         graphistry.nodes(values).umap()

From 08b36b2c3f0028827b97e20b80bf1d1dfb2558f0 Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 15:28:11 -0600
Subject: [PATCH 3/6] test_feat_type_edgecase passing

---
 graphistry/feature_utils.py            |  6 ++----
 graphistry/tests/test_feature_utils.py | 23 ++++++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
index a843e300b5..e8b044e254 100644
--- a/graphistry/feature_utils.py
+++ b/graphistry/feature_utils.py
@@ -402,18 +402,16 @@ def try_coerce_to_numeric(ndf: pd.DataFrame):
             if num_floats > len(nndf[j]) / 2:  # most of column is float
                 try:
                     nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
-                    exploded = False
                     logger.info("Coerced strings to floats")
                 except:
                     nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
                     nndf = nndf.explode(j)
                     nndf[j] = nndf[j].astype(float)
                     nndf.reset_index(drop=True, inplace=True)
-                    exploded = True
                     logger.info("Exploded rows with multiple values in single cell")
     except:
         pass
-    return nndf, exploded
+    return nndf
 
 def is_dataframe_all_numeric(df: pd.DataFrame) -> bool:
     is_all_numeric = True
@@ -911,7 +909,7 @@ def process_dirty_dataframes(
     from sklearn.preprocessing import FunctionTransformer
     t = time()
 
-    ndf, explode = try_coerce_to_numeric(ndf)
+    ndf = try_coerce_to_numeric(ndf)
     all_numeric = is_dataframe_all_numeric(ndf)
     if not all_numeric and has_dirty_cat:
         data_encoder = SuperVectorizer(
diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py
index 79661a17d6..3e833c7dd7 100644
--- a/graphistry/tests/test_feature_utils.py
+++ b/graphistry/tests/test_feature_utils.py
@@ -440,15 +440,20 @@ def test_edge_scaling(self):
 
     @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
     def test_type_edgecase(self):
-        values = pd.Series(np.random.rand(50))
-        num_to_convert = int(len(values) * 0.05)
-        indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False)
-        for i in indices_to_convert:
-            values[i] = str(values[i])
-        values.loc[13] = '92.026 123.903 702.124'
-        values.loc[33] = '26.092 903.123'
-
-        graphistry.nodes(values).featurize()
+        df = pd.DataFrame({
+            'A': np.random.rand(50),
+            'B': np.random.rand(50)
+        })
+        num_to_convert = int(len(df.A.values) * 0.1)
+        indices_to_convert = np.random.choice(len(df.A.values), num_to_convert, replace=False)
+        indices_to_convertB = np.random.choice(len(df.A.values), num_to_convert, replace=False)
+        for i,j in zip(indices_to_convert, indices_to_convertB):
+            df.A[i] = str(df.A[i])
+            df.B[j] = str(df.B[j])
+        df.A.loc[13] = '92.026 123.903 702.124'
+        df.B.loc[33] = '26.092 903.123'
+
+        graphistry.nodes(df).featurize()
         assert True
 
 if __name__ == "__main__":

From 99a1982202ebbbcc8a32e66a9d6cc5d5f2de0c6b Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 16:25:39 -0600
Subject: [PATCH 4/6] no explode, take first element in tuple as float

---
 graphistry/feature_utils.py | 25 ++++++-------------------
 graphistry/umap_utils.py    |  2 ++
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
index e8b044e254..79084dadcb 100644
--- a/graphistry/feature_utils.py
+++ b/graphistry/feature_utils.py
@@ -404,11 +404,14 @@ def try_coerce_to_numeric(ndf: pd.DataFrame):
                     nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
                     logger.info("Coerced strings to floats")
                 except:
-                    nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
-                    nndf = nndf.explode(j)
+                    # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
+                    # nndf = nndf.explode(j)
+                    # logger.info("Exploded rows with multiple values in single cell")
+                    nndf[j] = nndf[j].apply(lambda x: str(x).split()[0] if isinstance(x, str) and ' ' in x else x)
                     nndf[j] = nndf[j].astype(float)
                     nndf.reset_index(drop=True, inplace=True)
-                    logger.info("Exploded rows with multiple values in single cell")
+                    logger.info("took first float of tuple in single cell")
+
     except:
         pass
     return nndf
@@ -932,22 +935,6 @@ def process_dirty_dataframes(
             nndf[object_columns] = nndf[object_columns].astype(str)
             X_enc = data_encoder.fit_transform(nndf, y)
             logger.info("obj columns: %s are being converted to str", object_columns)
-        # except AssertionError:  # is actually all_numeric
-            # nndf = pd.DataFrame(ndf.copy())
-            # object_columns = nndf.select_dtypes(include=['object']).columns
-            # for j in object_columns:
-            #     num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
-            #     if num_floats > len(nndf[j]) / 2:  # most of column is float
-            #         try:
-            #             nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
-            #             logger.info("Coerced strings to floats")
-            #             X_enc = data_encoder.fit_transform(nndf, y)
-            #         except:
-            #             nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
-            #             nndf = nndf.explode(j)
-            #             nndf[j] = nndf[j].astype(float)
-            #             logger.info("Exploded rows with multiple values in single cell")
-                        # X_enc, _, data_encoder, _ = get_numeric_transformers(nndf, None)
         X_enc = make_array(X_enc)
 
         import warnings
diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py
index d2561739df..02ce4a2953 100644
--- a/graphistry/umap_utils.py
+++ b/graphistry/umap_utils.py
@@ -721,6 +721,8 @@ def _bind_xy_from_umap(
             emb = res._edge_embedding
             
         if isinstance(df, type(emb)):
+            print(df)
+            print(emb)
             df[x_name] = emb.values.T[0]
             df[y_name] = emb.values.T[1]
         elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)):

From 05d0c8e455442daa723551fffe27903e37082ce0 Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 16:29:37 -0600
Subject: [PATCH 5/6] revert print in umap_utils

---
 graphistry/umap_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py
index 02ce4a2953..d2561739df 100644
--- a/graphistry/umap_utils.py
+++ b/graphistry/umap_utils.py
@@ -721,8 +721,6 @@ def _bind_xy_from_umap(
             emb = res._edge_embedding
             
         if isinstance(df, type(emb)):
-            print(df)
-            print(emb)
             df[x_name] = emb.values.T[0]
             df[y_name] = emb.values.T[1]
         elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)):

From b21a0f0be7a8db88b079cff636075017755b30d7 Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Wed, 12 Jun 2024 16:31:25 -0600
Subject: [PATCH 6/6] lint

---
 graphistry/tests/test_feature_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py
index 3e833c7dd7..7072a16f77 100644
--- a/graphistry/tests/test_feature_utils.py
+++ b/graphistry/tests/test_feature_utils.py
@@ -456,5 +456,6 @@ def test_type_edgecase(self):
         graphistry.nodes(df).featurize()
         assert True
 
+
 if __name__ == "__main__":
     unittest.main()