From 7f6d31ef27d68f4b33803cc29e7619d3f96e441a Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 12:41:02 -0600 Subject: [PATCH 1/6] initial fix, need more robust synth data --- graphistry/feature_utils.py | 16 ++++++++++++++++ graphistry/tests/test_umap_utils.py | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 26214f3a69..38a75a284d 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -912,6 +912,22 @@ def process_dirty_dataframes( nndf[object_columns] = nndf[object_columns].astype(str) X_enc = data_encoder.fit_transform(nndf, y) logger.info("obj columns: %s are being converted to str", object_columns) + except AssertionError: + nndf = ndf.copy() + object_columns = pd.DataFrame(nndf).select_dtypes(include=['object']).columns + for j in object_columns: + num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) + if num_floats > len(nndf[j]) / 2: + print(nndf[j].dropna()) + try: + nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] + logger.info("Coerced strings to floats") + except: + nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) + nndf = nndf.explode(j) + nndf[j] = nndf[j].astype(float) + logger.info("Exploded rows with multiple values in single cell") + X_enc = data_encoder.fit_transform(nndf, y) X_enc = make_array(X_enc) import warnings diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index 3362e3405f..addf61f27c 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -400,6 +400,19 @@ def test_umap_edgecase(self): graphistry.nodes(df).umap() assert True + + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def test_type_edgecase(self): + values = pd.Series(np.random.rand(50)) + num_to_convert = int(len(values) * 0.05) + indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False) + for i in indices_to_convert: + values[i] = str(values[i]) + values.loc[13] = '92.026 123.903' + values.loc[33] = '26.092 903.123' + + graphistry.nodes(values).umap() + assert True @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_node_umap(self): From 66a7bf2cb5691e3dbe001a5eb6a89bea8b8759de Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 13:47:54 -0600 Subject: [PATCH 2/6] explode adds node names -- need smart fix --- graphistry/feature_utils.py | 54 ++++++++++++++++++-------- graphistry/tests/test_feature_utils.py | 12 ++++++ graphistry/tests/test_umap_utils.py | 2 +- 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 38a75a284d..a843e300b5 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -393,6 +393,27 @@ def convert_money_string_to_float(money: str): mask = where_is_currency_column(df, col) df[col, mask] = df[col, mask].apply(convert_money_string_to_float) +def try_coerce_to_numeric(ndf: pd.DataFrame): + try: + nndf = ndf.copy() + object_columns = nndf.select_dtypes(include=['object']).columns + for j in object_columns: + num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) + if num_floats > len(nndf[j]) / 2: # most of column is float + try: + nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] + exploded = False + logger.info("Coerced strings to floats") + except: + nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) + nndf = nndf.explode(j) + nndf[j] = nndf[j].astype(float) + nndf.reset_index(drop=True, inplace=True) + exploded = True + logger.info("Exploded rows with multiple values in single cell") + except: + pass + return nndf, exploded def is_dataframe_all_numeric(df: pd.DataFrame) -> bool: is_all_numeric = True @@ -890,6 +911,7 @@ def process_dirty_dataframes( from sklearn.preprocessing import FunctionTransformer t = time() + ndf, explode = try_coerce_to_numeric(ndf) all_numeric = is_dataframe_all_numeric(ndf) if not all_numeric and has_dirty_cat: data_encoder = SuperVectorizer( @@ -912,22 +934,22 @@ def process_dirty_dataframes( nndf[object_columns] = nndf[object_columns].astype(str) X_enc = data_encoder.fit_transform(nndf, y) logger.info("obj columns: %s are being converted to str", object_columns) - except AssertionError: - nndf = ndf.copy() - object_columns = pd.DataFrame(nndf).select_dtypes(include=['object']).columns - for j in object_columns: - num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) - if num_floats > len(nndf[j]) / 2: - print(nndf[j].dropna()) - try: - nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] - logger.info("Coerced strings to floats") - except: - nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) - nndf = nndf.explode(j) - nndf[j] = nndf[j].astype(float) - logger.info("Exploded rows with multiple values in single cell") - X_enc = data_encoder.fit_transform(nndf, y) + # except AssertionError: # is actually all_numeric + # nndf = pd.DataFrame(ndf.copy()) + # object_columns = nndf.select_dtypes(include=['object']).columns + # for j in object_columns: + # num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) + # if num_floats > len(nndf[j]) / 2: # most of column is float + # try: + # nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] + # logger.info("Coerced strings to floats") + # X_enc = data_encoder.fit_transform(nndf, y) + # except: + # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) + # nndf = nndf.explode(j) + # nndf[j] = nndf[j].astype(float) + # logger.info("Exploded rows with multiple values in single cell") + # X_enc, _, data_encoder, _ = get_numeric_transformers(nndf, None) X_enc = make_array(X_enc) import warnings diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fa4333737a..79661a17d6 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -438,6 +438,18 @@ def test_edge_scaling(self): return_scalers=True) + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") + def test_type_edgecase(self): + values = pd.Series(np.random.rand(50)) + num_to_convert = int(len(values) * 0.05) + indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False) + for i in indices_to_convert: + values[i] = str(values[i]) + values.loc[13] = '92.026 123.903 702.124' + values.loc[33] = '26.092 903.123' + + graphistry.nodes(values).featurize() + assert True if __name__ == "__main__": unittest.main() diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index addf61f27c..93dd5656a4 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -408,7 +408,7 @@ def test_type_edgecase(self): indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False) for i in indices_to_convert: values[i] = str(values[i]) - values.loc[13] = '92.026 123.903' + values.loc[13] = '92.026 123.903 702.124' values.loc[33] = '26.092 903.123' graphistry.nodes(values).umap() From 08b36b2c3f0028827b97e20b80bf1d1dfb2558f0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 15:28:11 -0600 Subject: [PATCH 3/6] test_feat_type_edgecase passing --- graphistry/feature_utils.py | 6 ++---- graphistry/tests/test_feature_utils.py | 23 ++++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index a843e300b5..e8b044e254 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -402,18 +402,16 @@ def try_coerce_to_numeric(ndf: pd.DataFrame): if num_floats > len(nndf[j]) / 2: # most of column is float try: nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] - exploded = False logger.info("Coerced strings to floats") except: nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) nndf = nndf.explode(j) nndf[j] = nndf[j].astype(float) nndf.reset_index(drop=True, inplace=True) - exploded = True logger.info("Exploded rows with multiple values in single cell") except: pass - return nndf, exploded + return nndf def is_dataframe_all_numeric(df: pd.DataFrame) -> bool: is_all_numeric = True @@ -911,7 +909,7 @@ def process_dirty_dataframes( from sklearn.preprocessing import FunctionTransformer t = time() - ndf, explode = try_coerce_to_numeric(ndf) + ndf = try_coerce_to_numeric(ndf) all_numeric = is_dataframe_all_numeric(ndf) if not all_numeric and has_dirty_cat: data_encoder = SuperVectorizer( diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index 79661a17d6..3e833c7dd7 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -440,15 +440,20 @@ def test_edge_scaling(self): @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def test_type_edgecase(self): - values = pd.Series(np.random.rand(50)) - num_to_convert = int(len(values) * 0.05) - indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False) - for i in indices_to_convert: - values[i] = str(values[i]) - values.loc[13] = '92.026 123.903 702.124' - values.loc[33] = '26.092 903.123' - - graphistry.nodes(values).featurize() + df = pd.DataFrame({ + 'A': np.random.rand(50), + 'B': np.random.rand(50) + }) + num_to_convert = int(len(df.A.values) * 0.1) + indices_to_convert = np.random.choice(len(df.A.values), num_to_convert, replace=False) + indices_to_convertB = np.random.choice(len(df.A.values), num_to_convert, replace=False) + for i,j in zip(indices_to_convert, indices_to_convertB): + df.A[i] = str(df.A[i]) + df.B[j] = str(df.B[j]) + df.A.loc[13] = '92.026 123.903 702.124' + df.B.loc[33] = '26.092 903.123' + + graphistry.nodes(df).featurize() assert True if __name__ == "__main__": From 99a1982202ebbbcc8a32e66a9d6cc5d5f2de0c6b Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 16:25:39 -0600 Subject: [PATCH 4/6] no explode, take first element in tuple as float --- graphistry/feature_utils.py | 25 ++++++------------------- graphistry/umap_utils.py | 2 ++ 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index e8b044e254..79084dadcb 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -404,11 +404,14 @@ def try_coerce_to_numeric(ndf: pd.DataFrame): nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] logger.info("Coerced strings to floats") except: - nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) - nndf = nndf.explode(j) + # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) + # nndf = nndf.explode(j) + # logger.info("Exploded rows with multiple values in single cell") + nndf[j] = nndf[j].apply(lambda x: str(x).split()[0] if isinstance(x, str) and ' ' in x else x) nndf[j] = nndf[j].astype(float) nndf.reset_index(drop=True, inplace=True) - logger.info("Exploded rows with multiple values in single cell") + logger.info("took first float of tuple in single cell") + except: pass return nndf @@ -932,22 +935,6 @@ def process_dirty_dataframes( nndf[object_columns] = nndf[object_columns].astype(str) X_enc = data_encoder.fit_transform(nndf, y) logger.info("obj columns: %s are being converted to str", object_columns) - # except AssertionError: # is actually all_numeric - # nndf = pd.DataFrame(ndf.copy()) - # object_columns = nndf.select_dtypes(include=['object']).columns - # for j in object_columns: - # num_floats = sum(isinstance(x, float) for x in nndf[j].dropna()) - # if num_floats > len(nndf[j]) / 2: # most of column is float - # try: - # nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]] - # logger.info("Coerced strings to floats") - # X_enc = data_encoder.fit_transform(nndf, y) - # except: - # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x) - # nndf = nndf.explode(j) - # nndf[j] = nndf[j].astype(float) - # logger.info("Exploded rows with multiple values in single cell") - # X_enc, _, data_encoder, _ = get_numeric_transformers(nndf, None) X_enc = make_array(X_enc) import warnings diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index d2561739df..02ce4a2953 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -721,6 +721,8 @@ def _bind_xy_from_umap( emb = res._edge_embedding if isinstance(df, type(emb)): + print(df) + print(emb) df[x_name] = emb.values.T[0] df[y_name] = emb.values.T[1] elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)): From 05d0c8e455442daa723551fffe27903e37082ce0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 16:29:37 -0600 Subject: [PATCH 5/6] revert print in umap_utils --- graphistry/umap_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index 02ce4a2953..d2561739df 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -721,8 +721,6 @@ def _bind_xy_from_umap( emb = res._edge_embedding if isinstance(df, type(emb)): - print(df) - print(emb) df[x_name] = emb.values.T[0] df[y_name] = emb.values.T[1] elif isinstance(df, pd.DataFrame) and 'cudf.core.dataframe' in str(getmodule(emb)): From b21a0f0be7a8db88b079cff636075017755b30d7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 12 Jun 2024 16:31:25 -0600 Subject: [PATCH 6/6] lint --- graphistry/tests/test_feature_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index 3e833c7dd7..7072a16f77 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -456,5 +456,6 @@ def test_type_edgecase(self): graphistry.nodes(df).featurize() assert True + if __name__ == "__main__": unittest.main()