ctrlaltaf
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spras/allpairs.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/allpairs.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/domino.py‎
Lines changed: 5 additions & 0 deletions b/‎spras/domino.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎spras/meo.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/meo.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/mincostflow.py‎
Lines changed: 5 additions & 1 deletion b/‎spras/mincostflow.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎spras/omicsintegrator1.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/omicsintegrator1.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/omicsintegrator2.py‎
Lines changed: 5 additions & 1 deletion b/‎spras/omicsintegrator2.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎spras/pathlinker.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/pathlinker.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/util.py‎
Lines changed: 29 additions & 0 deletions b/‎spras/util.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎test/ml/expected/expected-dataframe.csv‎
Lines changed: 20 additions & 16 deletions b/‎test/ml/expected/expected-dataframe.csv‎
Lines changed: 20 additions & 16 deletions
@@ -157,6 +157,7 @@ Implement the `parse_output` function.
 The edges in the Local Neighborhood output have the same format as the input, `<vertex1>|<vertex2>`.
 Convert these to be tab-separated vertex pairs followed by a tab `1` and tab `U` at the end of every line, which indicates all edges have the same rank and are undirected.
 See the `add_rank_column` and `raw_pathway_df` function in `spras.util.py` and `reinsert_direction_col_undirected` function in `spras.interactome.py`.
+The `parse_output` function also ensures that there are no duplicate edges in the output pathway using the `spras.util.py` function `duplicate_edges`.
 Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created.
 The output should have the format `<vertex1> <vertex2> 1 U`.
 
 
@@ -7,7 +7,7 @@
     reinsert_direction_col_undirected,
 )
 from spras.prm import PRM
-from spras.util import add_rank_column, raw_pathway_df
+from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
 
 __all__ = ['AllPairs']
 
@@ -114,4 +114,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             df = add_rank_column(df)
             df = reinsert_direction_col_undirected(df)
             df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
@@ -9,6 +9,7 @@
     reinsert_direction_col_undirected,
 )
 from spras.prm import PRM
+from spras.util import duplicate_edges
 
 __all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform']
 
@@ -209,6 +210,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         else:
             edges_df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
+        edges_df, has_duplicates = duplicate_edges(edges_df)
+        if has_duplicates:
+            print(f"Duplicate edges were removed from {raw_pathway_file}")
+
         edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False)
 
 
 
@@ -7,7 +7,7 @@
     reinsert_direction_col_directed,
 )
 from spras.prm import PRM
-from spras.util import add_rank_column, raw_pathway_df
+from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
 
 __all__ = ['MEO', 'write_properties']
 
@@ -214,4 +214,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             df = reinsert_direction_col_directed(df)
             df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True)
             df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
         df.to_csv(standardized_pathway_file, index=False, sep='\t', header=True)
@@ -6,7 +6,7 @@
     reinsert_direction_col_undirected,
 )
 from spras.prm import PRM
-from spras.util import add_rank_column, raw_pathway_df
+from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
 
 __all__ = ['MinCostFlow']
 
@@ -155,4 +155,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             # Currently directed edges in the input will be converted to undirected edges in the output
             df = reinsert_direction_col_undirected(df)
             df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
+
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
@@ -3,7 +3,7 @@
 from spras.containers import prepare_volume, run_container
 from spras.interactome import reinsert_direction_col_mixed
 from spras.prm import PRM
-from spras.util import add_rank_column, raw_pathway_df
+from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
 
 __all__ = ['OmicsIntegrator1', 'write_conf']
 
@@ -227,5 +227,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp")
             df.drop(columns=['InteractionType'], inplace=True)
             df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
 
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
@@ -6,7 +6,7 @@
 from spras.dataset import Dataset
 from spras.interactome import reinsert_direction_col_undirected
 from spras.prm import PRM
-from spras.util import add_rank_column
+from spras.util import add_rank_column, duplicate_edges
 
 __all__ = ['OmicsIntegrator2']
 
@@ -164,4 +164,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             else: # corrupted data
                 df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
+
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
@@ -7,7 +7,7 @@
     reinsert_direction_col_directed,
 )
 from spras.prm import PRM
-from spras.util import raw_pathway_df
+from spras.util import duplicate_edges, raw_pathway_df
 
 __all__ = ['PathLinker']
 
@@ -141,4 +141,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             df = df.take([0, 1, 2], axis=1)
             df = reinsert_direction_col_directed(df)
             df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            df, has_duplicates = duplicate_edges(df)
+            if has_duplicates:
+                print(f"Duplicate edges were removed from {raw_pathway_file}")
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
@@ -76,3 +76,32 @@ def raw_pathway_df(raw_pathway_file: str, sep: str = '\t', header: int = None) -
         df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
     return df
+
+
+def duplicate_edges(df: pd.DataFrame) -> (pd.DataFrame, bool):
+    """
+    Removes duplicate edges from the input DataFrame. Run within every pathway reconstruction algorithm's parse_output.
+    - For duplicate edges (based on Node1, Node2, and Direction), the one with the smallest Rank is kept.
+    - For undirected edges, the node pair is sorted (e.g., "B-A" becomes "A-B") before removing duplicates.
+
+    @param df: A DataFrame from a raw file pathway.
+    @return pd.DataFrame: A DataFrame with duplicate edges removed.
+    @return bool: True if duplicate edges were found and removed, False otherwise.
+    """
+    # sort by rank, then by (node1 and node2) to ensure deterministic sorting
+    df_sorted = df.sort_values(by=["Rank", "Node1", "Node2"], ascending=True, ignore_index=True)
+
+    # for undirected edges, sort node pairs so that Node1 is always the lesser of the two
+    undirected_mask = df_sorted["Direction"] == "U"
+
+    # computes the minimum and maximum node (sorted order) for each row under the mask
+    min_nodes = df_sorted.loc[undirected_mask, ["Node1", "Node2"]].min(axis=1)
+    max_nodes = df_sorted.loc[undirected_mask, ["Node1", "Node2"]].max(axis=1)
+
+    # assigns the sorted Node1 and Node2 back into the df
+    df_sorted.loc[undirected_mask, "Node1"] = min_nodes
+    df_sorted.loc[undirected_mask, "Node2"] = max_nodes
+
+    unique_edges_df = df_sorted.drop_duplicates(subset=["Node1", "Node2", "Direction"], keep="first", ignore_index=True)
+
+    return unique_edges_df, not unique_edges_df.equals(df)
@@ -1,16 +1,20 @@
-,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction
-A---B,1,1,0,0,0,0,0,0
-C---D,1,1,0,0,0,0,0,1
-E---F,1,1,0,0,0,0,0,1
-L---M,0,1,1,0,0,0,1,0
-M---N,0,0,1,0,0,0,0,0
-O---P,0,0,1,0,0,0,1,0
-P---Q,0,0,1,0,0,0,0,0
-node1---node2,0,0,0,1,0,0,0,0
-node1---node3,0,0,0,1,1,0,0,0
-node4---node5,0,0,0,1,1,0,0,0
-LONGERNAMES---TEST,0,0,0,1,1,0,0,0
-node2---node3,0,0,0,0,1,0,0,0
-nodes with---spaces in name,0,0,0,0,0,0,1,0
-A-->B,0,0,0,0,0,0,0,1
-B-->A,0,0,0,0,0,0,0,1
+,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction,test-data-repeat-edges-directed
+A---B,1,1,0,0,0,0,0,0,0
+C---D,1,1,0,0,0,0,0,1,0
+E---F,1,1,0,0,0,0,0,1,0
+L---M,0,1,1,0,0,0,1,0,0
+M---N,0,0,1,0,0,0,0,0,0
+O---P,0,0,1,0,0,0,1,0,0
+P---Q,0,0,1,0,0,0,0,0,0
+node1---node2,0,0,0,1,0,0,0,0,0
+node1---node3,0,0,0,1,1,0,0,0,0
+node4---node5,0,0,0,1,1,0,0,0,0
+LONGERNAMES---TEST,0,0,0,1,1,0,0,0,0
+node2---node3,0,0,0,0,1,0,0,0,0
+nodes with---spaces in name,0,0,0,0,0,0,1,0,0
+A-->B,0,0,0,0,0,0,0,1,0
+B-->A,0,0,0,0,0,0,0,1,0
+L-->M,0,0,0,0,0,0,0,0,1
+M-->N,0,0,0,0,0,0,0,0,1
+O-->P,0,0,0,0,0,0,0,0,1
+P-->Q,0,0,0,0,0,0,0,0,1