Skip to content

Commit 24577a1

Browse files
authored
Merge pull request Reed-CompBio#191 from ntalluri/invalid_index_error
summarize_networks InvalidIndexError
2 parents c0d7fa5 + a09e8d9 commit 24577a1

29 files changed

Lines changed: 1103 additions & 63 deletions

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ Implement the `parse_output` function.
157157
The edges in the Local Neighborhood output have the same format as the input, `<vertex1>|<vertex2>`.
158158
Convert these to be tab-separated vertex pairs followed by a tab `1` and tab `U` at the end of every line, which indicates all edges have the same rank and are undirected.
159159
See the `add_rank_column` and `raw_pathway_df` function in `spras.util.py` and `reinsert_direction_col_undirected` function in `spras.interactome.py`.
160+
The `parse_output` function also ensures that there are no duplicate edges in the output pathway using the `spras.util.py` function `duplicate_edges`.
160161
Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created.
161162
The output should have the format `<vertex1> <vertex2> 1 U`.
162163

spras/allpairs.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
reinsert_direction_col_undirected,
88
)
99
from spras.prm import PRM
10-
from spras.util import add_rank_column, raw_pathway_df
10+
from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
1111

1212
__all__ = ['AllPairs']
1313

@@ -114,4 +114,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
114114
df = add_rank_column(df)
115115
df = reinsert_direction_col_undirected(df)
116116
df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
117+
df, has_duplicates = duplicate_edges(df)
118+
if has_duplicates:
119+
print(f"Duplicate edges were removed from {raw_pathway_file}")
117120
df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')

spras/domino.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
reinsert_direction_col_undirected,
1010
)
1111
from spras.prm import PRM
12+
from spras.util import duplicate_edges
1213

1314
__all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform']
1415

@@ -209,6 +210,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
209210
else:
210211
edges_df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
211212

213+
edges_df, has_duplicates = duplicate_edges(edges_df)
214+
if has_duplicates:
215+
print(f"Duplicate edges were removed from {raw_pathway_file}")
216+
212217
edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False)
213218

214219

spras/meo.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
reinsert_direction_col_directed,
88
)
99
from spras.prm import PRM
10-
from spras.util import add_rank_column, raw_pathway_df
10+
from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
1111

1212
__all__ = ['MEO', 'write_properties']
1313

@@ -214,4 +214,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
214214
df = reinsert_direction_col_directed(df)
215215
df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True)
216216
df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
217+
df, has_duplicates = duplicate_edges(df)
218+
if has_duplicates:
219+
print(f"Duplicate edges were removed from {raw_pathway_file}")
217220
df.to_csv(standardized_pathway_file, index=False, sep='\t', header=True)

spras/mincostflow.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
reinsert_direction_col_undirected,
77
)
88
from spras.prm import PRM
9-
from spras.util import add_rank_column, raw_pathway_df
9+
from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
1010

1111
__all__ = ['MinCostFlow']
1212

@@ -155,4 +155,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
155155
# Currently directed edges in the input will be converted to undirected edges in the output
156156
df = reinsert_direction_col_undirected(df)
157157
df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
158+
df, has_duplicates = duplicate_edges(df)
159+
if has_duplicates:
160+
print(f"Duplicate edges were removed from {raw_pathway_file}")
161+
158162
df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')

spras/omicsintegrator1.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from spras.containers import prepare_volume, run_container
44
from spras.interactome import reinsert_direction_col_mixed
55
from spras.prm import PRM
6-
from spras.util import add_rank_column, raw_pathway_df
6+
from spras.util import add_rank_column, duplicate_edges, raw_pathway_df
77

88
__all__ = ['OmicsIntegrator1', 'write_conf']
99

@@ -227,5 +227,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
227227
df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp")
228228
df.drop(columns=['InteractionType'], inplace=True)
229229
df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
230+
df, has_duplicates = duplicate_edges(df)
231+
if has_duplicates:
232+
print(f"Duplicate edges were removed from {raw_pathway_file}")
230233

231234
df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')

spras/omicsintegrator2.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from spras.dataset import Dataset
77
from spras.interactome import reinsert_direction_col_undirected
88
from spras.prm import PRM
9-
from spras.util import add_rank_column
9+
from spras.util import add_rank_column, duplicate_edges
1010

1111
__all__ = ['OmicsIntegrator2']
1212

@@ -164,4 +164,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
164164
else: # corrupted data
165165
df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
166166

167+
df, has_duplicates = duplicate_edges(df)
168+
if has_duplicates:
169+
print(f"Duplicate edges were removed from {raw_pathway_file}")
170+
167171
df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')

spras/pathlinker.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
reinsert_direction_col_directed,
88
)
99
from spras.prm import PRM
10-
from spras.util import raw_pathway_df
10+
from spras.util import duplicate_edges, raw_pathway_df
1111

1212
__all__ = ['PathLinker']
1313

@@ -141,4 +141,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
141141
df = df.take([0, 1, 2], axis=1)
142142
df = reinsert_direction_col_directed(df)
143143
df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
144+
df, has_duplicates = duplicate_edges(df)
145+
if has_duplicates:
146+
print(f"Duplicate edges were removed from {raw_pathway_file}")
144147
df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')

spras/util.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,32 @@ def raw_pathway_df(raw_pathway_file: str, sep: str = '\t', header: int = None) -
7676
df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
7777

7878
return df
79+
80+
81+
def duplicate_edges(df: pd.DataFrame) -> (pd.DataFrame, bool):
82+
"""
83+
Removes duplicate edges from the input DataFrame. Run within every pathway reconstruction algorithm's parse_output.
84+
- For duplicate edges (based on Node1, Node2, and Direction), the one with the smallest Rank is kept.
85+
- For undirected edges, the node pair is sorted (e.g., "B-A" becomes "A-B") before removing duplicates.
86+
87+
@param df: A DataFrame from a raw file pathway.
88+
@return pd.DataFrame: A DataFrame with duplicate edges removed.
89+
@return bool: True if duplicate edges were found and removed, False otherwise.
90+
"""
91+
# sort by rank, then by (node1 and node2) to ensure deterministic sorting
92+
df_sorted = df.sort_values(by=["Rank", "Node1", "Node2"], ascending=True, ignore_index=True)
93+
94+
# for undirected edges, sort node pairs so that Node1 is always the lesser of the two
95+
undirected_mask = df_sorted["Direction"] == "U"
96+
97+
# computes the minimum and maximum node (sorted order) for each row under the mask
98+
min_nodes = df_sorted.loc[undirected_mask, ["Node1", "Node2"]].min(axis=1)
99+
max_nodes = df_sorted.loc[undirected_mask, ["Node1", "Node2"]].max(axis=1)
100+
101+
# assigns the sorted Node1 and Node2 back into the df
102+
df_sorted.loc[undirected_mask, "Node1"] = min_nodes
103+
df_sorted.loc[undirected_mask, "Node2"] = max_nodes
104+
105+
unique_edges_df = df_sorted.drop_duplicates(subset=["Node1", "Node2", "Direction"], keep="first", ignore_index=True)
106+
107+
return unique_edges_df, not unique_edges_df.equals(df)
Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
1-
,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction
2-
A---B,1,1,0,0,0,0,0,0
3-
C---D,1,1,0,0,0,0,0,1
4-
E---F,1,1,0,0,0,0,0,1
5-
L---M,0,1,1,0,0,0,1,0
6-
M---N,0,0,1,0,0,0,0,0
7-
O---P,0,0,1,0,0,0,1,0
8-
P---Q,0,0,1,0,0,0,0,0
9-
node1---node2,0,0,0,1,0,0,0,0
10-
node1---node3,0,0,0,1,1,0,0,0
11-
node4---node5,0,0,0,1,1,0,0,0
12-
LONGERNAMES---TEST,0,0,0,1,1,0,0,0
13-
node2---node3,0,0,0,0,1,0,0,0
14-
nodes with---spaces in name,0,0,0,0,0,0,1,0
15-
A-->B,0,0,0,0,0,0,0,1
16-
B-->A,0,0,0,0,0,0,0,1
1+
,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction,test-data-repeat-edges-directed
2+
A---B,1,1,0,0,0,0,0,0,0
3+
C---D,1,1,0,0,0,0,0,1,0
4+
E---F,1,1,0,0,0,0,0,1,0
5+
L---M,0,1,1,0,0,0,1,0,0
6+
M---N,0,0,1,0,0,0,0,0,0
7+
O---P,0,0,1,0,0,0,1,0,0
8+
P---Q,0,0,1,0,0,0,0,0,0
9+
node1---node2,0,0,0,1,0,0,0,0,0
10+
node1---node3,0,0,0,1,1,0,0,0,0
11+
node4---node5,0,0,0,1,1,0,0,0,0
12+
LONGERNAMES---TEST,0,0,0,1,1,0,0,0,0
13+
node2---node3,0,0,0,0,1,0,0,0,0
14+
nodes with---spaces in name,0,0,0,0,0,0,1,0,0
15+
A-->B,0,0,0,0,0,0,0,1,0
16+
B-->A,0,0,0,0,0,0,0,1,0
17+
L-->M,0,0,0,0,0,0,0,0,1
18+
M-->N,0,0,0,0,0,0,0,0,1
19+
O-->P,0,0,0,0,0,0,0,0,1
20+
P-->Q,0,0,0,0,0,0,0,0,1

0 commit comments

Comments
 (0)