diff --git a/Snakefile b/Snakefile index 3cd39ff8..166c8b2f 100644 --- a/Snakefile +++ b/Snakefile @@ -2,11 +2,12 @@ import os from spras import runner import shutil import yaml -from spras.dataset import Dataset -from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape from spras.config.revision import detach_spras_revision import spras.config.config as _config +from spras.dataset import Dataset +from spras.evaluation import Evaluation +from spras.statistics import from_output_pathway, statistics_computation, statistics_options # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -312,18 +313,36 @@ rule viz_cytoscape: run: cytoscape.run_cytoscape(input.pathways, output.session, container_settings) +# We generate new Snakemake rules for every statistic +# to allow parallel and lazy computation of individual statistics +for keys, values in statistics_computation.items(): + pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys]) + rule: + # (See https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#procedural-rule-definition) + name: pythonic_name + input: pathway_file = rules.parse_output.output.standardized_file + output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys] + run: + (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True) + graph = from_output_pathway(input.pathway_file) + for computed, output in zip(values(graph), output): + Path(output).write_text(str(computed)) # Write a single summary table for all pathways for each dataset rule summary_table: input: # Collect all pathways generated for the dataset pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params), - dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) + dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']), + # Collect all possible options + statistics = expand( + '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt', + out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options) output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt']) run: # Load the node table from the pickled dataset file node_table = Dataset.from_file(input.dataset_file).node_table - summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params) + summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics) summary_df.to_csv(output.summary_table, sep='\t', index=False) # Cluster the output pathways for each dataset diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 91ba544a..237f7c5b 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -1,14 +1,15 @@ +import ast import json from pathlib import Path -from statistics import median from typing import Iterable -import networkx as nx import pandas as pd +from spras.statistics import from_output_pathway + def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict], - algo_with_params: list[str]) -> pd.DataFrame: + algo_with_params: list[str], statistics_files: list) -> pd.DataFrame: """ Generate a table that aggregates summary information about networks in file_paths, including which nodes are present in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the @@ -18,6 +19,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @param algo_with_params: a list of -params- combinations + @param statistics_files: a list of statistic files with the computed statistics. @return: pandas DataFrame with summary information """ # Ensure that NODEID is the first column @@ -40,52 +42,18 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Iterate through each network file path for index, file_path in enumerate(sorted(file_paths)): - with open(file_path, 'r') as f: - lines = f.readlines()[1:] # skip the header line - # directed or mixed graphs are parsed and summarized as an undirected graph - nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) + nw = from_output_pathway(file_path) # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) - number_nodes = nw.number_of_nodes() - number_edges = nw.number_of_edges() - ncc = nx.number_connected_components(nw) - - # Save the max/median degree, average clustering coefficient, and density - if number_nodes == 0: - max_degree = 0 - median_degree = 0.0 - density = 0.0 - else: - degrees = [deg for _, deg in nw.degree()] - max_degree = max(degrees) - median_degree = median(degrees) - density = nx.density(nw) - - cc = list(nx.connected_components(nw)) - # Save the max diameter - # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) - diameters = [ - nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0 - for c in cc - ] - max_diameter = max(diameters, default=0) - - # Save the average path lengths - # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) - avg_path_lengths = [ - nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0 - for c in cc - ] - - if len(avg_path_lengths) != 0: - avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) - else: - avg_path_len = 0.0 + + # We use ast.literal_eval here to convert statistic file outputs to ints or floats depending on their string representation. + # (e.g. "5.0" -> float(5.0), while "5" -> int(5).) + graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files] # Initialize list to store current network information - cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len] + cur_nw_info = [nw_name, *graph_statistics] # Iterate through each node property and save the intersection with the current network for node_list in nodes_by_col: @@ -107,8 +75,10 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Save the current network information to the network summary list nw_info.append(cur_nw_info) + # Get the list of statistic names by their file names + statistics_options = [Path(file).stem for file in statistics_files] # Prepare column names - col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length'] + col_names = ['Name', *statistics_options] col_names.extend(nodes_by_col_labs) col_names.append('Parameter combination') @@ -122,65 +92,4 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg return nw_info -def degree(g): - return dict(g.degree) - -# TODO: redo .run code to work on mixed graphs -# stats is just a list of functions to apply to the graph. -# They should take as input a networkx graph or digraph but may have any output. -# stats = [degree, nx.clustering, nx.betweenness_centrality] - - -# def produce_statistics(g: nx.Graph, s=None) -> dict: -# global stats -# if s is not None: -# stats = s -# d = dict() -# for s in stats: -# sname = s.__name__ -# d[sname] = s(g) -# return d - - -# def load_graph(path: str) -> nx.Graph: -# g = nx.read_edgelist(path, data=(('weight', float), ('Direction',str))) -# return g - - -# def save(data, pth): -# fout = open(pth, 'w') -# fout.write('#node\t%s\n' % '\t'.join([s.__name__ for s in stats])) -# for node in data[stats[0].__name__]: -# row = [data[s.__name__][node] for s in stats] -# fout.write('%s\t%s\n' % (node, '\t'.join([str(d) for d in row]))) -# fout.close() - - -# def run(infile: str, outfile: str) -> None: -# """ -# run function that wraps above functions. -# """ -# # if output directory doesn't exist, make it. -# outdir = os.path.dirname(outfile) -# if not os.path.exists(outdir): -# os.makedirs(outdir) - -# # load graph, produce stats, and write to human-readable file. -# g = load_graph(infile) -# dat = produce_statistics(g) -# save(dat, outfile) - - -# def main(argv): -# """ -# for testing -# """ -# g = load_graph(argv[1]) -# print(g.nodes) -# dat = produce_statistics(g) -# print(dat) -# save(dat, argv[2]) - - -# if __name__ == '__main__': -# main(sys.argv) +# TODO: redo the above code to work on mixed graphs diff --git a/spras/statistics.py b/spras/statistics.py new file mode 100644 index 00000000..251fecca --- /dev/null +++ b/spras/statistics.py @@ -0,0 +1,76 @@ +""" +Graph statistics, used to power summary.py. + +We allow for arbitrary computation of any specific statistic on some graph, +computing more than necessary if we have dependencies. See the top level +`statistics_computation` dictionary for usage. + +To make the statistics allow directed graph input, they will always take +in a networkx.DiGraph, which contains even more information, even though +the underlying graph may be just as easily represented by networkx.Graph. +""" + +import itertools +from statistics import median +from typing import Callable + +import networkx as nx + + +def compute_degree(graph: nx.DiGraph) -> tuple[int, float]: + """ + Computes the (max, median) degree of a `graph`. + """ + # number_of_nodes is a cheap call + if graph.number_of_nodes() == 0: + return (0, 0.0) + else: + degrees = [deg for _, deg in graph.degree()] + return max(degrees), median(degrees) + +def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]: + # We convert our directed_graph to an undirected graph as networkx (reasonably) does + # not allow for computing the connected components of a directed graph, but the connected + # component count still is a useful statistic for us. + graph: nx.Graph = directed_graph.to_undirected() + cc = list(nx.connected_components(graph)) + # Save the max diameter + # Use diameter only for components with ≥2 nodes (singleton components have diameter 0) + diameters = [ + nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0 + for c in cc + ] + max_diameter = max(diameters, default=0) + + # Save the average path lengths + # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0) + avg_path_lengths = [ + nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0 + for c in cc + ] + + if len(avg_path_lengths) != 0: + avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) + else: + avg_path_len = 0.0 + + return max_diameter, avg_path_len + +# The type signature here is meant to be 'an n-tuple has n-outputs.' +statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = { + ('Number of nodes',): lambda graph : (graph.number_of_nodes(),), + ('Number of edges',): lambda graph : (graph.number_of_edges(),), + ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),), + ('Density',): lambda graph : (nx.density(graph),), + ('Max degree', 'Median degree'): compute_degree, + ('Max diameter', 'Average path length'): compute_on_cc, +} + +# All of the keys inside statistics_computation, flattened. +statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys()))) + +def from_output_pathway(lines) -> nx.Graph: + with open(lines, 'r') as f: + lines = f.readlines()[1:] + + return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))