diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..37ca25e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,21 @@ +language: python +python: + - 2.7 + - 3.4 + - 3.5 + - 3.6 +install: + - sudo apt-get -y update + - sudo apt-get -y install r-base + - sudo apt-get -y install python-matplotlib + - pip install codecov + - pip install -r requirements.txt + - cd wext + - python setup.py install + - cd ../ + - pwd + - ls +script: + - nosetests +after_success: + - codecov diff --git a/README.md b/README.md index a77b0c5..952fbe8 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,11 @@ # Weighted Exclusivity Test (WExT) # -The Weighted Exclusivity Test (WExT) was developed by the [Raphael research group](http://compbio.cs.brown.edu/) at Brown University. - -### Requirements ### - -Latest tested version in parentheses. +[![Build Status](https://api.travis-ci.org/raphael-group/wext.svg?branch=master)](https://travis-ci.org/raphael-group/wext?branch=master) -1. Python (2.7.9) - a. NumPy (1.11.0) - - b. SciPy (0.17.0) +The Weighted Exclusivity Test (WExT) was developed by the [Raphael research group](http://compbio.cs.brown.edu/) at Brown University. -2. gcc (4.9.2) +### Requirements ### We recommend using [`virtualenv`](https://virtualenv.pypa.io/en/latest/) to install the Python requirements. After installing `virtualenv`, you can install the Python requirements for the weighted exclusivity test as follows: @@ -27,8 +20,7 @@ See the wiki for additional instructions on [Setup and installation](https://git The C and Fortran extensions must be compiled before running the weighted exclusivity test: cd wext - python setup.py build - f2py -c src/fortran/bipartite_edge_swap_module.f95 -m bipartite_edge_swap_module + python setup.py install ### Usage ### diff --git a/compute_mutation_probabilities.py b/compute_mutation_probabilities.py index da2a41d..5be6777 100755 --- a/compute_mutation_probabilities.py +++ b/compute_mutation_probabilities.py @@ -4,10 +4,12 @@ import sys, os, argparse, json, numpy as np, multiprocessing as mp, random from collections import defaultdict + # Load the weighted exclusivity test this_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(this_dir) from wext import * +from past.builtins import xrange # Argument parser def get_parser(): @@ -20,12 +22,13 @@ def get_parser(): parser.add_argument('-q', '--swap_multiplier', type=int, required=False, default=100) parser.add_argument('-nc', '--num_cores', type=int, required=False, default=1) parser.add_argument('-s', '--seed', type=int, required=False, default=None) - parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=range(5)) + parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=list(range(5))) return parser -def permute_matrices_wrapper(args): return permute_matrices(*args) -def permute_matrices(edge_list, max_swaps, max_tries, seeds, verbose, - m, n, num_edges, indexToGene, indexToPatient): +def permute_matrices_wrapper(args): + return permute_matrices(*args) + +def permute_matrices(edge_list, max_swaps, max_tries, seeds, verbose, m, n, num_edges, indexToGene, indexToPatient): # Initialize our output observed = np.zeros((m, n)) permutations = [] @@ -43,8 +46,8 @@ def permute_matrices(edge_list, max_swaps, max_tries, seeds, verbose, indices.append( (edge[0]-1, edge[1]-1) ) # Record the permutation - observed[zip(*indices)] += 1. - geneToCases = dict( (g, list(cases)) for g, cases in geneToCases.iteritems() ) + observed[tuple(zip(*indices))] += 1. + geneToCases = dict( (g, list(cases)) for g, cases in geneToCases.items()) permutations.append( dict(geneToCases=geneToCases, permutation_number=seed) ) return observed/float(len(seeds)), permutations @@ -76,20 +79,20 @@ def run( args ): # Load mutation data if args.verbose > 0: - print '* Loading mutation data...' + print('* Loading mutation data...') mutation_data = load_mutation_data( args.mutation_file ) genes, all_genes, patients, geneToCases, patientToMutations, params, hypermutators = mutation_data - geneToObserved = dict( (g, len(cases)) for g, cases in geneToCases.iteritems() ) - patientToObserved = dict( (p, len(muts)) for p, muts in patientToMutations.iteritems() ) + geneToObserved = dict( (g, len(cases)) for g, cases in geneToCases.items()) + patientToObserved = dict( (p, len(muts)) for p, muts in patientToMutations.items()) geneToIndex = dict( (g, i+1) for i, g in enumerate(all_genes) ) indexToGene = dict( (i+1, g) for i, g in enumerate(all_genes) ) patientToIndex = dict( (p, j+1) for j, p in enumerate(patients) ) indexToPatient = dict( (j+1, p) for j, p in enumerate(patients) ) edges = set() - for gene, cases in geneToCases.iteritems(): + for gene, cases in geneToCases.items(): for patient in cases: edges.add( (geneToIndex[gene], patientToIndex[patient]) ) @@ -97,7 +100,7 @@ def run( args ): # Run the bipartite edge swaps if args.verbose > 0: - print '* Permuting matrices...' + print('* Permuting matrices...') m = len(all_genes) n = len(patients) @@ -127,7 +130,7 @@ def run( args ): # Create the weights file if args.weights_file: if args.verbose > 0: - print '* Saving weights file...' + print('* Saving weights file...') # Allow for small accumulated numerical errors tol = 1e3*max(m, n)*args.num_permutations*np.finfo(np.float64).eps @@ -137,10 +140,10 @@ def run( args ): P = np.add.reduce(observeds) / float(len(observeds)) # Verify the weights - for g, obs in geneToObserved.iteritems(): + for g, obs in geneToObserved.items(): assert( np.abs(P[geneToIndex[g]-1].sum() - obs) < tol) - for p, obs in patientToObserved.iteritems(): + for p, obs in patientToObserved.items(): assert( np.abs(P[:, patientToIndex[p]-1].sum() - obs) < tol) # Construct mutation matrix to compute marginals @@ -154,12 +157,12 @@ def run( args ): P = postprocess_weight_matrix(P, r, s) # Verify the weights again - for g, obs in geneToObserved.iteritems(): + for g, obs in geneToObserved.items(): assert( np.abs(P[geneToIndex[g]-1].sum() - obs) < tol) - for p, obs in patientToObserved.iteritems(): + for p, obs in patientToObserved.items(): assert( np.abs(P[:, patientToIndex[p]-1].sum() - obs) < tol) - + # Add pseudocounts to entries with no mutations observed; unlikely or impossible after post-processing step P[P == 0] = 1./(2. * args.num_permutations) @@ -171,7 +174,7 @@ def run( args ): if args.permutation_directory: output_prefix = args.permutation_directory + '/permuted-mutations-{}.json' if args.verbose > 0: - print '* Saving permuted mutation data...' + print('* Saving permuted mutation data...') for _, permutation_list in results: for permutation in permutation_list: @@ -180,4 +183,5 @@ def run( args ): permutation['params'] = params json.dump( permutation, OUT ) -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) diff --git a/examples/generate_data.py b/examples/generate_data.py index e5d5606..5a6c4c9 100644 --- a/examples/generate_data.py +++ b/examples/generate_data.py @@ -81,4 +81,5 @@ def run(args): raise NotImplementedError('Data generation mode "%s" is not implemented.' % args.mode) return -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) \ No newline at end of file +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) \ No newline at end of file diff --git a/experiments/eccb2016/scripts/helper.py b/experiments/eccb2016/scripts/helper.py index 597270b..8131c6a 100644 --- a/experiments/eccb2016/scripts/helper.py +++ b/experiments/eccb2016/scripts/helper.py @@ -1,5 +1,7 @@ #!/usr/bin/env python + import numpy as np +from past.builtins import xrange # Add a y=x line to the given matplotlib axis def add_y_equals_x(ax, c='k', line_style='--', alpha=0.75): @@ -15,6 +17,7 @@ def add_y_equals_x(ax, c='k', line_style='--', alpha=0.75): ax.set_xlim(lims) ax.set_ylim(lims) + def aligned_plaintext_table(table, sep='\t', spaces=2): """ Create and return an aligned plaintext table. @@ -41,6 +44,7 @@ def aligned_plaintext_table(table, sep='\t', spaces=2): # Return results. return '\n'.join([''.join([entries[i][j].rjust(sizes[j]+spaces) for j in range(n)]).rstrip() for i in range(m)]) + def rank(a, reverse=False, ties=2): """ Find the ranks of the elements of a. diff --git a/experiments/eccb2016/scripts/pairs_summary.py b/experiments/eccb2016/scripts/pairs_summary.py index 8275746..2920d18 100755 --- a/experiments/eccb2016/scripts/pairs_summary.py +++ b/experiments/eccb2016/scripts/pairs_summary.py @@ -86,7 +86,7 @@ "Cancer": cancer}) df = pd.DataFrame(items) -print 'Testing {} pairs...'.format(len(weighted_exact_pvals)) +print('Testing {} pairs...'.format(len(weighted_exact_pvals))) # Set up the figure fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4) @@ -138,15 +138,15 @@ # Output the correlation between all_correlation = spearmanr(weighted_exact_pvals, weighted_saddlepoint_pvals) tail_correlation = spearmanr(weighted_exact_tail_pvals, weighted_saddlepoint_tail_pvals) -print '-' * 14, 'Correlation: WRE (Saddlepoint) and WRE (Recursive)', '-' * 14 -print 'All: \\rho={:.5}, P={:.5}'.format(*all_correlation) -print '\Phi_WR < 10^-4: \\rho={:.5}, P={:.5}'.format(*tail_correlation) +print('-' * 14, 'Correlation: WRE (Saddlepoint) and WRE (Recursive)', '-' * 14) +print('All: \\rho={:.5}, P={:.5}'.format(*all_correlation)) +print('\Phi_WR < 10^-4: \\rho={:.5}, P={:.5}'.format(*tail_correlation)) # Output a table summarizing the runtimes (Table 3) -print '-' * 35, 'Runtimes', '-' * 35 +print('-' * 35, 'Runtimes', '-' * 35) tbl = ['#Method\tMinimum\tMedian\tMaximum\tTotal'] for method in ["WRE (Exact)", "WRE (Saddlepoint)"]: - print method, sum(list(df.loc[df['Method'] == method]['Runtime (seconds)'])) + print(method, sum(list(df.loc[df['Method'] == method]['Runtime (seconds)']))) # Output to file plt.tight_layout() diff --git a/experiments/eccb2016/scripts/permutation_test_helper.py b/experiments/eccb2016/scripts/permutation_test_helper.py index fc9b90a..56d8d57 100644 --- a/experiments/eccb2016/scripts/permutation_test_helper.py +++ b/experiments/eccb2016/scripts/permutation_test_helper.py @@ -16,7 +16,7 @@ parser.add_argument('-o', '--output_prefix', type=str, required=True) parser.add_argument('-w', '--wext_directory', type=str, required=True) parser.add_argument('-j', '--job_id', type=int, required=job_id is None, default=job_id) -parser.add_argument('-v', '--verbose', type=int, required=False, default=0, choices=range(5)) +parser.add_argument('-v', '--verbose', type=int, required=False, default=0, choices=list(range(5))) args = parser.parse_args( sys.argv[1:] ) # Load weighted exclusivity test @@ -25,19 +25,23 @@ from wext import rce_permutation_test, load_mutation_data, output_enumeration_table # Load the mutation data -if args.verbose > 0: print '* Loading mutation data..' +if args.verbose > 0: + print('* Loading mutation data..') mutation_data = load_mutation_data( args.mutation_file, args.min_freq ) genes, all_genes, patients, geneToCases, _, params, _ = mutation_data num_patients = len(patients) sets = list( frozenset(t) for t in combinations(genes, args.gene_set_size) ) -if args.verbose > 0: print '\t- Testing {} sets of size k={}'.format(len(sets), args.gene_set_size) +if args.verbose > 0: + print('\t- Testing {} sets of size k={}'.format(len(sets), args.gene_set_size)) # Run the permutational test -if args.verbose > 0: print '* Running permutation test...' +if args.verbose > 0: + print('* Running permutation test...') start_index = (args.job_id-1) * args.batch_size permuted_files = get_permuted_files([args.input_directory], args.num_permutations)[start_index:start_index + args.batch_size] -if args.verbose > 0: print '\t- Testing {} files'.format(len(permuted_files)) +if args.verbose > 0: + print('\t- Testing {} files'.format(len(permuted_files))) setToPval, setToRuntime, setToFDR, setToObs = rce_permutation_test( sets, geneToCases, num_patients, permuted_files, 1, 0 ) diff --git a/experiments/eccb2016/scripts/permute_single_matrix.py b/experiments/eccb2016/scripts/permute_single_matrix.py index 1efa4f0..c25eff2 100755 --- a/experiments/eccb2016/scripts/permute_single_matrix.py +++ b/experiments/eccb2016/scripts/permute_single_matrix.py @@ -17,6 +17,7 @@ def get_parser(): default=os.environ.get('SGE_TASK_ID', 0)) return parser + def run( args ): # Load WExT sys.path.append(args.wext_dir) @@ -33,7 +34,7 @@ def run( args ): indexToPatient = dict( (j+1, p) for j, p in enumerate(patients) ) edges = set() - for gene, cases in geneToCases.iteritems(): + for gene, cases in geneToCases.items(): for patient in cases: edges.add( (geneToIndex[gene], patientToIndex[patient]) ) @@ -57,16 +58,16 @@ def run( args ): permutedPatientToMutations[patient].add(gene) # Verify the number of mutations per gene/patient is preserved - for g, cases in geneToCases.iteritems(): + for g, cases in geneToCases.items(): assert( len(cases) == len(permutedGeneToCases[g]) ) - for p, muts in patientToMutations.iteritems(): + for p, muts in patientToMutations.items(): assert( len(muts) == len(permutedPatientToMutations[p]) ) # Save edge list. output_file = '{}-{}.json'.format(args.output_prefix, args.job_id) permutation = dict(params=params, permutation_number=args.job_id, - geneToCases=dict( (g, list(cases)) for g, cases in permutedGeneToCases.iteritems())) + geneToCases=dict( (g, list(cases)) for g, cases in permutedGeneToCases.items())) with open(output_file, 'w') as OUT: json.dump( permutation, OUT ) if __name__ == '__main__': diff --git a/experiments/eccb2016/scripts/pval_correlations.py b/experiments/eccb2016/scripts/pval_correlations.py index a124476..75e3c86 100755 --- a/experiments/eccb2016/scripts/pval_correlations.py +++ b/experiments/eccb2016/scripts/pval_correlations.py @@ -31,7 +31,7 @@ # Compute the correlations with permutational # permutational_pvals_with_zeros = list(df.loc[df['Method'] == 'Permutational']['Raw P-value']) # all_indices = -tests = ["Permutational", "Fisher's exact test", "Weighted (exact test)", "Weighted (saddlepoint)"] +tests = ["Permutational", "Fisher's exact test", "Weighted (exact test)", "Weighted (saddlepoint)"] for val, indices in [("All", []), (0, 1./args.num_permutations), (1./args.num_permutations, 2)]: tbl = [list(tests)] for t1 in tests: @@ -46,33 +46,33 @@ row.append(rho) tbl.append(row) - print '-' * 80 - print 'CORRELATIONS ({})'.format(val) - print aligned_plaintext_table('\n'.join([ '\t'.join(map(str, row)) for row in tbl ]) ) + print('-' * 80) + print('CORRELATIONS ({})'.format(val)) + print(aligned_plaintext_table('\n'.join([ '\t'.join(map(str, row)) for row in tbl ]))) permutational_pvals_no_zeros = [ p for p in permutational_pvals_with_zeros if p > 0 ] for method in ["Fisher's exact test", "Weighted (exact test)", "Weighted (saddlepoint)"]: pvals = list(df.loc[df['Method'] == method]['P-value']) - print 'Correlation:', method, 'with Permutational' + print('Correlation:', method, 'with Permutational') rho, pval = spearmanr(permutational_pvals, pvals) - print '\tIncluding P < {}: N={}, \\rho={}, P={}'.format(1./args.num_permutations, len(pvals), rho, pval) + print('\tIncluding P < {}: N={}, \\rho={}, P={}'.format(1./args.num_permutations, len(pvals), rho, pval)) pvals_no_zeros = [ p for i, p in enumerate(pvals) if permutational_pvals_with_zeros[i] > 0 ] rho, pval = spearmanr(permutational_pvals_no_zeros, pvals_no_zeros) - print '\tWithout P < {}: N={}, \\rho={}, P={}'.format(1./args.num_permutations, len(pvals_no_zeros), rho, pval) -print + print('\tWithout P < {}: N={}, \\rho={}, P={}'.format(1./args.num_permutations, len(pvals_no_zeros), rho, pval)) + # Compute the correlations of weighted saddlepoint and exact test weighted_exact_pvals = list(df.loc[df['Method'] == 'Weighted (exact test)']['P-value']) weighted_saddlepoint_pvals = list(df.loc[df['Method'] == 'Weighted (saddlepoint)']['P-value']) rho, pval = spearmanr(weighted_exact_pvals, weighted_saddlepoint_pvals) -print 'Correlation of weighted exact test and saddlepoint (all P-values)' -print '\tN={}, \\rho: {}, P={}'.format(len(weighted_exact_pvals), rho, pval) +print('Correlation of weighted exact test and saddlepoint (all P-values)') +print('\tN={}, \\rho: {}, P={}'.format(len(weighted_exact_pvals), rho, pval)) tail_weighted_exact_pvals = [ p for p in weighted_exact_pvals if p < 1e-4 ] rho, pval = spearmanr(tail_weighted_exact_pvals, [ p for i, p in enumerate(weighted_saddlepoint_pvals) if weighted_exact_pvals[i] < 1e-4]) -print 'Correlation of weighted exact test and saddlepoint (P < 0.0001)' -print '\tN={}, \\rho: {}, P={}'.format(len(tail_weighted_exact_pvals), rho, pval) +print('Correlation of weighted exact test and saddlepoint (P < 0.0001)') +print('\tN={}, \\rho: {}, P={}'.format(len(tail_weighted_exact_pvals), rho, pval)) rho, pval = spearmanr(tail_weighted_exact_pvals, [ p for i, p in enumerate(permutational_pvals) if weighted_exact_pvals[i] < 1e-4]) -print 'Correlation of weighted exact test and permutational (P < 0.0001)' -print '\tN={}, \\rho: {}, P={}'.format(len(tail_weighted_exact_pvals), rho, pval) +print('Correlation of weighted exact test and permutational (P < 0.0001)') +print('\tN={}, \\rho: {}, P={}'.format(len(tail_weighted_exact_pvals), rho, pval)) diff --git a/experiments/eccb2016/scripts/reconcile_grid_permutation_test.py b/experiments/eccb2016/scripts/reconcile_grid_permutation_test.py index 3b123a2..c60a68c 100644 --- a/experiments/eccb2016/scripts/reconcile_grid_permutation_test.py +++ b/experiments/eccb2016/scripts/reconcile_grid_permutation_test.py @@ -19,6 +19,7 @@ # Load and merge the JSON files def load_json_files(( json_files )): + setToCount = defaultdict( int ) setToRuntime = defaultdict( float ) setToObs = dict() @@ -27,7 +28,7 @@ def load_json_files(( json_files )): # Parse the JSON file with open(json_file, 'r') as IN: obj = json.load(IN) - for M, pval in obj['setToPval'].iteritems(): + for M, pval in obj['setToPval'].items(): frozen_M = frozenset(M.split('\t')) setToCount[frozen_M] += int(round(pval * args.batch_size)) setToRuntime[frozen_M] += obj['setToRuntime'][M] @@ -42,7 +43,7 @@ def load_json_files(( json_files )): json_files = [ '{}/{}'.format(root, f) for f in files if f.lower().endswith('.json') ] # Set up the multiprocessing and run -print '* Loading {} JSON files...'.format(len(json_files)) +print('* Loading {} JSON files...'.format(len(json_files))) num_cores = args.num_cores if args.num_cores != -1 else mp.cpu_count() if num_cores != 1: pool = mp.Pool(num_cores) @@ -58,7 +59,7 @@ def load_json_files(( json_files )): pool.join() # Merge the results -print '\t- Merging results...' +print('\t- Merging results...') setToCount = defaultdict( int ) setToRuntime = defaultdict( float ) setToObs = dict() @@ -72,16 +73,16 @@ def load_json_files(( json_files )): setToPval = dict( (M, count/num_permutations) for M, count in setToCount.iteritems() ) -print '\t- Loaded {} sets with {} permutations'.format(len(setToPval), int(num_permutations)) +print('\t- Loaded {} sets with {} permutations'.format(len(setToPval), int(num_permutations))) # Compute FDR -print '* Computing FDRs...' +print('* Computing FDRs...') tested_sets = setToPval.keys() pvals = [ setToPval[M] for M in tested_sets ] setToFDR = dict(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY"))) # Output the merged file -print '* Outputting to file...' +print('* Outputting to file...') k = len(tested_sets[0]) args.json_format = True args.test = 'RCE' diff --git a/experiments/eccb2016/scripts/remove_genes_with_no_length.py b/experiments/eccb2016/scripts/remove_genes_with_no_length.py index 5441499..f0897e7 100644 --- a/experiments/eccb2016/scripts/remove_genes_with_no_length.py +++ b/experiments/eccb2016/scripts/remove_genes_with_no_length.py @@ -23,13 +23,13 @@ # Remove genes without a length original_genes = set(obj['genes']) remaining_genes = original_genes & set(geneToLength.keys()) -obj['geneToCases'] = dict( (g, cases) for g, cases in obj['geneToCases'].iteritems() if g in geneToLength ) +obj['geneToCases'] = dict( (g, cases) for g, cases in obj['geneToCases'].items() if g in geneToLength ) obj['genes'] = sorted(obj['geneToCases'].keys()) obj['num_genes'] = len(obj['genes']) obj['params']['lengths_file'] = os.path.abspath(args.lengths_file) obj['genes_with_no_length_removed'] = sorted(original_genes - set(obj['genes'])) -obj['patientToMutations'] = dict( (p, sorted(set(muts) & remaining_genes)) for p, muts in obj['patientToMutations'].iteritems() ) -print 'Removed {} genes with no length'.format(len(obj['genes_with_no_length_removed'])) +obj['patientToMutations'] = dict((p, sorted(set(muts) & remaining_genes)) for p, muts in obj['patientToMutations'].items()) +print('Removed {} genes with no length'.format(len(obj['genes_with_no_length_removed']))) # Output the new file with open(args.output_file, 'w') as OUT: diff --git a/experiments/eccb2016/scripts/results_table.py b/experiments/eccb2016/scripts/results_table.py index a8514db..7953f09 100755 --- a/experiments/eccb2016/scripts/results_table.py +++ b/experiments/eccb2016/scripts/results_table.py @@ -22,47 +22,49 @@ with open(args.lengths_file, 'r') as IN: arrs = [ l.rstrip().split('\t') for l in IN if not l.startswith('#') ] geneToLength = dict( (arr[0], float(arr[1])) for arr in arrs ) - lengths = geneToLength.values() + lengths = list(geneToLength.values()) length_ranks = rank(lengths, reverse=True) geneToLengthRank = defaultdict( lambda : args.length_threshold + 1 ) - geneToLengthRank.update(zip(geneToLength.keys(), length_ranks)) - threshold_gene = sorted(geneToLength.keys(), key=lambda g: geneToLengthRank[g])[args.length_threshold] - print 'Length of {} longest gene: {}'.format(args.length_threshold, geneToLength[threshold_gene]) + geneToLengthRank.update(list(zip(list(geneToLength.keys()), length_ranks))) + threshold_gene = sorted(list(geneToLength.keys()), key=lambda g: geneToLengthRank[g])[args.length_threshold] + print('Length of {} longest gene: {}'.format(args.length_threshold, geneToLength[threshold_gene])) # Load the mutations with open(args.mutation_file, 'r') as IN: obj = json.load(IN) genes, patients = obj['genes'], obj['patients'] hypermutators = set(obj['hypermutators']) - geneToCases = dict( (g, set(cases)) for g, cases in obj['geneToCases'].iteritems() ) + geneToCases = dict((g, set(cases)) for g, cases in obj['geneToCases'].items()) # Load the triples with open(args.unweighted_exact_file, 'r') as IN: obj = json.load(IN) - unweightedPval = dict( (frozenset(t.split('\t')), pval) for t, pval in obj['setToPval'].iteritems() ) + unweightedPval = dict((frozenset(t.split('\t')), pval) for t, pval in obj['setToPval'].items()) assert( all( not(isnan(pval)) for pval in unweightedPval.values() )) - unweightedFDR = dict( (frozenset(t.split('\t')), fdr) for t, fdr in obj['setToFDR'].iteritems() ) + unweightedFDR = dict((frozenset(t.split('\t')), fdr) for t, fdr in obj['setToFDR'].items()) with open(args.weighted_saddlepoint_file, 'r') as IN: obj = json.load(IN) - weightedPval = dict( (frozenset(t.split('\t')), pval) for t, pval in obj['setToPval'].iteritems() ) + weightedPval = dict((frozenset(t.split('\t')), pval) for t, pval in obj['setToPval'].items()) assert( all( not(isnan(pval)) for pval in weightedPval.values() )) - weightedFDR = dict( (frozenset(t.split('\t')), fdr) for t, fdr in obj['setToFDR'].iteritems() ) + weightedFDR = dict((frozenset(t.split('\t')), fdr) for t, fdr in obj['setToFDR'].items()) -print 'Triples with weighted FDR < {}: {}/{}'.format(args.fdr_cutoff, sum(1 for t, fdr in weightedFDR.iteritems() if fdr < args.fdr_cutoff), len(weightedFDR)) -print 'Triples with unweighted FDR < {}: {}/{}'.format(args.fdr_cutoff, sum(1 for t, fdr in unweightedFDR.iteritems() if fdr < args.fdr_cutoff), len(unweightedFDR)) +print('Triples with weighted FDR < {}: {}/{}'.format(args.fdr_cutoff, sum(1 for t, fdr in weightedFDR.items() if fdr < args.fdr_cutoff), len(weightedFDR))) +print('Triples with unweighted FDR < {}: {}/{}'.format(args.fdr_cutoff, sum(1 for t, fdr in unweightedFDR.items() if fdr < args.fdr_cutoff), len(unweightedFDR))) # Rank triples by P-value triples = sorted(set(weightedPval.keys()) & set(unweightedPval.keys())) top_weighted_triples = sorted(triples, key=lambda t: weightedPval[t]) -weightedRank = dict(zip(triples, rank([ weightedPval[t] for t in triples ]))) +weightedRank = dict(list(zip(triples, rank([ weightedPval[t] for t in triples ])))) top_unweighted_triples = sorted(triples, key=lambda t: unweightedPval[t]) -unweightedRank = dict(zip(triples, rank([ unweightedPval[t] for t in triples ]))) +unweightedRank = dict(list(zip(triples, rank([ unweightedPval[t] for t in triples ])))) # Create tables def length_indicate(g): - if geneToLengthRank[g] > args.length_threshold: return g - else: return '\\textbf{%s}' % g + if geneToLengthRank[g] > args.length_threshold: + return g + else: + return '\\textbf{%s}' % g header = ['CoMEt rank', 'Weighted rank', 'Triple', 'Phi(M)', 'Psi(M)', 'Hypermutator mutations'] tbl = [ header ] diff --git a/experiments/eccb2016/scripts/sample_mutation_frequency_plot.py b/experiments/eccb2016/scripts/sample_mutation_frequency_plot.py index 3435794..909bcba 100755 --- a/experiments/eccb2016/scripts/sample_mutation_frequency_plot.py +++ b/experiments/eccb2016/scripts/sample_mutation_frequency_plot.py @@ -26,12 +26,12 @@ # Make a map of patients to their mutated genes patientToMutations = dict( (p, set()) for p in patients ) - for g, cases in obj['geneToCases'].iteritems(): + for g, cases in obj['geneToCases'].items(): for p in cases: patientToMutations[p].add( g ) # Assemble the data into dictionaries for Pandas - for p, mutations in patientToMutations.iteritems(): + for p, mutations in patientToMutations.items(): ty = "Hypermutator" if p in hypermutators else "Non-hypermutator" items.append({ "Sample": p, "Mutated genes per sample": len(mutations), "Type": ty, "Cancer": cancer }) @@ -51,4 +51,4 @@ non_hyper_rates = list(df.loc[(df['Cancer'] == c) & (df['Type'] == "Non-hypermutator")]['Mutated genes per sample']) tbl.append([ c, np.median(all_rates), np.median(hyper_rates) if len(hyper_rates) > 0 else '--', np.median(non_hyper_rates)]) -print aligned_plaintext_table('\n'.join([ '\t'.join(map(str, row)) for row in tbl ])) +print(aligned_plaintext_table('\n'.join([ '\t'.join(map(str, row)) for row in tbl ]))) diff --git a/experiments/eccb2016/scripts/triple_pval_scatter.py b/experiments/eccb2016/scripts/triple_pval_scatter.py index 59a6eee..127c9d5 100755 --- a/experiments/eccb2016/scripts/triple_pval_scatter.py +++ b/experiments/eccb2016/scripts/triple_pval_scatter.py @@ -36,16 +36,16 @@ with open(permuted_file, 'r') as IN: setToPermuted.update( json.load(IN)['setToPval'] ) -for M, pval in setToPermuted.iteritems(): +for M, pval in setToPermuted.items(): if pval == 0: setToPermuted[M] = 1./args.num_permutations sets = set(setToWeighted.keys()) & set(setToUnweighted.keys()) & set(setToPermuted.keys()) -print '* Loaded weighted/unweighted P-values in {} triples...'.format(len(setToWeighted)) -print '\t- Weighted range: [{}, {}]'.format(np.min(setToWeighted.values()), np.max(setToWeighted.values())) -print '\t- Unweighted range: [{}, {}]'.format(np.min(setToUnweighted.values()), np.max(setToUnweighted.values())) -print '* Loaded permuted P-values for {} sets ({} intersection)...'.format(len(setToPermuted), len(sets)) +print('* Loaded weighted/unweighted P-values in {} triples...'.format(len(setToWeighted))) +print('\t- Weighted range: [{}, {}]'.format(np.min(setToWeighted.values()), np.max(setToWeighted.values()))) +print('\t- Unweighted range: [{}, {}]'.format(np.min(setToUnweighted.values()), np.max(setToUnweighted.values()))) +print('* Loaded permuted P-values for {} sets ({} intersection)...'.format(len(setToPermuted), len(sets))) # Create two scatter plots fig, (ax1, ax2) = plt.subplots(1, 2) @@ -77,17 +77,17 @@ ax2.plot(ax2.get_xlim(), ax2.get_xlim(), ls="--", c=".3") # Output maximum deviation and correlations -print 'Max deviation permutational vs. weighted (1E-3 to 1E-5):', +print('Max deviation permutational vs. weighted (1E-3 to 1E-5):') deviations = [ (x, y, np.abs(y/x)) for x, y in zip(xs, ys) if 1e-3 > x > 1e-5 ] if deviations: - print max(deviations, key=lambda (x, y, z): z) + print(max(deviations, key=lambda (x, y, z): z)) else: - print 'None in p-value interval' + print('None in p-value interval') -print 'Unweighted correlation (all): \\rho={}'.format(unweighted_rho) -print 'Unweighted correlation (P<0.001): \\rho={}'.format(unweighted_tail_rho) -print 'Weighted correlation (all): \\rho={}'.format(weighted_rho) -print 'Weighted correlation (P<0.001): \\rho={}'.format(weighted_tail_rho) +print('Unweighted correlation (all): \\rho={}'.format(unweighted_rho)) +print('Unweighted correlation (P<0.001): \\rho={}'.format(unweighted_tail_rho)) +print('Weighted correlation (all): \\rho={}'.format(weighted_rho)) +print('Weighted correlation (P<0.001): \\rho={}'.format(weighted_tail_rho)) # Output to file plt.tight_layout() diff --git a/experiments/eccb2016/scripts/unweighted_comparison.py b/experiments/eccb2016/scripts/unweighted_comparison.py index 6367890..e2c3913 100755 --- a/experiments/eccb2016/scripts/unweighted_comparison.py +++ b/experiments/eccb2016/scripts/unweighted_comparison.py @@ -27,7 +27,7 @@ exactPval[cancer] = obj['setToPval'] exactRuntime[cancer] = obj['setToRuntime'] -num_exact = sum( 1 for c in args.cancers for M in exactPval[c].keys() ) +num_exact = sum(1 for c in args.cancers for M in exactPval[c].keys()) for cancer, saddlepoint_file in zip(args.cancers, args.saddlepoint_files): with open(saddlepoint_file, 'r') as IN: @@ -35,8 +35,8 @@ saddlepointPval[cancer] = obj['setToPval'] saddlepointRuntime[cancer] = obj['setToRuntime'] -num_saddlepoint = sum( 1 for c in args.cancers for M in saddlepointPval[c].keys() ) -print '* Loaded {} exact sets and {} saddlepoint sets...'.format(num_exact, num_saddlepoint) +num_saddlepoint = sum(1 for c in args.cancers for M in saddlepointPval[c].keys()) +print('* Loaded {} exact sets and {} saddlepoint sets...'.format(num_exact, num_saddlepoint)) # Construct the arrays of data saddlepoint_pvals, exact_pvals, items = [], [], [] @@ -52,13 +52,13 @@ df = pd.DataFrame(items) -print '* Testing {} triples in the intersection (ignoring sets with invalid P-values)...'.format(len(saddlepoint_pvals)) +print('* Testing {} triples in the intersection (ignoring sets with invalid P-values)...'.format(len(saddlepoint_pvals))) # Output spearman correlations between the saddlepoint and exact rho, pval = spearmanr(exact_pvals, saddlepoint_pvals) -print '-' * 80 -print 'CORRELATION' -print "Spearman's Rho: {}\nSpearman's P-value: {}\n".format(rho, pval) +print('-' * 80) +print('CORRELATION') +print("Spearman's Rho: {}\nSpearman's P-value: {}\n".format(rho, pval)) # Set up the figure fig, (ax1, ax2) = plt.subplots(1, 2) diff --git a/experiments/eccb2016/scripts/weights_matrix.py b/experiments/eccb2016/scripts/weights_matrix.py index e7d2c54..6d958ba 100755 --- a/experiments/eccb2016/scripts/weights_matrix.py +++ b/experiments/eccb2016/scripts/weights_matrix.py @@ -17,7 +17,7 @@ assert( len(args.cancers) == len(args.weights_files) == len(args.mutation_files) ) # Load the mutation file -print '* Loading mutation files...' +print('* Loading mutation files...') cancerToWeights, cancerToPatients, cancerToGenes, cancerToHypermutators, patientToMutations, geneToCases = dict(), dict(), dict(), dict(), dict(), dict() for cancer, mutation_file, weights_file in zip(args.cancers, args.mutation_files, args.weights_files): with open(mutation_file, 'r') as IN: @@ -28,24 +28,25 @@ cancerToHypermutators[cancer] = set(obj['hypermutators']) geneToCases[cancer] = obj['geneToCases'] patientToMutations[cancer] = dict( (p, set()) for p in obj['patients'] ) - for g, cases in geneToCases[cancer].iteritems(): + for g, cases in (geneToCases[cancer].items(): for p in cases: patientToMutations[cancer][p].add( g ) cancerToWeights[cancer] = np.load(weights_file) - print '\t{}\n\t\t- Genes: {}\n\t\t- Patients: {}'.format(cancer, num_genes, num_patients) + print('\t{}\n\t\t- Genes: {}\n\t\t- Patients: {}'.format(cancer, num_genes, num_patients)) # Set up the figure fig, axes = plt.subplots( 1, len(args.cancers)) fig.set_size_inches( len(args.cancers) * 5, 5) min_weight = min([ np.min(W) for W in cancerToWeights.values() ]) -print 'Min weight:', min_weight +print('Min weight:', min_weight) + for ax, cancer in zip(axes, args.cancers): # Sort the weights so that hypermutators are all on one side patients = cancerToPatients[cancer] genes = cancerToGenes[cancer] hypermutators = cancerToHypermutators[cancer] num_non_hypermutators = len(set(patients) - hypermutators) - patient_indices = sorted(range(len(patients)), key=lambda p: (patients[p] in hypermutators, len(patientToMutations[cancer][patients[p]]))) + patient_indices = sorted(list(range(len(patients))), key=lambda p: (patients[p] in hypermutators, len(patientToMutations[cancer][patients[p]]))) gene_indices = sorted([ i for i, g in enumerate(genes) if g in geneToCases[cancer]], key=lambda g: len(geneToCases[cancer].get(genes[g], [])), reverse=True) weights = [ row[patient_indices] for row in cancerToWeights[cancer][gene_indices] ] diff --git a/find_exclusive_sets.py b/find_exclusive_sets.py index d15ae05..f6046c4 100755 --- a/find_exclusive_sets.py +++ b/find_exclusive_sets.py @@ -18,7 +18,7 @@ def get_parser(): parser.add_argument('-o', '--output_prefix', type=str, required=True) parser.add_argument('-f', '--min_frequency', type=int, default=1, required=False) parser.add_argument('-c', '--num_cores', type=int, required=False, default=1) - parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=range(5)) + parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=list(range(5))) parser.add_argument('-r', '--report_invalids', action='store_true', default=False, required=False) parser.add_argument('--json_format', action='store_true', default=False, required=False) @@ -105,11 +105,12 @@ def load_mutation_files(mutation_files): genes |= set(type_genes) # Record the mutations in each gene - for g, cases in typeGeneToCases.iteritems(): geneToCases[g] |= cases + for g, cases in typeGeneToCases.items(): + geneToCases[g] |= cases # Record the genes, patients, and their indices for later - typeToGeneIndex.append(dict(zip(type_genes, range(len(type_genes))))) - typeToPatientIndex.append(dict(zip(type_patients, range(len(type_patients))))) + typeToGeneIndex.append(dict(zip(type_genes, list(range(len(type_genes)))))) + typeToPatientIndex.append(dict(zip(type_patients, list(range(len(type_patients)))))) return genes, patients, geneToCases, typeToGeneIndex, typeToPatientIndex @@ -123,12 +124,12 @@ def run( args ): # Load the mutation data if args.verbose > 0: - print ('-' * 30), 'Input Mutation Data', ('-' * 29) + print(('-' * 30), 'Input Mutation Data', ('-' * 29)) genes, patients, geneToCases, typeToGeneIndex, typeToPatientIndex = load_mutation_files( args.mutation_files ) num_all_genes, num_patients = len(genes), len(patients) # Restrict to genes mutated in a minimum number of samples - geneToCases = dict( (g, cases) for g, cases in geneToCases.iteritems() if g in genes and len(cases) >= args.min_frequency ) + geneToCases = dict( (g, cases) for g, cases in geneToCases.items() if g in genes and len(cases) >= args.min_frequency ) genes = set(geneToCases.keys()) num_genes = len(genes) @@ -141,7 +142,7 @@ def run( args ): # Since we are looking for co-occurrence between exclusive sets with # an annotation A, we add events for each patient NOT annotated by # the given annotation - for annotation, cases in annotationToPatients.iteritems(): + for annotation, cases in annotationToPatients.items(): not_cases = patients - cases if len(not_cases) > 0: geneToCases[annotation] = not_cases @@ -149,18 +150,18 @@ def run( args ): annotations = set() if args.verbose > 0: - print '- Genes:', num_all_genes - print '- Patients:', num_patients - print '- Genes mutated in >={} patients: {}'.format(args.min_frequency, num_genes) + print('- Genes:', num_all_genes) + print('- Patients:', num_patients) + print('- Genes mutated in >={} patients: {}'.format(args.min_frequency, num_genes)) if args.patient_annotation_file: - print '- Patient annotations:', len(annotations) + print('- Patient annotations:', len(annotations)) # Load the weights (if necessary) test = nameToTest[args.test] if test == WRE: # Create master versions of the indices - masterGeneToIndex = dict(zip(sorted(genes), range(num_genes))) - masterPatientToIndex = dict( zip(sorted(patients), range(num_patients)) ) + masterGeneToIndex = dict(zip(sorted(genes), list(range(num_genes)))) + masterPatientToIndex = dict(zip(sorted(patients), list(range(num_patients)))) geneToP = load_weight_files(args.weights_files, genes, patients, typeToGeneIndex, typeToPatientIndex, masterGeneToIndex, masterPatientToIndex) else: geneToP = None @@ -169,17 +170,19 @@ def run( args ): if test == RCE: permuted_files = get_permuted_files(args.permuted_matrix_directories, args.num_permutations) if args.verbose > 0: - print '* Using {} permuted matrix files'.format(len(permuted_files)) + print('* Using {} permuted matrix files'.format(len(permuted_files))) #Enumeration if args.search_strategy == 'Enumerate': - if args.verbose > 0: print ('-' * 31), 'Enumerating Sets', ('-' * 31) + if args.verbose > 0: + print(('-' * 31), 'Enumerating Sets', ('-' * 31)) for k in set( args.gene_set_sizes ): # we don't need to enumerate the same size more than once # Create a list of sets to test sets = list( frozenset(t) for t in combinations(genes, k) ) num_sets = len(sets) - if args.verbose > 0: print 'k={}: {} sets...'.format(k, num_sets) + if args.verbose > 0: + print('k={}: {} sets...'.format(k, num_sets)) if test == RCE: # Run the permutational setToPval, setToRuntime, setToFDR, setToObs = rce_permutation_test( sets, geneToCases, num_patients, permuted_files, args.num_cores, args.verbose ) @@ -199,4 +202,5 @@ def run( args ): else: raise NotImplementedError("Strategy '{}' not implemented.".format(args.strategy)) -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) diff --git a/find_sets.py b/find_sets.py index 903f3c3..aa1f984 100755 --- a/find_sets.py +++ b/find_sets.py @@ -24,7 +24,7 @@ def get_parser(): parser.add_argument('-t', '--test', type=str, required=False, default='WRE', choices=['WRE']) parser.add_argument('-m', '--method', type=str, required=False, default='Saddlepoint', choices=['Saddlepoint']) parser.add_argument('-s', '--statistic', type=str, required=True, choices=['exclusivity', 'any-co-occurrence', 'all-co-occurrence']) - parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=range(5)) + parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=list(range(5)) ) parser.add_argument('-r', '--report_invalids', action='store_true', default=False, required=False) parser.add_argument('--json_format', action='store_true', default=False, required=False) return parser @@ -87,11 +87,12 @@ def load_mutation_files(mutation_files): genes |= set(type_genes) # Record the mutations in each gene - for g, cases in typeGeneToCases.iteritems(): geneToCases[g] |= cases + for g, cases in typeGeneToCases.items(): + geneToCases[g] |= cases # Record the genes, patients, and their indices for later - typeToGeneIndex.append(dict(zip(type_genes, range(len(type_genes))))) - typeToPatientIndex.append(dict(zip(type_patients, range(len(type_patients))))) + typeToGeneIndex.append(dict(zip(type_genes, list(range(len(type_genes)))))) + typeToPatientIndex.append(dict(zip(type_patients, list(range(len(type_patients)))))) return genes, patients, geneToCases, typeToGeneIndex, typeToPatientIndex @@ -101,12 +102,12 @@ def run( args ): # Load the mutation data if args.verbose > 0: - print ('-' * 30), 'Input Mutation Data', ('-' * 29) + print(('-' * 30), 'Input Mutation Data', ('-' * 29)) genes, patients, geneToCases, typeToGeneIndex, typeToPatientIndex = load_mutation_files( args.mutation_files ) num_all_genes, num_patients = len(genes), len(patients) # Restrict to genes mutated in a minimum number of samples - geneToCases = dict( (g, cases) for g, cases in geneToCases.iteritems() if g in genes and len(cases) >= args.min_frequency ) + geneToCases = dict( (g, cases) for g, cases in geneToCases.items() if g in genes and len(cases) >= args.min_frequency ) genes = set(geneToCases.keys()) num_genes = len(genes) @@ -119,7 +120,7 @@ def run( args ): # Since we are looking for co-occurrence between exclusive sets with # an annotation A, we add events for each patient NOT annotated by # the given annotation - for annotation, cases in annotationToPatients.iteritems(): + for annotation, cases in annotationToPatients.items(): not_cases = patients - cases if len(not_cases) > 0: geneToCases[annotation] = not_cases @@ -127,26 +128,28 @@ def run( args ): annotations = set() if args.verbose > 0: - print '- Genes:', num_all_genes - print '- Patients:', num_patients - print '- Genes mutated in >={} patients: {}'.format(args.min_frequency, num_genes) + print('- Genes:', num_all_genes) + print('- Patients:', num_patients) + print('- Genes mutated in >={} patients: {}'.format(args.min_frequency, num_genes)) if args.patient_annotation_file: - print '- Patient annotations:', len(annotations) + print('- Patient annotations:', len(annotations)) # Load the weights (if necessary) # Create master versions of the indices - masterGeneToIndex = dict(zip(sorted(genes), range(num_genes))) - masterPatientToIndex = dict( zip(sorted(patients), range(num_patients)) ) + masterGeneToIndex = dict(zip(sorted(genes), list(range(num_genes)))) + masterPatientToIndex = dict(zip(sorted(patients), list(range(num_patients)))) geneToP = load_weight_files(args.weights_files, genes, patients, typeToGeneIndex, typeToPatientIndex, masterGeneToIndex, masterPatientToIndex) - if args.verbose > 0: print ('-' * 31), 'Enumerating Sets', ('-' * 31) + if args.verbose > 0: + print(('-' * 31), 'Enumerating Sets', ('-' * 31)) k = args.gene_set_size # Create a list of sets to test sets = list( frozenset(t) for t in combinations(genes, k) ) num_sets = len(sets) - if args.verbose > 0: print 'k={}: {} sets...'.format(k, num_sets) + if args.verbose > 0: + print('k={}: {} sets...'.format(k, num_sets)) # Run the test method = nameToMethod['Saddlepoint'] test = nameToTest['WRE'] @@ -155,4 +158,5 @@ def run( args ): verbose=args.verbose, report_invalids=args.report_invalids) output_enumeration_table( args, k, setToPval, setToRuntime, setToFDR, setToObs, args.fdr_threshold ) -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) diff --git a/process_mutations.py b/process_mutations.py index 4b39585..99e7b09 100755 --- a/process_mutations.py +++ b/process_mutations.py @@ -19,11 +19,12 @@ def get_parser(): parser.add_argument('-ivs', '--ignored_validation_statuses', type=str, required=False, nargs='*', default=['Wildtype', 'Invalid']) parser.add_argument('-o', '--output_file', type=str, required=True) - parser.add_argument('-v', '--verbose', type=int, default=1, required=False, choices=range(5)) + parser.add_argument('-v', '--verbose', type=int, default=1, required=False, choices=list(range(5))) return parser def process_maf( maf_file, patientWhitelist, geneToCases, patientToMutations, vc, vt, vs, ivc, ivt, ivs, verbose ): - if verbose > 1: print '\tLoading MAF:', maf_file + if verbose > 1: + print('\tLoading MAF: ', maf_file) genes, patients = set(), set() with open(maf_file, 'r') as IN: seenHeader = False @@ -44,7 +45,8 @@ def process_maf( maf_file, patientWhitelist, geneToCases, patientToMutations, vc # Record the patients and genes, even if we ignore their mutations patient, gene = '-'.join(arr[patient_index].split('-')[:3]), arr[gene_index] - if not patientWhitelist[patient]: continue + if not patientWhitelist[patient]: + continue patients.add(patient) genes.add(gene) @@ -83,7 +85,8 @@ def process_maf( maf_file, patientWhitelist, geneToCases, patientToMutations, vc return genes, patients def process_events_file( events_file, patientWhitelist, geneToCases, patientToMutations, verbose ): - if verbose > 1: print '\tProcessing events file:', events_file + if verbose > 1: + print('\tProcessing events file: ', events_file) # Parse the events file events, patients = set(), set() @@ -92,7 +95,8 @@ def process_events_file( events_file, patientWhitelist, geneToCases, patientToMu for arr in arrs: # Skip patients that aren't whitelisted patient, mutations = arr[0], set(arr[1:]) - if not patientWhitelist[patient]: continue + if not patientWhitelist[patient]: + continue # Record the events and mutations patients.add(patient) @@ -112,16 +116,19 @@ def run( args ): # Load the patient whitelist (if supplied) if args.patient_whitelist: - if args.verbose > 0: print '* Loading patient whitelist...' + if args.verbose > 0: + print('* Loading patient whitelist...') patientWhitelist = defaultdict( lambda : False ) with open(args.patient_whitelist, 'r') as IN: patientWhitelist.update( (l.rstrip('\n').split()[0], True) for l in IN if not l.startswith('#') ) else: - if args.verbose > 0: print '* No patient whitelist provided, including all patients...' + if args.verbose > 0: + print('* No patient whitelist provided, including all patients...') patientWhitelist = defaultdict( lambda : True ) # Load the mutations from each MAF - if args.verbose > 0: print '* Loading and combining {} datasets...'.format(len(args.cancer_types)) + if args.verbose > 0: + print('* Loading and combining {} datasets...'.format(len(args.cancer_types))) geneToCases, patientToMutations = defaultdict( set ), defaultdict( set ) genes, patients = set(), set() vc, vt, vs = set(), set(), set() # variant classes/types and validation statuses @@ -154,12 +161,12 @@ def run( args ): # Summarize the data if args.verbose > 0: - print '* Summary of mutation data...' - print '\tGenes: {}'.format(num_genes) - print '\tPatients: {} ({} hypermutators)'.format(num_patients, len(hypermutators)) - print '\tUsed variant classes:', ', '.join(sorted(vc)) - print '\tUsed variant types:', ', '.join(sorted(vt)) - print '\tUsed validation statuses:', ', '.join(sorted(vs)) + print("* Summary of mutation data...") + print("\tGenes: {}".format(num_genes)) + print("\tPatients: {} ({} hypermutators)".format(num_patients, len(hypermutators))) + print("\tUsed variant classes: " + ", ".join(sorted(vc))) + print("\tUsed variant types: " + ", ".join(sorted(vt))) + print("\tUsed validation statuses: " + ", ".join(sorted(vs))) # Output to file with open(args.output_file, 'w') as OUT: @@ -177,4 +184,5 @@ def run( args ): num_genes=num_genes, num_patients=num_patients) json.dump( output, OUT ) -if __name__ == '__main__': run( get_parser().parse_args( sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args( sys.argv[1:]) ) diff --git a/requirements.txt b/requirements.txt index 0e48cd0..eca8f8a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -numpy -scipy -networkx +numpy >=1.11.0 +scipy >=0.17.0 +networkx >= 1.11 +future >= 0.16.0 diff --git a/viz/generate_viz_data.py b/viz/generate_viz_data.py index a675096..4a4608b 100755 --- a/viz/generate_viz_data.py +++ b/viz/generate_viz_data.py @@ -35,42 +35,42 @@ def run( args ): method_paren = '' if is_rce else ' ({})'.format(params['method']) run_name = '{}{}'.format(params['test'], method_paren) methods.add( run_name ) - setToPval[run_name].update( obj['setToPval'].items() ) - setToRuntime[run_name].update( obj['setToRuntime'].items() ) - setToFDR[run_name].update( obj['setToFDR'].items() ) - setToObs[run_name].update(obj['setToObs'].items() ) + setToPval[run_name].update( list(obj['setToPval'].items()) ) + setToRuntime[run_name].update( list(obj['setToRuntime'].items()) ) + setToFDR[run_name].update( list(obj['setToFDR'].items()) ) + setToObs[run_name].update(list(obj['setToObs'].items()) ) sets |= set(obj['setToPval'].keys()) # Load the mutation data mutation_data = load_mutation_data( args.mutation_file, min_frequency ) genes, _, patients, geneToCases, patientToMutations, params, hypermutators = mutation_data num_genes, num_patients = len(genes), len(patients) - geneToIndex = dict(zip(genes, range(num_genes))) + geneToIndex = dict(list(zip(genes, list(range(num_genes))))) patientToType = dict( (p, "Hypermutator" if p in hypermutators else "Non-hypermutator") for p in patients ) # Load the weights P = np.load(args.weights_file) - P = dict( (g, dict(zip(patients, P[geneToIndex[g]]))) for g in genes ) + P = dict( (g, dict(list(zip(patients, P[geneToIndex[g]])))) for g in genes ) # Restrict the sets (if necessary) if args.num_sets: new_sets = set() for run_name in methods: - new_sets |= set(sorted( setToPval[run_name].keys(), key=lambda M: setToPval[run_name][M] )[:args.num_sets]) + new_sets |= set(sorted( list(setToPval[run_name].keys()), key=lambda M: setToPval[run_name][M] )[:args.num_sets]) sets = new_sets - setToPval = dict( (run_name, dict( (M, pval) for M, pval in setToPval[run_name].iteritems() if M in new_sets)) for run_name in methods ) - setToRuntime = dict( (run_name, dict( (M, pval) for M, pval in setToRuntime[run_name].iteritems() if M in new_sets)) for run_name in methods ) - setToObs = dict( (run_name, dict( (M, pval) for M, pval in setToObs[run_name].iteritems() if M in new_sets)) for run_name in methods ) - setToFDR = dict( (run_name, dict( (M, pval) for M, pval in setToFDR[run_name].iteritems() if M in new_sets)) for run_name in methods ) + setToPval = dict( (run_name, dict( (M, pval) for M, pval in setToPval[run_name].items() if M in new_sets)) for run_name in methods ) + setToRuntime = dict( (run_name, dict( (M, pval) for M, pval in setToRuntime[run_name].items() if M in new_sets)) for run_name in methods ) + setToObs = dict( (run_name, dict( (M, pval) for M, pval in setToObs[run_name].items() if M in new_sets)) for run_name in methods ) + setToFDR = dict( (run_name, dict( (M, pval) for M, pval in setToFDR[run_name].items() if M in new_sets)) for run_name in methods ) # Restrict the weights genes_in_sets = set( g for M in sets for g in M.split('\t') ) P = dict( (g, P[g]) for g in genes_in_sets ) - geneToCases = dict( (g, cases) for g, cases in geneToCases.iteritems() if g in genes_in_sets ) + geneToCases = dict( (g, cases) for g, cases in geneToCases.items() if g in genes_in_sets ) - print '* Considering {} sets...'.format(len(new_sets)) + print('* Considering {} sets...'.format(len(new_sets))) # Output the JSON file with open(args.output_file, 'w') as OUT: @@ -81,11 +81,12 @@ def run( args ): params['weights_file'] = os.path.abspath(args.weights_file) # Output - output = dict(params=params, geneToCases=dict( (g, list(cases)) for g, cases in geneToCases.iteritems() ), + output = dict(params=params, geneToCases=dict( (g, list(cases)) for g, cases in geneToCases.items()), setToPval=setToPval, methods=sorted(methods), patientToType=patientToType, setToFDR=setToFDR, setToRuntime=setToRuntime, setToObs=setToObs, sets=list(sets), genes=list(genes), patients=patients, P=P) json.dump( output, OUT ) -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) diff --git a/viz/server.py b/viz/server.py index 568c36d..38a21ab 100644 --- a/viz/server.py +++ b/viz/server.py @@ -47,7 +47,8 @@ def run( args ): # Start server app = tornado.web.Application(routes) app.listen(args.port) - print 'Listening on port {}'.format(args.port) + print('Listening on port {}'.format(args.port)) tornado.ioloop.IOLoop.current().start() -if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) ) +if __name__ == '__main__': + run( get_parser().parse_args(sys.argv[1:]) ) diff --git a/wext/__init__.py b/wext/__init__.py index 08c6b5e..de8e09d 100755 --- a/wext/__init__.py +++ b/wext/__init__.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -# Import modules. -from constants import * -from statistics import * -from i_o import * -from enumerate_sets import * -from mcmc import mcmc -from exact import exact_test +# Import modules +from .constants import * +from .statistics import * +from .i_o import * +from .enumerate_sets import * +from .mcmc import mcmc +from .exact import exact_test import cpoibin -from saddlepoint import saddlepoint -from comet_exact_test import comet_exact_test -from exclusivity_tests import re_test, wre_test -from bipartite_edge_swap_module import bipartite_edge_swap +import wext_exact_test +from .saddlepoint import saddlepoint +import comet_exact_tests +from .exclusivity_tests import re_test, wre_test +from bipartite_edge_swap_module import bipartite_edge_swap \ No newline at end of file diff --git a/wext/enumerate_sets.py b/wext/enumerate_sets.py index 780deb6..5d8cce0 100755 --- a/wext/enumerate_sets.py +++ b/wext/enumerate_sets.py @@ -7,9 +7,9 @@ from math import ceil, isnan # Load local modules -from exclusivity_tests import wre_test, re_test, general_wre_test -from constants import * -from statistics import multiple_hypothesis_correction +from .exclusivity_tests import wre_test, re_test, general_wre_test +from .constants import * +from .statistics import multiple_hypothesis_correction ################################################################################ # Permutational test @@ -17,7 +17,7 @@ # Compute the mutual exclusivity T for the given gene set def T(M, geneToCases): sampleToCount = Counter( s for g in M for s in geneToCases.get(g, []) ) - return sum( 1 for sample, count in sampleToCount.iteritems() if count == 1 ) + return sum( 1 for sample, count in sampleToCount.items() if count == 1 ) # Compute the permutational def permutational_dist_wrapper( args ): return permutational_dist( *args ) @@ -29,7 +29,7 @@ def permutational_dist( sets, permuted_files ): permutedGeneToCases = defaultdict(set) for pf in pf_group: with open(pf, 'r') as IN: - for g, cases in json.load(IN)['geneToCases'].iteritems(): + for g, cases in json.load(IN)['geneToCases'].items(): permutedGeneToCases[g] |= set(cases) reading_time = time() - reading_start @@ -55,7 +55,7 @@ def rce_permutation_test(sets, geneToCases, num_patients, permuted_files, num_co # Filter the sets based on the observed values k = len(next(iter(sets))) setToObs = dict( (M, observed_values(M, num_patients, geneToCases)) for M in sets ) - sets = set( M for M, (X, T, Z, tbl) in setToObs.iteritems() if testable_set(k, T, Z, tbl) ) + sets = set( M for M, (X, T, Z, tbl) in setToObs.items() if testable_set(k, T, Z, tbl) ) # Compute the distribution of exclusivity for each pair across the permuted files np = float(len(permuted_files)) @@ -69,14 +69,14 @@ def rce_permutation_test(sets, geneToCases, num_patients, permuted_files, num_co # Merge the different distributions setToDist, setToTime = defaultdict(list), dict() for dist, times in empirical_distributions: - setToTime.update(times.items()) - for k, v in dist.iteritems(): + setToTime.update(list(times.items())) + for k, v in dist.tems(): setToDist[k].extend(v) # Compute the observed values and then the P-values setToObs = dict( (M, setToObs[M]) for M in sets ) setToPval = dict() - for M, (X, T, Z, tbl) in setToObs.iteritems(): + for M, (X, T, Z, tbl) in setToObs.items(): # Compute the P-value. count = sum( 1. for d in setToDist[M] if d >= T ) setToPval[M] = count / np @@ -84,7 +84,7 @@ def rce_permutation_test(sets, geneToCases, num_patients, permuted_files, num_co # Compute FDRs tested_sets = setToPval.keys() pvals = [ setToPval[M] for M in tested_sets ] - setToFDR = dict(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY"))) + setToFDR = dict(list(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY")))) return setToPval, setToTime, setToFDR, setToObs @@ -151,8 +151,6 @@ def test_set_group( sets, geneToCases, num_patients, method, test, P=None, verbo setToTime[M] = time() - start - if verbose > 1: print - return setToPval, setToTime, setToObs def test_sets( sets, geneToCases, num_patients, method, test, P=None, num_cores=1, verbose=0, @@ -177,13 +175,13 @@ def test_sets( sets, geneToCases, num_patients, method, test, P=None, num_cores= # Combine the dictionaries setToPval, setToTime, setToObs = dict(), dict(), dict() for pval, time, obs in results: - setToPval.update(pval.items()) - setToTime.update(time.items()) - setToObs.update(obs.items()) + setToPval.update(list(pval.items())) + setToTime.update(list(time.items())) + setToObs.update(list(obs.items())) # Make sure all P-values are numbers tested_sets = len(setToPval) - invalid_sets = set( M for M, pval in setToPval.iteritems() if isnan(pval) or -PTOL > pval or pval > 1+PTOL ) + invalid_sets = set( M for M, pval in setToPval.items() if isnan(pval) or -PTOL > pval or pval > 1+PTOL ) # Report invalid sets if verbose > 0 and report_invalids: @@ -194,19 +192,19 @@ def test_sets( sets, geneToCases, num_patients, method, test, P=None, num_cores= invalid_rows.append([ ','.join(sorted(M)), T, Z, tbl, setToPval[M] ]) sys.stderr.write( '\t' + '\n\t '.join([ '\t'.join(map(str, row)) for row in invalid_rows ]) + '\n' ) - setToPval = dict( (M, pval) for M, pval in setToPval.iteritems() if not M in invalid_sets ) - setToTime = dict( (M, runtime) for M, runtime in setToTime.iteritems() if not M in invalid_sets ) - setToObs = dict( (M, obs) for M, obs in setToObs.iteritems() if not M in invalid_sets ) + setToPval = dict( (M, pval) for M, pval in setToPval.items() if not M in invalid_sets ) + setToTime = dict( (M, runtime) for M, runtime in setToTime.items() if not M in invalid_sets ) + setToObs = dict( (M, obs) for M, obs in setToObs.items() if not M in invalid_sets ) if verbose > 0: - print '- Output {} sets'.format(len(setToPval)) - print '\tRemoved {} sets with NaN or invalid P-values'.format(len(invalid_sets)) - print '\tIgnored {} sets with Z >= T or a gene with no exclusive mutations'.format(len(sets)-tested_sets) + print('- Output {} sets'.format(len(setToPval))) + print('\tRemoved {} sets with NaN or invalid P-values'.format(len(invalid_sets))) + print('\tIgnored {} sets with Z >= T or a gene with no exclusive mutations'.format(len(sets)-tested_sets)) # Compute the FDRs - tested_sets = setToPval.keys() + tested_sets = list(setToPval.keys()) pvals = [ setToPval[M] for M in tested_sets ] - setToFDR = dict(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY"))) + setToFDR = dict(list(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY")))) return setToPval, setToTime, setToFDR, setToObs @@ -232,8 +230,6 @@ def general_test_set_group( sets, geneToCases, num_patients, method, test, stati setToPval[M] = general_wre_test( sorted_M, geneToCases, [ P[g] for g in sorted_M ], statistic ) setToTime[M] = time() - start - if verbose > 1: print - return setToPval, setToTime, setToObs def general_test_sets( sets, geneToCases, num_patients, method, test, statistic, P=None, num_cores=1, verbose=0, @@ -258,13 +254,13 @@ def general_test_sets( sets, geneToCases, num_patients, method, test, statistic, # Combine the dictionaries setToPval, setToTime, setToObs = dict(), dict(), dict() for pval, time, obs in results: - setToPval.update(pval.items()) - setToTime.update(time.items()) - setToObs.update(obs.items()) + setToPval.update(list(pval.items())) + setToTime.update(list(time.items())) + setToObs.update(list(obs.items())) # Make sure all P-values are numbers tested_sets = len(setToPval) - invalid_sets = set( M for M, pval in setToPval.iteritems() if isnan(pval) or -PTOL > pval or pval > 1+PTOL ) + invalid_sets = set( M for M, pval in setToPval.items() if isnan(pval) or -PTOL > pval or pval > 1+PTOL ) # Report invalid sets if verbose > 0 and report_invalids: @@ -275,19 +271,19 @@ def general_test_sets( sets, geneToCases, num_patients, method, test, statistic, invalid_rows.append([ ','.join(sorted(M)), T, Z, tbl, setToPval[M] ]) sys.stderr.write( '\t' + '\n\t '.join([ '\t'.join(map(str, row)) for row in invalid_rows ]) + '\n' ) - setToPval = dict( (M, pval) for M, pval in setToPval.iteritems() if not M in invalid_sets ) - setToTime = dict( (M, runtime) for M, runtime in setToTime.iteritems() if not M in invalid_sets ) - setToObs = dict( (M, obs) for M, obs in setToObs.iteritems() if not M in invalid_sets ) + setToPval = dict( (M, pval) for M, pval in setToPval.items() if not M in invalid_sets ) + setToTime = dict( (M, runtime) for M, runtime in setToTime.items() if not M in invalid_sets ) + setToObs = dict( (M, obs) for M, obs in setToObs.items() if not M in invalid_sets ) if verbose > 0: - print '- Output {} sets'.format(len(setToPval)) - print '\tRemoved {} sets with NaN or invalid P-values'.format(len(invalid_sets)) - print '\tIgnored {} sets with Z >= T or a gene with no exclusive mutations'.format(len(sets)-tested_sets) + print('- Output {} sets'.format(len(setToPval))) + print('\tRemoved {} sets with NaN or invalid P-values'.format(len(invalid_sets))) + print('\tIgnored {} sets with Z >= T or a gene with no exclusive mutations'.format(len(sets)-tested_sets)) # Compute the FDRs - tested_sets = setToPval.keys() + tested_sets = list(setToPval.keys()) pvals = [ min(max(0.0, setToPval[M]), 1.0) for M in tested_sets ] - setToFDR = dict(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY"))) + setToFDR = dict(list(zip(tested_sets, multiple_hypothesis_correction(pvals, method="BY")))) return setToPval, setToTime, setToFDR, setToObs diff --git a/wext/exact.py b/wext/exact.py index 052a578..345450f 100644 --- a/wext/exact.py +++ b/wext/exact.py @@ -1,10 +1,10 @@ #!/usr/bin/env python import numpy as np -import wext_exact_test -from constants import * +import wext_exact_test +from .constants import * -def exact_test( t, x, p, verbose=False ): +def exact_test(t, x, p, verbose=False): k = len(x) if k == 2: return exact_test_k2( t, x, p, verbose ) @@ -19,11 +19,13 @@ def exact_test_k3(t, x, p, verbose): return wext_exact_test.triple_exact_test( N, t, x[0], x[1], x[2], p ) # Wrapper for k=2 exact test C function -def exact_test_k2(t, (x, y), (p_x, p_y), verbose): +def exact_test_k2(t, xy, pxpy, verbose): # Two-sided test + (x, y) = xy + (p_x, p_y) = pxpy N = len(p_x) z = (x + y - t)/2 # count number of co-occurrences - tail_masses = wext_exact_test.conditional(N, range(z+1), x, y, p_x, p_y) + tail_masses = wext_exact_test.conditional(N, list(range(z+1)), x, y, p_x, p_y) obs_mass = tail_masses[-1] pval = sum(tail_masses) return pval diff --git a/wext/exclusivity_tests.py b/wext/exclusivity_tests.py index 985ce2e..936e77a 100755 --- a/wext/exclusivity_tests.py +++ b/wext/exclusivity_tests.py @@ -2,11 +2,11 @@ # Load required modules import numpy as np -from constants import * -from exact import exact_test +from .constants import * +from .exact import exact_test import cpoibin -from saddlepoint import saddlepoint, check_condition -from comet_exact_test import comet_exact_test +from .saddlepoint import saddlepoint, check_condition +from comet_exact_tests import comet_exact_test import warnings # Perform the weighted-row exclusivity test (WR-test) using the given method. diff --git a/wext/i_o.py b/wext/i_o.py index 679ca30..e533718 100755 --- a/wext/i_o.py +++ b/wext/i_o.py @@ -3,7 +3,7 @@ # Load required modules import sys, os, json, numpy as np from collections import defaultdict -from constants import * +from .constants import * # Load mutation data from one of our processed JSON file def load_mutation_data( mutation_file, min_freq=1 ): @@ -11,13 +11,13 @@ def load_mutation_data( mutation_file, min_freq=1 ): obj = json.load(IN) all_genes = obj['genes'] patients = obj['patients'] - geneToCases = dict( (g, set(cases)) for g, cases in obj['geneToCases'].iteritems() ) - patientToMutations = dict( (p, set(muts)) for p, muts in obj['patientToMutations'].iteritems() ) + geneToCases = dict( (g, set(cases)) for g, cases in obj['geneToCases'].items() ) + patientToMutations = dict( (p, set(muts)) for p, muts in obj['patientToMutations'].items() ) hypermutators = set(obj['hypermutators']) params = obj['params'] # Restrict the genes based on the minimum frequency - genes = set( g for g, cases in geneToCases.iteritems() if len(cases) >= min_freq ) + genes = set( g for g, cases in geneToCases.items() if len(cases) >= min_freq ) return genes, all_genes, patients, geneToCases, patientToMutations, params, hypermutators @@ -34,11 +34,11 @@ def load_patient_annotation_file(patient_annotation_file): # Converts keys from an iterable to tab-separated, so the dictionary can be # output as JSON def convert_dict_for_json( setToVal, sep='\t' ): - return dict( (sep.join(sorted(M)), val) for M, val in setToVal.iteritems() ) + return dict( (sep.join(sorted(M)), val) for M, val in setToVal.items() ) # Converts tab-separated keys back to frozensets def convert_dict_from_json( setToVal, sep='\t', iterable=frozenset ): - return dict( (iterable(M.split(sep)), val) for M, val in setToVal.iteritems() ) + return dict( (iterable(M.split(sep)), val) for M, val in setToVal.items() ) # Create the header strings for a contingency table def create_tbl_header( k ): @@ -54,7 +54,7 @@ def output_enumeration_table(args, k, setToPval, setToRuntime, setToFDR, setToOb if not args.json_format: # Construct the rows rows = [] - for M, pval in setToPval.iteritems(): + for M, pval in setToPval.items(): if setToFDR[M]<=fdr_threshold: X, T, Z, tbl = setToObs[M] row = [ ', '.join(sorted(M)), pval, setToFDR[M], setToRuntime[M], T, Z ] + tbl @@ -90,14 +90,14 @@ def output_mcmc(args, setsToFreq, setToPval, setToObs): params = vars(args) output = dict(params=params, setToPval=convert_dict_for_json(setToPval), setToObs=convert_dict_for_json(setToObs), - setsToFreq=dict( (' '.join([ ','.join(sorted(M)) for M in sets ]), freq) for sets, freq in setsToFreq.iteritems() )) + setsToFreq=dict( (' '.join([ ','.join(sorted(M)) for M in sets ]), freq) for sets, freq in setsToFreq.items()) ) with open(args.output_prefix + '.json', 'w') as OUT: json.dump( output, OUT ) else: # Output a gene set file with open(args.output_prefix + '-sampled-collections.tsv', 'w') as OUT: rows = [] - for sets, freq in setsToFreq.iteritems(): + for sets, freq in setsToFreq.items(): row = [ ' '.join([ ','.join(M) for M in sets ]), freq ] row.append( sum( -np.log10(setToPval[M] ** args.alpha) for M in sets )) rows.append(row) @@ -109,7 +109,7 @@ def output_mcmc(args, setsToFreq, setToPval, setToObs): # Output each of the sample gene sets with open(args.output_prefix + '-sampled-sets.tsv', 'w') as OUT: rows = [] - for M, pval in setToPval.iteritems(): + for M, pval in setToPval.items(): X, T, Z, tbl = setToObs[M] rows.append([ ','.join(sorted(M)), pval, T, Z] + tbl ) rows.sort(key=lambda r: r[1]) diff --git a/wext/mcmc.py b/wext/mcmc.py index 06ca876..1269378 100755 --- a/wext/mcmc.py +++ b/wext/mcmc.py @@ -1,16 +1,18 @@ -#!/usr/bin/python +#!/usr/bin/env python import sys, os, numpy as np from collections import defaultdict from time import time from random import random, sample, choice, seed as random_seed +from past.builtins import xrange -from constants import * -from enumerate_sets import observed_values -from exclusivity_tests import re_test, wre_test +from .constants import * +from .enumerate_sets import observed_values +from .exclusivity_tests import re_test, wre_test def mcmc(ks, geneToCases, num_patients, method, test, geneToP, seed, annotations=set(), verbose=0, step_len=100, nchains=1, niters=1000, alpha=1): - if verbose > 0: print '-' * 33, 'Running MCMC', '-' * 33 + if verbose > 0: + print('-' * 33, 'Running MCMC', '-' * 33) # Set up a local version of the weight function if test == WRE: @@ -45,19 +47,21 @@ def _collection_weight(collection): return sum( _weight(M) for M in collection ) def _to_collection(solution): - return frozenset( frozenset(M) for M in solution.values() ) + return frozenset( frozenset(M) for M in solution.values() ) # Compute the acceptance ratio - def _log_accept_ratio( W_current, W_next ): return W_next - W_current + def _log_accept_ratio( W_current, W_next ): + return W_next - W_current # Set up PRNG, sample space, and output random_seed(seed) t = len(ks) - genespace = geneToCases.keys() + genespace = list(geneToCases.keys()) setsToFreq = [ defaultdict(int) for _ in xrange(nchains) ] setToPval, setToObs = dict(), dict() for c in xrange(nchains): - if verbose > 0: print '- Experiment', c+1 + if verbose > 0: + print('- Experiment', c+1) # Seed Markov chain soln, assigned = choose_random_set(ks, genespace) @@ -75,8 +79,8 @@ def _log_accept_ratio( W_current, W_next ): return W_next - W_current sys.stdout.flush() # Sample the next gene to swap in/around the set - next_soln = dict( (index, set(M)) for index, M in soln.iteritems() ) - next_assigned = dict(assigned.items()) + next_soln = dict( (index, set(M)) for index, M in soln.items() ) + next_assigned = dict(list(assigned.items())) next_gene = choice(genespace) # There are two possibilities for the next gene @@ -101,14 +105,15 @@ def _log_accept_ratio( W_current, W_next ): return W_next - W_current # 2) The gene is not in the current solution. In this case, we choose # a random gene in the solution to remove, and add the next gene. else: - swap_gene = choice(next_assigned.keys()) + swap_gene = choice(list(next_assigned.keys())) j = next_assigned[swap_gene] del next_assigned[swap_gene] next_assigned[next_gene] = j next_soln[j].remove(swap_gene) next_soln[j].add(next_gene) - if not _valid_set(next_soln[j]): continue + if not _valid_set(next_soln[j]): + continue # Compare the current soln to the next soln next_weight = _collection_weight(_to_collection(next_soln)) @@ -121,12 +126,12 @@ def _log_accept_ratio( W_current, W_next ): return W_next - W_current setsToFreq[c][_to_collection(soln)] += 1 if verbose > 0: - print '\r[' + ('='*71) + '>] 100%' + print('\r[' + ('='*71) + '>] 100%') # Merge the various chains setsToTotalFreq = defaultdict(int) for counter in setsToFreq: - for sets, freq in counter.iteritems(): + for sets, freq in counter.items(): setsToTotalFreq[sets] += freq return setsToTotalFreq, setToPval, setToObs diff --git a/wext/saddlepoint.py b/wext/saddlepoint.py index 02334c2..49df27b 100644 --- a/wext/saddlepoint.py +++ b/wext/saddlepoint.py @@ -1,9 +1,11 @@ +#!/usr/bin/env python + import numpy as np from numpy.linalg import det from scipy.optimize import fsolve from scipy.stats import norm import itertools -from constants import * +from .constants import * def check_condition(state, condition): @@ -82,7 +84,7 @@ def saddlepoint(observed_t, observed_y, probabilities, condition='exclusivity'): w = np.zeros((2**k, n)) for i, state in enumerate(states): - w[i, :] = np.product(p[state, range(k), :], axis=0) + w[i, :] = np.product(p[state, list(range(k)), :], axis=0) # Define the moment generating functions and cumulant generating functions. These functions # use the above constants. diff --git a/wext/setup.cfg b/wext/setup.cfg deleted file mode 100755 index 8f69613..0000000 --- a/wext/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[build_ext] -inplace=1 diff --git a/wext/setup.py b/wext/setup.py index 072ae0f..38de6ac 100755 --- a/wext/setup.py +++ b/wext/setup.py @@ -3,7 +3,7 @@ """Compiles the C modules used by the weighted exclusivity test.""" # Load required modules -from distutils.core import setup, Extension +from numpy.distutils.core import setup, Extension import numpy, os thisDir = os.path.dirname(os.path.realpath(__file__)) @@ -13,7 +13,7 @@ module = Extension('cpoibin', include_dirs=[numpy.get_include()], sources = [ thisDir + s for s in srcs ], extra_compile_args = ['-g', '-O0']) -setup(name='poibin', version='0.0.1', ext_modules=[module], +setup(name='cpoibin', version='0.0.1', ext_modules=[module], description='Module for analyzing the Poisson-Binomial distribution.') # Compile the weighted enrichment module @@ -26,8 +26,15 @@ # Compile the CoMEt exact test module srcs = ['/src/c/comet_exact_test.c'] -module = Extension('comet_exact_test', include_dirs=[numpy.get_include()], +module = Extension('comet_exact_tests', include_dirs=[numpy.get_include()], sources = [ thisDir + s for s in srcs ], extra_compile_args = ['-g', '-O0']) -setup(name='comet_exact_test', version='0.0.1', ext_modules=[module], +setup(name='comet_exact_tests', version='0.0.1', ext_modules=[module], description='CoMEt exact test implementation.') + +## Compile the FORTRAN extension, bipartite_edge_swap_module +srcs = ['/src/fortran/bipartite_edge_swap_module.f95'] +module = Extension('bipartite_edge_swap_module', include_dirs=[numpy.get_include()], + sources = [ thisDir + s for s in srcs ]) +setup(name='bipartite_edge_swap_module', version='0.0.1', ext_modules=[module], + description='FORTRAN code description') diff --git a/wext/src/c/comet_exact_test.c b/wext/src/c/comet_exact_test.c index 903d991..b6097d6 100755 --- a/wext/src/c/comet_exact_test.c +++ b/wext/src/c/comet_exact_test.c @@ -281,12 +281,15 @@ struct Pvalues comet_exact_test(int k, int N, int *ctbl, double pvalthresh){ } + + + //////////////////////////////////////////////////////////////////////////////// -// Python registration +// Python wrapper functions //////////////////////////////////////////////////////////////////////////////// // The CoMEt exact test, callable from Python -PyObject *py_comet_exact_test(PyObject *self, PyObject *args){ +static PyObject *py_comet_exact_test(PyObject *self, PyObject *args){ // Parameters int k, N; // k: gene set size; N: number of samples PyObject *py_tbl; // FLAT Python contingency table @@ -307,7 +310,7 @@ PyObject *py_comet_exact_test(PyObject *self, PyObject *args){ tbl[i] = (int) PyLong_AsLong (PyList_GetItem(py_tbl, i)); // Compute the P-values - pval = comet_exact_test(k, N, tbl, pvalthresh); + pval = comet_exact_test(k, N, tbl, pvalthresh); // Free memory free(tbl); @@ -316,17 +319,48 @@ PyObject *py_comet_exact_test(PyObject *self, PyObject *args){ } +// methods definition: cometExactTest +// name of module: comet_exact_test ... which is also the name of the function in Python + +// try renaming this to 'comet_exact_tests' + + // Register the functions we want to be accessible from Python -PyMethodDef cometExactTest[] = { - {"comet_exact_test", py_comet_exact_test, METH_VARARGS, "CoMEt exact test"} +static PyMethodDef cometExactTest[] = { + {"comet_exact_test", py_comet_exact_test, METH_VARARGS, "CoMEt exact test"}, + {NULL, NULL, 0, NULL} +}; + + + +#if PY_MAJOR_VERSION >= 3 + +// define structure for module +static struct PyModuleDef comet_exact_tests = { + PyModuleDef_HEAD_INIT, // required + "comet_exact_tests", // name of module + "documentation detailed here", // documentation + -1, + cometExactTest // method definitions }; -// Note that the suffix of init has to match the name of the module, -// both here and in the setup.py file -PyMODINIT_FUNC initcomet_exact_test(void) { - PyObject *m = Py_InitModule("comet_exact_test", cometExactTest); - if (m == NULL) { - return; +// finally, write the initalizer function + +PyMODINIT_FUNC PyInit_comet_exact_tests(void) +{ + return PyModule_Create(&comet_exact_tests); +} + +#else + +PyMODINIT_FUNC initcomet_exact_tests(void) { + PyObject *m = Py_InitModule("comet_exact_tests", cometExactTest); + if (m == NULL) { + return; } } + +#endif + + diff --git a/wext/src/c/poibinmodule.c b/wext/src/c/poibinmodule.c index 8d5b3a3..6c3e71a 100755 --- a/wext/src/c/poibinmodule.c +++ b/wext/src/c/poibinmodule.c @@ -45,7 +45,7 @@ double pmf(int k, int N, double *ps){ return mass; } -PyObject *py_pmf(PyObject *self, PyObject *args){ +static PyObject *py_pmf(PyObject *self, PyObject *args){ // Parameters int i, k, N; double result, *ps; @@ -71,16 +71,43 @@ PyObject *py_pmf(PyObject *self, PyObject *args){ return Py_BuildValue("d", result); } + +// methods definition: poibinMethods +// name of module: cpoibin + // Register the functions we want to be accessible from Python -PyMethodDef poibinMethods[] = { - {"pmf", py_pmf, METH_VARARGS, "Poisson-Binomial PMF"} +static PyMethodDef poibinMethods[] = { + {"pmf", py_pmf, METH_VARARGS, "Poisson-Binomial PMF"}, + {NULL, NULL, 0, NULL} +}; + + +// define the module structure + +#if PY_MAJOR_VERSION >= 3 + +static struct PyModuleDef cpoibin = { + PyModuleDef_HEAD_INIT, // required + "cpoibin", // name of module + "ocumentation detailed here", // documentation + -1, + poibinMethods // method definitions }; -// Note that the suffix of init has to match the name of the module, -// both here and in the setup.py file +// finally, write the initalizer function + +PyMODINIT_FUNC PyInit_cpoibin(void) +{ + return PyModule_Create(&cpoibin); +} + +#else + PyMODINIT_FUNC initcpoibin(void) { PyObject *m = Py_InitModule("cpoibin", poibinMethods); if (m == NULL) { return; } } + +#endif diff --git a/wext/src/c/poibinmodule.h b/wext/src/c/poibinmodule.h index 2d851e4..59b3511 100755 --- a/wext/src/c/poibinmodule.h +++ b/wext/src/c/poibinmodule.h @@ -8,4 +8,4 @@ // Function declarations double pmf_recursion(int k, int j, double *ps, double **cache); double pmf(int k, int N, double *ps); -PyObject *py_pmf(PyObject *self, PyObject *args); +static PyObject *py_pmf(PyObject *self, PyObject *args); diff --git a/wext/src/c/wext_exact_test.c b/wext/src/c/wext_exact_test.c index ce49264..3ab4209 100755 --- a/wext/src/c/wext_exact_test.c +++ b/wext/src/c/wext_exact_test.c @@ -39,7 +39,9 @@ double joint_mass(int n, int z, int x, int y, double *p_x, double *p_y, double * return cache[n][z][x][y]; } -PyObject *py_conditional(PyObject *self, PyObject *args){ +// python wrapper + +static PyObject *py_conditional(PyObject *self, PyObject *args){ // Parameters int i, j, i2, j2, N, x, y, *zs, num_zs; double *p_x, *p_y, joint_marginal, mass, ****cache; @@ -136,7 +138,7 @@ double P(int n, int t, int w, int x, int y, double **p, double *****cache){ return cache[n][t][w][x][y]; } -PyObject *triple_exact_test(PyObject *self, PyObject *args){ +static PyObject *triple_exact_test(PyObject *self, PyObject *args){ // Parameters int i, j, i2, j2, i3, N, w, x, y, t, T; double **p, marginals, joint, result, *****cache; @@ -202,18 +204,48 @@ PyObject *triple_exact_test(PyObject *self, PyObject *args){ return Py_BuildValue("f", result); } + +// methods definition: weightedEnrichmentMethods +// name of module: wext_exact_test + //////////////////////////////////////////////////////////////////////////////// // Register the functions we want to be accessible from Python -PyMethodDef weightedEnrichmentMethods[] = { +static PyMethodDef weightedEnrichmentMethods[] = { {"conditional", py_conditional, METH_VARARGS, "Weighted enrichment test conditional PMF for pairs"}, - {"triple_exact_test", triple_exact_test, METH_VARARGS, "Weighted enrichment test for triples"} + {"triple_exact_test", triple_exact_test, METH_VARARGS, "Weighted enrichment test for triples"}, + {NULL, NULL, 0, NULL} +}; + + + +#if PY_MAJOR_VERSION >= 3 + +// define module structure + +static struct PyModuleDef wext_exact_test = { + PyModuleDef_HEAD_INIT, // required + "wext_exact_test", // name of module + "documentation detailed here", // documentation + -1, + weightedEnrichmentMethods // method definitions }; -// Note that the suffix of init has to match the name of the module, -// both here and in the setup.py file + +// finally, write the initalizer function + +PyMODINIT_FUNC PyInit_wext_exact_test(void) +{ + return PyModule_Create(&wext_exact_test); +} + +#else + PyMODINIT_FUNC initwext_exact_test(void) { PyObject *m = Py_InitModule("wext_exact_test", weightedEnrichmentMethods); if (m == NULL) { return; } } + + +#endif \ No newline at end of file