diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py index ffbd77e..f433f72 100644 --- a/EMT_data_analysis/analysis_scripts/Analysis_tools.py +++ b/EMT_data_analysis/analysis_scripts/Analysis_tools.py @@ -20,6 +20,37 @@ warnings.filterwarnings("ignore") +rng = np.random.default_rng(42) + +def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstraps=2000, seed=42, verbose=True): + x = np.asarray(x) + y = np.asarray(y) + + corr_func = pearsonr + if method == "Spearman": + corr_func = spearmanr + + r_observed, p_value = corr_func(x, y) + n_samples = len(x) + + inds = np.arange(n_samples) + boot_corrs = [] + for _ in range(n_bootstraps): + resampled_inds = rng.choice(inds, size=n_samples, replace=True) + rx = x[resampled_inds] + ry = y[resampled_inds] + boot_r, _ = corr_func(rx, ry) + boot_corrs.append(boot_r) + alpha = (1.0-confidence_interval)/2.0 + ci_low = np.nanpercentile(boot_corrs, 100*alpha) + ci_high = np.nanpercentile(boot_corrs, 100*(1.0-alpha)) + + if verbose: + print(f'Number of samples: {n_samples}') + print(f'{method} Correlation: {r_observed:.3g} | 95% CI: [{ci_low:.2f}, {ci_high:.2f}] | p-value: {p_value:.3g}') + + return r_observed, p_value, ci_low, ci_high + def run_all_analyses(): """ Run all analysis functions @@ -29,6 +60,8 @@ def run_all_analyses(): df = io.load_image_analysis_extracted_features() + print("Loaded image analysis extracted features and started running analyses...") + plot_area_at_glass_all_data(df, FIGS_DIR, OUT_TYPE) plot_area_at_glass_h2b(df, FIGS_DIR, OUT_TYPE) plot_migration_timing_all_data(df, FIGS_DIR, OUT_TYPE) @@ -611,10 +644,8 @@ def plot_gene_expression_experiments(df, figs_dir, out_type): migration = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() metric = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() - pearson, p_pvalue = pearsonr(migration, metric) - spearman, s_pvalue = spearmanr(migration, metric) - print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}') - print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}') + pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson") + spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman") X = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() Y = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() @@ -638,11 +669,9 @@ def plot_gene_expression_experiments(df, figs_dir, out_type): migration = df_g['Migration Onset Time (Footprint Area Based)'] metric = df_g['gene_metric'] - pearson, p_pvalue = pearsonr(migration, metric) - spearman, s_pvalue = spearmanr(migration, metric) - print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}') - print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}') - + pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson") + spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman") + X = df_g['Migration Onset Time (Footprint Area Based)'] Y = df_g['gene_metric'] @@ -719,6 +748,8 @@ def plot_collagenase_analysis(df, figs_dir, out_type): X = df_gene['Collagenease concentration (ug/mL)'] Y = df_gene['Migration Onset Time (Footprint Area Based)'] + _ = bootstrap_corr(X, Y, "Spearman") + # It's important to add a constant (intercept) to the model X = sm.add_constant(X) @@ -747,7 +778,6 @@ def plot_collagenase_analysis(df, figs_dir, out_type): else: print("\nConclusion: The p-value for the slope is not less than 0.05, so we cannot conclude there is a significant linear relationship.") - def plot_mmp_inhibitor_migration(df, figs_dir, out_type): """ Parameters @@ -1015,8 +1045,8 @@ def plot_inside_outside_migration_timing(df, figs_dir, out_type): X = dfio_scatter['Migration Onset Time (Footprint Area Based)'] Y = dfio_scatter['Migration Onset Time (Inside/Outside Basement Membrane Based)'] - p_results = pearsonr(X.values, Y.values) - r_results = spearmanr(X.values, Y.values) + p_results = bootstrap_corr(X.values, Y.values, "Pearson") + r_results = bootstrap_corr(X.values, Y.values, "Spearman") print('n: {0:d}'.format(n_movies_io)) print('Pearson Correlation: {0:.3g} | p-Value: {1:.3g}'.format(p_results.statistic, p_results.pvalue)) print('Spearman Correlation: {0:.3g} | p-Value: {1:.3g}'.format(r_results.statistic, r_results.pvalue)) diff --git a/EMT_data_analysis/tools/io.py b/EMT_data_analysis/tools/io.py index 26875df..0ee65a8 100644 --- a/EMT_data_analysis/tools/io.py +++ b/EMT_data_analysis/tools/io.py @@ -12,12 +12,15 @@ def load_imaging_and_segmentation_dataset(): return df def load_image_analysis_extracted_features(load_from_aws: bool = True): - path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj" - if not load_from_aws: - # Or read from local if the user decides to run Metric_computation.py - metric_comp_results_dir = get_results_directory_name() / "metric_computation" - path = metric_comp_results_dir / "Image_analysis_extracted_features.csv" - df = pd.read_csv(path) + metric_comp_results_dir = get_results_directory_name() / "metric_computation" + path = metric_comp_results_dir / "Image_analysis_extracted_features.csv" + try: + print('Trying to load features from local path.') + df = pd.read_csv(path) + except Exception: + print(f'Features not found at {path}. Loading from AWS instead. This may take a while...') + path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj" + df = pd.read_csv(path) return df def load_inside_outside_classification(load_from_aws: bool = True):