From 72143af9bf7b8032067fe51f65926930fa2b25ab Mon Sep 17 00:00:00 2001 From: Matheus Viana Date: Thu, 22 Jan 2026 20:11:19 -0800 Subject: [PATCH 1/5] Add logging for dataframe loading and analysis execution - Add print statement in io.load_image_analysis_extracted_features to verify source path. - Add status update in Analysis_tools.run_all_analyses to indicate when analysis begins. --- EMT_data_analysis/analysis_scripts/Analysis_tools.py | 2 ++ EMT_data_analysis/tools/io.py | 1 + 2 files changed, 3 insertions(+) diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py index ffbd77e..f005d1f 100644 --- a/EMT_data_analysis/analysis_scripts/Analysis_tools.py +++ b/EMT_data_analysis/analysis_scripts/Analysis_tools.py @@ -29,6 +29,8 @@ def run_all_analyses(): df = io.load_image_analysis_extracted_features() + print("Loaded image analysis extracted features and started running analyses...") + plot_area_at_glass_all_data(df, FIGS_DIR, OUT_TYPE) plot_area_at_glass_h2b(df, FIGS_DIR, OUT_TYPE) plot_migration_timing_all_data(df, FIGS_DIR, OUT_TYPE) diff --git a/EMT_data_analysis/tools/io.py b/EMT_data_analysis/tools/io.py index 26875df..3bbf8fb 100644 --- a/EMT_data_analysis/tools/io.py +++ b/EMT_data_analysis/tools/io.py @@ -17,6 +17,7 @@ def load_image_analysis_extracted_features(load_from_aws: bool = True): # Or read from local if the user decides to run Metric_computation.py metric_comp_results_dir = get_results_directory_name() / "metric_computation" path = metric_comp_results_dir / "Image_analysis_extracted_features.csv" + print(f'Loading image analysis extracted features from {path}. This may take a while...') df = pd.read_csv(path) return df From 3e40e6608150d54c12f4e0023988b92cff0071f2 Mon Sep 17 00:00:00 2001 From: Matheus Viana Date: Thu, 22 Jan 2026 20:34:46 -0800 Subject: [PATCH 2/5] feat: Prioritize loading image analysis features from local path, falling back to AWS if local load fails. --- EMT_data_analysis/tools/io.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/EMT_data_analysis/tools/io.py b/EMT_data_analysis/tools/io.py index 3bbf8fb..0ee65a8 100644 --- a/EMT_data_analysis/tools/io.py +++ b/EMT_data_analysis/tools/io.py @@ -12,13 +12,15 @@ def load_imaging_and_segmentation_dataset(): return df def load_image_analysis_extracted_features(load_from_aws: bool = True): - path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj" - if not load_from_aws: - # Or read from local if the user decides to run Metric_computation.py - metric_comp_results_dir = get_results_directory_name() / "metric_computation" - path = metric_comp_results_dir / "Image_analysis_extracted_features.csv" - print(f'Loading image analysis extracted features from {path}. This may take a while...') - df = pd.read_csv(path) + metric_comp_results_dir = get_results_directory_name() / "metric_computation" + path = metric_comp_results_dir / "Image_analysis_extracted_features.csv" + try: + print('Trying to load features from local path.') + df = pd.read_csv(path) + except Exception: + print(f'Features not found at {path}. Loading from AWS instead. This may take a while...') + path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj" + df = pd.read_csv(path) return df def load_inside_outside_classification(load_from_aws: bool = True): From 5a1094eda707de68ed0f0ae832288a5a7f1c151b Mon Sep 17 00:00:00 2001 From: Matheus Viana Date: Fri, 23 Jan 2026 17:32:41 -0800 Subject: [PATCH 3/5] Use nanpercentile to prevent nan bootstraped correlation --- .../analysis_scripts/Analysis_tools.py | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py index f005d1f..630ebc1 100644 --- a/EMT_data_analysis/analysis_scripts/Analysis_tools.py +++ b/EMT_data_analysis/analysis_scripts/Analysis_tools.py @@ -20,6 +20,34 @@ warnings.filterwarnings("ignore") +def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstraps=2000, seed=42, verbose=True): + x = np.asarray(x) + y = np.asarray(y) + + corr_func = pearsonr + if method == "Spearman": + corr_func = spearmanr + + r_observed, p_value = corr_func(x, y) + n_samples = len(x) + + inds = np.arange(n_samples) + boot_corrs = [] + for _ in range(n_bootstraps): + resampled_inds = np.random.choice(inds, size=n_samples, replace=True) + rx = x[resampled_inds] + ry = y[resampled_inds] + boot_r, _ = corr_func(rx, ry) + boot_corrs.append(boot_r) + alpha = (1.0-confidence_interval)/2.0 + ci_low = np.nanpercentile(boot_corrs, 100*alpha) + ci_high = np.nanpercentile(boot_corrs, 100*(1.0-alpha)) + + if verbose: + print(f'{method} Correlation: {r_observed:.3g} | 95% CI: [{ci_low:.2f}, {ci_high:.2f}] | p-value: {p_value:.3g}') + + return r_observed, p_value, ci_low, ci_high + def run_all_analyses(): """ Run all analysis functions @@ -613,10 +641,8 @@ def plot_gene_expression_experiments(df, figs_dir, out_type): migration = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() metric = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() - pearson, p_pvalue = pearsonr(migration, metric) - spearman, s_pvalue = spearmanr(migration, metric) - print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}') - print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}') + pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson") + spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman") X = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() Y = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna() @@ -640,11 +666,9 @@ def plot_gene_expression_experiments(df, figs_dir, out_type): migration = df_g['Migration Onset Time (Footprint Area Based)'] metric = df_g['gene_metric'] - pearson, p_pvalue = pearsonr(migration, metric) - spearman, s_pvalue = spearmanr(migration, metric) - print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}') - print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}') - + pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson") + spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman") + X = df_g['Migration Onset Time (Footprint Area Based)'] Y = df_g['gene_metric'] From 3f1ce3a688319332c3dda3dc394e94891480c016 Mon Sep 17 00:00:00 2001 From: Matheus Viana Date: Fri, 23 Jan 2026 19:15:08 -0800 Subject: [PATCH 4/5] Add random seed for reproducibility --- EMT_data_analysis/analysis_scripts/Analysis_tools.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py index 630ebc1..ccf1736 100644 --- a/EMT_data_analysis/analysis_scripts/Analysis_tools.py +++ b/EMT_data_analysis/analysis_scripts/Analysis_tools.py @@ -20,6 +20,8 @@ warnings.filterwarnings("ignore") +rng = np.random.default_rng(42) + def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstraps=2000, seed=42, verbose=True): x = np.asarray(x) y = np.asarray(y) @@ -34,7 +36,7 @@ def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstrap inds = np.arange(n_samples) boot_corrs = [] for _ in range(n_bootstraps): - resampled_inds = np.random.choice(inds, size=n_samples, replace=True) + resampled_inds = rng.choice(inds, size=n_samples, replace=True) rx = x[resampled_inds] ry = y[resampled_inds] boot_r, _ = corr_func(rx, ry) @@ -44,6 +46,7 @@ def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstrap ci_high = np.nanpercentile(boot_corrs, 100*(1.0-alpha)) if verbose: + print(f'Number of samples: {n_samples}') print(f'{method} Correlation: {r_observed:.3g} | 95% CI: [{ci_low:.2f}, {ci_high:.2f}] | p-value: {p_value:.3g}') return r_observed, p_value, ci_low, ci_high From 148c486cd38eb994a3fb67eb67551210644a3481 Mon Sep 17 00:00:00 2001 From: Matheus Viana Date: Sat, 24 Jan 2026 14:47:42 -0800 Subject: [PATCH 5/5] Adding 95% CI for other correlation tests --- EMT_data_analysis/analysis_scripts/Analysis_tools.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py index ccf1736..f433f72 100644 --- a/EMT_data_analysis/analysis_scripts/Analysis_tools.py +++ b/EMT_data_analysis/analysis_scripts/Analysis_tools.py @@ -748,6 +748,8 @@ def plot_collagenase_analysis(df, figs_dir, out_type): X = df_gene['Collagenease concentration (ug/mL)'] Y = df_gene['Migration Onset Time (Footprint Area Based)'] + _ = bootstrap_corr(X, Y, "Spearman") + # It's important to add a constant (intercept) to the model X = sm.add_constant(X) @@ -776,7 +778,6 @@ def plot_collagenase_analysis(df, figs_dir, out_type): else: print("\nConclusion: The p-value for the slope is not less than 0.05, so we cannot conclude there is a significant linear relationship.") - def plot_mmp_inhibitor_migration(df, figs_dir, out_type): """ Parameters @@ -1044,8 +1045,8 @@ def plot_inside_outside_migration_timing(df, figs_dir, out_type): X = dfio_scatter['Migration Onset Time (Footprint Area Based)'] Y = dfio_scatter['Migration Onset Time (Inside/Outside Basement Membrane Based)'] - p_results = pearsonr(X.values, Y.values) - r_results = spearmanr(X.values, Y.values) + p_results = bootstrap_corr(X.values, Y.values, "Pearson") + r_results = bootstrap_corr(X.values, Y.values, "Spearman") print('n: {0:d}'.format(n_movies_io)) print('Pearson Correlation: {0:.3g} | p-Value: {1:.3g}'.format(p_results.statistic, p_results.pvalue)) print('Spearman Correlation: {0:.3g} | p-Value: {1:.3g}'.format(r_results.statistic, r_results.pvalue))