From 25094c0f0290e980f016b2cb6b5403ed6da1c19b Mon Sep 17 00:00:00 2001 From: Nicole Bedanova Date: Wed, 6 Aug 2025 14:19:09 -0700 Subject: [PATCH 1/2] Adding functions to drop a percentage of counts and plot FMS --- pf2rnaseq/factorization.py | 133 ++++++++++++++++++- pf2rnaseq/figures/commonFuncs/plotGeneral.py | 23 +++- pf2rnaseq/figures/figureCountFMS.py | 26 ++++ 3 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 pf2rnaseq/figures/figureCountFMS.py diff --git a/pf2rnaseq/factorization.py b/pf2rnaseq/factorization.py index 5c44f5c..afca4d1 100644 --- a/pf2rnaseq/factorization.py +++ b/pf2rnaseq/factorization.py @@ -5,6 +5,7 @@ import scanpy as sc import scipy.sparse as sps from pacmap import PaCMAP +from parafac2.normalize import prepare_dataset from parafac2.parafac2 import parafac2_nd, store_pf2 from scipy.stats import gmean from sklearn.decomposition import PCA @@ -43,7 +44,7 @@ def pf2( tolerance=1e-9, r2x=False, ): - cupy.cuda.Device(1).use() + cupy.cuda.Device(0).use() pf_out, R2X = parafac2_nd( X, rank=rank, @@ -190,3 +191,133 @@ def fms_diff_ranks( ) return df + + +def downsample_counts_multinomial( + X: anndata.AnnData, + percent_drop: float, + random_state: int = 0, +) -> anndata.AnnData: + """ + Create a downsampled counts copy of AnnData using multinomial sampling. + + Parameters: + ----------- + X : anndata.AnnData + Input dataset + percent_drop : float + Percentage of counts to drop (0-100) + random_state : int + Random seed for reproducibility + + Returns: + -------- + anndata.AnnData + Downsampled copy of the input data + """ + import scipy.sparse as sp + + # Handle 0% drop case + if percent_drop == 0: + return X.copy() + + # Set random seed + np.random.seed(random_state) + + # Convert to CSR and extract structure + original_csr = X.X.tocsr() + data = original_csr.data.copy() + indices = original_csr.indices + indptr = original_csr.indptr + + # Process each cell + for cell_idx in range(X.n_obs): + start_idx = indptr[cell_idx] + end_idx = indptr[cell_idx + 1] + + if start_idx == end_idx: + continue + + cell_data = data[start_idx:end_idx] + total_counts = int(np.sum(cell_data)) + + if total_counts == 0: + continue + + new_total = int(total_counts * (1 - percent_drop / 100)) + if new_total == 0: + data[start_idx:end_idx] = 0 + continue + + # Convert to probabilities and normalize + probs = cell_data / total_counts + probs = probs / np.sum(probs) # Ensure sum = 1.0 + + # Multinomial sampling + new_counts = np.random.multinomial(new_total, probs) + data[start_idx:end_idx] = new_counts.astype(cell_data.dtype) + + # Create new sparse matrix + sampled_csr = sp.csr_matrix((data, indices, indptr), shape=original_csr.shape) + + # Create new AnnData object + sampled_data = X.copy() + sampled_data.X = sampled_csr + + return sampled_data + + +def fms_percent_drop_counts_multinomial( + X: anndata.AnnData, + percentList: np.ndarray, + runs: int, + rank: int = 30, + deviance: bool = False, + condition: str = "Condition", + geneThreshold: float = 0.0, +): + """ + Multinomial sampling to reduce counts of each cell and calculate FMS. + """ + + # Get reference factorization + X_prepared = prepare_dataset( + X, condition, geneThreshold=geneThreshold, deviance=deviance + ) + dataX = pf2(X_prepared, rank, doEmbedding=False) + + # Pre-allocate results + results = np.zeros((runs, len(percentList))) + + # Main loop + for j in range(runs): + for i, percent_drop in enumerate(percentList): + # Handle 0% drop case separately (no sampling needed) + if percent_drop == 0: + results[j, i] = 1.0 # FMS = 1.0 for identical data + continue + + # Create downsampled data + sampled_data = downsample_counts_multinomial( + X, percent_drop, random_state=j + ) + + # Apply same processing as reference + sampled_data = prepare_dataset( + sampled_data, condition, geneThreshold=geneThreshold, deviance=deviance + ) + + # Factorization + sampledX = pf2(sampled_data, rank, random_state=j + 2, doEmbedding=False) + results[j, i] = calculateFMS(dataX, sampledX) + + # Prepare DataFrame for results + df = pd.DataFrame( + { + "Run": np.repeat(np.arange(runs), len(percentList)), + "Percentage of Counts Dropped": np.tile(percentList, runs), + "FMS": results.flatten(), + } + ) + + return df diff --git a/pf2rnaseq/figures/commonFuncs/plotGeneral.py b/pf2rnaseq/figures/commonFuncs/plotGeneral.py index db0f833..a5c0153 100644 --- a/pf2rnaseq/figures/commonFuncs/plotGeneral.py +++ b/pf2rnaseq/figures/commonFuncs/plotGeneral.py @@ -6,7 +6,12 @@ import seaborn as sns from matplotlib.axes import Axes -from ...factorization import fms_percent_drop, pf2_pca_r2x, fms_diff_ranks +from ...factorization import ( + fms_diff_ranks, + fms_percent_drop, + fms_percent_drop_counts_multinomial, + pf2_pca_r2x, +) def plot_r2x(data, rank_vec, ax: Axes): @@ -460,3 +465,19 @@ def plot_fms_percent_drop( df = fms_percent_drop(X, percentList, runs, rank) sns.lineplot(data=df, x="Percentage of Data Dropped", y="FMS", ax=ax) ax.set_ylim(0, 1) + + +def plot_fms_percent_drop_counts( + X: anndata.AnnData, + ax: Axes, + percentList: np.ndarray, + runs=3, + rank: int = 30, + deviance: bool = False, +): + """Plots FMS when dropping different percentages of data""" + df = fms_percent_drop_counts_multinomial( + X, percentList, runs, rank, deviance=deviance + ) + sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax) + ax.set_ylim(0, 1) diff --git a/pf2rnaseq/figures/figureCountFMS.py b/pf2rnaseq/figures/figureCountFMS.py new file mode 100644 index 0000000..f233db1 --- /dev/null +++ b/pf2rnaseq/figures/figureCountFMS.py @@ -0,0 +1,26 @@ +""" +factorization score + +""" + +from anndata import read_h5ad + +from .common import getSetup, subplotLabel +from .commonFuncs.plotGeneral import plot_fms_percent_drop_counts + + +def makeFigure(): + ax, f = getSetup((6, 3), (1, 2)) + subplotLabel(ax) + # Using our cytokine dataset + X = read_h5ad("/opt/extra-storage/Treg_h5ads/Treg_raw.h5ad") + + # Remove multiplexing identifiers + X = X[:, ~X.var_names.str.match("^CMO3[0-9]{2}$")] # type: ignore + # Remove genes with too few reads now + X = X[X.X.sum(axis=1) > 10, X.X.mean(axis=0) > 0.1] + X = X.copy() + percentList = [0.0, 25.0, 50.0, 75.0, 95.0, 99.0] + plot_fms_percent_drop_counts(X, ax[0], percentList, runs=2, rank=20, deviance=True) + + return f From 2301740fc432f744cdee75589fd129850c89f904 Mon Sep 17 00:00:00 2001 From: Nicole Bedanova Date: Wed, 13 Aug 2025 12:12:10 -0700 Subject: [PATCH 2/2] Small fixes --- pf2rnaseq/factorization.py | 139 +++++++++++++------ pf2rnaseq/figures/commonFuncs/plotGeneral.py | 11 +- pf2rnaseq/figures/figureCountFMS.py | 11 +- 3 files changed, 110 insertions(+), 51 deletions(-) diff --git a/pf2rnaseq/factorization.py b/pf2rnaseq/factorization.py index afca4d1..d7fc8e0 100644 --- a/pf2rnaseq/factorization.py +++ b/pf2rnaseq/factorization.py @@ -44,7 +44,7 @@ def pf2( tolerance=1e-9, r2x=False, ): - cupy.cuda.Device(0).use() + cupy.cuda.Device(1).use() pf_out, R2X = parafac2_nd( X, rank=rank, @@ -267,57 +267,112 @@ def downsample_counts_multinomial( return sampled_data -def fms_percent_drop_counts_multinomial( +def calculate_fms_downsample( + X: anndata.AnnData, + X_pf2: anndata.AnnData, + percent_drop: float, + rank: int = 30, + deviance: bool = False, + condition: str = "Condition", + random_state: int = 0, +) -> float: + """ + Calculate FMS for a single downsampling scenario. + + Parameters: + ----------- + X : anndata.AnnData + Original dataset for reference + X_pf2 : anndata.AnnData + Full factorized dataset + percent_drop : float + Percentage of counts to drop (0-100) + rank : int + Factorization rank + deviance : bool + Whether to use deviance normalization + condition : str + Condition column name + random_state : int + Random seed + + Returns: + -------- + float + FMS score + """ + + # Handle 0% drop case + if percent_drop == 0: + return 1.0 + + # Create downsampled data + sampled_data = downsample_counts_multinomial( + X, percent_drop, random_state=random_state + ) + + # Apply same processing as reference + sampled_data = prepare_dataset( + sampled_data, condition, geneThreshold=0.0, deviance=deviance + ) + + # Factorization + sampledX = pf2(sampled_data, rank, random_state=random_state + 2, doEmbedding=False) + + return calculateFMS(X_pf2, sampledX) + + +def fms_percent_drop_counts( X: anndata.AnnData, percentList: np.ndarray, - runs: int, rank: int = 30, deviance: bool = False, condition: str = "Condition", geneThreshold: float = 0.0, -): - """ - Multinomial sampling to reduce counts of each cell and calculate FMS. + random_state: int = 0, +) -> pd.DataFrame: """ + Calculate FMS for multiple downsampling percentages (single run). + + Parameters: + ----------- + X : anndata.AnnData + Input dataset + percentList : np.ndarray + Array of dropout percentages to test + rank : int + Factorization rank + deviance : bool + Whether to use deviance normalization + condition : str + Condition column name + geneThreshold : float + Gene threshold for preparation + random_state : int + Random seed - # Get reference factorization + Returns: + -------- + pd.DataFrame + DataFrame with columns: Percentage of Counts Dropped, FMS + """ + results = [] X_prepared = prepare_dataset( X, condition, geneThreshold=geneThreshold, deviance=deviance ) - dataX = pf2(X_prepared, rank, doEmbedding=False) - - # Pre-allocate results - results = np.zeros((runs, len(percentList))) - - # Main loop - for j in range(runs): - for i, percent_drop in enumerate(percentList): - # Handle 0% drop case separately (no sampling needed) - if percent_drop == 0: - results[j, i] = 1.0 # FMS = 1.0 for identical data - continue - - # Create downsampled data - sampled_data = downsample_counts_multinomial( - X, percent_drop, random_state=j - ) - - # Apply same processing as reference - sampled_data = prepare_dataset( - sampled_data, condition, geneThreshold=geneThreshold, deviance=deviance - ) - - # Factorization - sampledX = pf2(sampled_data, rank, random_state=j + 2, doEmbedding=False) - results[j, i] = calculateFMS(dataX, sampledX) + X_pf2 = pf2(X_prepared, rank, doEmbedding=False) + + for percent_drop in percentList: + fms_score = calculate_fms_downsample( + X=X, + X_pf2=X_pf2, + percent_drop=percent_drop, + rank=rank, + deviance=deviance, + condition=condition, + random_state=random_state, + ) - # Prepare DataFrame for results - df = pd.DataFrame( - { - "Run": np.repeat(np.arange(runs), len(percentList)), - "Percentage of Counts Dropped": np.tile(percentList, runs), - "FMS": results.flatten(), - } - ) + results.append({"Percentage of Counts Dropped": percent_drop, "FMS": fms_score}) - return df + return pd.DataFrame(results) diff --git a/pf2rnaseq/figures/commonFuncs/plotGeneral.py b/pf2rnaseq/figures/commonFuncs/plotGeneral.py index a5c0153..028328e 100644 --- a/pf2rnaseq/figures/commonFuncs/plotGeneral.py +++ b/pf2rnaseq/figures/commonFuncs/plotGeneral.py @@ -9,7 +9,7 @@ from ...factorization import ( fms_diff_ranks, fms_percent_drop, - fms_percent_drop_counts_multinomial, + fms_percent_drop_counts, pf2_pca_r2x, ) @@ -471,13 +471,12 @@ def plot_fms_percent_drop_counts( X: anndata.AnnData, ax: Axes, percentList: np.ndarray, - runs=3, rank: int = 30, deviance: bool = False, + label: str = None, ): """Plots FMS when dropping different percentages of data""" - df = fms_percent_drop_counts_multinomial( - X, percentList, runs, rank, deviance=deviance - ) - sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax) + df = fms_percent_drop_counts(X, percentList, rank, deviance=deviance) + sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax, label=label) ax.set_ylim(0, 1) + diff --git a/pf2rnaseq/figures/figureCountFMS.py b/pf2rnaseq/figures/figureCountFMS.py index f233db1..e69d415 100644 --- a/pf2rnaseq/figures/figureCountFMS.py +++ b/pf2rnaseq/figures/figureCountFMS.py @@ -10,7 +10,7 @@ def makeFigure(): - ax, f = getSetup((6, 3), (1, 2)) + ax, f = getSetup((6, 3), (1, 1)) subplotLabel(ax) # Using our cytokine dataset X = read_h5ad("/opt/extra-storage/Treg_h5ads/Treg_raw.h5ad") @@ -20,7 +20,12 @@ def makeFigure(): # Remove genes with too few reads now X = X[X.X.sum(axis=1) > 10, X.X.mean(axis=0) > 0.1] X = X.copy() - percentList = [0.0, 25.0, 50.0, 75.0, 95.0, 99.0] - plot_fms_percent_drop_counts(X, ax[0], percentList, runs=2, rank=20, deviance=True) + percentList = [0.0, 30.0, 50.0] + plot_fms_percent_drop_counts( + X, ax[0], percentList, rank=15, deviance=True, label="Deviance" + ) + plot_fms_percent_drop_counts( + X, ax[0], percentList, rank=15, deviance=False, label="CPM" + ) return f