From 25094c0f0290e980f016b2cb6b5403ed6da1c19b Mon Sep 17 00:00:00 2001
From: Nicole Bedanova <nbedanova@g.ucla.edu>
Date: Wed, 6 Aug 2025 14:19:09 -0700
Subject: [PATCH 1/2] Adding functions to drop a percentage of counts and plot
 FMS

---
 pf2rnaseq/factorization.py                   | 133 ++++++++++++++++++-
 pf2rnaseq/figures/commonFuncs/plotGeneral.py |  23 +++-
 pf2rnaseq/figures/figureCountFMS.py          |  26 ++++
 3 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 pf2rnaseq/figures/figureCountFMS.py

diff --git a/pf2rnaseq/factorization.py b/pf2rnaseq/factorization.py
index 5c44f5c..afca4d1 100644
--- a/pf2rnaseq/factorization.py
+++ b/pf2rnaseq/factorization.py
@@ -5,6 +5,7 @@
 import scanpy as sc
 import scipy.sparse as sps
 from pacmap import PaCMAP
+from parafac2.normalize import prepare_dataset
 from parafac2.parafac2 import parafac2_nd, store_pf2
 from scipy.stats import gmean
 from sklearn.decomposition import PCA
@@ -43,7 +44,7 @@ def pf2(
     tolerance=1e-9,
     r2x=False,
 ):
-    cupy.cuda.Device(1).use()
+    cupy.cuda.Device(0).use()
     pf_out, R2X = parafac2_nd(
         X,
         rank=rank,
@@ -190,3 +191,133 @@ def fms_diff_ranks(
     )
 
     return df
+
+
+def downsample_counts_multinomial(
+    X: anndata.AnnData,
+    percent_drop: float,
+    random_state: int = 0,
+) -> anndata.AnnData:
+    """
+    Create a downsampled counts copy of AnnData using multinomial sampling.
+
+    Parameters:
+    -----------
+    X : anndata.AnnData
+        Input dataset
+    percent_drop : float
+        Percentage of counts to drop (0-100)
+    random_state : int
+        Random seed for reproducibility
+
+    Returns:
+    --------
+    anndata.AnnData
+        Downsampled copy of the input data
+    """
+    import scipy.sparse as sp
+
+    # Handle 0% drop case
+    if percent_drop == 0:
+        return X.copy()
+
+    # Set random seed
+    np.random.seed(random_state)
+
+    # Convert to CSR and extract structure
+    original_csr = X.X.tocsr()
+    data = original_csr.data.copy()
+    indices = original_csr.indices
+    indptr = original_csr.indptr
+
+    # Process each cell
+    for cell_idx in range(X.n_obs):
+        start_idx = indptr[cell_idx]
+        end_idx = indptr[cell_idx + 1]
+
+        if start_idx == end_idx:
+            continue
+
+        cell_data = data[start_idx:end_idx]
+        total_counts = int(np.sum(cell_data))
+
+        if total_counts == 0:
+            continue
+
+        new_total = int(total_counts * (1 - percent_drop / 100))
+        if new_total == 0:
+            data[start_idx:end_idx] = 0
+            continue
+
+        # Convert to probabilities and normalize
+        probs = cell_data / total_counts
+        probs = probs / np.sum(probs)  # Ensure sum = 1.0
+
+        # Multinomial sampling
+        new_counts = np.random.multinomial(new_total, probs)
+        data[start_idx:end_idx] = new_counts.astype(cell_data.dtype)
+
+    # Create new sparse matrix
+    sampled_csr = sp.csr_matrix((data, indices, indptr), shape=original_csr.shape)
+
+    # Create new AnnData object
+    sampled_data = X.copy()
+    sampled_data.X = sampled_csr
+
+    return sampled_data
+
+
+def fms_percent_drop_counts_multinomial(
+    X: anndata.AnnData,
+    percentList: np.ndarray,
+    runs: int,
+    rank: int = 30,
+    deviance: bool = False,
+    condition: str = "Condition",
+    geneThreshold: float = 0.0,
+):
+    """
+    Multinomial sampling to reduce counts of each cell and calculate FMS.
+    """
+
+    # Get reference factorization
+    X_prepared = prepare_dataset(
+        X, condition, geneThreshold=geneThreshold, deviance=deviance
+    )
+    dataX = pf2(X_prepared, rank, doEmbedding=False)
+
+    # Pre-allocate results
+    results = np.zeros((runs, len(percentList)))
+
+    # Main loop
+    for j in range(runs):
+        for i, percent_drop in enumerate(percentList):
+            # Handle 0% drop case separately (no sampling needed)
+            if percent_drop == 0:
+                results[j, i] = 1.0  # FMS = 1.0 for identical data
+                continue
+
+            # Create downsampled data
+            sampled_data = downsample_counts_multinomial(
+                X, percent_drop, random_state=j
+            )
+
+            # Apply same processing as reference
+            sampled_data = prepare_dataset(
+                sampled_data, condition, geneThreshold=geneThreshold, deviance=deviance
+            )
+
+            # Factorization
+            sampledX = pf2(sampled_data, rank, random_state=j + 2, doEmbedding=False)
+            results[j, i] = calculateFMS(dataX, sampledX)
+
+    # Prepare DataFrame for results
+    df = pd.DataFrame(
+        {
+            "Run": np.repeat(np.arange(runs), len(percentList)),
+            "Percentage of Counts Dropped": np.tile(percentList, runs),
+            "FMS": results.flatten(),
+        }
+    )
+
+    return df
diff --git a/pf2rnaseq/figures/commonFuncs/plotGeneral.py b/pf2rnaseq/figures/commonFuncs/plotGeneral.py
index db0f833..a5c0153 100644
--- a/pf2rnaseq/figures/commonFuncs/plotGeneral.py
+++ b/pf2rnaseq/figures/commonFuncs/plotGeneral.py
@@ -6,7 +6,12 @@
 import seaborn as sns
 from matplotlib.axes import Axes
 
-from ...factorization import fms_percent_drop, pf2_pca_r2x, fms_diff_ranks
+from ...factorization import (
+    fms_diff_ranks,
+    fms_percent_drop,
+    fms_percent_drop_counts_multinomial,
+    pf2_pca_r2x,
+)
 
 
 def plot_r2x(data, rank_vec, ax: Axes):
@@ -460,3 +465,19 @@ def plot_fms_percent_drop(
     df = fms_percent_drop(X, percentList, runs, rank)
     sns.lineplot(data=df, x="Percentage of Data Dropped", y="FMS", ax=ax)
     ax.set_ylim(0, 1)
+
+
+def plot_fms_percent_drop_counts(
+    X: anndata.AnnData,
+    ax: Axes,
+    percentList: np.ndarray,
+    runs=3,
+    rank: int = 30,
+    deviance: bool = False,
+):
+    """Plots FMS when dropping different percentages of data"""
+    df = fms_percent_drop_counts_multinomial(
+        X, percentList, runs, rank, deviance=deviance
+    )
+    sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax)
+    ax.set_ylim(0, 1)
diff --git a/pf2rnaseq/figures/figureCountFMS.py b/pf2rnaseq/figures/figureCountFMS.py
new file mode 100644
index 0000000..f233db1
--- /dev/null
+++ b/pf2rnaseq/figures/figureCountFMS.py
@@ -0,0 +1,26 @@
+"""
+factorization score
+
+"""
+
+from anndata import read_h5ad
+
+from .common import getSetup, subplotLabel
+from .commonFuncs.plotGeneral import plot_fms_percent_drop_counts
+
+
+def makeFigure():
+    ax, f = getSetup((6, 3), (1, 2))
+    subplotLabel(ax)
+    # Using our cytokine dataset
+    X = read_h5ad("/opt/extra-storage/Treg_h5ads/Treg_raw.h5ad")
+
+    # Remove multiplexing identifiers
+    X = X[:, ~X.var_names.str.match("^CMO3[0-9]{2}$")]  # type: ignore
+    # Remove genes with too few reads now
+    X = X[X.X.sum(axis=1) > 10, X.X.mean(axis=0) > 0.1]
+    X = X.copy()
+    percentList = [0.0, 25.0, 50.0, 75.0, 95.0, 99.0]
+    plot_fms_percent_drop_counts(X, ax[0], percentList, runs=2, rank=20, deviance=True)
+
+    return f

From 2301740fc432f744cdee75589fd129850c89f904 Mon Sep 17 00:00:00 2001
From: Nicole Bedanova <nbedanova@g.ucla.edu>
Date: Wed, 13 Aug 2025 12:12:10 -0700
Subject: [PATCH 2/2] Small fixes

---
 pf2rnaseq/factorization.py                   | 139 +++++++++++++------
 pf2rnaseq/figures/commonFuncs/plotGeneral.py |  11 +-
 pf2rnaseq/figures/figureCountFMS.py          |  11 +-
 3 files changed, 110 insertions(+), 51 deletions(-)

diff --git a/pf2rnaseq/factorization.py b/pf2rnaseq/factorization.py
index afca4d1..d7fc8e0 100644
--- a/pf2rnaseq/factorization.py
+++ b/pf2rnaseq/factorization.py
@@ -44,7 +44,7 @@ def pf2(
     tolerance=1e-9,
     r2x=False,
 ):
-    cupy.cuda.Device(0).use()
+    cupy.cuda.Device(1).use()
     pf_out, R2X = parafac2_nd(
         X,
         rank=rank,
@@ -267,57 +267,112 @@ def downsample_counts_multinomial(
     return sampled_data
 
 
-def fms_percent_drop_counts_multinomial(
+def calculate_fms_downsample(
+    X: anndata.AnnData,
+    X_pf2: anndata.AnnData,
+    percent_drop: float,
+    rank: int = 30,
+    deviance: bool = False,
+    condition: str = "Condition",
+    random_state: int = 0,
+) -> float:
+    """
+    Calculate FMS for a single downsampling scenario.
+
+    Parameters:
+    -----------
+    X : anndata.AnnData
+        Original dataset for reference
+    X_pf2 : anndata.AnnData
+        Full factorized dataset
+    percent_drop : float
+        Percentage of counts to drop (0-100)
+    rank : int
+        Factorization rank
+    deviance : bool
+        Whether to use deviance normalization
+    condition : str
+        Condition column name
+    random_state : int
+        Random seed
+
+    Returns:
+    --------
+    float
+        FMS score
+    """
+
+    # Handle 0% drop case
+    if percent_drop == 0:
+        return 1.0
+
+    # Create downsampled data
+    sampled_data = downsample_counts_multinomial(
+        X, percent_drop, random_state=random_state
+    )
+
+    # Apply same processing as reference
+    sampled_data = prepare_dataset(
+        sampled_data, condition, geneThreshold=0.0, deviance=deviance
+    )
+
+    # Factorization
+    sampledX = pf2(sampled_data, rank, random_state=random_state + 2, doEmbedding=False)
+
+    return calculateFMS(X_pf2, sampledX)
+
+
+def fms_percent_drop_counts(
     X: anndata.AnnData,
     percentList: np.ndarray,
-    runs: int,
     rank: int = 30,
     deviance: bool = False,
     condition: str = "Condition",
     geneThreshold: float = 0.0,
-):
-    """
-    Multinomial sampling to reduce counts of each cell and calculate FMS.
+    random_state: int = 0,
+) -> pd.DataFrame:
     """
+    Calculate FMS for multiple downsampling percentages (single run).
+
+    Parameters:
+    -----------
+    X : anndata.AnnData
+        Input dataset
+    percentList : np.ndarray
+        Array of dropout percentages to test
+    rank : int
+        Factorization rank
+    deviance : bool
+        Whether to use deviance normalization
+    condition : str
+        Condition column name
+    geneThreshold : float
+        Gene threshold for preparation
+    random_state : int
+        Random seed
 
-    # Get reference factorization
+    Returns:
+    --------
+    pd.DataFrame
+        DataFrame with columns: Percentage of Counts Dropped, FMS
+    """
+    results = []
     X_prepared = prepare_dataset(
         X, condition, geneThreshold=geneThreshold, deviance=deviance
     )
-    dataX = pf2(X_prepared, rank, doEmbedding=False)
-
-    # Pre-allocate results
-    results = np.zeros((runs, len(percentList)))
-
-    # Main loop
-    for j in range(runs):
-        for i, percent_drop in enumerate(percentList):
-            # Handle 0% drop case separately (no sampling needed)
-            if percent_drop == 0:
-                results[j, i] = 1.0  # FMS = 1.0 for identical data
-                continue
-
-            # Create downsampled data
-            sampled_data = downsample_counts_multinomial(
-                X, percent_drop, random_state=j
-            )
-
-            # Apply same processing as reference
-            sampled_data = prepare_dataset(
-                sampled_data, condition, geneThreshold=geneThreshold, deviance=deviance
-            )
-
-            # Factorization
-            sampledX = pf2(sampled_data, rank, random_state=j + 2, doEmbedding=False)
-            results[j, i] = calculateFMS(dataX, sampledX)
+    X_pf2 = pf2(X_prepared, rank, doEmbedding=False)
+
+    for percent_drop in percentList:
+        fms_score = calculate_fms_downsample(
+            X=X,
+            X_pf2=X_pf2,
+            percent_drop=percent_drop,
+            rank=rank,
+            deviance=deviance,
+            condition=condition,
+            random_state=random_state,
+        )
 
-    # Prepare DataFrame for results
-    df = pd.DataFrame(
-        {
-            "Run": np.repeat(np.arange(runs), len(percentList)),
-            "Percentage of Counts Dropped": np.tile(percentList, runs),
-            "FMS": results.flatten(),
-        }
-    )
+        results.append({"Percentage of Counts Dropped": percent_drop, "FMS": fms_score})
 
-    return df
+    return pd.DataFrame(results)
diff --git a/pf2rnaseq/figures/commonFuncs/plotGeneral.py b/pf2rnaseq/figures/commonFuncs/plotGeneral.py
index a5c0153..028328e 100644
--- a/pf2rnaseq/figures/commonFuncs/plotGeneral.py
+++ b/pf2rnaseq/figures/commonFuncs/plotGeneral.py
@@ -9,7 +9,7 @@
 from ...factorization import (
     fms_diff_ranks,
     fms_percent_drop,
-    fms_percent_drop_counts_multinomial,
+    fms_percent_drop_counts,
     pf2_pca_r2x,
 )
 
@@ -471,13 +471,12 @@ def plot_fms_percent_drop_counts(
     X: anndata.AnnData,
     ax: Axes,
     percentList: np.ndarray,
-    runs=3,
     rank: int = 30,
     deviance: bool = False,
+    label: str = None,
 ):
     """Plots FMS when dropping different percentages of data"""
-    df = fms_percent_drop_counts_multinomial(
-        X, percentList, runs, rank, deviance=deviance
-    )
-    sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax)
+    df = fms_percent_drop_counts(X, percentList, rank, deviance=deviance)
+    sns.lineplot(data=df, x="Percentage of Counts Dropped", y="FMS", ax=ax, label=label)
     ax.set_ylim(0, 1)
+
diff --git a/pf2rnaseq/figures/figureCountFMS.py b/pf2rnaseq/figures/figureCountFMS.py
index f233db1..e69d415 100644
--- a/pf2rnaseq/figures/figureCountFMS.py
+++ b/pf2rnaseq/figures/figureCountFMS.py
@@ -10,7 +10,7 @@
 
 
 def makeFigure():
-    ax, f = getSetup((6, 3), (1, 2))
+    ax, f = getSetup((6, 3), (1, 1))
     subplotLabel(ax)
     # Using our cytokine dataset
     X = read_h5ad("/opt/extra-storage/Treg_h5ads/Treg_raw.h5ad")
@@ -20,7 +20,12 @@ def makeFigure():
     # Remove genes with too few reads now
     X = X[X.X.sum(axis=1) > 10, X.X.mean(axis=0) > 0.1]
     X = X.copy()
-    percentList = [0.0, 25.0, 50.0, 75.0, 95.0, 99.0]
-    plot_fms_percent_drop_counts(X, ax[0], percentList, runs=2, rank=20, deviance=True)
+    percentList = [0.0, 30.0, 50.0]
+    plot_fms_percent_drop_counts(
+        X, ax[0], percentList, rank=15, deviance=True, label="Deviance"
+    )
+    plot_fms_percent_drop_counts(
+        X, ax[0], percentList, rank=15, deviance=False, label="CPM"
+    )
 
     return f