From a42e2e5aab709caadbadbf72f61c79eae6e27951 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Fri, 5 Jun 2026 14:19:38 -0400
Subject: [PATCH 1/2] Fix GH issue 195

---
 nvmolkit/tests/test_fingerprints.py | 28 ++++++++++++++++++++++++++++
 src/morgan_fingerprint_gpu.cpp      | 16 ++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/nvmolkit/tests/test_fingerprints.py b/nvmolkit/tests/test_fingerprints.py
index a1ddad40..4deea42a 100644
--- a/nvmolkit/tests/test_fingerprints.py
+++ b/nvmolkit/tests/test_fingerprints.py
@@ -145,3 +145,31 @@ def test_gh_issue_84():
         gen = MorganFingerprintGenerator(radius=radius, fpSize=fp_size)
         bits = unpack_fingerprint(gen.GetFingerprints([mol]).torch()).sum().item()
         assert bits > 0, f"Got empty fingerprint for BINAP on attempt {i}"
+
+
+def test_gh_issue_195():
+    """Regression test for https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/195.
+
+    A batch containing only molecules larger than the 128 atom/bond GPU buckets
+    used to produce empty fingerprints.
+    """
+    radius = 3
+    fp_size = 2048
+
+    large_mol = Chem.MolFromSmiles("NCC(=O)" * 40)
+    assert large_mol is not None
+    assert large_mol.GetNumAtoms() >= 128 or large_mol.GetNumBonds() >= 128
+
+    rdkit_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fp_size)
+    ref_bits = rdkit_gen.GetFingerprint(large_mol).ToList()
+    assert sum(ref_bits) > 0
+
+    nvmolkit_gen = MorganFingerprintGenerator(radius=radius, fpSize=fp_size)
+
+    for batch in ([large_mol], [large_mol, large_mol]):
+        unpacked = unpack_fingerprint(nvmolkit_gen.GetFingerprints(batch).torch())
+        torch.cuda.synchronize()
+        assert unpacked.shape == (len(batch), fp_size)
+        for row in range(len(batch)):
+            assert unpacked[row].sum().item() > 0, "Large-only batch produced an empty fingerprint"
+            torch.testing.assert_close(ref_bits, unpacked[row].to(int).tolist())
diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp
index b37349bd..8ef48e37 100644
--- a/src/morgan_fingerprint_gpu.cpp
+++ b/src/morgan_fingerprint_gpu.cpp
@@ -266,10 +266,18 @@ AsyncDeviceVector<FlatBitVect<fpSize>> computeFingerprintsCuImpl(const std::vect
       workLarge.push_back(i);
     }
   }
-  const size_t                    numThreads32    = (work32.size() + dispatchChunkSize - 1) / dispatchChunkSize;
-  const size_t                    numThreads64    = (work64.size() + dispatchChunkSize - 1) / dispatchChunkSize;
-  const size_t                    numThreads128   = (work128.size() + dispatchChunkSize - 1) / dispatchChunkSize;
-  const size_t                    numThreadsTotal = numThreads32 + numThreads64 + numThreads128;
+  const size_t numThreads32    = (work32.size() + dispatchChunkSize - 1) / dispatchChunkSize;
+  const size_t numThreads64    = (work64.size() + dispatchChunkSize - 1) / dispatchChunkSize;
+  const size_t numThreads128   = (work128.size() + dispatchChunkSize - 1) / dispatchChunkSize;
+  size_t       numThreadsTotal = numThreads32 + numThreads64 + numThreads128;
+  // Large molecules are drained from the shared workLarge queue inside the worker
+  // loop below, which only runs numThreadsTotal iterations. When every molecule is
+  // large there is no small/medium work, so without dedicated iterations the queue
+  // would never be drained and those molecules would get empty fingerprints. Spread
+  // the drain across the available threads in that case.
+  if (numThreadsTotal == 0) {
+    numThreadsTotal = std::min(workLarge.size(), static_cast<size_t>(nThreadsActual));
+  }
   detail::OpenMPExceptionRegistry exceptionRegistry;
 
 #pragma omp parallel for num_threads(nThreadsActual) default(none) shared(numThreadsTotal,     \

From fa3f784b931edcf5c57a70a41d935700b9b49fea Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Sat, 6 Jun 2026 07:55:22 -0400
Subject: [PATCH 2/2] Fix morgan fingerprint stale cache bug

---
 src/morgan_fingerprint_gpu.cpp    | 12 ++++++++---
 tests/test_morgan_fingerprint.cpp | 36 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp
index 8ef48e37..8e43a568 100644
--- a/src/morgan_fingerprint_gpu.cpp
+++ b/src/morgan_fingerprint_gpu.cpp
@@ -122,15 +122,12 @@ void allocateGpuBatch(MorganGPUBuffersBatch& buffers,
   switch (maxAtoms) {
     case 32:
       buffers.allSeenNeighborhoods32 = AsyncDeviceVector<FlatBitVect<32>>(numMols * 32 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods32.zero();
       break;
     case 64:
       buffers.allSeenNeighborhoods64 = AsyncDeviceVector<FlatBitVect<64>>(numMols * 64 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods64.zero();
       break;
     case 128:
       buffers.allSeenNeighborhoods128 = AsyncDeviceVector<FlatBitVect<128>>(numMols * 128 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods128.zero();
       break;
     default:
       throw std::runtime_error("Unsupported max atoms for Morgan fingerprint GPU: " + std::to_string(maxAtoms));
@@ -424,6 +421,15 @@ AsyncDeviceVector<FlatBitVect<fpSize>> computeFingerprintsCuImpl(const std::vect
         buffersToUse->outputIndices.copyFromHost(threadCpuBuffers.h_outputIndices.data(), scopedChunkSize);
         cudaCheckError(cudaEventRecord(threadCpuBuffers.prevMemcpyDoneEvent.event(), stream));
         rangeMemcpy.pop();
+        // The kernel uses allSeenNeighborhoods as scratch that must be zeroed on entry. The buffers are
+        // reused across dispatch rounds, so reset the scratch for this round before launching.
+        if (thisRoundNumAtoms == 32) {
+          buffersToUse->allSeenNeighborhoods32.zero();
+        } else if (thisRoundNumAtoms == 64) {
+          buffersToUse->allSeenNeighborhoods64.zero();
+        } else {
+          buffersToUse->allSeenNeighborhoods128.zero();
+        }
         solveOnGPUBatch<fpSize>(*buffersToUse,
                                 outputAccumulator,
                                 maxRadius,
diff --git a/tests/test_morgan_fingerprint.cpp b/tests/test_morgan_fingerprint.cpp
index 431abefb..2ec6b26b 100644
--- a/tests/test_morgan_fingerprint.cpp
+++ b/tests/test_morgan_fingerprint.cpp
@@ -238,6 +238,42 @@ TEST(MorganFingerprintTest, GpuWithLargeMolecules) {
   }
 }
 
+TEST(MorganFingerprintTest, GpuConsistentAcrossDispatchRounds) {
+  // Regression: the per-thread GPU scratch (allSeenNeighborhoods) is reused across dispatch
+  // rounds. When it was only zeroed at allocation, molecules processed in rounds after the
+  // first saw stale neighborhood data from a prior round's molecule in the same slot, producing
+  // a false duplicate-environment match that dropped a bit. A small batch size over many
+  // molecules forces dozens of rounds and exposes this.
+  const unsigned int radius       = 3;
+  const unsigned int fpSize       = 2048;
+  auto               refGenerator = std::unique_ptr<RDKit::FingerprintGenerator<std::uint32_t>>(
+    RDKit::MorganFingerprint::getMorganGenerator<
+      std::uint32_t>(radius, false, false, true, false, nullptr, nullptr, fpSize, {1, 2, 4, 8}, false, false));
+
+  auto [mols, smiles] = loadNChemblMolecules(100, 128);
+  auto molsView       = makeMolsView(mols);
+
+  std::vector<std::unique_ptr<ExplicitBitVect>> refResults;
+  refResults.reserve(mols.size());
+  for (const auto& mol : mols) {
+    auto refFingerprint = std::unique_ptr<ExplicitBitVect>(refGenerator->getFingerprint(*mol));
+    ASSERT_NE(refFingerprint, nullptr);
+    refResults.push_back(std::move(refFingerprint));
+  }
+
+  auto                                generator = nvMolKit::MorganFingerprintGenerator(radius, fpSize);
+  nvMolKit::FingerprintComputeOptions options;
+  options.backend      = nvMolKit::FingerprintComputeBackend::GPU;
+  options.gpuBatchSize = 8;  // Small batch forces many dispatch rounds that reuse the scratch buffers.
+  auto newResults      = generator.GetFingerprints(molsView, options);
+
+  ASSERT_EQ(newResults.size(), mols.size());
+  for (size_t i = 0; i < mols.size(); i++) {
+    ASSERT_NE(newResults[i], nullptr);
+    ASSERT_EQ(*newResults[i], *refResults[i]) << "on element " << i << " with smiles " << smiles[i];
+  }
+}
+
 TEST(MorganFingerprintGpuTest, ThrowsRequestingCpuBackendGpuBuffer) {
   const unsigned int                  radius    = 3;
   const unsigned int                  fpSize    = 1024;