NVIDIA-BioNeMo · scal444 · Jun 8, 2026 · Jun 5, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp
@@ -122,15 +122,12 @@ void allocateGpuBatch(MorganGPUBuffersBatch& buffers,
   switch (maxAtoms) {
     case 32:
       buffers.allSeenNeighborhoods32 = AsyncDeviceVector<FlatBitVect<32>>(numMols * 32 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods32.zero();
       break;
     case 64:
       buffers.allSeenNeighborhoods64 = AsyncDeviceVector<FlatBitVect<64>>(numMols * 64 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods64.zero();
       break;
     case 128:
       buffers.allSeenNeighborhoods128 = AsyncDeviceVector<FlatBitVect<128>>(numMols * 128 * (radius + 1), stream);
-      buffers.allSeenNeighborhoods128.zero();
       break;
     default:
       throw std::runtime_error("Unsupported max atoms for Morgan fingerprint GPU: " + std::to_string(maxAtoms));
@@ -424,6 +421,15 @@ AsyncDeviceVector<FlatBitVect<fpSize>> computeFingerprintsCuImpl(const std::vect
         buffersToUse->outputIndices.copyFromHost(threadCpuBuffers.h_outputIndices.data(), scopedChunkSize);
         cudaCheckError(cudaEventRecord(threadCpuBuffers.prevMemcpyDoneEvent.event(), stream));
         rangeMemcpy.pop();
+        // The kernel uses allSeenNeighborhoods as scratch that must be zeroed on entry. The buffers are
+        // reused across dispatch rounds, so reset the scratch for this round before launching.
+        if (thisRoundNumAtoms == 32) {
+          buffersToUse->allSeenNeighborhoods32.zero();
+        } else if (thisRoundNumAtoms == 64) {
+          buffersToUse->allSeenNeighborhoods64.zero();
+        } else {
+          buffersToUse->allSeenNeighborhoods128.zero();
+        }
         solveOnGPUBatch<fpSize>(*buffersToUse,
                                 outputAccumulator,
                                 maxRadius,

diff --git a/tests/test_morgan_fingerprint.cpp b/tests/test_morgan_fingerprint.cpp
@@ -238,6 +238,42 @@ TEST(MorganFingerprintTest, GpuWithLargeMolecules) {
   }
 }
 
+TEST(MorganFingerprintTest, GpuConsistentAcrossDispatchRounds) {
+  // Regression: the per-thread GPU scratch (allSeenNeighborhoods) is reused across dispatch
+  // rounds. When it was only zeroed at allocation, molecules processed in rounds after the
+  // first saw stale neighborhood data from a prior round's molecule in the same slot, producing
+  // a false duplicate-environment match that dropped a bit. A small batch size over many
+  // molecules forces dozens of rounds and exposes this.
+  const unsigned int radius       = 3;
+  const unsigned int fpSize       = 2048;
+  auto               refGenerator = std::unique_ptr<RDKit::FingerprintGenerator<std::uint32_t>>(
+    RDKit::MorganFingerprint::getMorganGenerator<
+      std::uint32_t>(radius, false, false, true, false, nullptr, nullptr, fpSize, {1, 2, 4, 8}, false, false));
+
+  auto [mols, smiles] = loadNChemblMolecules(100, 128);
+  auto molsView       = makeMolsView(mols);
+
+  std::vector<std::unique_ptr<ExplicitBitVect>> refResults;
+  refResults.reserve(mols.size());
+  for (const auto& mol : mols) {
+    auto refFingerprint = std::unique_ptr<ExplicitBitVect>(refGenerator->getFingerprint(*mol));
+    ASSERT_NE(refFingerprint, nullptr);
+    refResults.push_back(std::move(refFingerprint));
+  }
+
+  auto                                generator = nvMolKit::MorganFingerprintGenerator(radius, fpSize);
+  nvMolKit::FingerprintComputeOptions options;
+  options.backend      = nvMolKit::FingerprintComputeBackend::GPU;
+  options.gpuBatchSize = 8;  // Small batch forces many dispatch rounds that reuse the scratch buffers.
+  auto newResults      = generator.GetFingerprints(molsView, options);
+
+  ASSERT_EQ(newResults.size(), mols.size());
+  for (size_t i = 0; i < mols.size(); i++) {
+    ASSERT_NE(newResults[i], nullptr);
+    ASSERT_EQ(*newResults[i], *refResults[i]) << "on element " << i << " with smiles " << smiles[i];
+  }
+}
+
 TEST(MorganFingerprintGpuTest, ThrowsRequestingCpuBackendGpuBuffer) {
   const unsigned int                  radius    = 3;
   const unsigned int                  fpSize    = 1024;