From a42e2e5aab709caadbadbf72f61c79eae6e27951 Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Fri, 5 Jun 2026 14:19:38 -0400 Subject: [PATCH 1/2] Fix GH issue 195 --- nvmolkit/tests/test_fingerprints.py | 28 ++++++++++++++++++++++++++++ src/morgan_fingerprint_gpu.cpp | 16 ++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/nvmolkit/tests/test_fingerprints.py b/nvmolkit/tests/test_fingerprints.py index a1ddad40..4deea42a 100644 --- a/nvmolkit/tests/test_fingerprints.py +++ b/nvmolkit/tests/test_fingerprints.py @@ -145,3 +145,31 @@ def test_gh_issue_84(): gen = MorganFingerprintGenerator(radius=radius, fpSize=fp_size) bits = unpack_fingerprint(gen.GetFingerprints([mol]).torch()).sum().item() assert bits > 0, f"Got empty fingerprint for BINAP on attempt {i}" + + +def test_gh_issue_195(): + """Regression test for https://github.com/NVIDIA-BioNeMo/nvMolKit/issues/195. + + A batch containing only molecules larger than the 128 atom/bond GPU buckets + used to produce empty fingerprints. + """ + radius = 3 + fp_size = 2048 + + large_mol = Chem.MolFromSmiles("NCC(=O)" * 40) + assert large_mol is not None + assert large_mol.GetNumAtoms() >= 128 or large_mol.GetNumBonds() >= 128 + + rdkit_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fp_size) + ref_bits = rdkit_gen.GetFingerprint(large_mol).ToList() + assert sum(ref_bits) > 0 + + nvmolkit_gen = MorganFingerprintGenerator(radius=radius, fpSize=fp_size) + + for batch in ([large_mol], [large_mol, large_mol]): + unpacked = unpack_fingerprint(nvmolkit_gen.GetFingerprints(batch).torch()) + torch.cuda.synchronize() + assert unpacked.shape == (len(batch), fp_size) + for row in range(len(batch)): + assert unpacked[row].sum().item() > 0, "Large-only batch produced an empty fingerprint" + torch.testing.assert_close(ref_bits, unpacked[row].to(int).tolist()) diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp index b37349bd..8ef48e37 100644 --- a/src/morgan_fingerprint_gpu.cpp +++ b/src/morgan_fingerprint_gpu.cpp @@ -266,10 +266,18 @@ AsyncDeviceVector> computeFingerprintsCuImpl(const std::vect workLarge.push_back(i); } } - const size_t numThreads32 = (work32.size() + dispatchChunkSize - 1) / dispatchChunkSize; - const size_t numThreads64 = (work64.size() + dispatchChunkSize - 1) / dispatchChunkSize; - const size_t numThreads128 = (work128.size() + dispatchChunkSize - 1) / dispatchChunkSize; - const size_t numThreadsTotal = numThreads32 + numThreads64 + numThreads128; + const size_t numThreads32 = (work32.size() + dispatchChunkSize - 1) / dispatchChunkSize; + const size_t numThreads64 = (work64.size() + dispatchChunkSize - 1) / dispatchChunkSize; + const size_t numThreads128 = (work128.size() + dispatchChunkSize - 1) / dispatchChunkSize; + size_t numThreadsTotal = numThreads32 + numThreads64 + numThreads128; + // Large molecules are drained from the shared workLarge queue inside the worker + // loop below, which only runs numThreadsTotal iterations. When every molecule is + // large there is no small/medium work, so without dedicated iterations the queue + // would never be drained and those molecules would get empty fingerprints. Spread + // the drain across the available threads in that case. + if (numThreadsTotal == 0) { + numThreadsTotal = std::min(workLarge.size(), static_cast(nThreadsActual)); + } detail::OpenMPExceptionRegistry exceptionRegistry; #pragma omp parallel for num_threads(nThreadsActual) default(none) shared(numThreadsTotal, \ From fa3f784b931edcf5c57a70a41d935700b9b49fea Mon Sep 17 00:00:00 2001 From: Kevin Boyd Date: Sat, 6 Jun 2026 07:55:22 -0400 Subject: [PATCH 2/2] Fix morgan fingerprint stale cache bug --- src/morgan_fingerprint_gpu.cpp | 12 ++++++++--- tests/test_morgan_fingerprint.cpp | 36 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp index 8ef48e37..8e43a568 100644 --- a/src/morgan_fingerprint_gpu.cpp +++ b/src/morgan_fingerprint_gpu.cpp @@ -122,15 +122,12 @@ void allocateGpuBatch(MorganGPUBuffersBatch& buffers, switch (maxAtoms) { case 32: buffers.allSeenNeighborhoods32 = AsyncDeviceVector>(numMols * 32 * (radius + 1), stream); - buffers.allSeenNeighborhoods32.zero(); break; case 64: buffers.allSeenNeighborhoods64 = AsyncDeviceVector>(numMols * 64 * (radius + 1), stream); - buffers.allSeenNeighborhoods64.zero(); break; case 128: buffers.allSeenNeighborhoods128 = AsyncDeviceVector>(numMols * 128 * (radius + 1), stream); - buffers.allSeenNeighborhoods128.zero(); break; default: throw std::runtime_error("Unsupported max atoms for Morgan fingerprint GPU: " + std::to_string(maxAtoms)); @@ -424,6 +421,15 @@ AsyncDeviceVector> computeFingerprintsCuImpl(const std::vect buffersToUse->outputIndices.copyFromHost(threadCpuBuffers.h_outputIndices.data(), scopedChunkSize); cudaCheckError(cudaEventRecord(threadCpuBuffers.prevMemcpyDoneEvent.event(), stream)); rangeMemcpy.pop(); + // The kernel uses allSeenNeighborhoods as scratch that must be zeroed on entry. The buffers are + // reused across dispatch rounds, so reset the scratch for this round before launching. + if (thisRoundNumAtoms == 32) { + buffersToUse->allSeenNeighborhoods32.zero(); + } else if (thisRoundNumAtoms == 64) { + buffersToUse->allSeenNeighborhoods64.zero(); + } else { + buffersToUse->allSeenNeighborhoods128.zero(); + } solveOnGPUBatch(*buffersToUse, outputAccumulator, maxRadius, diff --git a/tests/test_morgan_fingerprint.cpp b/tests/test_morgan_fingerprint.cpp index 431abefb..2ec6b26b 100644 --- a/tests/test_morgan_fingerprint.cpp +++ b/tests/test_morgan_fingerprint.cpp @@ -238,6 +238,42 @@ TEST(MorganFingerprintTest, GpuWithLargeMolecules) { } } +TEST(MorganFingerprintTest, GpuConsistentAcrossDispatchRounds) { + // Regression: the per-thread GPU scratch (allSeenNeighborhoods) is reused across dispatch + // rounds. When it was only zeroed at allocation, molecules processed in rounds after the + // first saw stale neighborhood data from a prior round's molecule in the same slot, producing + // a false duplicate-environment match that dropped a bit. A small batch size over many + // molecules forces dozens of rounds and exposes this. + const unsigned int radius = 3; + const unsigned int fpSize = 2048; + auto refGenerator = std::unique_ptr>( + RDKit::MorganFingerprint::getMorganGenerator< + std::uint32_t>(radius, false, false, true, false, nullptr, nullptr, fpSize, {1, 2, 4, 8}, false, false)); + + auto [mols, smiles] = loadNChemblMolecules(100, 128); + auto molsView = makeMolsView(mols); + + std::vector> refResults; + refResults.reserve(mols.size()); + for (const auto& mol : mols) { + auto refFingerprint = std::unique_ptr(refGenerator->getFingerprint(*mol)); + ASSERT_NE(refFingerprint, nullptr); + refResults.push_back(std::move(refFingerprint)); + } + + auto generator = nvMolKit::MorganFingerprintGenerator(radius, fpSize); + nvMolKit::FingerprintComputeOptions options; + options.backend = nvMolKit::FingerprintComputeBackend::GPU; + options.gpuBatchSize = 8; // Small batch forces many dispatch rounds that reuse the scratch buffers. + auto newResults = generator.GetFingerprints(molsView, options); + + ASSERT_EQ(newResults.size(), mols.size()); + for (size_t i = 0; i < mols.size(); i++) { + ASSERT_NE(newResults[i], nullptr); + ASSERT_EQ(*newResults[i], *refResults[i]) << "on element " << i << " with smiles " << smiles[i]; + } +} + TEST(MorganFingerprintGpuTest, ThrowsRequestingCpuBackendGpuBuffer) { const unsigned int radius = 3; const unsigned int fpSize = 1024;