diff --git a/src/morgan_fingerprint_gpu.cpp b/src/morgan_fingerprint_gpu.cpp index 8ef48e37..8e43a568 100644 --- a/src/morgan_fingerprint_gpu.cpp +++ b/src/morgan_fingerprint_gpu.cpp @@ -122,15 +122,12 @@ void allocateGpuBatch(MorganGPUBuffersBatch& buffers, switch (maxAtoms) { case 32: buffers.allSeenNeighborhoods32 = AsyncDeviceVector>(numMols * 32 * (radius + 1), stream); - buffers.allSeenNeighborhoods32.zero(); break; case 64: buffers.allSeenNeighborhoods64 = AsyncDeviceVector>(numMols * 64 * (radius + 1), stream); - buffers.allSeenNeighborhoods64.zero(); break; case 128: buffers.allSeenNeighborhoods128 = AsyncDeviceVector>(numMols * 128 * (radius + 1), stream); - buffers.allSeenNeighborhoods128.zero(); break; default: throw std::runtime_error("Unsupported max atoms for Morgan fingerprint GPU: " + std::to_string(maxAtoms)); @@ -424,6 +421,15 @@ AsyncDeviceVector> computeFingerprintsCuImpl(const std::vect buffersToUse->outputIndices.copyFromHost(threadCpuBuffers.h_outputIndices.data(), scopedChunkSize); cudaCheckError(cudaEventRecord(threadCpuBuffers.prevMemcpyDoneEvent.event(), stream)); rangeMemcpy.pop(); + // The kernel uses allSeenNeighborhoods as scratch that must be zeroed on entry. The buffers are + // reused across dispatch rounds, so reset the scratch for this round before launching. + if (thisRoundNumAtoms == 32) { + buffersToUse->allSeenNeighborhoods32.zero(); + } else if (thisRoundNumAtoms == 64) { + buffersToUse->allSeenNeighborhoods64.zero(); + } else { + buffersToUse->allSeenNeighborhoods128.zero(); + } solveOnGPUBatch(*buffersToUse, outputAccumulator, maxRadius, diff --git a/tests/test_morgan_fingerprint.cpp b/tests/test_morgan_fingerprint.cpp index 431abefb..2ec6b26b 100644 --- a/tests/test_morgan_fingerprint.cpp +++ b/tests/test_morgan_fingerprint.cpp @@ -238,6 +238,42 @@ TEST(MorganFingerprintTest, GpuWithLargeMolecules) { } } +TEST(MorganFingerprintTest, GpuConsistentAcrossDispatchRounds) { + // Regression: the per-thread GPU scratch (allSeenNeighborhoods) is reused across dispatch + // rounds. When it was only zeroed at allocation, molecules processed in rounds after the + // first saw stale neighborhood data from a prior round's molecule in the same slot, producing + // a false duplicate-environment match that dropped a bit. A small batch size over many + // molecules forces dozens of rounds and exposes this. + const unsigned int radius = 3; + const unsigned int fpSize = 2048; + auto refGenerator = std::unique_ptr>( + RDKit::MorganFingerprint::getMorganGenerator< + std::uint32_t>(radius, false, false, true, false, nullptr, nullptr, fpSize, {1, 2, 4, 8}, false, false)); + + auto [mols, smiles] = loadNChemblMolecules(100, 128); + auto molsView = makeMolsView(mols); + + std::vector> refResults; + refResults.reserve(mols.size()); + for (const auto& mol : mols) { + auto refFingerprint = std::unique_ptr(refGenerator->getFingerprint(*mol)); + ASSERT_NE(refFingerprint, nullptr); + refResults.push_back(std::move(refFingerprint)); + } + + auto generator = nvMolKit::MorganFingerprintGenerator(radius, fpSize); + nvMolKit::FingerprintComputeOptions options; + options.backend = nvMolKit::FingerprintComputeBackend::GPU; + options.gpuBatchSize = 8; // Small batch forces many dispatch rounds that reuse the scratch buffers. + auto newResults = generator.GetFingerprints(molsView, options); + + ASSERT_EQ(newResults.size(), mols.size()); + for (size_t i = 0; i < mols.size(); i++) { + ASSERT_NE(newResults[i], nullptr); + ASSERT_EQ(*newResults[i], *refResults[i]) << "on element " << i << " with smiles " << smiles[i]; + } +} + TEST(MorganFingerprintGpuTest, ThrowsRequestingCpuBackendGpuBuffer) { const unsigned int radius = 3; const unsigned int fpSize = 1024;