Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/morgan_fingerprint_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,12 @@ void allocateGpuBatch(MorganGPUBuffersBatch& buffers,
switch (maxAtoms) {
case 32:
buffers.allSeenNeighborhoods32 = AsyncDeviceVector<FlatBitVect<32>>(numMols * 32 * (radius + 1), stream);
buffers.allSeenNeighborhoods32.zero();
break;
case 64:
buffers.allSeenNeighborhoods64 = AsyncDeviceVector<FlatBitVect<64>>(numMols * 64 * (radius + 1), stream);
buffers.allSeenNeighborhoods64.zero();
break;
case 128:
buffers.allSeenNeighborhoods128 = AsyncDeviceVector<FlatBitVect<128>>(numMols * 128 * (radius + 1), stream);
buffers.allSeenNeighborhoods128.zero();
break;
default:
throw std::runtime_error("Unsupported max atoms for Morgan fingerprint GPU: " + std::to_string(maxAtoms));
Expand Down Expand Up @@ -424,6 +421,15 @@ AsyncDeviceVector<FlatBitVect<fpSize>> computeFingerprintsCuImpl(const std::vect
buffersToUse->outputIndices.copyFromHost(threadCpuBuffers.h_outputIndices.data(), scopedChunkSize);
cudaCheckError(cudaEventRecord(threadCpuBuffers.prevMemcpyDoneEvent.event(), stream));
rangeMemcpy.pop();
// The kernel uses allSeenNeighborhoods as scratch that must be zeroed on entry. The buffers are
// reused across dispatch rounds, so reset the scratch for this round before launching.
if (thisRoundNumAtoms == 32) {
buffersToUse->allSeenNeighborhoods32.zero();
} else if (thisRoundNumAtoms == 64) {
buffersToUse->allSeenNeighborhoods64.zero();
} else {
buffersToUse->allSeenNeighborhoods128.zero();
}
solveOnGPUBatch<fpSize>(*buffersToUse,
outputAccumulator,
maxRadius,
Expand Down
36 changes: 36 additions & 0 deletions tests/test_morgan_fingerprint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,42 @@ TEST(MorganFingerprintTest, GpuWithLargeMolecules) {
}
}

TEST(MorganFingerprintTest, GpuConsistentAcrossDispatchRounds) {
// Regression: the per-thread GPU scratch (allSeenNeighborhoods) is reused across dispatch
// rounds. When it was only zeroed at allocation, molecules processed in rounds after the
// first saw stale neighborhood data from a prior round's molecule in the same slot, producing
// a false duplicate-environment match that dropped a bit. A small batch size over many
// molecules forces dozens of rounds and exposes this.
const unsigned int radius = 3;
const unsigned int fpSize = 2048;
auto refGenerator = std::unique_ptr<RDKit::FingerprintGenerator<std::uint32_t>>(
RDKit::MorganFingerprint::getMorganGenerator<
std::uint32_t>(radius, false, false, true, false, nullptr, nullptr, fpSize, {1, 2, 4, 8}, false, false));

auto [mols, smiles] = loadNChemblMolecules(100, 128);
auto molsView = makeMolsView(mols);

std::vector<std::unique_ptr<ExplicitBitVect>> refResults;
refResults.reserve(mols.size());
for (const auto& mol : mols) {
auto refFingerprint = std::unique_ptr<ExplicitBitVect>(refGenerator->getFingerprint(*mol));
ASSERT_NE(refFingerprint, nullptr);
refResults.push_back(std::move(refFingerprint));
}

auto generator = nvMolKit::MorganFingerprintGenerator(radius, fpSize);
nvMolKit::FingerprintComputeOptions options;
options.backend = nvMolKit::FingerprintComputeBackend::GPU;
options.gpuBatchSize = 8; // Small batch forces many dispatch rounds that reuse the scratch buffers.
auto newResults = generator.GetFingerprints(molsView, options);

ASSERT_EQ(newResults.size(), mols.size());
for (size_t i = 0; i < mols.size(); i++) {
ASSERT_NE(newResults[i], nullptr);
ASSERT_EQ(*newResults[i], *refResults[i]) << "on element " << i << " with smiles " << smiles[i];
}
}

TEST(MorganFingerprintGpuTest, ThrowsRequestingCpuBackendGpuBuffer) {
const unsigned int radius = 3;
const unsigned int fpSize = 1024;
Expand Down
Loading