Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -558,7 +559,7 @@ public void onTrigger(final ProcessContext context, final ProcessSession session
// Update signal if needed.
try {
if (waitCompleted) {
protocol.complete(signalId);
protocol.complete(signal);
if (logger.isDebugEnabled()) {
logger.debug("Completed wait for signalId='{}' and removed signal from cache", signalId);
}
Expand All @@ -570,6 +571,10 @@ public void onTrigger(final ProcessContext context, final ProcessSession session
}
}

} catch (final ConcurrentModificationException e) {
logger.warn("Concurrent modification detected for signal [{}], rolling back session to retry: {}", signalId, e.getMessage());
session.rollback();
throw new ProcessException(String.format("Concurrent modification detected while updating signal %s", signalId), e);
} catch (final IOException e) {
session.rollback();
throw new ProcessException(String.format("Unable to communicate with cache while updating %s due to %s", signalId, e), e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,11 @@ public Signal notify(final String signalId, final Map<String, Integer> deltas, f
signal.counts.put(counterName, count);
});

if (replace(signal)) {
try {
replace(signal);
return signal;
} catch (final ConcurrentModificationException ignored) {
// CAS failed; retry with fresh signal state after backoff.
}

long waitMillis = REPLACE_RETRY_WAIT_MILLIS * (i + 1);
Expand Down Expand Up @@ -238,7 +241,7 @@ public Signal notify(final String signalId, final String counterName, final int

/**
* Retrieve a stored Signal in the cache engine.
* If a caller gets satisfied with the returned Signal state and finish waiting, it should call {@link #complete(String)}
* If a caller gets satisfied with the returned Signal state and finish waiting, it should call {@link #complete(Signal)}
* to complete the Wait Notify protocol.
* @param signalId a key in the underlying cache engine
* @return A Signal instance
Expand Down Expand Up @@ -281,22 +284,55 @@ public Signal getSignal(final String signalId) throws IOException, Deserializati

/**
* Finish protocol and remove the cache entry.
* @param signalId a key in the underlying cache engine
*
* <p>This method performs a best-effort version check before removing the entry. If the signal
* was concurrently modified by a Notify processor after the caller last read it, a
* {@link ConcurrentModificationException} is thrown so the caller can roll back and retry
* rather than silently discarding the concurrent notification.</p>
*
* <p>Note: there is a small inherent TOCTOU window between the version re-fetch and the
* remove call. A {@link AtomicDistributedMapCacheClient} API extension for atomic
* compare-and-delete would eliminate this entirely, but this approach covers the common case.</p>
*
* @param signal the Signal obtained from the most recent {@link #getSignal(String)} call;
* its cached revision is used to detect concurrent modifications
* @throws IOException thrown when it failed interacting with the cache engine
* @throws ConcurrentModificationException thrown if the signal was concurrently modified
* or removed since the caller last read it
*/
public void complete(final String signalId) throws IOException {
public void complete(final Signal signal) throws IOException, ConcurrentModificationException {
final String signalId = signal.identifier;

// Re-fetch to detect concurrent updates since the signal was last read.
final Signal current = getSignal(signalId);
if (current == null) {
throw new ConcurrentModificationException(String.format(
"Failed to complete signal [%s]: signal was concurrently removed.", signalId));
}

final Object expectedRevision = signal.cachedEntry != null ? signal.cachedEntry.getRevision().orElse(null) : null;
final Object actualRevision = current.cachedEntry.getRevision().orElse(null);
if (expectedRevision != null && !expectedRevision.equals(actualRevision)) {
throw new ConcurrentModificationException(String.format(
"Failed to complete signal [%s]: signal was concurrently modified (expected revision %s, found %s).",
signalId, expectedRevision, actualRevision));
}

cache.remove(signalId, stringSerializer);
}

public boolean replace(final Signal signal) throws IOException {
public void replace(final Signal signal) throws IOException, ConcurrentModificationException {

final String signalJson = objectMapper.writeValueAsString(signal);
if (signal.cachedEntry == null) {
signal.cachedEntry = new AtomicCacheEntry<>(signal.identifier, signalJson, null);
} else {
signal.cachedEntry.setValue(signalJson);
}
return cache.replace(signal.cachedEntry, stringSerializer, stringSerializer);
if (!cache.replace(signal.cachedEntry, stringSerializer, stringSerializer)) {
throw new ConcurrentModificationException(String.format(
"Failed to update signal [%s] in cache due to concurrent modification.", signal.identifier));
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
*/
package org.apache.nifi.processors.standard;

import org.apache.nifi.distributed.cache.client.AtomicCacheEntry;
import org.apache.nifi.distributed.cache.client.Deserializer;
import org.apache.nifi.distributed.cache.client.Serializer;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processors.standard.TestNotify.MockCacheClient;
import org.apache.nifi.reporting.InitializationException;
Expand Down Expand Up @@ -785,6 +788,115 @@ public void testOpenGate() throws IOException {
assertEquals(0, signal.getReleasableCount());
}

/**
* A {@link MockCacheClient} extension that can simulate concurrent-modification scenarios
* by returning {@code false} from {@link #replace} on demand, or by returning a stale
* (bumped) revision from {@link #fetch} on a specific call number.
*/
private static class ControllableCacheClient extends MockCacheClient {
private volatile boolean failNextReplace = false;
private volatile int bumpRevisionOnFetchCall = -1;
private volatile int fetchCallCount = 0;

void setFailNextReplace() {
failNextReplace = true;
}

void setBumpRevisionOnFetchCall(final int callNumber) {
fetchCallCount = 0;
bumpRevisionOnFetchCall = callNumber;
}

@Override
public <K, V> boolean replace(final AtomicCacheEntry<K, V, Long> entry,
final Serializer<K> keySerializer, final Serializer<V> valueSerializer) throws IOException {
if (failNextReplace) {
failNextReplace = false;
return false;
}
return super.replace(entry, keySerializer, valueSerializer);
}

@Override
@SuppressWarnings("unchecked")
public <K, V> AtomicCacheEntry<K, V, Long> fetch(final K key,
final Serializer<K> keySerializer, final Deserializer<V> valueDeserializer) throws IOException {
final AtomicCacheEntry<K, V, Long> entry = super.fetch(key, keySerializer, valueDeserializer);
final int call = ++fetchCallCount;
if (bumpRevisionOnFetchCall == call && entry != null) {
return new AtomicCacheEntry<>(entry.getKey(), entry.getValue(),
entry.getRevision().orElse(0L) + 999L);
}
return entry;
}
}

/**
* When a concurrent Notify wins the CAS slot, {@code protocol.replace(signal)}
* throws {@link ConcurrentModificationException}. Wait must roll back the session and
* throw a {@link ProcessException} so FlowFiles are retried rather than silently lost.
*/
@Test
public void testWaitProgressedRollsBackOnConcurrentReplaceFailure() throws Exception {
final ControllableCacheClient controllableService = new ControllableCacheClient();
runner.addControllerService("controllable-service", controllableService);
runner.enableControllerService(controllableService);
runner.setProperty(Wait.DISTRIBUTED_CACHE_SERVICE, "controllable-service");

// Signal count=2, target=1, releasable=1 (default): after consuming 1 signal to release
// the FlowFile, releasableCount becomes 1 (leftover ticket). That means waitCompleted=false
// and waitProgressed=true → Wait calls protocol.replace(signal).
runner.setProperty(Wait.RELEASE_SIGNAL_IDENTIFIER, "${releaseSignalAttribute}");
runner.setProperty(Wait.TARGET_SIGNAL_COUNT, "1");

final WaitNotifyProtocol protocol = new WaitNotifyProtocol(controllableService);
protocol.notify("key", WaitNotifyProtocol.DEFAULT_COUNT_NAME, 2, null);

final Map<String, String> attrs = new HashMap<>();
attrs.put("releaseSignalAttribute", "key");
runner.enqueue(new byte[]{}, attrs);

// Simulate a concurrent Notify winning the CAS slot: replace() will return false.
controllableService.setFailNextReplace();

final AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertInstanceOf(ProcessException.class, e.getCause());
assertInstanceOf(ConcurrentModificationException.class, e.getCause().getCause());
}

/**
* When {@code protocol.complete(signal)} detects that the cached entry was
* modified by a concurrent Notify between Wait's initial fetch and the remove call, Wait
* must roll back the session and throw a {@link ProcessException} to avoid silently
* discarding the Notify update.
*/
@Test
public void testWaitCompletedRollsBackOnConcurrentSignalModification() throws Exception {
final ControllableCacheClient controllableService = new ControllableCacheClient();
runner.addControllerService("controllable-service", controllableService);
runner.enableControllerService(controllableService);
runner.setProperty(Wait.DISTRIBUTED_CACHE_SERVICE, "controllable-service");

runner.setProperty(Wait.RELEASE_SIGNAL_IDENTIFIER, "${releaseSignalAttribute}");
runner.setProperty(Wait.TARGET_SIGNAL_COUNT, "1");

final WaitNotifyProtocol protocol = new WaitNotifyProtocol(controllableService);
protocol.notify("key", WaitNotifyProtocol.DEFAULT_COUNT_NAME, 1, null);

final Map<String, String> attrs = new HashMap<>();
attrs.put("releaseSignalAttribute", "key");
runner.enqueue(new byte[]{}, attrs);

// After notify(), reset the counter. During runner.run():
// call 1 = Wait's own getSignal() → returns normally (revision N)
// call 2 = complete()'s internal getSignal() → returns bumped revision (simulating concurrent Notify)
controllableService.setBumpRevisionOnFetchCall(2);

final AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertInstanceOf(ProcessException.class, e.getCause());
assertInstanceOf(ConcurrentModificationException.class, e.getCause().getCause());
}

@Test
void testMigrateProperties() {
final Map<String, String> expectedRenamed = Map.ofEntries(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,66 @@ public void testReleaseCandidateTotal() throws Exception {

}

@Test
public void testCompleteRemovesSignalFromCache() throws Exception {
doAnswer(successfulReplace).when(cache).replace(any(), any(), any());
doAnswer(invocation -> {
cacheEntries.remove(invocation.getArguments()[0]);
return true;
}).when(cache).remove(any(), any());

final WaitNotifyProtocol protocol = new WaitNotifyProtocol(cache);
final String signalId = "signal-id";

protocol.notify(signalId, "a", 1, null);
assertTrue(cacheEntries.containsKey(signalId));

final Signal signal = protocol.getSignal(signalId);
assertNotNull(signal);

protocol.complete(signal);
assertFalse(cacheEntries.containsKey(signalId));
}

@Test
public void testCompleteThrowsOnConcurrentModification() throws Exception {
doAnswer(successfulReplace).when(cache).replace(any(), any(), any());

final WaitNotifyProtocol protocol = new WaitNotifyProtocol(cache);
final String signalId = "signal-id";

// Notify creates the signal at revision 1.
protocol.notify(signalId, "a", 1, null);
final Signal signalBeforeRace = protocol.getSignal(signalId);
assertNotNull(signalBeforeRace);

// Simulate a concurrent Notify that updates the signal, bumping the revision.
protocol.notify(signalId, "a", 1, null);

// complete() with a stale signal should detect the version mismatch and throw.
assertThrows(ConcurrentModificationException.class, () -> protocol.complete(signalBeforeRace));

// The entry must still be present — complete() must NOT have removed it.
assertTrue(cacheEntries.containsKey(signalId));
}

@Test
public void testCompleteThrowsWhenAlreadyRemoved() throws Exception {
doAnswer(successfulReplace).when(cache).replace(any(), any(), any());

final WaitNotifyProtocol protocol = new WaitNotifyProtocol(cache);
final String signalId = "signal-id";

protocol.notify(signalId, "a", 1, null);
final Signal signal = protocol.getSignal(signalId);

// Remove the entry from the cache directly, simulating concurrent removal by another process.
cacheEntries.remove(signalId);

// complete() on a signal that was concurrently removed should throw ConcurrentModificationException.
assertThrows(ConcurrentModificationException.class, () -> protocol.complete(signal));
}

public void assertValueEquals(String expected, String value) throws Exception {
assertEquals(mapper.readTree(expected), mapper.readTree(value));
}
Expand Down
Loading