Optimize modulo and division used in clock-sweep algorithm

gburd · gburd · commit 06ffe4ded648 · 2025-08-12T15:13:00.000-04:00
Improve the performance of the buffer manager by replacing the modulo and division operations with a technique described in the paper "Division by Invariant Integers using Multiplication" [1]. Our implementation is inspired by the MIT Licensed "fastdiv" [2]. This algorithm provides accurate division and modulo in constant time that is pipeline and ALU friendly and estimated to take about ~12-18 cycles (vs 26-90 for hardware division). Because our divisor (NBuffers) is fixed at startup so we need only calculate the constant used by it once. [1] https://gmplib.org/~tege/divcnst-pldi94.pdf [2] https://github.com/jmtilli/fastdiv
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
@@ -31,12 +31,28 @@ typedef struct
 {
 	/*
 	 * The clock-sweep counter is atomically updated by 1 at every tick.  Use
-	 * the macro CLOCKSWEEP_HAND() to find the location of the hand on the
-	 * clock. Use CLOCKSWEEP_PASSES() to calculate the number of times the
+	 * the function ClockSweepHand() to find the location of the hand on the
+	 * clock. Use ClockSweepPasses() to calculate the number of times the
 	 * clock-sweep hand has made a complete pass around the clock.
 	 */
 	pg_atomic_uint64 clockSweepCounter;
 
+	/*
+	 * Division and modulo can be expensive to calculate repeatedly.  Given
+	 * that the buffer manager is a very hot code path we implement a more
+	 * efficient method based on using "Division by invariant Integers using
+	 * Multiplication" (https://gmplib.org/~tege/divcnst-pldi94.pdf) by
+	 * Granlund-Montgomery.  Our implementation below was inspired by the MIT
+	 * Licensed "fastdiv" (https://github.com/jmtilli/fastdiv).
+	 */
+	struct
+	{
+		uint32		mul;
+		uint32		mod;
+		uint8		shift1:1;
+		uint8		shift2:7;
+	}			md;
+
 	/*
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
@@ -86,10 +102,67 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
 
-#define CLOCKSWEEP_HAND(counter) \
-	((counter) & 0xFFFFFFFF) % NBuffers
-#define CLOCKSWEEP_PASSES(counter) \
-	(uint32) ((counter) / NBuffers)
+static inline uint32
+InvariantDivision(uint64 n)
+{
+	/* Compute quotient using multiplication */
+	uint64		product = n * StrategyControl->md.mul;
+	uint32		quotient = (uint32) (product >> 32);
+
+	/*
+	 * The invariant multiplication gives us an approximation that may be off
+	 * by 1.
+	 */
+	n -= quotient;
+	n >>= StrategyControl->md.shift1;
+	n += quotient;
+	n >>= StrategyControl->md.shift2;
+
+	return n;
+}
+
+static inline uint32
+InvariantModulo(uint64 n)
+{
+	/* Compute quotient using multiplication */
+	uint64		product = n * StrategyControl->md.mul;
+	uint32		quotient = (uint32) (product >> 32);
+	uint32		on = n;
+
+	/*
+	 * The invariant multiplication gives us an approximation that may be off
+	 * by 1.
+	 */
+	n -= quotient;
+	n >>= StrategyControl->md.shift1;
+	n += quotient;
+	n >>= StrategyControl->md.shift2;
+
+	quotient = StrategyControl->md.mod * n;
+	return on - quotient;
+}
+
+static inline uint32
+ClockSweepHand(uint64 counter)
+{
+	uint32		result = InvariantModulo(counter);
+
+	Assert(result < NBuffers);
+	Assert(result == (uint32) counter % NBuffers);
+
+	return result;
+}
+
+static inline uint32
+ClockSweepPasses(uint64 counter)
+{
+	uint32		result = InvariantDivision(counter);
+
+	/* Verify our result matches standard division */
+	Assert(result == (uint32) (counter / NBuffers));
+
+	return result;
+}
 
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -110,7 +183,7 @@ ClockSweepTick(void)
 	 */
 	counter = pg_atomic_fetch_add_u64(&StrategyControl->clockSweepCounter, 1);
 
-	hand = CLOCKSWEEP_HAND(counter);
+	hand = ClockSweepHand(counter);
 	Assert(hand < NBuffers);
 
 	return hand;
@@ -244,10 +317,10 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 	uint32		result;
 
 	counter = pg_atomic_read_u64(&StrategyControl->clockSweepCounter);
-	result = CLOCKSWEEP_HAND(counter);
+	result = ClockSweepHand(counter);
 
 	if (complete_passes)
-		*complete_passes = CLOCKSWEEP_PASSES(counter);
+		*complete_passes = ClockSweepPasses(counter);
 
 	if (num_buf_alloc)
 		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
@@ -326,11 +399,27 @@ StrategyInitialize(bool init)
 
 	if (!found)
 	{
+		uint8		shift2 = 0;
+		uint32		divisor = NBuffers;
+		uint8		is_pow2 = (divisor & (divisor - 1)) == 0 ? 0 : 1;
+
 		/*
 		 * Only done once, usually in postmaster
 		 */
 		Assert(init);
 
+		/* Calculate the constants used for speeding up division and modulo */
+		Assert(NBuffers > 0 && NBuffers < (1U << 31));
+
+		/* shift2 = ilog(NBuffers) */
+		for (uint32 n = divisor; n >>= 1;)
+			shift2++;
+
+		StrategyControl->md.shift1 = is_pow2;
+		StrategyControl->md.shift2 = shift2;
+		StrategyControl->md.mod = NBuffers;
+		StrategyControl->md.mul = (1ULL << (32 + is_pow2 + shift2)) / NBuffers + 1;
+
 		/* Initialize combined clock-sweep pointer/complete passes counter */
 		pg_atomic_init_u64(&StrategyControl->clockSweepCounter, 0);