Skip to content

Commit ca79be4

Browse files
committed
Optimize modulo in clock-sweep algorithm
Improve performance by replacing the modulo operator which compiles into a division instruction that can be slow on certain architectures. When the size of the clock (NBuffers) is a power-of-two we can simply bitshift to get the modulo (4 instructions, ~3-4 cycles). When it isn't we can replace modulo using a 64-bit multiplication by the inverse of the clock size and a right shift as described in the paper "Division by Invariant Integers using Multiplication" (4 instructions, ~8-12 cycles). In both cases the branch prediction should be nearly 100% given that NBuffers never changes at runtime. In comparison a modulo operation translates into IDIV and the code would require ~26-90 cycles. Switching to this invariant method should use common ALU operations that don't block the pipeline and have better instruction level parallelism. [1] https://gmplib.org/~tege/divcnst-pldi94.pdf
1 parent e9649ad commit ca79be4

1 file changed

Lines changed: 55 additions & 2 deletions

File tree

src/backend/storage/buffer/freelist.c

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ typedef struct ClockSweep
2727
{
2828
pg_atomic_uint64 counter; /* Only incremented by one */
2929
uint32_t size; /* Size of the clock */
30+
uint32_t m; /* Mask if power-of-2, or magic constant */
31+
bool pow2; /* True if power-of-2, else use fast_mod() */
3032
} ClockSweep;
3133

3234
/*
@@ -94,6 +96,33 @@ ClockSweepInit(ClockSweep *sweep, uint32 size)
9496
{
9597
pg_atomic_init_u64(&sweep->counter, 0);
9698
sweep->size = size;
99+
100+
if ((size & (size - 1)) == 0)
101+
{
102+
/* Power of 2: use simple mask */
103+
sweep->m = size - 1;
104+
sweep->pow2 = true;
105+
}
106+
else
107+
{
108+
/* Non-power of 2: calculate magic constant */
109+
sweep->m = ((1ULL << 32) + size - 1) / size; /* ceil() div */
110+
sweep->pow2 = false;
111+
}
112+
}
113+
114+
/* A faster modulo using pre-computed magic constant */
115+
static inline uint32
116+
fast_mod(uint32 n, uint32 divisor, uint64 magic)
117+
{
118+
/* Compute quotient using magic multiplication */
119+
uint32 quotient = (uint32) (((uint64) n * magic) >> 32);
120+
121+
/* Compute remainder */
122+
uint32 remainder = n - quotient * divisor;
123+
124+
/* Adjust if remainder is too large (can only be off by divisor) */
125+
return remainder < divisor ? remainder : remainder - divisor;
97126
}
98127

99128
/* Extract the number of complete cycles from the clock hand */
@@ -110,8 +139,20 @@ static inline uint32
110139
ClockSweepPosition(ClockSweep *sweep)
111140
{
112141
uint64 counter = pg_atomic_read_u64(&sweep->counter);
142+
uint32 current = (uint32) counter & 0xFFFFFFFF;
143+
uint32 result;
144+
145+
if (sweep->pow2)
146+
/* Power of 2: use mask */
147+
result = current & sweep->m;
148+
else
149+
/* Non-power of 2: use magic modulo */
150+
result = fast_mod(current, sweep->size, sweep->m);
151+
152+
Assert(result < sweep->size);
153+
Assert(result == (counter & 0xFFFFFFFF) % sweep->size);
113154

114-
return ((counter) & 0xFFFFFFFF) % sweep->size;
155+
return result;
115156
}
116157

117158
/*
@@ -121,8 +162,20 @@ static inline uint32
121162
ClockSweepTick(ClockSweep *sweep)
122163
{
123164
uint64 counter = pg_atomic_fetch_add_u64(&sweep->counter, 1);
165+
uint32 current = (uint32) counter & 0xFFFFFFFF;
166+
uint32 result;
167+
168+
if (sweep->pow2)
169+
/* Power of 2: use mask */
170+
result = current & sweep->m;
171+
else
172+
/* Non-power of 2: use magic modulo */
173+
result = fast_mod(current, sweep->size, sweep->m);
124174

125-
return ((counter) & 0xFFFFFFFF) % sweep->size;
175+
Assert(result < sweep->size);
176+
Assert(result == (counter & 0xFFFFFFFF) % sweep->size);
177+
178+
return result;
126179
}
127180

128181
/*

0 commit comments

Comments
 (0)