@@ -62,6 +62,8 @@ struct ClusterNative {
6262 static constexpr int scalePadPacked = 64 ; // < ~60 is needed for 0.1mm precision, but power of two avoids rounding
6363 static constexpr int scaleSigmaTimePacked = 32 ; // 1/32nd of pad/timebin precision for cluster size
6464 static constexpr int scaleSigmaPadPacked = 32 ;
65+ static constexpr int scaleSaturatedQTot = 4 ;
66+ static constexpr int maxSaturatedQTot = UINT16_MAX * scaleSaturatedQTot;
6567
6668 uint32_t timeFlagsPacked; // < Contains the time in the lower 24 bits in a packed format, contains the flags in the
6769 // upper 8 bits
@@ -138,6 +140,31 @@ struct ClusterNative {
138140 sigmaPadPacked = tmp;
139141 }
140142
143+ GPUd () bool isSaturated () const { return qMax >= 1023 ; }
144+
145+ GPUd () void setSaturatedQtot (uint32_t qtot)
146+ {
147+ if (qtot > maxSaturatedQTot) {
148+ qtot = maxSaturatedQTot;
149+ }
150+ this ->qTot = qtot / scaleSaturatedQTot;
151+ }
152+
153+ GPUd () uint32_t getSaturatedQtot () const
154+ {
155+ return uint32_t (qTot) * scaleSaturatedQTot;
156+ }
157+
158+ GPUd () void setSaturatedTailLength (uint32_t tail)
159+ {
160+ sigmaTimePacked = encodeTailLength (tail);
161+ }
162+
163+ GPUd () uint32_t getSaturatedTailLength () const
164+ {
165+ return decodeTailLength (sigmaTimePacked);
166+ }
167+
141168 GPUd () bool operator <(const ClusterNative& rhs) const
142169 {
143170 if (this ->getTimePacked () != rhs.getTimePacked ()) {
@@ -167,6 +194,93 @@ struct ClusterNative {
167194 this ->qTot == rhs.qTot &&
168195 this ->getFlags () == rhs.getFlags ();
169196 }
197+
198+ private:
199+ static constexpr GPUd () uint32_t decodeTailLength(uint8_t code)
200+ {
201+ // Quantize tail length into 8bits.
202+ // Max expected length is 1500 tbs.
203+ // But allow outliers up to 8000 tbs.
204+ //
205+ // Full code layout is:
206+ //
207+ // | Code range | Decoded values | Step | Codes |
208+ // | ---------: | -------------: | ----: | ----: |
209+ // | `0..63` | `0..63` | `1` | `64` |
210+ // | `64..95` | `64..126` | `2` | `32` |
211+ // | `96..127` | `128..252` | `4` | `32` |
212+ // | `128..159` | `256..504` | `8` | `32` |
213+ // | `160..223` | `512..1520` | `16` | `64` |
214+ // | `224..239` | `1552..2032` | `32` | `16` |
215+ // | `240..255` | `2048..8048` | `400` | `16` |
216+ //
217+
218+ if (code < 64 ) {
219+ return code;
220+ }
221+
222+ if (code < 160 ) {
223+ uint32_t q = (uint32_t )code - 64u ;
224+ uint32_t exponent = (q >> 5 ) + 1u ; // 1, 2, 3
225+ uint32_t mantissa = q & 31u ; // 0..31
226+
227+ return (32u + mantissa) << exponent;
228+ }
229+
230+ if (code < 224 ) {
231+ return 512u + 16u * ((uint32_t )code - 160u );
232+ }
233+
234+ if (code < 240 ) {
235+ return 1552u + 32u * ((uint32_t )code - 224u );
236+ }
237+
238+ return 2048u + 400u * ((uint32_t )code - 240u );
239+ }
240+
241+ static constexpr GPUd () uint8_t encodeTailLength(uint32_t value)
242+ {
243+ // Saturate above representable range.
244+ if (value >= decodeTailLength (255 )) [[unlikely]] {
245+ return 255 ;
246+ }
247+
248+ // Binary search for the first code whose decoded value >= value.
249+ uint8_t lo = 0 ;
250+ uint8_t hi = 255 ;
251+
252+ while (lo < hi) {
253+ uint8_t mid = lo + ((hi - lo) >> 1 );
254+ uint32_t decoded = decodeTailLength (mid);
255+
256+ if (decoded < value) {
257+ lo = mid + 1 ;
258+ } else {
259+ hi = mid;
260+ }
261+ }
262+
263+ // lo is now the first code with decoded >= value.
264+ if (lo == 0 ) [[unlikely]] {
265+ return 0 ;
266+ }
267+
268+ uint8_t above_code = lo;
269+ uint8_t below_code = lo - 1 ;
270+
271+ uint32_t above_value = decodeTailLength (above_code);
272+ uint32_t below_value = decodeTailLength (below_code);
273+
274+ uint32_t above_error = above_value - value;
275+ uint32_t below_error = value - below_value;
276+
277+ // Tie-break downward.
278+ if (below_error <= above_error) {
279+ return below_code;
280+ } else {
281+ return above_code;
282+ }
283+ }
170284};
171285
172286// This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to
0 commit comments