From 66e5e3812f14b3f86b518f14ac2522ed4d303917 Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Sat, 29 Nov 2025 18:36:40 +0100 Subject: [PATCH 1/7] io: Add data structure for 64 bit byte count. In order to support 64 bits byte count, which does not fit in the space reverse for them in the stream (4 bytes minus 3 control bits) and the position and some byte count do not fit in the variable used in existing code (arguments to WriteVersion, SetByteCount, ReadVersion and CheckByteCount), we need to pass them indirectly. Since the streaming is inherently serial, we can leverage the sequence of calls and cache the 64 bits values in a queue. The bytecount that can not be stored in place in the stream will be held in a collection (fByteCounts) that need to be stored externally to the buffer. --- io/io/inc/TBufferFile.h | 13 +++++++++++++ io/io/src/TBufferFile.cxx | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/io/io/inc/TBufferFile.h b/io/io/inc/TBufferFile.h index 3d2e37141fcc4..26070da4043c0 100644 --- a/io/io/inc/TBufferFile.h +++ b/io/io/inc/TBufferFile.h @@ -52,6 +52,16 @@ class TBufferFile : public TBufferIO { TStreamerInfo *fInfo{nullptr}; ///< Pointer to TStreamerInfo object writing/reading the buffer InfoList_t fInfoStack; ///< Stack of pointers to the TStreamerInfos + using ByteCountLocator_t = std::size_t; // This might become a pair if we implement chunked keys + using ByteCountStack_t = std::vector; + ByteCountStack_t fByteCountStack; ///; + // fByteCounts will be stored either in the header/summary tkey or at the end + // of the last segment/chunk for a large TKey. + ByteCountFinder_t fByteCounts; ///< Map to find the byte count value for a given position + // Default ctor TBufferFile() {} // NOLINT: not allowed to use = default because of TObject::kIsOnHeap detection, see ROOT-10300 @@ -62,6 +72,7 @@ class TBufferFile : public TBufferIO { Long64_t CheckByteCount(ULong64_t startpos, ULong64_t bcnt, const TClass *clss, const char* classname); void CheckCount(UInt_t offset) override; UInt_t CheckObject(UInt_t offset, const TClass *cl, Bool_t readClass = kFALSE); + UInt_t ReserveByteCount(); void WriteObjectClass(const void *actualObjStart, const TClass *actualClass, Bool_t cacheReuse) override; @@ -106,6 +117,8 @@ class TBufferFile : public TBufferIO { TObject *ReadObject(const TClass *cl) override; + ByteCountFinder_t GetByteCounts() const { return fByteCounts; } + using TBufferIO::CheckObject; // basic types and arrays of basic types diff --git a/io/io/src/TBufferFile.cxx b/io/io/src/TBufferFile.cxx index d169d2348b347..bf9296b029855 100644 --- a/io/io/src/TBufferFile.cxx +++ b/io/io/src/TBufferFile.cxx @@ -3327,6 +3327,22 @@ UInt_t TBufferFile::CheckObject(UInt_t offset, const TClass *cl, Bool_t readClas return offset; } +//////////////////////////////////////////////////////////////////////////////// +/// Reserve space for a leading byte count and return the position where to +/// store the byte count value. +/// +/// \param[in] mask The mask to apply to the placeholder value (default kByteCountMask) +/// \return The position (cntpos) where the byte count should be stored later, +/// or 0 if the position exceeds kMaxInt + +UInt_t TBufferFile::ReserveByteCount() +{ + // reserve space for leading byte count + auto full_cntpos = fBufCur - fBuffer; + fByteCountStack.push_back(full_cntpos); + *this << (UInt_t)kByteCountMask; // placeholder for byte count + return full_cntpos < kMaxInt ? full_cntpos : kMaxInt; +} //////////////////////////////////////////////////////////////////////////////// /// Read max bytes from the I/O buffer into buf. The function returns From 2ef1e0a460d2255824e9c82414b4385deb6ac4c4 Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Sat, 29 Nov 2025 18:55:15 +0100 Subject: [PATCH 2/7] io: Support 64 bits position/byte-count in reading --- io/io/src/TBufferFile.cxx | 75 ++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/io/io/src/TBufferFile.cxx b/io/io/src/TBufferFile.cxx index bf9296b029855..1598185be5d2e 100644 --- a/io/io/src/TBufferFile.cxx +++ b/io/io/src/TBufferFile.cxx @@ -360,10 +360,25 @@ void TBufferFile::SetByteCount(ULong64_t cntpos, Bool_t packInVersion) Long64_t TBufferFile::CheckByteCount(ULong64_t startpos, ULong64_t bcnt, const TClass *clss, const char *classname) { - if (!bcnt) return 0; - R__ASSERT(startpos <= kMaxUInt && bcnt <= kMaxUInt); - Long64_t offset = 0; + R__ASSERT(!fByteCountStack.empty()); + if (startpos == kMaxInt) { + // The position is above 1GB and was cached using a 32 bit variable. + startpos = fByteCountStack.back(); + } + if (bcnt == kMaxInt) { + // in case we are checking a byte count for which we postponed + // the writing because it was too large, we retrieve it now + auto it = fByteCounts.find(startpos); + if (it != fByteCounts.end()) { + bcnt = it->second; + } + } + fByteCountStack.pop_back(); + if (!bcnt || startpos == kMaxInt) + return 0; + + Long64_t offset = 0; Longptr_t endpos = Longptr_t(fBuffer) + startpos + bcnt + sizeof(UInt_t); if (Longptr_t(fBufCur) != endpos) { @@ -390,7 +405,7 @@ Long64_t TBufferFile::CheckByteCount(ULong64_t startpos, ULong64_t bcnt, const T if ( ((char *)endpos) > fBufMax ) { offset = fBufMax-fBufCur; Error("CheckByteCount", - "Byte count probably corrupted around buffer position %llu:\n\t%llu for a possible maximum of %lld", + "Byte count probably corrupted around buffer position %llu:\n\tByte count is %llu while the buffer size is %lld", startpos, bcnt, offset); fBufCur = fBufMax; @@ -2749,6 +2764,8 @@ TClass *TBufferFile::ReadClass(const TClass *clReq, UInt_t *objTag) bcnt = 0; } else { fVersion = 1; + if (objTag) + fByteCountStack.push_back(fBufCur - fBuffer); startpos = UInt_t(fBufCur-fBuffer); *this >> tag; } @@ -2932,7 +2949,10 @@ Version_t TBufferFile::ReadVersion(UInt_t *startpos, UInt_t *bcnt, const TClass if (startpos) { // before reading object save start position - *startpos = UInt_t(fBufCur-fBuffer); + auto full_startpos = fBufCur - fBuffer; + *startpos = full_startpos < kMaxInt ? UInt_t(full_startpos) : kMaxInt; + if (bcnt) + fByteCountStack.push_back(full_startpos); } // read byte count (older files don't have byte count) @@ -2956,9 +2976,29 @@ Version_t TBufferFile::ReadVersion(UInt_t *startpos, UInt_t *bcnt, const TClass fBufCur -= sizeof(UInt_t); v.cnt = 0; } - if (bcnt) *bcnt = (v.cnt & ~kByteCountMask); + if (bcnt) { + if (!v.cnt) { + // no byte count stored + *bcnt = 0; + fByteCountStack.pop_back(); + } else { + *bcnt = (v.cnt & ~kByteCountMask); + if (*bcnt == 0) { + // The byte count was stored but is zero, this means the data + // did not fit and thus we stored it in 'fByteCounts' instead. + // Mark this case by setting startpos to kMaxInt. + *bcnt = kMaxInt; + if (startpos) + *startpos = kMaxInt; + } + } + } frombuf(this->fBufCur,&version); + // NOTE: The code above is not straightforward to refactor by a call + // to ReadVersionNoCheckSum because of the following code need the + // 'raw' value in `v`. + if (version<=1) { if (version <= 0) { if (cl) { @@ -3038,7 +3078,10 @@ Version_t TBufferFile::ReadVersionNoCheckSum(UInt_t *startpos, UInt_t *bcnt) if (startpos) { // before reading object save start position - *startpos = UInt_t(fBufCur-fBuffer); + auto full_startpos = fBufCur - fBuffer; + *startpos = full_startpos < kMaxInt ? UInt_t(full_startpos) : kMaxInt; + if (bcnt) + fByteCountStack.push_back(full_startpos); } // read byte count (older files don't have byte count) @@ -3062,7 +3105,23 @@ Version_t TBufferFile::ReadVersionNoCheckSum(UInt_t *startpos, UInt_t *bcnt) fBufCur -= sizeof(UInt_t); v.cnt = 0; } - if (bcnt) *bcnt = (v.cnt & ~kByteCountMask); + if (bcnt) { + if (!v.cnt) { + // no byte count stored + *bcnt = 0; + fByteCountStack.pop_back(); + } else { + *bcnt = (v.cnt & ~kByteCountMask); + if (*bcnt == 0) { + // The byte count was stored but is zero, this means the data + // did not fit and thus we stored it in 'fByteCounts' instead. + // Mark this case by setting startpos to kMaxInt. + *bcnt = kMaxInt; + if (startpos) + *startpos = kMaxInt; + } + } + } frombuf(this->fBufCur,&version); return version; From ad33fd816ea0b0200dd95f376002fcc6ff2bd99f Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Sat, 29 Nov 2025 18:54:43 +0100 Subject: [PATCH 3/7] io: Support 64 bits position/byte-count in writing --- io/io/src/TBufferFile.cxx | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/io/io/src/TBufferFile.cxx b/io/io/src/TBufferFile.cxx index 1598185be5d2e..4cbf1f321097d 100644 --- a/io/io/src/TBufferFile.cxx +++ b/io/io/src/TBufferFile.cxx @@ -323,7 +323,26 @@ void TBufferFile::SetByteCount(ULong64_t cntpos, Bool_t packInVersion) && (fBufCur >= fBuffer) && static_cast(fBufCur - fBuffer) <= std::numeric_limits::max() && "Byte count position is after the end of the buffer"); - const UInt_t cnt = UInt_t(fBufCur - fBuffer) - UInt_t(cntpos) - sizeof(UInt_t); + + // We can either make this unconditional or we could split the routine + // in two, one with a new signature and guarantee to get the 64bit position + // (which may be chunk number + local offset) and one with the old signature + // which uses the stack to get the position and call the new one. + // This (of course) also requires that we do the 'same' to the WriteVersion + // routines. + R__ASSERT( !fByteCountStack.empty() ); + if (cntpos == kMaxInt) { + cntpos = fByteCountStack.back(); + } + fByteCountStack.pop_back(); + // if we are not in the same TKey chunk or if the cntpos is too large to fit in UInt_t + // let's postpone the writing of the byte count + const ULong64_t full_cnt = ULong64_t(fBufCur - fBuffer) - cntpos - sizeof(UInt_t); + if (full_cnt >= kMaxMapCount) { + fByteCounts[cntpos] = full_cnt; + return; + } + UInt_t cnt = static_cast(full_cnt); char *buf = (char *)(fBuffer + cntpos); // if true, pack byte count in two consecutive shorts, so it can @@ -2709,8 +2728,7 @@ void TBufferFile::WriteObjectClass(const void *actualObjectStart, const TClass * } // reserve space for leading byte count - UInt_t cntpos = UInt_t(fBufCur-fBuffer); - fBufCur += sizeof(UInt_t); + UInt_t cntpos = ReserveByteCount(); // write class of object first Int_t mapsize = fMap->Capacity(); // The slot depends on the capacity and WriteClass might induce an increase. @@ -3207,8 +3225,7 @@ UInt_t TBufferFile::WriteVersion(const TClass *cl, Bool_t useBcnt) UInt_t cntpos = 0; if (useBcnt) { // reserve space for leading byte count - cntpos = UInt_t(fBufCur-fBuffer); - fBufCur += sizeof(UInt_t); + cntpos = ReserveByteCount(); } Version_t version = cl->GetClassVersion(); @@ -3237,8 +3254,7 @@ UInt_t TBufferFile::WriteVersionMemberWise(const TClass *cl, Bool_t useBcnt) UInt_t cntpos = 0; if (useBcnt) { // reserve space for leading byte count - cntpos = UInt_t(fBufCur-fBuffer); - fBufCur += sizeof(UInt_t); + cntpos = ReserveByteCount(); } Version_t version = cl->GetClassVersion(); From 912624905ab436dc605be767ff2ba6e0b4cd79f3 Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Sat, 29 Nov 2025 18:58:00 +0100 Subject: [PATCH 4/7] io/nfc: add comment --- tree/tree/src/TBranch.cxx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tree/tree/src/TBranch.cxx b/tree/tree/src/TBranch.cxx index 6b9d1102625a9..768434017ea86 100644 --- a/tree/tree/src/TBranch.cxx +++ b/tree/tree/src/TBranch.cxx @@ -979,6 +979,7 @@ Int_t TBranch::FillEntryBuffer(TBasket* basket, TBuffer* buf, Int_t& lnew) fEntryBuffer->SetBufferOffset(objectStart); *fEntryBuffer >> tag; if (tag & kByteCountMask) { + // Ignore byte count. *fEntryBuffer >> tag; } if (tag == kNewClassTag) { @@ -1457,7 +1458,7 @@ bool TBranch::SupportsBulkRead() const { /// the number of elements corresponding to each entries. /// /// For each entry the number of elements is the multiplication of -/// +/// /// ~~~{.cpp} /// TLeaf *leaf = static_cast(branch->GetListOfLeaves()->At(0)); /// auto len = leaf->GetLen(); From e94bdea6f52d251f36ad5dc289b29681f215175d Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Tue, 2 Dec 2025 11:09:44 -0600 Subject: [PATCH 5/7] io: Add Set/GetByteCounts to TBufferFile Note to store and restore the larger than 1GB byte count use: // Copy the content of the const reference. auto bytecounts{b.GetByteCounts()}; ... b.SetByteCounts(std::move(bytecounts)); --- io/io/inc/TBufferFile.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io/io/inc/TBufferFile.h b/io/io/inc/TBufferFile.h index 26070da4043c0..2c36d15b6335e 100644 --- a/io/io/inc/TBufferFile.h +++ b/io/io/inc/TBufferFile.h @@ -117,7 +117,8 @@ class TBufferFile : public TBufferIO { TObject *ReadObject(const TClass *cl) override; - ByteCountFinder_t GetByteCounts() const { return fByteCounts; } + const ByteCountFinder_t& GetByteCounts() const { return fByteCounts; } + void SetByteCounts(ByteCountFinder_t &&byteCounts) { fByteCounts = std::move(byteCounts); } using TBufferIO::CheckObject; From 53ce59974c47789a1f504c763219ad0e97087bf7 Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Mon, 8 Dec 2025 14:24:29 -0600 Subject: [PATCH 6/7] io: Add reset of new bytecount infrastructure --- io/io/inc/TBufferFile.h | 2 ++ io/io/src/TBufferFile.cxx | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/io/io/inc/TBufferFile.h b/io/io/inc/TBufferFile.h index 2c36d15b6335e..26c82531fdf3f 100644 --- a/io/io/inc/TBufferFile.h +++ b/io/io/inc/TBufferFile.h @@ -120,6 +120,8 @@ class TBufferFile : public TBufferIO { const ByteCountFinder_t& GetByteCounts() const { return fByteCounts; } void SetByteCounts(ByteCountFinder_t &&byteCounts) { fByteCounts = std::move(byteCounts); } + void ResetMap() override; + using TBufferIO::CheckObject; // basic types and arrays of basic types diff --git a/io/io/src/TBufferFile.cxx b/io/io/src/TBufferFile.cxx index 4cbf1f321097d..08e2a6cab87b0 100644 --- a/io/io/src/TBufferFile.cxx +++ b/io/io/src/TBufferFile.cxx @@ -114,6 +114,16 @@ TBufferFile::~TBufferFile() { } +//////////////////////////////////////////////////////////////////////////////// +/// Reset buffer maps and clear byte-count tracking structures. + +void TBufferFile::ResetMap() +{ + TBufferIO::ResetMap(); + fByteCountStack.clear(); + fByteCounts.clear(); +} + //////////////////////////////////////////////////////////////////////////////// /// Increment level. From 0d9bcc29ead2fba8b880c319eb65154485433179 Mon Sep 17 00:00:00 2001 From: Philippe Canal Date: Mon, 8 Dec 2025 14:25:17 -0600 Subject: [PATCH 7/7] io: Add new bytecount test --- io/io/test/CMakeLists.txt | 4 ++ io/io/test/testByteCount.C | 119 +++++++++++++++++++++++++++++++++++ io/io/test/testByteCount.ref | 3 + 3 files changed, 126 insertions(+) create mode 100644 io/io/test/testByteCount.C create mode 100644 io/io/test/testByteCount.ref diff --git a/io/io/test/CMakeLists.txt b/io/io/test/CMakeLists.txt index 8e1031f01a1e9..ed271dccf724e 100644 --- a/io/io/test/CMakeLists.txt +++ b/io/io/test/CMakeLists.txt @@ -28,3 +28,7 @@ endif() # ROOT_EXECUTABLE(TMapFileTest TMapFileTest.cxx LIBRARIES RIO Hist New) # ROOT_ADD_TEST(io-io-test-TMapFileTest COMMAND TMapFileTest complete) #endif() + +ROOTTEST_ADD_TEST(testLargeByteCounts + MACRO testByteCount.C + OUTREF testByteCount.ref) diff --git a/io/io/test/testByteCount.C b/io/io/test/testByteCount.C new file mode 100644 index 0000000000000..4c9af3f9707df --- /dev/null +++ b/io/io/test/testByteCount.C @@ -0,0 +1,119 @@ +{ + int errors = 0; + int expectedByteCounts = 1; + + // TBufferFile currently reject size larger than 2GB. + // SetBufferOffset does not check against the size, + // so we can provide and use a larger buffer. + std::vector databuffer{}; + databuffer.reserve( 4*1024*1024*1024ll ); + TBufferFile b(TBuffer::kWrite, 2*1024*1024*1024ll-100, databuffer.data(), false /* don't adopt */); + { + // Regular object at offset 0 + UInt_t R__c = b.WriteVersion(TExMap::Class(), kTRUE); + b.SetBufferOffset(1000); + b.SetByteCount(R__c, kTRUE); + } + { + // Regular object + UInt_t R__c = b.WriteVersion(TExMap::Class(), kTRUE); + b.SetBufferOffset(2000); + b.SetByteCount(R__c, kTRUE); + } + { + // Object larger than 1GB + UInt_t R__c = b.WriteVersion(TExMap::Class(), kTRUE); + b.SetBufferOffset(4000+1*1024*1024*1024ll); + b.SetByteCount(R__c, kTRUE); + } + { + // Regular object located past 1GB + UInt_t R__c = b.WriteVersion(TExMap::Class(), kTRUE); + b.SetBufferOffset(8000+1*1024*1024*1024ll); + b.SetByteCount(R__c, kTRUE); + } + { + ++expectedByteCounts; + // Object larger than 1GB start after 1GB + // NOTE: this does not yet fit, we are writing past the end. + // Need to lift the 2GB limit for TBuffer first. + // However the lifting might be temporary, so this might need to be + // moved to a test that stored objects in a TFile. + UInt_t R__c = b.WriteVersion(TExMap::Class(), kTRUE); + b.SetBufferOffset(12000+2*1024*1024*1024ll); + b.SetByteCount(R__c, kTRUE); + } + + + // To make a copy instead of using the const references: + auto bytecounts{ b.GetByteCounts() }; + if (bytecounts.size() != expectedByteCounts) { + ++errors; + std::cerr << "The number of bytecount is not as expected (1), it is " << bytecounts.size() << '\n'; + std::cerr << "The full list is:\n"; + for(auto bc : bytecounts) + std::cerr << "values: " << bc.first << " , " << bc.second << '\n'; + } + + // Rewind. Other code use Reset instead of SetBufferOffset + b.SetReadMode(); + b.Reset(); + b.SetByteCounts(std::move(bytecounts)); + + UInt_t R__s = 0; + UInt_t R__c = 0; + { + // Regular object at offset 0 + auto version = b.ReadVersion(&R__s, &R__c, TExMap::Class()); + b.SetBufferOffset(1000); + auto res = b.CheckByteCount(R__s, R__c, TExMap::Class()); + if (res != 0) { + ++errors; + // We can assume there as already an error message in CheckByCount + } + } + { + // Regular object + auto version = b.ReadVersion(&R__s, &R__c, TExMap::Class()); + b.SetBufferOffset(2000); + auto res = b.CheckByteCount(R__s, R__c, TExMap::Class()); + if (res != 0) { + ++errors; + // We can assume there as already an error message in CheckByCount + } + } + { + // Object larger than 1GB + auto version = b.ReadVersion(&R__s, &R__c, TExMap::Class()); + b.SetBufferOffset(4000+1*1024*1024*1024ll); + auto res = b.CheckByteCount(R__s, R__c, TExMap::Class()); + if (res != 0) { + ++errors; + // We can assume there as already an error message in CheckByCount + } + } + { + // Regular object located past 1GB + auto version = b.ReadVersion(&R__s, &R__c, TExMap::Class()); + b.SetBufferOffset(8000+1*1024*1024*1024ll); + auto res = b.CheckByteCount(R__s, R__c, TExMap::Class()); + if (res != 0) { + ++errors; + // We can assume there as already an error message in CheckByCount + } + } + { + // Object larger than 1GB start after 1GB + // NOTE: this does not yet fit. + auto version = b.ReadVersion(&R__s, &R__c, TExMap::Class()); + b.SetBufferOffset(12000+2*1024*1024*1024ll); + auto res = b.CheckByteCount(R__s, R__c, TExMap::Class()); + if (res != 0) { + ++errors; + // We can assume there as already an error message in CheckByCount + } + } + + std::cerr << "The end.\n"; + return errors; +} diff --git a/io/io/test/testByteCount.ref b/io/io/test/testByteCount.ref new file mode 100644 index 0000000000000..85ceff04b5e34 --- /dev/null +++ b/io/io/test/testByteCount.ref @@ -0,0 +1,3 @@ + +Processing testByteCount.C... +The end.