@@ -55,6 +55,15 @@ def bucket(self) -> int:
5555 """Return the bucket of this split."""
5656 pass
5757
58+ def merged_row_count (self ) -> Optional [int ]:
59+ """
60+ Return the merged row count of data files. For example, when the delete vector is enabled in
61+ the primary key table, the number of rows that have been deleted will be subtracted from the
62+ returned result. In the Data Evolution mode of the Append table, the actual number of rows
63+ will be returned.
64+ """
65+ return None
66+
5867
5968class DataSplit (Split ):
6069 """
@@ -106,3 +115,88 @@ def file_size(self) -> int:
106115 @property
107116 def file_paths (self ) -> List [str ]:
108117 return self ._file_paths
118+
119+ def set_row_count (self , row_count : int ) -> None :
120+ self ._row_count = row_count
121+
122+ def merged_row_count (self ) -> Optional [int ]:
123+ """
124+ Return the merged row count of data files. For example, when the delete vector is enabled in
125+ the primary key table, the number of rows that have been deleted will be subtracted from the
126+ returned result. In the Data Evolution mode of the Append table, the actual number of rows
127+ will be returned.
128+ """
129+ if self ._raw_merged_row_count_available ():
130+ return self ._raw_merged_row_count ()
131+ if self ._data_evolution_row_count_available ():
132+ return self ._data_evolution_merged_row_count ()
133+ return None
134+
135+ def _raw_merged_row_count_available (self ) -> bool :
136+ return self .raw_convertible and (
137+ self .data_deletion_files is None
138+ or all (f is None or f .cardinality is not None for f in self .data_deletion_files )
139+ )
140+
141+ def _raw_merged_row_count (self ) -> int :
142+ sum_rows = 0
143+ for i , file in enumerate (self ._files ):
144+ deletion_file = None
145+ if self .data_deletion_files is not None and i < len (self .data_deletion_files ):
146+ deletion_file = self .data_deletion_files [i ]
147+
148+ if deletion_file is None :
149+ sum_rows += file .row_count
150+ elif deletion_file .cardinality is not None :
151+ sum_rows += file .row_count - deletion_file .cardinality
152+
153+ return sum_rows
154+
155+ def _data_evolution_row_count_available (self ) -> bool :
156+ for file in self ._files :
157+ if file .first_row_id is None :
158+ return False
159+ return True
160+
161+ def _data_evolution_merged_row_count (self ) -> int :
162+ if not self ._files :
163+ return 0
164+
165+ file_ranges = []
166+ for file in self ._files :
167+ if file .first_row_id is not None and file .row_count > 0 :
168+ start = file .first_row_id
169+ end = file .first_row_id + file .row_count - 1
170+ file_ranges .append ((file , start , end ))
171+
172+ if not file_ranges :
173+ return 0
174+
175+ file_ranges .sort (key = lambda x : (x [1 ], x [2 ]))
176+
177+ groups = []
178+ current_group = [file_ranges [0 ]]
179+ current_end = file_ranges [0 ][2 ]
180+
181+ for file_range in file_ranges [1 :]:
182+ file , start , end = file_range
183+ if start <= current_end :
184+ current_group .append (file_range )
185+ if end > current_end :
186+ current_end = end
187+ else :
188+ groups .append (current_group )
189+ current_group = [file_range ]
190+ current_end = end
191+
192+ if current_group :
193+ groups .append (current_group )
194+
195+ sum_rows = 0
196+ for group in groups :
197+ max_count = 0
198+ for file , _ , _ in group :
199+ max_count = max (max_count , file .row_count )
200+ sum_rows += max_count
201+
202+ return sum_rows
0 commit comments