[python] fix data-evolution double-counting issue by using mergedRowCount in ray datasource (#7087)

XiaoHongbo-Hope · web-flow · commit 4f8a8110facb · 2026-01-22T18:16:09.000+08:00
diff --git a/paimon-python/pypaimon/globalindex/indexed_split.py b/paimon-python/pypaimon/globalindex/indexed_split.py
@@ -76,6 +76,9 @@ def row_count(self) -> int:
         """
         return sum(r.count() for r in self._row_ranges)
 
+    def merged_row_count(self):
+        return self.row_count
+
     # Delegate other properties to data_split
 
     @property
diff --git a/paimon-python/pypaimon/read/datasource/ray_datasource.py b/paimon-python/pypaimon/read/datasource/ray_datasource.py
@@ -171,8 +171,15 @@ def _get_read_task(
             for split in chunk_splits:
                 if predicate is None:
                     # Only estimate rows if no predicate (predicate filtering changes row count)
-                    if hasattr(split, 'row_count') and split.row_count > 0:
-                        total_rows += split.row_count
+                    row_count = None
+                    if hasattr(split, 'merged_row_count'):
+                        merged_count = split.merged_row_count()
+                        if merged_count is not None:
+                            row_count = merged_count
+                    if row_count is None and hasattr(split, 'row_count') and split.row_count > 0:
+                        row_count = split.row_count
+                    if row_count is not None and row_count > 0:
+                        total_rows += row_count
                 if hasattr(split, 'file_size') and split.file_size > 0:
                     total_size += split.file_size
 
diff --git a/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py b/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py
@@ -23,7 +23,7 @@
 from pypaimon.manifest.schema.data_file_meta import DataFileMeta
 from pypaimon.manifest.schema.manifest_entry import ManifestEntry
 from pypaimon.read.scanner.split_generator import AbstractSplitGenerator
-from pypaimon.read.split import Split
+from pypaimon.read.split import DataSplit, Split
 from pypaimon.read.sliced_split import SlicedSplit
 
 
@@ -104,8 +104,8 @@ def weight_func(file_list: List[DataFileMeta]) -> int:
                 for pack in packed_files
             ]
 
-            splits += self._build_split_from_pack(
-                flatten_packed_files, sorted_entries_list, False
+            splits += self._build_split_from_pack_for_data_evolution(
+                flatten_packed_files, packed_files, sorted_entries_list
             )
 
         if self.start_pos_of_this_subtask is not None or self.idx_of_this_subtask is not None:
@@ -117,6 +117,60 @@ def weight_func(file_list: List[DataFileMeta]) -> int:
 
         return splits
 
+    def _build_split_from_pack_for_data_evolution(
+        self,
+        flatten_packed_files: List[List[DataFileMeta]],
+        packed_files: List[List[List[DataFileMeta]]],
+        file_entries: List[ManifestEntry]
+    ) -> List[Split]:
+        """
+        Build splits from packed files for data evolution tables.
+        raw_convertible is True only when each range (pack) contains exactly one file.
+        """
+        splits = []
+        for i, file_group in enumerate(flatten_packed_files):
+            # In Java: rawConvertible = f.stream().allMatch(file -> file.size() == 1)
+            # This means raw_convertible is True only when each range contains exactly one file
+            pack = packed_files[i] if i < len(packed_files) else []
+            raw_convertible = all(len(sub_pack) == 1 for sub_pack in pack)
+
+            file_paths = []
+            total_file_size = 0
+            total_record_count = 0
+
+            for data_file in file_group:
+                data_file.set_file_path(
+                    self.table.table_path,
+                    file_entries[0].partition,
+                    file_entries[0].bucket
+                )
+                file_paths.append(data_file.file_path)
+                total_file_size += data_file.file_size
+                total_record_count += data_file.row_count
+
+            if file_paths:
+                # Get deletion files for this split
+                data_deletion_files = None
+                if self.deletion_files_map:
+                    data_deletion_files = self._get_deletion_files_for_split(
+                        file_group,
+                        file_entries[0].partition,
+                        file_entries[0].bucket
+                    )
+
+                split = DataSplit(
+                    files=file_group,
+                    partition=file_entries[0].partition,
+                    bucket=file_entries[0].bucket,
+                    file_paths=file_paths,
+                    row_count=total_record_count,
+                    file_size=total_file_size,
+                    raw_convertible=raw_convertible,
+                    data_deletion_files=data_deletion_files
+                )
+                splits.append(split)
+        return splits
+
     def _wrap_to_sliced_splits(self, splits: List[Split], plan_start_pos: int, plan_end_pos: int) -> List[Split]:
         """
         Wrap splits with SlicedSplit to add file-level slicing information.
diff --git a/paimon-python/pypaimon/read/sliced_split.py b/paimon-python/pypaimon/read/sliced_split.py
@@ -96,6 +96,84 @@ def raw_convertible(self):
     def data_deletion_files(self):
         return self._data_split.data_deletion_files
 
+    def _get_sliced_file_row_count(self, file: 'DataFileMeta') -> int:
+        if file.file_name in self._shard_file_idx_map:
+            start, end = self._shard_file_idx_map[file.file_name]
+            return (end - start) if start != -1 and end != -1 else 0
+        return file.row_count
+
+    def merged_row_count(self):
+        if not self._shard_file_idx_map:
+            return self._data_split.merged_row_count()
+        
+        underlying_merged = self._data_split.merged_row_count()
+        if underlying_merged is not None:
+            original_row_count = self._data_split.row_count
+            return int(underlying_merged * self.row_count / original_row_count) if original_row_count > 0 else 0
+        
+        from pypaimon.read.split import DataSplit
+        from pypaimon.globalindex.range import Range
+        
+        if not isinstance(self._data_split, DataSplit):
+            return None
+        
+        if not all(f.first_row_id is not None for f in self._data_split.files):
+            return None
+        
+        file_ranges = []
+        for file in self._data_split.files:
+            if file.first_row_id is not None:
+                sliced_count = self._get_sliced_file_row_count(file)
+                if sliced_count > 0:
+                    file_ranges.append((file, Range(file.first_row_id, file.first_row_id + sliced_count - 1)))
+        
+        if not file_ranges:
+            return 0
+        
+        file_ranges.sort(key=lambda x: x[1].from_)
+        
+        groups = []
+        current_group = [file_ranges[0]]
+        current_range = file_ranges[0][1]
+        
+        for file, file_range in file_ranges[1:]:
+            if file_range.from_ <= current_range.to + 1:
+                current_group.append((file, file_range))
+                current_range = Range(current_range.from_, max(current_range.to, file_range.to))
+            else:
+                groups.append(current_group)
+                current_group = [(file, file_range)]
+                current_range = file_range
+        
+        if current_group:
+            groups.append(current_group)
+        
+        sum_rows = 0
+        for group in groups:
+            max_count = 0
+            for file, _ in group:
+                max_count = max(max_count, self._get_sliced_file_row_count(file))
+            sum_rows += max_count
+        
+        if self._data_split.data_deletion_files is not None:
+            if not all(f is None or f.cardinality is not None for f in self._data_split.data_deletion_files):
+                return None
+            
+            for i, deletion_file in enumerate(self._data_split.data_deletion_files):
+                if (deletion_file is not None and deletion_file.cardinality is not None
+                        and i < len(self._data_split.files)):
+                    file = self._data_split.files[i]
+                    if file.first_row_id is not None:
+                        file_original_count = file.row_count
+                        file_sliced_count = self._get_sliced_file_row_count(file)
+                        if file_original_count > 0:
+                            deletion_ratio = deletion_file.cardinality / file_original_count
+                            sum_rows -= int(file_sliced_count * deletion_ratio)
+                        else:
+                            sum_rows -= deletion_file.cardinality
+        
+        return sum_rows
+
     def __eq__(self, other):
         if not isinstance(other, SlicedSplit):
             return False
diff --git a/paimon-python/pypaimon/read/split.py b/paimon-python/pypaimon/read/split.py
@@ -55,6 +55,15 @@ def bucket(self) -> int:
         """Return the bucket of this split."""
         pass
 
+    def merged_row_count(self) -> Optional[int]:
+        """
+        Return the merged row count of data files. For example, when the delete vector is enabled in
+        the primary key table, the number of rows that have been deleted will be subtracted from the
+        returned result. In the Data Evolution mode of the Append table, the actual number of rows
+        will be returned.
+        """
+        return None
+
 
 class DataSplit(Split):
     """
@@ -106,3 +115,88 @@ def file_size(self) -> int:
     @property
     def file_paths(self) -> List[str]:
         return self._file_paths
+
+    def set_row_count(self, row_count: int) -> None:
+        self._row_count = row_count
+
+    def merged_row_count(self) -> Optional[int]:
+        """
+        Return the merged row count of data files. For example, when the delete vector is enabled in
+        the primary key table, the number of rows that have been deleted will be subtracted from the
+        returned result. In the Data Evolution mode of the Append table, the actual number of rows
+        will be returned.
+        """
+        if self._raw_merged_row_count_available():
+            return self._raw_merged_row_count()
+        if self._data_evolution_row_count_available():
+            return self._data_evolution_merged_row_count()
+        return None
+
+    def _raw_merged_row_count_available(self) -> bool:
+        return self.raw_convertible and (
+            self.data_deletion_files is None
+            or all(f is None or f.cardinality is not None for f in self.data_deletion_files)
+        )
+
+    def _raw_merged_row_count(self) -> int:
+        sum_rows = 0
+        for i, file in enumerate(self._files):
+            deletion_file = None
+            if self.data_deletion_files is not None and i < len(self.data_deletion_files):
+                deletion_file = self.data_deletion_files[i]
+            
+            if deletion_file is None:
+                sum_rows += file.row_count
+            elif deletion_file.cardinality is not None:
+                sum_rows += file.row_count - deletion_file.cardinality
+        
+        return sum_rows
+
+    def _data_evolution_row_count_available(self) -> bool:
+        for file in self._files:
+            if file.first_row_id is None:
+                return False
+        return True
+
+    def _data_evolution_merged_row_count(self) -> int:
+        if not self._files:
+            return 0
+        
+        file_ranges = []
+        for file in self._files:
+            if file.first_row_id is not None and file.row_count > 0:
+                start = file.first_row_id
+                end = file.first_row_id + file.row_count - 1
+                file_ranges.append((file, start, end))
+        
+        if not file_ranges:
+            return 0
+        
+        file_ranges.sort(key=lambda x: (x[1], x[2]))
+        
+        groups = []
+        current_group = [file_ranges[0]]
+        current_end = file_ranges[0][2]
+        
+        for file_range in file_ranges[1:]:
+            file, start, end = file_range
+            if start <= current_end:
+                current_group.append(file_range)
+                if end > current_end:
+                    current_end = end
+            else:
+                groups.append(current_group)
+                current_group = [file_range]
+                current_end = end
+        
+        if current_group:
+            groups.append(current_group)
+        
+        sum_rows = 0
+        for group in groups:
+            max_count = 0
+            for file, _, _ in group:
+                max_count = max(max_count, file.row_count)
+            sum_rows += max_count
+        
+        return sum_rows
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py
diff --git a/paimon-python/pypaimon/tests/reader_split_generator_test.py b/paimon-python/pypaimon/tests/reader_split_generator_test.py