Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions paimon-python/pypaimon/read/scanner/file_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,19 @@ def __init__(
# bucket set per ``total_buckets`` value.
self._bucket_selector = self._init_bucket_selector()

def schema_fields_func(schema_id: int):
return self.table.schema_manager.get_schema(schema_id).fields

self.simple_stats_evolutions = SimpleStatsEvolutions(
schema_fields_func,
self._schema_fields,
self.table.table_schema.id
)

def _schema_fields(self, schema_id: int):
"""Resolve schema fields, short-circuiting current table schema id to avoid
filesystem access (REST catalog would get 403).
"""
if schema_id == self.table.table_schema.id:
return self.table.table_schema.fields
return self.table.schema_manager.get_schema(schema_id).fields

def _deletion_files_map(self, entries: List[ManifestEntry]) -> Dict[tuple, Dict[str, DeletionFile]]:
if not self.deletion_vectors_enabled:
return {}
Expand Down
15 changes: 11 additions & 4 deletions paimon-python/pypaimon/read/split_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,14 @@ def _compute_nested_path_by_name(self) -> Optional[Dict[str, List[str]]]:
def _nested_path_by_name(self) -> Optional[Dict[str, List[str]]]:
return self._cached_nested_path_by_name

def _resolve_schema(self, schema_id: int):
"""Resolve schema, short-circuiting current table schema id to avoid
filesystem access (REST catalog would get 403).
"""
if schema_id == self.table.table_schema.id:
return self.table.table_schema
return self.table.schema_manager.get_schema(schema_id)

def _push_down_predicate(self) -> Optional[Predicate]:
if self.predicate is None:
return None
Expand Down Expand Up @@ -304,8 +312,7 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
if has_nested:
raise NotImplementedError(
"Nested-field projection is not supported on ROW files")
file_schema = self.table.schema_manager.get_schema(
file.schema_id)
file_schema = self._resolve_schema(file.schema_id)
if file.write_cols:
field_map = {f.name: f for f in file_schema.fields}
row_full_fields = [field_map[n] for n in file.write_cols
Expand Down Expand Up @@ -389,7 +396,7 @@ def _get_fields_and_predicate(self, schema_id: int, read_fields):
key = (schema_id, tuple(read_fields))
if key not in self.schema_id_2_fields:
nested_path_by_name = self._nested_path_by_name()
schema = self.table.schema_manager.get_schema(schema_id)
schema = self._resolve_schema(schema_id)
schema_fields = (
SpecialFields.row_type_with_row_tracking(schema.fields)
if self.row_tracking_enabled else schema.fields
Expand Down Expand Up @@ -461,7 +468,7 @@ def _file_read_fields(self, file: DataFileMeta) -> Optional[List[DataField]]:
nested-projection reads."""
if self._nested_path_by_name() is not None:
return None
file_schema = self.table.schema_manager.get_schema(file.schema_id)
file_schema = self._resolve_schema(file.schema_id)
if file_schema is None:
return None
return self._final_data_fields_from(
Expand Down
2 changes: 1 addition & 1 deletion paimon-python/pypaimon/tests/partition_predicate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _mock_scanner_table():
table.options.data_evolution_enabled.return_value = False
table.options.deletion_vectors_enabled.return_value = False
table.options.scan_manifest_parallelism.return_value = 1
table.table_schema = Mock(id=0)
table.table_schema = Mock(id=0, fields=TABLE_FIELDS)
table.schema_manager = Mock()
table.schema_manager.get_schema.return_value = Mock(fields=TABLE_FIELDS)
return table
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import unittest
from unittest.mock import MagicMock

from pypaimon.read.scanner.file_scanner import FileScanner
from pypaimon.schema.data_types import AtomicType, DataField


class FileScannerSchemaFieldsShortCircuitTest(unittest.TestCase):
"""Test _schema_fields short-circuits current schema id (REST catalog 403 fix)."""

def _make_scanner(self, current_schema_id, current_fields,
historical_fields_map=None):
"""Build a FileScanner bypassing __init__ to isolate _schema_fields."""
scanner = FileScanner.__new__(FileScanner)
scanner.table = MagicMock()
scanner.table.table_schema.id = current_schema_id
scanner.table.table_schema.fields = current_fields
scanner.table.schema_manager = MagicMock()

def get_schema(schema_id):
if historical_fields_map is None or schema_id not in historical_fields_map:
raise AssertionError(
f"schema_manager.get_schema({schema_id}) was called "
"but no historical schema was registered for it"
)
historical = MagicMock()
historical.fields = historical_fields_map[schema_id]
return historical

scanner.table.schema_manager.get_schema.side_effect = get_schema
return scanner

def test_short_circuits_current_schema_id(self):
"""Current schema id returns in-memory fields without filesystem access."""
current_fields = [DataField(0, "a", AtomicType("INT"))]
scanner = self._make_scanner(
current_schema_id=5, current_fields=current_fields
)

result = scanner._schema_fields(5)

self.assertIs(result, current_fields)
scanner.table.schema_manager.get_schema.assert_not_called()

def test_delegates_for_historical_schema(self):
"""Historical schema id delegates to schema_manager.get_schema()."""
current_fields = [DataField(0, "a", AtomicType("INT"))]
historical_fields = [
DataField(0, "a", AtomicType("INT")),
DataField(1, "b", AtomicType("STRING")),
]
scanner = self._make_scanner(
current_schema_id=5,
current_fields=current_fields,
historical_fields_map={3: historical_fields},
)

result = scanner._schema_fields(3)

self.assertEqual(result, historical_fields)
scanner.table.schema_manager.get_schema.assert_called_once_with(3)

def test_short_circuit_works_for_zero_schema_id(self):
"""Schema id == 0 still short-circuits (guards against truthiness bugs)."""
current_fields = [DataField(0, "x", AtomicType("INT"))]
scanner = self._make_scanner(
current_schema_id=0, current_fields=current_fields
)

result = scanner._schema_fields(0)

self.assertIs(result, current_fields)
scanner.table.schema_manager.get_schema.assert_not_called()


if __name__ == "__main__":
unittest.main()
Loading