From 23a82a037394528fb3d050f3c56876f827f586a3 Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 00:34:50 +0800 Subject: [PATCH 1/7] Upgrade Iceberg versions with per-engine compatibility modules --- .../IcebergRewriteExecutorTest.java | 10 +- .../amoro-mixed-flink-common-1.17/pom.xml | 453 +++++ .../apache/amoro/flink/FlinkSchemaUtil.java | 438 ++++ .../amoro/flink/InternalCatalogBuilder.java | 189 ++ .../flink/catalog/FlinkUnifiedCatalog.java | 550 +++++ .../amoro/flink/catalog/MixedCatalog.java | 792 ++++++++ .../factories/CatalogFactoryOptions.java | 45 + .../factories/FlinkUnifiedCatalogFactory.java | 125 ++ .../iceberg/IcebergFlinkCatalogFactory.java | 39 + .../mixed/MixedHiveCatalogFactory.java | 34 + .../mixed/MixedIcebergCatalogFactory.java | 74 + .../paimon/PaimonFlinkCatalogFactory.java | 55 + ...FlinkTablePropertiesInvocationHandler.java | 86 + .../interceptor/KerberosInterceptor.java | 57 + .../KerberosInvocationHandler.java | 70 + .../amoro/flink/interceptor/ProxyFactory.java | 48 + .../flink/lookup/BasicLookupFunction.java | 263 +++ .../BinaryRowDataSerializerWrapper.java | 81 + .../flink/lookup/ByteArraySetSerializer.java | 89 + .../amoro/flink/lookup/ByteArrayWrapper.java | 246 +++ .../apache/amoro/flink/lookup/KVTable.java | 80 + .../amoro/flink/lookup/KVTableFactory.java | 83 + .../apache/amoro/flink/lookup/KeyRowData.java | 133 ++ .../amoro/flink/lookup/LookupMetrics.java | 27 + .../amoro/flink/lookup/LookupOptions.java | 133 ++ .../amoro/flink/lookup/LookupRecord.java | 53 + .../MixedFormatRowDataLookupFunction.java | 81 + .../amoro/flink/lookup/RocksDBCacheState.java | 342 ++++ .../flink/lookup/RocksDBRecordState.java | 156 ++ .../flink/lookup/RocksDBSetSpilledState.java | 230 +++ .../amoro/flink/lookup/RocksDBSetState.java | 137 ++ .../flink/lookup/RowDataStateFactory.java | 98 + .../flink/lookup/SecondaryIndexTable.java | 170 ++ .../amoro/flink/lookup/TableFactory.java | 36 + .../amoro/flink/lookup/UniqueIndexTable.java | 153 ++ .../flink/lookup/filter/RowDataPredicate.java | 307 +++ .../RowDataPredicateExpressionVisitor.java | 287 +++ .../amoro/flink/metric/MetricConstant.java | 36 + .../amoro/flink/metric/MetricsGenerator.java | 128 ++ .../planner/calcite/FlinkTypeSystem.java | 215 ++ .../read/AdaptHiveFlinkParquetReaders.java | 873 ++++++++ .../amoro/flink/read/FlinkSplitPlanner.java | 288 +++ .../amoro/flink/read/MixedFormatSource.java | 132 ++ .../flink/read/MixedIncrementalLoader.java | 119 ++ .../flink/read/PartitionAndNodeGroup.java | 119 ++ .../hybrid/assigner/ShuffleSplitAssigner.java | 342 ++++ .../flink/read/hybrid/assigner/Split.java | 83 + .../read/hybrid/assigner/SplitAssigner.java | 66 + .../hybrid/assigner/StaticSplitAssigner.java | 134 ++ .../AbstractMixedFormatEnumerator.java | 183 ++ .../ContinuousEnumerationResult.java | 82 + .../enumerator/ContinuousSplitPlanner.java | 43 + .../ContinuousSplitPlannerImpl.java | 127 ++ .../InitializationFinishedEvent.java | 29 + .../MergeOnReadIncrementalPlanner.java | 67 + .../enumerator/MergeOnReadPlannerImpl.java | 83 + .../MixedFormatEnumeratorOffset.java | 94 + ...MixedFormatEnumeratorOffsetSerializer.java | 91 + .../MixedFormatSourceEnumState.java | 67 + .../MixedFormatSourceEnumStateSerializer.java | 160 ++ .../MixedFormatSourceEnumerator.java | 264 +++ .../StaticMixedFormatSourceEnumerator.java | 94 + .../read/hybrid/reader/ArrayBatchRecords.java | 206 ++ .../reader/ArrayPoolDataIteratorBatcher.java | 138 ++ .../hybrid/reader/DataIteratorBatcher.java | 37 + .../reader/DataIteratorReaderFunction.java | 59 + .../read/hybrid/reader/HybridSplitReader.java | 132 ++ .../reader/MixedFormatRecordEmitter.java | 76 + .../reader/MixedFormatRecordWithOffset.java | 66 + .../reader/MixedFormatSourceReader.java | 193 ++ .../read/hybrid/reader/ReaderFunction.java | 37 + .../hybrid/reader/ReaderStartedEvent.java | 28 + .../read/hybrid/reader/RecordFactory.java | 34 + .../read/hybrid/reader/RecordPosition.java | 61 + .../hybrid/reader/RowDataReaderFunction.java | 217 ++ .../hybrid/reader/RowDataRecordFactory.java | 73 + .../read/hybrid/split/ChangelogSplit.java | 141 ++ .../read/hybrid/split/MergeOnReadSplit.java | 96 + .../read/hybrid/split/MixedFormatSplit.java | 85 + .../split/MixedFormatSplitSerializer.java | 98 + .../hybrid/split/MixedFormatSplitState.java | 75 + .../read/hybrid/split/SnapshotSplit.java | 113 ++ .../read/hybrid/split/SplitRequestEvent.java | 55 + .../read/hybrid/split/TemporalJoinSplits.java | 154 ++ .../internals/KafkaPartitionSplitReader.java | 499 +++++ .../flink/read/internals/KafkaSource.java | 214 ++ .../internals/KafkaSourceFetcherManager.java | 107 + .../read/internals/KafkaSourceReader.java | 181 ++ .../metrics/KafkaConsumerMetricConstants.java | 33 + .../read/source/ChangeLogDataIterator.java | 235 +++ .../amoro/flink/read/source/DataIterator.java | 199 ++ .../flink/read/source/FileScanTaskReader.java | 35 + .../read/source/FlinkKeyedMORDataReader.java | 84 + .../read/source/FlinkUnkyedDataReader.java | 128 ++ .../read/source/MergeOnReadDataIterator.java | 132 ++ .../read/source/MixedFormatScanContext.java | 378 ++++ .../read/source/log/LogSourceHelper.java | 250 +++ .../log/kafka/LogKafkaPartitionSplit.java | 85 + .../kafka/LogKafkaPartitionSplitReader.java | 443 ++++ .../kafka/LogKafkaPartitionSplitState.java | 118 ++ .../log/kafka/LogKafkaRecordEmitter.java | 44 + .../read/source/log/kafka/LogKafkaSource.java | 161 ++ .../log/kafka/LogKafkaSourceBuilder.java | 578 ++++++ .../log/kafka/LogKafkaSourceReader.java | 77 + .../log/kafka/LogRecordWithRetractInfo.java | 115 ++ .../flink/shuffle/ReadShuffleRulePolicy.java | 120 ++ .../shuffle/RoundRobinShuffleRulePolicy.java | 235 +++ .../amoro/flink/shuffle/ShuffleHelper.java | 155 ++ .../amoro/flink/shuffle/ShuffleKey.java | 33 + .../flink/shuffle/ShuffleRulePolicy.java | 62 + .../apache/amoro/flink/table/FlinkSource.java | 316 +++ .../amoro/flink/table/LogDynamicSource.java | 230 +++ .../flink/table/MixedDynamicTableFactory.java | 265 +++ .../flink/table/MixedFormatDynamicSink.java | 113 ++ .../flink/table/MixedFormatDynamicSource.java | 384 ++++ .../flink/table/MixedFormatFileSource.java | 244 +++ .../flink/table/MixedFormatTableLoader.java | 152 ++ .../apache/amoro/flink/table/OptionsUtil.java | 64 + .../table/UnifiedDynamicTableFactory.java | 124 ++ .../UnkeyedInputFormatOperatorFactory.java | 67 + .../UnkeyedInputFormatSourceFunction.java | 191 ++ .../descriptors/MixedFormatValidator.java | 349 ++++ .../util/CompatibleFlinkPropertyUtil.java | 158 ++ .../amoro/flink/util/DateTimeUtils.java | 1797 +++++++++++++++++ .../apache/amoro/flink/util/FilterUtil.java | 45 + .../flink/util/FlinkClassReflectionUtil.java | 65 + .../flink/util/IcebergAndFlinkFilters.java | 49 + .../amoro/flink/util/IcebergClassUtil.java | 214 ++ .../apache/amoro/flink/util/LookupUtil.java | 36 + .../amoro/flink/util/MixedFormatUtils.java | 276 +++ .../apache/amoro/flink/util/Projection.java | 430 ++++ .../apache/amoro/flink/util/ProxyUtil.java | 67 + .../amoro/flink/util/ReflectionUtil.java | 56 + .../amoro/flink/util/ThreadLocalCache.java | 90 + .../write/AdaptHiveFlinkAppenderFactory.java | 276 +++ .../write/AutomaticDoubleWriteStatus.java | 96 + .../amoro/flink/write/AutomaticLogWriter.java | 142 ++ .../write/AutomaticWriteSpecification.java | 76 + .../flink/write/FlinkBaseTaskWriter.java | 72 + .../flink/write/FlinkChangeTaskWriter.java | 136 ++ .../apache/amoro/flink/write/FlinkSink.java | 444 ++++ .../flink/write/FlinkTaskWriterBuilder.java | 289 +++ .../flink/write/MixedFormatFileWriter.java | 231 +++ .../flink/write/MixedFormatLogWriter.java | 28 + .../MixedFormatRowDataTaskWriterFactory.java | 77 + .../amoro/flink/write/MixedFormatWriter.java | 218 ++ .../write/hidden/AbstractHiddenLogWriter.java | 240 +++ .../write/hidden/GlobalFlipCommitter.java | 272 +++ .../flink/write/hidden/HiddenLogWriter.java | 70 + .../flink/write/hidden/LogMsgFactory.java | 61 + .../hidden/MixedFormatLogPartitioner.java | 63 + .../hidden/kafka/HiddenKafkaFactory.java | 49 + .../hidden/kafka/HiddenKafkaProducer.java | 194 ++ .../flink/DynamicTableSourceTestBase.java | 96 + .../amoro/flink/FlinkTableTestBase.java | 108 + .../org/apache/amoro/flink/FlinkTestBase.java | 324 +++ .../amoro/flink/TestFlinkSchemaUtil.java | 60 + .../catalog/FlinkAmoroCatalogITCase.java | 154 ++ .../flink/catalog/FlinkCatalogContext.java | 133 ++ .../catalog/FlinkUnifiedCatalogITCase.java | 138 ++ .../catalog/TestFlinkUnifiedCatalogs.java | 169 ++ .../amoro/flink/catalog/TestMixedCatalog.java | 589 ++++++ .../TestMixedCatalogTablePartitions.java | 223 ++ .../kafka/testutils/KafkaConfigGenerate.java | 81 + .../kafka/testutils/KafkaContainerTest.java | 137 ++ .../flink/kafka/testutils/KafkaUtil.java | 186 ++ .../kafka/testutils/SuccessException.java | 24 + .../lookup/ByteArraySetSerializerTest.java | 87 + .../amoro/flink/lookup/TestKVTable.java | 584 ++++++ .../TestRowDataPredicateAllFieldTypes.java | 258 +++ .../filter/TestRowDataPredicateBase.java | 111 + ...TestRowDataPredicateExpressionVisitor.java | 163 ++ .../amoro/flink/read/TestFlinkSource.java | 304 +++ .../flink/read/TestFlinkSplitPlanner.java | 72 + .../flink/read/TestMixedFormatSource.java | 1128 +++++++++++ .../read/hidden/kafka/TestKafkaConsumer.java | 150 ++ .../hidden/kafka/TestKafkaSourceReader.java | 266 +++ .../TestLogKafkaPartitionSplitReader.java | 306 +++ .../assigner/TestShuffleSplitAssigner.java | 257 +++ .../assigner/TestSplitAssignerAwaiting.java | 126 ++ .../assigner/TestStaticSplitAssigner.java | 87 + .../TestContinuousSplitPlannerImpl.java | 173 ++ ...tMixedFormatSourceEnumStateSerializer.java | 95 + .../TestMixedFormatSourceEnumerator.java | 295 +++ .../TestTemporalJoinSplitsThreadSafe.java | 107 + .../reader/MixedIncrementalLoaderTest.java | 172 ++ .../reader/TestRowDataReaderFunction.java | 391 ++++ .../split/TestMixedFormatSplitSerializer.java | 89 + .../amoro/flink/shuffle/TestLogRecordV1.java | 143 ++ .../TestRoundRobinShuffleRulePolicy.java | 173 ++ .../flink/table/AmoroCatalogITCaseBase.java | 124 ++ .../amoro/flink/table/CatalogITCaseBase.java | 105 + .../amoro/flink/table/LookupITCase.java | 189 ++ .../apache/amoro/flink/table/TestJoin.java | 367 ++++ .../apache/amoro/flink/table/TestKeyed.java | 1164 +++++++++++ .../flink/table/TestLookupSecondary.java | 191 ++ .../amoro/flink/table/TestTableRefresh.java | 88 + .../apache/amoro/flink/table/TestUnkeyed.java | 1052 ++++++++++ .../flink/table/TestUnkeyedOverwrite.java | 208 ++ .../amoro/flink/table/TestWatermark.java | 259 +++ .../amoro/flink/util/ClassLoaderUtils.java | 293 +++ .../org/apache/amoro/flink/util/DataUtil.java | 155 ++ .../util/MixedFormatMockEnvironment.java | 80 + .../flink/util/MockEnvironmentBuilder.java | 209 ++ .../util/TestCompatibleFlinkPropertyUtil.java | 56 + .../util/TestGlobalAggregateManager.java | 50 + .../TestOneInputStreamOperatorIntern.java | 105 + .../amoro/flink/util/TestProjection.java | 148 ++ .../org/apache/amoro/flink/util/TestUtil.java | 71 + .../flink/write/FlinkTaskWriterBaseTest.java | 167 ++ .../write/MixedFormatFileWriterITCase.java | 311 +++ .../flink/write/TestAdaptHiveWriter.java | 330 +++ .../write/TestAutomaticDoubleWriteStatus.java | 70 + .../flink/write/TestAutomaticLogWriter.java | 429 ++++ .../amoro/flink/write/TestFlinkSink.java | 246 +++ .../write/TestMixedFormatFileCommitter.java | 151 ++ .../write/TestMixedFormatFileWriter.java | 327 +++ .../flink/write/hidden/kafka/TestBaseLog.java | 197 ++ .../hidden/kafka/TestHiddenKafkaProducer.java | 195 ++ .../hidden/kafka/TestHiddenLogOperators.java | 475 +++++ .../iceberg/flink/MiniClusterResource.java | 46 + .../pom.xml | 349 ++++ .../data/AdaptHiveFlinkParquetReaders.java | 873 ++++++++ .../data/AdaptHiveFlinkParquetWriters.java | 599 ++++++ ...daptHiveParquetWithFlinkSchemaVisitor.java | 231 +++ .../source/RowDataFileScanTaskReader.java | 247 +++ .../iceberg/flink/source/ScanContext.java | 707 +++++++ .../data/AdaptHiveFlinkParquetReaders.java | 6 +- .../source/RowDataFileScanTaskReader.java | 4 +- .../read/AdaptHiveFlinkParquetReaders.java | 6 +- .../hybrid/reader/RowDataRecordFactory.java | 13 +- .../iceberg/flink/MiniClusterResource.java | 46 + amoro-format-mixed/amoro-mixed-flink/pom.xml | 3 + .../v1.17/amoro-mixed-flink-1.17/pom.xml | 5 +- .../amoro-mixed-flink-runtime-1.17/pom.xml | 1 + .../v1.18/amoro-mixed-flink-1.18/pom.xml | 1 + .../amoro-mixed-flink-runtime-1.18/pom.xml | 1 + .../AdaptHiveGenericParquetReaders.java | 6 +- .../parquet/AdaptHiveParquetReader.java | 6 +- .../parquet/AdaptHiveParquetSchemaUtil.java | 4 +- .../spark/reader/SparkParquetReaders.java | 8 +- .../spark/reader/SparkParquetReaders.java | 8 +- .../spark/reader/SparkParquetReaders.java | 8 +- pom.xml | 11 +- 244 files changed, 44783 insertions(+), 37 deletions(-) create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java create mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java diff --git a/amoro-format-iceberg/src/test/java/org/apache/amoro/optimizing/IcebergRewriteExecutorTest.java b/amoro-format-iceberg/src/test/java/org/apache/amoro/optimizing/IcebergRewriteExecutorTest.java index 37b79036cb..e854990c30 100644 --- a/amoro-format-iceberg/src/test/java/org/apache/amoro/optimizing/IcebergRewriteExecutorTest.java +++ b/amoro-format-iceberg/src/test/java/org/apache/amoro/optimizing/IcebergRewriteExecutorTest.java @@ -65,6 +65,8 @@ @RunWith(Parameterized.class) public class IcebergRewriteExecutorTest extends TableTestBase { + private static final int POSITION_DELETE_FILE_PATH_ID = 2147483546; + private static final int POSITION_DELETE_POS_ID = 2147483545; private final FileFormat fileFormat; @@ -72,8 +74,11 @@ public class IcebergRewriteExecutorTest extends TableTestBase { private RewriteFilesInput dataScanTask; + // When GenericParquetReaders.buildReader() is called with a schema containing _file / _pos, + // the TypeWithSchemaVisitor in Iceberg does strict schema matching against the Parquet file + // schema private final Schema posSchema = - new Schema(MetadataColumns.FILE_PATH, MetadataColumns.ROW_POSITION); + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); public IcebergRewriteExecutorTest(boolean hasPartition, FileFormat fileFormat) { super( @@ -109,6 +114,9 @@ private StructLike getPartitionData() { @Before public void initDataAndReader() throws IOException { + Assert.assertEquals(POSITION_DELETE_FILE_PATH_ID, MetadataColumns.DELETE_FILE_PATH.fieldId()); + Assert.assertEquals(POSITION_DELETE_POS_ID, MetadataColumns.DELETE_FILE_POS.fieldId()); + StructLike partitionData = getPartitionData(); OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(getMixedTable().asUnkeyedTable(), 0, 1) diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml new file mode 100644 index 0000000000..7e2dc9859b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml @@ -0,0 +1,453 @@ + + + + 4.0.0 + + org.apache.amoro + amoro-mixed-flink + 0.9-SNAPSHOT + ../pom.xml + + + amoro-mixed-flink-common-1.17 + + jar + Amoro Project Mixed Format Flink Common + https://amoro.apache.org + + + 3.21.0 + 1.17.2 + 1.17.2 + 3.0.2-1.17 + 1.6.1 + + + + + + org.apache.amoro + amoro-format-iceberg + + + org.ow2.asm + asm + + + + + + org.apache.amoro + amoro-mixed-hive + + + + org.apache.amoro + amoro-format-mixed-flink-common-iceberg-bridge-1.17 + ${project.version} + + + + org.apache.iceberg + iceberg-flink-1.17 + ${iceberg.version} + provided + + + org.slf4j + slf4j-api + + + org.apache.parquet + parquet-column + + + org.apache.parquet + parquet-avro + + + + + + org.apache.paimon + paimon-flink-1.17 + ${paimon.version} + provided + + + + org.apache.amoro + amoro-format-mixed-flink-common-format + ${project.parent.version} + + + + cglib + cglib + + + + com.google.code.gson + gson + ${gson.version} + + + + + org.apache.flink + flink-connector-files + ${flink.version} + provided + + + + org.apache.flink + flink-connector-kafka + ${flink-kafka.version} + provided + + + + org.apache.flink + flink-json + ${flink.version} + provided + + + + org.apache.flink + flink-hadoop-compatibility_${flink.scala.binary.version} + ${flink.version} + provided + + + + org.apache.flink + flink-table-api-java-bridge + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + org.apache.flink + flink-metrics-dropwizard + ${flink.version} + provided + + + + + org.apache.flink + flink-orc + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-parquet + ${flink.version} + provided + + + org.apache.parquet + parquet-hadoop + + + + + + org.apache.flink + flink-table-runtime + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-table-planner_${flink.scala.binary.version} + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.vesion} + provided + + + + + + org.apache.iceberg + iceberg-flink-1.17 + ${iceberg.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-runtime + ${flink.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-streaming-java + ${flink.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-clients + ${flink.version} + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-test-utils + ${flink.version} + test + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + com.google.guava + guava + + + + + + org.apache.flink + flink-connector-test-utils + ${flink.version} + test + + + + org.apache.iceberg + iceberg-hive-metastore + ${iceberg.version} + tests + test + + + + org.apache.amoro + amoro-common + ${project.version} + tests + test + + + + org.apache.amoro + amoro-format-iceberg + ${project.version} + test-jar + test + + + + org.apache.amoro + amoro-mixed-hive + ${project.version} + tests + test + + + + org.apache.amoro + amoro-format-paimon + ${project.version} + tests + test + + + + org.apache.amoro + amoro-format-paimon + ${project.version} + test + + + + org.apache.flink + flink-metrics-jmx + ${flink.version} + test + + + + org.apache.flink + flink-runtime-web + ${flink.version} + test + + + + + org.apache.flink + flink-table-planner_${flink.scala.binary.version} + ${flink.version} + test-jar + test + + + org.slf4j + slf4j-api + + + + + + + org.apache.curator + curator-test + 2.12.0 + test + + + com.google.guava + guava + + + + + + org.testcontainers + kafka + ${testcontainers.version} + test + + + + org.testcontainers + junit-jupiter + ${testcontainers.version} + test + + + + org.assertj + assertj-core + ${assertj.version} + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + + listener + org.apache.amoro.listener.AmoroRunListener + + + -verbose:class + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + + + + + org.jacoco + jacoco-maven-plugin + + ${jacoco.flink.skip} + + + + + diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java new file mode 100644 index 0000000000..bb5bd93111 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java @@ -0,0 +1,438 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import static org.apache.flink.table.descriptors.DescriptorProperties.DATA_TYPE; +import static org.apache.flink.table.descriptors.DescriptorProperties.EXPR; +import static org.apache.flink.table.descriptors.DescriptorProperties.NAME; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_ROWTIME; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_DATA_TYPE; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_EXPR; +import static org.apache.flink.table.descriptors.Schema.SCHEMA_PROCTIME; + +import org.apache.amoro.flink.table.FlinkSource; +import org.apache.amoro.flink.table.MixedFormatDynamicSource; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.api.TableColumn.ComputedColumn; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.api.WatermarkSpec; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeParser; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** An util that converts flink table schema. */ +public class FlinkSchemaUtil { + + private static final Logger LOG = LoggerFactory.getLogger(FlinkSchemaUtil.class); + public static final String FLINK_PREFIX = "flink"; + + public static final String COMPUTED_COLUMNS = "computed-column"; + + public static final String WATERMARK = "watermark"; + public static final String PROCTIME_FUNCTION = SCHEMA_PROCTIME + "()"; + public static final Pattern COMPUTE_PATTERN = + Pattern.compile("flink\\.computed-column\\.(\\d+)\\.name"); + + /** + * Convert iceberg Schema to flink TableSchema. + * + * @param icebergSchema + * @param tableProperties + * @return Flink TableSchema + */ + public static TableSchema toSchema( + Schema icebergSchema, List primaryKeys, Map tableProperties) { + TableSchema.Builder builder = TableSchema.builder(); + RowType rowType = org.apache.iceberg.flink.FlinkSchemaUtil.convert(icebergSchema); + + // add physical columns. + for (RowType.RowField field : rowType.getFields()) { + builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); + } + + // add primary key + if (CollectionUtils.isNotEmpty(primaryKeys)) { + builder.primaryKey(primaryKeys.toArray(new String[0])); + } + + Set computeIndex = getComputeIndex(tableProperties); + List fieldNames = rowType.getFieldNames(); + + // add computed columns + for (int index : computeIndex) { + builder.add(deserializeComputeColumn(tableProperties, index, fieldNames)); + fieldNames.add(tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME))); + } + + // add watermark + if (isWatermarkValid(tableProperties)) { + builder.watermark(deserializeWatermarkSpec(tableProperties, fieldNames)); + } + return builder.build(); + } + + /** + * Add watermark info to help {@link FlinkSource} and {@link MixedFormatDynamicSource} distinguish + * the watermark field. For now, it only be used in the case of mixed-format table as dim-table. + */ + public static TableSchema getPhysicalSchemaForDimTable(TableSchema tableSchema) { + TableSchema.Builder builder = filter(tableSchema, TableColumn::isPhysical); + tableSchema.getWatermarkSpecs().forEach(builder::watermark); + return builder.build(); + } + + /** + * filter watermark due to watermark is a virtual field for now, not in mixed-format physical + * table. + */ + public static TableSchema filterWatermark(TableSchema tableSchema) { + List watermarkSpecs = tableSchema.getWatermarkSpecs(); + if (watermarkSpecs.isEmpty()) { + return tableSchema; + } + + Function filter = + (tableColumn) -> { + boolean isWatermark = false; + for (WatermarkSpec spec : watermarkSpecs) { + if (spec.getRowtimeAttribute().equals(tableColumn.getName())) { + isWatermark = true; + break; + } + } + return !isWatermark; + }; + return filter(tableSchema, filter).build(); + } + + /** If filter result is true, keep the column; otherwise, remove the column. */ + public static TableSchema.Builder filter( + TableSchema tableSchema, Function filter) { + TableSchema.Builder builder = TableSchema.builder(); + + tableSchema + .getTableColumns() + .forEach( + tableColumn -> { + if (!filter.apply(tableColumn)) { + return; + } + builder.field(tableColumn.getName(), tableColumn.getType()); + }); + tableSchema + .getPrimaryKey() + .ifPresent( + uniqueConstraint -> + builder.primaryKey( + uniqueConstraint.getName(), + uniqueConstraint.getColumns().toArray(new String[0]))); + return builder; + } + + public static RowType toRowType(TableSchema tableSchema) { + LogicalType[] fields = new LogicalType[tableSchema.getFieldCount()]; + + for (int i = 0; i < fields.length; i++) { + TableColumn tableColumn = tableSchema.getTableColumn(i).get(); + fields[i] = tableColumn.getType().getLogicalType(); + } + return RowType.of(fields); + } + + /** + * Primary keys are the required fields to guarantee that readers can read keyed table in right + * order, due to the automatic scaling in/out of nodes. The required fields should be added even + * though projection push down + */ + @Deprecated + public static List addPrimaryKey( + List projectedColumns, MixedTable table) { + List primaryKeys = + table.isUnkeyedTable() + ? Collections.EMPTY_LIST + : table.asKeyedTable().primaryKeySpec().fields().stream() + .map(PrimaryKeySpec.PrimaryKeyField::fieldName) + .collect(Collectors.toList()); + + List columns = new ArrayList<>(projectedColumns); + Set projectedNames = new HashSet<>(); + + projectedColumns.forEach(c -> projectedNames.add(c.name())); + + primaryKeys.forEach( + pk -> { + if (!projectedNames.contains(pk)) { + columns.add(table.schema().findField(pk)); + } + }); + + LOG.info("Projected Columns after addPrimaryKey, columns:{}", columns); + return columns; + } + + /** + * Primary keys are the required fields to guarantee that readers can read keyed table in right + * order, due to the automatic scaling in/out of nodes. The required fields should be added even + * though projection push down + */ + @Deprecated + public static void addPrimaryKey( + TableSchema.Builder builder, + MixedTable table, + TableSchema tableSchema, + String[] projectedColumns) { + Set projectedNames = Arrays.stream(projectedColumns).collect(Collectors.toSet()); + + if (!table.isKeyedTable()) { + return; + } + + List pks = table.asKeyedTable().primaryKeySpec().fieldNames(); + pks.forEach( + pk -> { + if (projectedNames.contains(pk)) { + return; + } + builder.field( + pk, + tableSchema + .getFieldDataType(pk) + .orElseThrow( + () -> + new ValidationException( + "Mixed-format table primary key should be declared in table"))); + }); + } + + /** + * Generate table properties for watermark and computed columns from flink TableSchema. + * + * @param schema Flink TableSchema. + * @return tableProperties. + */ + public static Map generateExtraOptionsFrom(TableSchema schema) { + Map properties = Maps.newHashMap(); + + // add properties for computeColumns + Map computeColumnProperties = serializeComputeColumn(schema); + properties.putAll(computeColumnProperties); + + // add properties for watermark,only support one watermark now + List watermarkSpecs = schema.getWatermarkSpecs(); + if (!watermarkSpecs.isEmpty()) { + if (watermarkSpecs.size() > 1) { + throw new IllegalStateException("Multiple watermark definition is not supported yet."); + } + properties.putAll(serializeWatermarkSpec(watermarkSpecs.get(0))); + } + + return properties; + } + + /** Serialize compute columns into properties. */ + private static Map serializeComputeColumn(TableSchema schema) { + Map serialized = new HashMap<>(); + List tableColumns = schema.getTableColumns(); + // index in compute Column, starting from 1 + int computeIndex = 1; + for (TableColumn column : tableColumns) { + if (column instanceof ComputedColumn) { + serialized.put( + compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, NAME), column.getName()); + serialized.put( + compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, DATA_TYPE), + column.getType().getLogicalType().asSerializableString()); + serialized.put( + compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, EXPR), + ((TableColumn.ComputedColumn) column).getExpression()); + computeIndex++; + } + } + return serialized; + } + + /** Deserialize compute columns from properties. */ + private static TableColumn deserializeComputeColumn( + Map tableProperties, int index, List fieldNames) { + String expr = tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, EXPR)); + if (!isExprContainField(expr, fieldNames)) { + throw new IllegalStateException( + "expression " + expr + " does not match any columns in the table. "); + } + DataType dataType = + TypeConversions.fromLogicalToDataType( + LogicalTypeParser.parse( + tableProperties.get( + compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, DATA_TYPE)))); + TableColumn column = + TableColumn.computed( + tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME)), + dataType, + expr); + return column; + } + + private static boolean isExprContainField(String expr, List fieldNames) { + if (expr.equalsIgnoreCase(PROCTIME_FUNCTION)) { + return true; + } + for (String fieldName : fieldNames) { + if (expr.contains("`" + fieldName + "`")) { + return true; + } + } + return false; + } + + private static boolean isComputeValid(Map tableProperties, int index) { + // check if properties for computeColumn is valid and complete + if (StringUtils.isNotBlank( + tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME))) + && StringUtils.isNotBlank( + tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, DATA_TYPE))) + && StringUtils.isNotBlank( + tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, EXPR)))) { + return true; + } + LOG.warn( + "properties for computeColumn {} is incomplete, It should contain {}, {}, {}. skip to convert it into computeColumn ", + index, + NAME, + DATA_TYPE, + EXPR); + return false; + } + + private static Set getComputeIndex(Map tableProperties) { + Set computedIndex = new TreeSet<>(); + tableProperties + .keySet() + .forEach( + k -> { + Matcher matcher = COMPUTE_PATTERN.matcher(k); + if (matcher.find()) { + int indexId = NumberUtils.toInt(matcher.group(1)); + if (indexId > 0 && isComputeValid(tableProperties, indexId)) { + computedIndex.add(indexId); + } + } + }); + return computedIndex; + } + + /** Serialize watermarkSpec into properties. */ + private static Map serializeWatermarkSpec(WatermarkSpec watermarkSpec) { + Map serializedWatermarkSpec = new HashMap<>(); + serializedWatermarkSpec.put( + compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME), + watermarkSpec.getRowtimeAttribute()); + serializedWatermarkSpec.put( + compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR), + watermarkSpec.getWatermarkExpr()); + serializedWatermarkSpec.put( + compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE), + watermarkSpec.getWatermarkExprOutputType().getLogicalType().asSerializableString()); + + return serializedWatermarkSpec; + } + + /** Deserialize watermarkSpec from properties. */ + private static WatermarkSpec deserializeWatermarkSpec( + Map tableProperties, List fieldNames) { + String rowtimeAttribute = + tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME)); + if (!fieldNames.contains(rowtimeAttribute)) { + throw new IllegalStateException( + "Watermark rowtime attribute '" + + rowtimeAttribute + + " does not match any columns in the table. "); + } + DataType watermarkExprOutputType = + TypeConversions.fromLogicalToDataType( + LogicalTypeParser.parse( + tableProperties.get( + compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE)))); + return new WatermarkSpec( + rowtimeAttribute, + tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR)), + watermarkExprOutputType); + } + + private static boolean isWatermarkValid(Map tableProperties) { + // check if properties for watermark is valid and complete + if (StringUtils.isNotBlank( + tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME))) + && StringUtils.isNotBlank( + tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR))) + && StringUtils.isNotBlank( + tableProperties.get( + compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE)))) { + return true; + } + LOG.warn( + "properties for watermark is incomplete, It should contain {}, {}, {}. skip to convert it into watermark strategy ", + WATERMARK_ROWTIME, + WATERMARK_STRATEGY_EXPR, + WATERMARK_STRATEGY_DATA_TYPE); + return false; + } + + private static String compoundKey(Object... components) { + return Stream.of(components).map(Object::toString).collect(Collectors.joining(".")); + } + + /** + * get physical tableSchema + * + * @param tableSchema Flink TableSchema + * @return Flink tableSchema + */ + public static TableSchema getPhysicalSchema(TableSchema tableSchema) { + TableSchema.Builder builder = filter(tableSchema, TableColumn::isPhysical); + return builder.build(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java new file mode 100644 index 0000000000..e31bc6605f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import static org.apache.iceberg.CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE; +import static org.apache.iceberg.flink.FlinkCatalogFactory.HADOOP_CONF_DIR; +import static org.apache.iceberg.flink.FlinkCatalogFactory.HIVE_CONF_DIR; + +import org.apache.amoro.mixed.CatalogLoader; +import org.apache.amoro.mixed.MixedFormatCatalog; +import org.apache.amoro.properties.CatalogMetaProperties; +import org.apache.amoro.shade.guava32.com.google.common.base.Strings; +import org.apache.amoro.table.TableMetaStore; +import org.apache.amoro.utils.ConfigurationFileUtil; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.runtime.util.HadoopUtils; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.flink.FlinkCatalogFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; + +/** Build {@link MixedFormatCatalog}. */ +public class InternalCatalogBuilder implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(InternalCatalogBuilder.class); + + private String amsUri; + private Map properties = new HashMap<>(0); + private String catalogName; + + private MixedFormatCatalog createMixedFormatCatalog() { + if (amsUri != null) { + return CatalogLoader.load(amsUri, properties); + } else { + Preconditions.checkArgument(catalogName != null, "Catalog name cannot be empty"); + String metastoreType = properties.get(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE); + Preconditions.checkArgument(metastoreType != null, "Catalog type cannot be empty"); + TableMetaStore tableMetaStore = + TableMetaStore.builder() + .withConfiguration(clusterHadoopConf(metastoreType, properties)) + .build(); + return CatalogLoader.createCatalog(catalogName, metastoreType, properties, tableMetaStore); + } + } + + public static Configuration clusterHadoopConf( + String metastoreType, Map properties) { + Configuration configuration = + HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); + if (ICEBERG_CATALOG_TYPE_HIVE.equals(metastoreType)) { + String hiveConfDir = properties.get(HIVE_CONF_DIR); + String hadoopConfDir = properties.get(HADOOP_CONF_DIR); + configuration = mergeHiveConf(configuration, hiveConfDir, hadoopConfDir); + } + return configuration; + } + + private static Configuration mergeHiveConf( + Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { + Configuration newConf = new Configuration(hadoopConf); + if (!Strings.isNullOrEmpty(hiveConfDir)) { + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); + newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); + } else { + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still + // couldn't load the configuration file, then it will throw exception in HiveCatalog. + URL configFile = InternalCatalogBuilder.class.getClassLoader().getResource("hive-site.xml"); + if (configFile != null) { + newConf.addResource(configFile); + } + } + + if (!Strings.isNullOrEmpty(hadoopConfDir)) { + java.nio.file.Path hdfsSiteFile = Paths.get(hadoopConfDir, "hdfs-site.xml"); + Preconditions.checkState( + Files.exists(hdfsSiteFile), + "Failed to load Hadoop configuration: missing %s", + hdfsSiteFile); + newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); + java.nio.file.Path coreSiteFile = Paths.get(hadoopConfDir, "core-site.xml"); + Preconditions.checkState( + Files.exists(coreSiteFile), + "Failed to load Hadoop configuration: missing %s", + coreSiteFile); + newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); + } + + return newConf; + } + + public String getAmsUri() { + return amsUri; + } + + public Map getProperties() { + return properties; + } + + public InternalCatalogBuilder() {} + + public static InternalCatalogBuilder builder() { + return new InternalCatalogBuilder(); + } + + public MixedFormatCatalog build() { + return createMixedFormatCatalog(); + } + + public InternalCatalogBuilder amsUri(String amsUri) { + this.amsUri = amsUri; + return this; + } + + public InternalCatalogBuilder properties(Map properties) { + Map finalProperties = new HashMap<>(); + for (Map.Entry property : properties.entrySet()) { + String key = property.getKey(); + String value = property.getValue(); + switch (key) { + case CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB_PATH: + try { + finalProperties.put( + CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB, + ConfigurationFileUtil.encodeConfigurationFileWithBase64(value)); + } catch (IOException e) { + LOG.error("encode keytab file failed", e); + throw new CatalogException("encode keytab file failed", e); + } + break; + case CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB_ENCODE: + finalProperties.put(CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB, value); + break; + case CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB_PATH: + try { + finalProperties.put( + CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB5, + ConfigurationFileUtil.encodeConfigurationFileWithBase64(value)); + } catch (IOException e) { + LOG.error("encode krb5 file failed", e); + throw new CatalogException("encode krb5 file failed", e); + } + break; + case CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB_ENCODE: + finalProperties.put(CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB5, value); + break; + default: + finalProperties.put(key, value); + break; + } + } + this.properties = finalProperties; + return this; + } + + public InternalCatalogBuilder catalogName(String catalogName) { + this.catalogName = catalogName; + return this; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java new file mode 100644 index 0000000000..c56bb6c94b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java @@ -0,0 +1,550 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.apache.amoro.Constants.THRIFT_TABLE_SERVICE_NAME; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; + +import org.apache.amoro.AlreadyExistsException; +import org.apache.amoro.AmoroTable; +import org.apache.amoro.NoSuchDatabaseException; +import org.apache.amoro.NoSuchTableException; +import org.apache.amoro.TableFormat; +import org.apache.amoro.UnifiedCatalog; +import org.apache.amoro.client.AmsThriftUrl; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.catalog.factories.FlinkUnifiedCatalogFactory; +import org.apache.amoro.flink.catalog.factories.iceberg.IcebergFlinkCatalogFactory; +import org.apache.amoro.flink.catalog.factories.mixed.MixedHiveCatalogFactory; +import org.apache.amoro.flink.catalog.factories.mixed.MixedIcebergCatalogFactory; +import org.apache.amoro.flink.catalog.factories.paimon.PaimonFlinkCatalogFactory; +import org.apache.amoro.flink.table.UnifiedDynamicTableFactory; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.amoro.table.TableIdentifier; +import org.apache.amoro.table.TableMetaStore; +import org.apache.amoro.utils.CatalogUtil; +import org.apache.amoro.utils.MixedFormatCatalogUtil; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; +import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.exceptions.TablePartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.factories.CatalogFactory; +import org.apache.flink.table.factories.Factory; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +/** This is a Flink catalog wrap a unified catalog. */ +public class FlinkUnifiedCatalog extends AbstractCatalog { + + private final UnifiedCatalog unifiedCatalog; + private final String amsUri; + private final String amoroCatalogName; + /** + * Available Flink catalogs for Unified Catalog. + * + *

May include: Iceberg, Mixed and Paimon Catalogs, etc. + */ + private Map availableCatalogs; + + private final CatalogFactory.Context context; + private final org.apache.hadoop.conf.Configuration hadoopConf; + + public FlinkUnifiedCatalog( + String amsUri, + String defaultDatabase, + UnifiedCatalog unifiedCatalog, + CatalogFactory.Context context, + org.apache.hadoop.conf.Configuration hadoopConf) { + super(context.getName(), defaultDatabase); + this.amsUri = amsUri; + this.amoroCatalogName = AmsThriftUrl.parse(amsUri, THRIFT_TABLE_SERVICE_NAME).catalogName(); + this.unifiedCatalog = unifiedCatalog; + this.context = context; + this.hadoopConf = hadoopConf; + } + + @Override + public void open() throws CatalogException { + availableCatalogs = Maps.newHashMap(); + } + + @Override + public void close() throws CatalogException { + if (availableCatalogs != null) { + availableCatalogs.forEach((tableFormat, catalog) -> catalog.close()); + } + } + + @Override + public List listDatabases() { + return unifiedCatalog.listDatabases(); + } + + @Override + public CatalogDatabase getDatabase(String databaseName) { + throw new UnsupportedOperationException("Unsupported operation: get database."); + } + + @Override + public boolean databaseExists(String databaseName) { + return unifiedCatalog.databaseExists(databaseName); + } + + @Override + public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) + throws DatabaseAlreadyExistException { + try { + unifiedCatalog.createDatabase(name); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), name); + } + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws DatabaseNotExistException { + try { + unifiedCatalog.dropDatabase(name); + } catch (NoSuchDatabaseException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) { + throw new UnsupportedOperationException("Unsupported operation: alter database."); + } + + @Override + public List listTables(String databaseName) { + return unifiedCatalog.listTables(databaseName).stream() + .map(table -> table.getIdentifier().getTableName()) + .collect(java.util.stream.Collectors.toList()); + } + + @Override + public List listViews(String databaseName) { + return Collections.emptyList(); + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + TableIdentifier tableIdentifier = + TableIdentifier.of( + this.amoroCatalogName, tablePath.getDatabaseName(), tablePath.getObjectName()); + Set formats = + CatalogUtil.tableFormats(unifiedCatalog.metastoreType(), unifiedCatalog.properties()); + + TableMetaStore tableMetaStore = unifiedCatalog.authenticationContext(); + return formats.stream() + .map( + f -> { + try { + AbstractCatalog catalog = + getOriginalCatalog(f) + .orElseGet(() -> createOriginalCatalog(tableIdentifier, f)); + CatalogTable catalogTable = + (CatalogTable) tableMetaStore.doAs(() -> catalog.getTable(tablePath)); + final Map flinkProperties = + Maps.newHashMap(catalogTable.getOptions()); + flinkProperties.put(TABLE_FORMAT.key(), f.toString()); + return CatalogTable.of( + catalogTable.getUnresolvedSchema(), + catalogTable.getComment(), + catalogTable.getPartitionKeys(), + flinkProperties); + } catch (RuntimeException e) { + // only handle no such table case + if (e.getCause() instanceof TableNotExistException + || e.getCause() instanceof NoSuchTableException) { + return null; + } else { + throw e; + } + } + }) + .filter(Objects::nonNull) + .findFirst() + .orElseThrow(() -> new TableNotExistException(getName(), tablePath)); + } + + @Override + public boolean tableExists(ObjectPath tablePath) { + try { + return unifiedCatalog.tableExists(tablePath.getDatabaseName(), tablePath.getObjectName()); + } catch (NoSuchDatabaseException | NoSuchTableException e) { + return false; + } + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) + throws TableNotExistException { + try { + unifiedCatalog.dropTable(tablePath.getDatabaseName(), tablePath.getObjectName(), true); + } catch (NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(getName(), tablePath); + } + } + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws TableNotExistException, TableAlreadyExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.renameTable(tablePath, newTableName, ignoreIfNotExists); + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { + Configuration configuration = new Configuration(); + table.getOptions().forEach(configuration::setString); + unifiedCatalog.refresh(); + table + .getOptions() + .putAll( + MixedFormatCatalogUtil.mergePersistedCatalogPropertiesToTable( + table.getOptions(), unifiedCatalog.properties())); + TableFormat format = TableFormat.valueOf(configuration.get(TABLE_FORMAT)); + TableIdentifier tableIdentifier = + TableIdentifier.of( + unifiedCatalog.name(), tablePath.getDatabaseName(), tablePath.getObjectName()); + String errorMessage = + String.format( + "Can't decide table format of table %s, Please specify 'table.format' " + + "in table properties", + tableIdentifier); + + Preconditions.checkNotNull(format, errorMessage); + try { + unifiedCatalog.loadTable(tableIdentifier.getDatabase(), tableIdentifier.getTableName()); + if (!ignoreIfExists) { + throw new TableAlreadyExistException(getName(), tablePath); + } + return; + } catch (NoSuchTableException e) { + // do nothing + } + + final TableFormat catalogFormat = format; + AbstractCatalog catalog = + getOriginalCatalog(format) + .orElseGet(() -> createOriginalCatalog(tableIdentifier, catalogFormat)); + catalog.createTable(tablePath, table, ignoreIfExists); + } + + @Override + public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterTable(tablePath, newTable, ignoreIfNotExists); + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.listPartitions(tablePath); + } + + @Override + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, + CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.listPartitions(tablePath, partitionSpec); + } + + @Override + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) + throws TableNotExistException, TableNotPartitionedException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.listPartitionsByFilter(tablePath, filters); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.getPartition(tablePath, partitionSpec); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.partitionExists(tablePath, partitionSpec); + } + + @Override + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, + PartitionAlreadyExistsException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.createPartition(tablePath, partitionSpec, partition, ignoreIfExists); + } + + @Override + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.dropPartition(tablePath, partitionSpec, ignoreIfNotExists); + } + + @Override + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterPartition(tablePath, partitionSpec, newPartition, ignoreIfNotExists); + } + + @Override + public Optional getFactory() { + return Optional.of(new UnifiedDynamicTableFactory(availableCatalogs)); + } + + @Override + public List listFunctions(String dbName) { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) { + return false; + } + + @Override + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) { + throw new UnsupportedOperationException("Unsupported operation: create function."); + } + + @Override + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) { + throw new UnsupportedOperationException("Unsupported operation: alter function."); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) { + throw new UnsupportedOperationException("Unsupported operation: drop function."); + } + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.getTableStatistics(tablePath); + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.getTableColumnStatistics(tablePath); + } + + @Override + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.getPartitionStatistics(tablePath, partitionSpec); + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + return catalog.getPartitionColumnStatistics(tablePath, partitionSpec); + } + + @Override + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterTableStatistics(tablePath, tableStatistics, ignoreIfNotExists); + } + + @Override + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws TableNotExistException, CatalogException, TablePartitionedException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterTableColumnStatistics(tablePath, columnStatistics, ignoreIfNotExists); + } + + @Override + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterPartitionStatistics( + tablePath, partitionSpec, partitionStatistics, ignoreIfNotExists); + } + + @Override + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws PartitionNotExistException, CatalogException { + AbstractCatalog catalog = originalCatalog(tablePath); + catalog.alterPartitionColumnStatistics( + tablePath, partitionSpec, columnStatistics, ignoreIfNotExists); + } + + /** + * Get the original flink catalog for the given table, if the flink catalog is not exists in the + * cache, would create a new original flink catalog for this table format. + * + * @param amoroTable amoroTable + * @return original Flink catalog + */ + private AbstractCatalog originalCatalog(AmoroTable amoroTable) { + TableFormat format = amoroTable.format(); + TableIdentifier tableIdentifier = amoroTable.id(); + return getOriginalCatalog(format) + .orElseGet(() -> createOriginalCatalog(tableIdentifier, format)); + } + + private AbstractCatalog originalCatalog(ObjectPath tablePath) { + AmoroTable amoroTable = loadAmoroTable(tablePath); + return originalCatalog(amoroTable); + } + + private Optional getOriginalCatalog(TableFormat format) { + return Optional.ofNullable(availableCatalogs.get(format)); + } + + private AmoroTable loadAmoroTable(ObjectPath tablePath) { + return unifiedCatalog.loadTable(tablePath.getDatabaseName(), tablePath.getObjectName()); + } + + private AbstractCatalog createOriginalCatalog( + TableIdentifier tableIdentifier, TableFormat tableFormat) { + CatalogFactory catalogFactory; + if (tableFormat.equals(TableFormat.MIXED_ICEBERG)) { + catalogFactory = new MixedIcebergCatalogFactory(); + } else if (tableFormat.equals(TableFormat.MIXED_HIVE)) { + catalogFactory = new MixedHiveCatalogFactory(); + } else if (tableFormat.equals(TableFormat.ICEBERG)) { + catalogFactory = new IcebergFlinkCatalogFactory(hadoopConf); + } else if (tableFormat.equals(TableFormat.PAIMON)) { + catalogFactory = + new PaimonFlinkCatalogFactory( + unifiedCatalog.properties(), unifiedCatalog.metastoreType()); + } else { + throw new UnsupportedOperationException( + String.format( + "Unsupported table format: [%s] in the unified catalog, table identifier is [%s], the supported table formats are [%s].", + tableFormat, tableIdentifier, FlinkUnifiedCatalogFactory.SUPPORTED_FORMATS)); + } + + AbstractCatalog originalCatalog; + try { + context.getOptions().put(CatalogFactoryOptions.FLINK_TABLE_FORMATS.key(), tableFormat.name()); + originalCatalog = (AbstractCatalog) catalogFactory.createCatalog(context); + } catch (CatalogException e) { + if (e.getMessage().contains("must implement createCatalog(Context)")) { + originalCatalog = + (AbstractCatalog) catalogFactory.createCatalog(context.getName(), context.getOptions()); + } else { + throw e; + } + } + originalCatalog.open(); + availableCatalogs.put(tableFormat, originalCatalog); + return originalCatalog; + } + + @Override + public String toString() { + return "FlinkUnifiedCatalog{" + + "name='" + + getName() + + '\'' + + ", defaultDatabase='" + + getDefaultDatabase() + + '\'' + + ", amsUri='" + + amsUri + + '\'' + + ", amoroCatalogName='" + + amoroCatalogName + + '\'' + + ", availableCatalogs size=" + + availableCatalogs.size() + + "}"; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java new file mode 100644 index 0000000000..3026f608bd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java @@ -0,0 +1,792 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.apache.amoro.flink.FlinkSchemaUtil.generateExtraOptionsFrom; +import static org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema; +import static org.apache.amoro.flink.FlinkSchemaUtil.toSchema; +import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.NoSuchDatabaseException; +import org.apache.amoro.flink.InternalCatalogBuilder; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.table.MixedDynamicTableFactory; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.mixed.MixedFormatCatalog; +import org.apache.amoro.scan.CombinedScanTask; +import org.apache.amoro.scan.KeyedTableScanTask; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.shade.guava32.com.google.common.base.Objects; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.amoro.table.TableBuilder; +import org.apache.amoro.table.TableIdentifier; +import org.apache.amoro.table.TableProperties; +import org.apache.amoro.table.UnkeyedTable; +import org.apache.amoro.utils.CompatiblePropertyUtil; +import org.apache.flink.table.api.TableColumn; +import org.apache.flink.table.api.TableColumn.ComputedColumn; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabase; +import org.apache.flink.table.catalog.CatalogFunction; +import org.apache.flink.table.catalog.CatalogPartition; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.CatalogTableImpl; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.CatalogException; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; +import org.apache.flink.table.catalog.stats.CatalogTableStatistics; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.factories.Factory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.UpdateProperties; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.FlinkFilters; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.util.FlinkAlterTableUtil; +import org.apache.iceberg.io.CloseableIterable; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Catalogs for mixed table format(include mixed-iceberg and mixed-hive). */ +public class MixedCatalog extends AbstractCatalog { + public static final String DEFAULT_DB = "default"; + + /** + * To distinguish 'CREATE TABLE LIKE' by checking stack + * org.apache.flink.table.planner.operations.SqlCreateTableConverter#lookupLikeSourceTable + */ + public static final String SQL_LIKE_METHOD = "lookupLikeSourceTable"; + + public static final String LOCATION = "location"; + + public static final String CHERRY_PICK_SNAPSHOT_ID = "cherry-pick-snapshot-id"; + + public static final String CURRENT_SNAPSHOT_ID = "current-snapshot-id"; + + private final InternalCatalogBuilder catalogBuilder; + + private MixedFormatCatalog internalCatalog; + + public MixedCatalog(String name, String defaultDatabase, InternalCatalogBuilder catalogBuilder) { + super(name, defaultDatabase); + this.catalogBuilder = catalogBuilder; + } + + public MixedCatalog(MixedCatalog copy) { + this(copy.getName(), copy.getDefaultDatabase(), copy.catalogBuilder); + } + + @Override + public void open() throws CatalogException { + internalCatalog = catalogBuilder.build(); + } + + @Override + public void close() throws CatalogException {} + + @Override + public List listDatabases() throws CatalogException { + return internalCatalog.listDatabases(); + } + + @Override + public CatalogDatabase getDatabase(String databaseName) throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean databaseExists(String databaseName) throws CatalogException { + return listDatabases().stream().anyMatch(db -> db.equalsIgnoreCase(databaseName)); + } + + @Override + public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) + throws CatalogException, DatabaseAlreadyExistException { + try { + internalCatalog.createDatabase(name); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new DatabaseAlreadyExistException(getName(), name, e); + } + } + } + + @Override + public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) + throws CatalogException, DatabaseNotExistException { + try { + internalCatalog.dropDatabase(name); + } catch (NoSuchDatabaseException e) { + if (!ignoreIfNotExists) { + throw new DatabaseNotExistException(getName(), name); + } + } + } + + @Override + public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listTables(String databaseName) throws CatalogException { + return internalCatalog.listTables(databaseName).stream() + .map(TableIdentifier::getTableName) + .collect(Collectors.toList()); + } + + @Override + public List listViews(String databaseName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogBaseTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { + TableIdentifier tableIdentifier = getTableIdentifier(tablePath); + try { + MixedTable table = internalCatalog.loadTable(tableIdentifier); + Schema mixedTableSchema = table.schema(); + + Map mixedTableProperties = Maps.newHashMap(table.properties()); + fillTableProperties(mixedTableProperties); + fillTableMetaPropertiesIfLookupLike(mixedTableProperties, tableIdentifier); + + List partitionKeys = toPartitionKeys(table.spec(), table.schema()); + return CatalogTable.of( + toSchema(mixedTableSchema, MixedFormatUtils.getPrimaryKeys(table), mixedTableProperties) + .toSchema(), + null, + partitionKeys, + mixedTableProperties); + } catch (NoSuchTableException e) { + throw new TableNotExistException(this.getName(), tablePath); + } + } + + /** + * For now, 'CREATE TABLE LIKE' would be treated as the case which users want to add watermark in + * temporal join, as an alternative of lookup join, and use mixed-format table as build table, + * i.e. right table. So the properties those required in temporal join will be put automatically. + * + *

If you don't want the properties, 'EXCLUDING ALL' is what you need. More details @see LIKE + */ + private void fillTableMetaPropertiesIfLookupLike( + Map properties, TableIdentifier tableIdentifier) { + StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace(); + boolean isLookupLike = false; + for (StackTraceElement stackTraceElement : stackTraceElements) { + if (Objects.equal(SQL_LIKE_METHOD, stackTraceElement.getMethodName())) { + isLookupLike = true; + break; + } + } + + if (!isLookupLike) { + return; + } + + properties.put(CONNECTOR.key(), MixedDynamicTableFactory.IDENTIFIER); + properties.put(MixedFormatValidator.MIXED_FORMAT_CATALOG.key(), tableIdentifier.getCatalog()); + properties.put(MixedFormatValidator.MIXED_FORMAT_TABLE.key(), tableIdentifier.getTableName()); + properties.put(MixedFormatValidator.MIXED_FORMAT_DATABASE.key(), tableIdentifier.getDatabase()); + properties.put(CatalogFactoryOptions.AMS_URI.key(), catalogBuilder.getAmsUri()); + } + + private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { + List partitionKeys = Lists.newArrayList(); + for (PartitionField field : spec.fields()) { + if (field.transform().isIdentity()) { + partitionKeys.add(icebergSchema.findColumnName(field.sourceId())); + } else { + // Not created by Flink SQL. + // For compatibility with iceberg tables, return empty. + // TODO modify this after Flink support partition transform. + return Collections.emptyList(); + } + } + return partitionKeys; + } + + private void fillTableProperties(Map tableProperties) { + boolean enableStream = + CompatiblePropertyUtil.propertyAsBoolean( + tableProperties, + TableProperties.ENABLE_LOG_STORE, + TableProperties.ENABLE_LOG_STORE_DEFAULT); + if (enableStream) { + tableProperties.putIfAbsent( + FactoryUtil.FORMAT.key(), + tableProperties.getOrDefault( + TableProperties.LOG_STORE_DATA_FORMAT, + TableProperties.LOG_STORE_DATA_FORMAT_DEFAULT)); + } + } + + private TableIdentifier getTableIdentifier(ObjectPath tablePath) { + return TableIdentifier.of( + internalCatalog.name(), tablePath.getDatabaseName(), tablePath.getObjectName()); + } + + @Override + public boolean tableExists(ObjectPath tablePath) throws CatalogException { + return internalCatalog.tableExists(getTableIdentifier(tablePath)); + } + + @Override + public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) throws CatalogException { + internalCatalog.dropTable(getTableIdentifier(tablePath), true); + } + + @Override + public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) + throws CatalogException { + internalCatalog.renameTable(getTableIdentifier(tablePath), newTableName); + } + + @Override + public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + validateFlinkTable(table); + validateColumnOrder(table); + createAmoroTable(tablePath, table, ignoreIfExists); + } + + private void createAmoroTable( + ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) + throws CatalogException, TableAlreadyExistException { + TableSchema tableSchema = table.getSchema(); + // get PhysicalColumn for TableSchema + TableSchema physicalSchema = getPhysicalSchema(tableSchema); + Schema icebergSchema = FlinkSchemaUtil.convert(physicalSchema); + TableBuilder tableBuilder = + internalCatalog.newTableBuilder(getTableIdentifier(tablePath), icebergSchema); + + tableSchema + .getPrimaryKey() + .ifPresent( + k -> { + PrimaryKeySpec.Builder builder = PrimaryKeySpec.builderFor(icebergSchema); + k.getColumns().forEach(builder::addColumn); + tableBuilder.withPrimaryKeySpec(builder.build()); + }); + + PartitionSpec spec = toPartitionSpec(((CatalogTable) table).getPartitionKeys(), icebergSchema); + tableBuilder.withPartitionSpec(spec); + + Map properties = table.getOptions(); + // update computed columns and watermark to properties + Map extraOptions = generateExtraOptionsFrom(tableSchema); + properties.putAll(extraOptions); + + tableBuilder.withProperties(properties); + + try { + tableBuilder.create(); + } catch (AlreadyExistsException e) { + if (!ignoreIfExists) { + throw new TableAlreadyExistException(getName(), tablePath, e); + } + } + } + + private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { + PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); + partitionKeys.forEach(builder::identity); + return builder.build(); + } + + private static void validateFlinkTable(CatalogBaseTable table) { + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); + } + + @Override + public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) + throws CatalogException, TableNotExistException { + validateFlinkTable(newTable); + + TableIdentifier tableIdentifier = getTableIdentifier(tablePath); + MixedTable mixedTable; + try { + mixedTable = internalCatalog.loadTable(tableIdentifier); + } catch (NoSuchTableException e) { + if (!ignoreIfNotExists) { + throw new TableNotExistException(internalCatalog.name(), tablePath, e); + } else { + return; + } + } + + // Currently, Flink SQL only support altering table properties. + validateTableSchemaAndPartition( + toCatalogTable(mixedTable, tableIdentifier), (CatalogTable) newTable); + + if (mixedTable.isUnkeyedTable()) { + alterUnKeyedTable(mixedTable.asUnkeyedTable(), newTable); + } else if (mixedTable.isKeyedTable()) { + alterKeyedTable(mixedTable.asKeyedTable(), newTable); + } else { + throw new UnsupportedOperationException("Unsupported alter table"); + } + } + + @Override + public List listPartitions(ObjectPath tablePath) + throws CatalogException, TableNotPartitionedException { + return listPartitionsByFilter(tablePath, Collections.emptyList()); + } + + @Override + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException, TableNotPartitionedException, PartitionSpecInvalidException { + checkNotNull(tablePath, "Table path cannot be null"); + checkNotNull(partitionSpec, "CatalogPartitionSpec cannot be null"); + TableIdentifier tableIdentifier = getTableIdentifier(tablePath); + checkValidPartitionSpec( + partitionSpec, internalCatalog.loadTable(tableIdentifier).spec(), tablePath); + List catalogPartitionSpecs = listPartitions(tablePath); + return catalogPartitionSpecs.stream() + .filter(spec -> spec.equals(partitionSpec)) + .collect(Collectors.toList()); + } + + @Override + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) + throws CatalogException, TableNotPartitionedException { + TableIdentifier tableIdentifier = getTableIdentifier(tablePath); + MixedTable mixedTable = internalCatalog.loadTable(tableIdentifier); + + org.apache.iceberg.expressions.Expression filter; + List expressions = + filters.stream() + .map(FlinkFilters::convert) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList()); + + filter = + expressions.isEmpty() + ? Expressions.alwaysTrue() + : expressions.stream().reduce(Expressions::and).orElse(Expressions.alwaysTrue()); + + if (mixedTable.spec().isUnpartitioned()) { + throw new TableNotPartitionedException(internalCatalog.name(), tablePath); + } + Set set = Sets.newHashSet(); + if (mixedTable.isKeyedTable()) { + KeyedTable table = mixedTable.asKeyedTable(); + try (CloseableIterable combinedScanTasks = + table.newScan().filter(filter).planTasks()) { + for (CombinedScanTask combinedScanTask : combinedScanTasks) { + combinedScanTask.tasks().stream() + .flatMap( + (Function>) + keyedTableScanTask -> + Stream.of( + keyedTableScanTask.dataTasks(), + keyedTableScanTask.mixedEquityDeletes()) + .flatMap(List::stream)) + .forEach( + mixedFileScanTask -> { + Map map = Maps.newHashMap(); + StructLike structLike = mixedFileScanTask.partition(); + PartitionSpec spec = table.spec(); + for (int i = 0; i < structLike.size(); i++) { + map.put( + spec.fields().get(i).name(), + String.valueOf(structLike.get(i, Object.class))); + } + set.add(new CatalogPartitionSpec(map)); + }); + } + } catch (IOException e) { + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); + } + } else { + UnkeyedTable table = mixedTable.asUnkeyedTable(); + try (CloseableIterable tasks = table.newScan().filter(filter).planFiles()) { + for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { + Map map = Maps.newHashMap(); + StructLike structLike = dataFile.partition(); + PartitionSpec spec = table.specs().get(dataFile.specId()); + for (int i = 0; i < structLike.size(); i++) { + map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); + } + set.add(new CatalogPartitionSpec(map)); + } + } catch (IOException e) { + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); + } + } + return Lists.newArrayList(set); + } + + @Override + public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List listFunctions(String dbName) throws CatalogException { + return Collections.emptyList(); + } + + @Override + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { + throw new FunctionNotExistException(getName(), functionPath); + } + + @Override + public boolean functionExists(ObjectPath functionPath) throws CatalogException { + return false; + } + + @Override + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) + throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogTableStatistics.UNKNOWN; + } + + @Override + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + return CatalogColumnStatistics.UNKNOWN; + } + + @Override + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public Optional getFactory() { + return Optional.of(new MixedDynamicTableFactory(this)); + } + + public InternalCatalogBuilder catalogBuilder() { + return catalogBuilder; + } + + public String amsCatalogName() { + return internalCatalog.name(); + } + + /** + * Check whether a list of partition values are valid based on the given list of partition keys. + * + * @param partitionSpec a partition spec. + * @param mixedTablePartitionSpec mixedTablePartitionSpec + * @param tablePath tablePath + * @throws PartitionSpecInvalidException thrown if any key in partitionSpec doesn't exist in + * partitionKeys. + */ + private void checkValidPartitionSpec( + CatalogPartitionSpec partitionSpec, + PartitionSpec mixedTablePartitionSpec, + ObjectPath tablePath) + throws PartitionSpecInvalidException { + List partitionKeys = + mixedTablePartitionSpec.fields().stream() + .map(PartitionField::name) + .collect(Collectors.toList()); + for (String key : partitionSpec.getPartitionSpec().keySet()) { + if (!partitionKeys.contains(key)) { + throw new PartitionSpecInvalidException(getName(), partitionKeys, tablePath, partitionSpec); + } + } + } + + private void validateColumnOrder(CatalogBaseTable table) { + TableSchema schema = table.getSchema(); + List tableColumns = schema.getTableColumns(); + + boolean foundComputeColumn = false; + for (TableColumn tableColumn : tableColumns) { + if (tableColumn instanceof ComputedColumn) { + foundComputeColumn = true; + } else if (foundComputeColumn) { + throw new IllegalStateException( + "compute column must be listed after all physical columns. "); + } + } + } + + /** + * copy from + * https://github.com/apache/iceberg/blob/main/flink/v1.16/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java#L425C23-L425C54 + * + * @param ct1 CatalogTable before + * @param ct2 CatalogTable after + */ + private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { + TableSchema ts1 = ct1.getSchema(); + TableSchema ts2 = ct2.getSchema(); + boolean equalsPrimary = false; + + if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { + equalsPrimary = + Objects.equal(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) + && Objects.equal( + ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); + } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { + equalsPrimary = true; + } + + if (!(Objects.equal(ts1.getTableColumns(), ts2.getTableColumns()) + && Objects.equal(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) + && equalsPrimary)) { + throw new UnsupportedOperationException("Altering schema is not supported yet."); + } + + if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { + throw new UnsupportedOperationException("Altering partition keys is not supported yet."); + } + } + + private void alterUnKeyedTable(UnkeyedTable table, CatalogBaseTable newTable) { + Map oldProperties = table.properties(); + Map setProperties = Maps.newHashMap(); + + String setLocation = null; + String setSnapshotId = null; + String pickSnapshotId = null; + + for (Map.Entry entry : newTable.getOptions().entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + + if (Objects.equal(value, oldProperties.get(key))) { + continue; + } + + if (LOCATION.equalsIgnoreCase(key)) { + setLocation = value; + } else if (CURRENT_SNAPSHOT_ID.equalsIgnoreCase(key)) { + setSnapshotId = value; + } else if (CHERRY_PICK_SNAPSHOT_ID.equalsIgnoreCase(key)) { + pickSnapshotId = value; + } else { + setProperties.put(key, value); + } + } + + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); + + FlinkAlterTableUtil.commitChanges( + table, setLocation, setSnapshotId, pickSnapshotId, setProperties); + } + + private CatalogTable toCatalogTable(MixedTable table, TableIdentifier tableIdentifier) { + Schema mixedTableSchema = table.schema(); + + Map mixedTableProperties = Maps.newHashMap(table.properties()); + fillTableProperties(mixedTableProperties); + fillTableMetaPropertiesIfLookupLike(mixedTableProperties, tableIdentifier); + + List partitionKeys = toPartitionKeys(table.spec(), table.schema()); + return new CatalogTableImpl( + toSchema(mixedTableSchema, MixedFormatUtils.getPrimaryKeys(table), mixedTableProperties), + partitionKeys, + mixedTableProperties, + null); + } + + private void alterKeyedTable(KeyedTable table, CatalogBaseTable newTable) { + Map oldProperties = table.properties(); + Map setProperties = Maps.newHashMap(); + for (Map.Entry entry : newTable.getOptions().entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + if (!Objects.equal(value, oldProperties.get(key))) { + setProperties.put(key, value); + } + } + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); + commitKeyedChanges(table, setProperties); + } + + private void commitKeyedChanges(KeyedTable table, Map setProperties) { + if (!setProperties.isEmpty()) { + updateTransactionKey(table.updateProperties(), setProperties); + } + } + + private void updateTransactionKey( + UpdateProperties updateProperties, Map setProperties) { + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); + updateProperties.commit(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java new file mode 100644 index 0000000000..95e5888e79 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories; + +import static org.apache.amoro.properties.CatalogMetaProperties.TABLE_FORMATS; + +import org.apache.amoro.flink.catalog.FlinkUnifiedCatalog; +import org.apache.amoro.flink.catalog.MixedCatalog; +import org.apache.amoro.properties.CatalogMetaProperties; +import org.apache.flink.annotation.Internal; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +/** {@link ConfigOption}s for {@link MixedCatalog} and {@link FlinkUnifiedCatalog}. */ +@Internal +public class CatalogFactoryOptions { + public static final String MIXED_ICEBERG_IDENTIFIER = "mixed_iceberg"; + public static final String MIXED_HIVE_IDENTIFIER = "mixed_hive"; + public static final String UNIFIED_IDENTIFIER = "unified"; + + public static final ConfigOption AMS_URI = + ConfigOptions.key(CatalogMetaProperties.AMS_URI).stringType().noDefaultValue(); + + public static final ConfigOption FLINK_TABLE_FORMATS = + ConfigOptions.key(TABLE_FORMATS) + .stringType() + .noDefaultValue() + .withDescription("This illustrates the table format contained in the catalog."); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java new file mode 100644 index 0000000000..063666b17d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories; + +import static org.apache.amoro.Constants.THRIFT_TABLE_SERVICE_NAME; +import static org.apache.amoro.flink.table.OptionsUtil.getCatalogProperties; +import static org.apache.amoro.properties.CatalogMetaProperties.TABLE_FORMATS; + +import org.apache.amoro.CommonUnifiedCatalog; +import org.apache.amoro.TableFormat; +import org.apache.amoro.UnifiedCatalog; +import org.apache.amoro.UnifiedCatalogLoader; +import org.apache.amoro.client.AmsThriftUrl; +import org.apache.amoro.flink.InternalCatalogBuilder; +import org.apache.amoro.flink.catalog.FlinkUnifiedCatalog; +import org.apache.amoro.flink.catalog.MixedCatalog; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.amoro.table.TableMetaStore; +import org.apache.amoro.utils.CatalogUtil; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.CommonCatalogOptions; +import org.apache.flink.table.factories.CatalogFactory; +import org.apache.flink.util.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.flink.FlinkCatalogFactory; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +/** Factory for {@link FlinkUnifiedCatalog}. */ +public class FlinkUnifiedCatalogFactory implements CatalogFactory { + + public static final Set SUPPORTED_FORMATS = + Sets.newHashSet( + TableFormat.MIXED_ICEBERG, + TableFormat.MIXED_HIVE, + TableFormat.ICEBERG, + TableFormat.PAIMON); + + @Override + public String factoryIdentifier() { + return CatalogFactoryOptions.UNIFIED_IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return Collections.emptySet(); + } + + @Override + public Catalog createCatalog(Context context) { + + final String defaultDatabase = + context + .getOptions() + .getOrDefault(CommonCatalogOptions.DEFAULT_DATABASE_KEY, MixedCatalog.DEFAULT_DB); + final String metastoreUri = context.getOptions().get(CatalogFactoryOptions.AMS_URI.key()); + final Map catalogProperties = getCatalogProperties(context.getOptions()); + + UnifiedCatalog unifiedCatalog; + if (metastoreUri != null) { + String amoroCatalogName = + AmsThriftUrl.parse(metastoreUri, THRIFT_TABLE_SERVICE_NAME).catalogName(); + unifiedCatalog = + UnifiedCatalogLoader.loadUnifiedCatalog( + metastoreUri, amoroCatalogName, catalogProperties); + } else { + String metastoreType = catalogProperties.get(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE); + Preconditions.checkArgument(metastoreType != null, "Catalog type cannot be empty"); + TableMetaStore tableMetaStore = + TableMetaStore.builder() + .withConfiguration( + InternalCatalogBuilder.clusterHadoopConf(metastoreType, catalogProperties)) + .build(); + unifiedCatalog = + new CommonUnifiedCatalog( + context.getName(), metastoreType, catalogProperties, tableMetaStore); + } + Configuration hadoopConf = unifiedCatalog.authenticationContext().getConfiguration(); + Set tableFormats = + CatalogUtil.tableFormats(unifiedCatalog.metastoreType(), unifiedCatalog.properties()); + validate(tableFormats); + + return new FlinkUnifiedCatalog( + metastoreUri, defaultDatabase, unifiedCatalog, context, hadoopConf); + } + + private void validate(Set expectedFormats) { + if (expectedFormats.isEmpty()) { + throw new IllegalArgumentException( + String.format( + "The table formats must be specified in the catalog properties: [%s]", + TABLE_FORMATS)); + } + if (!SUPPORTED_FORMATS.containsAll(expectedFormats)) { + throw new IllegalArgumentException( + String.format( + "The table formats [%s] are not supported in the unified catalog, the supported table formats are [%s].", + expectedFormats, SUPPORTED_FORMATS)); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java new file mode 100644 index 0000000000..44530bf1d1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories.iceberg; + +import org.apache.flink.table.catalog.Catalog; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.flink.FlinkCatalogFactory; + +import java.util.Map; + +/** Creating Iceberg Catalog by the hadoop configuration which stored in the AMS. */ +public class IcebergFlinkCatalogFactory extends FlinkCatalogFactory { + private final Configuration hadoopConf; + + public IcebergFlinkCatalogFactory(Configuration hadoopConf) { + this.hadoopConf = hadoopConf; + } + + @Override + public Catalog createCatalog(String name, Map properties) { + return super.createCatalog(name, properties, hadoopConf); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java new file mode 100644 index 0000000000..d55eff1a21 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories.mixed; + +import org.apache.amoro.flink.catalog.MixedCatalog; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; + +/** + * The factory to create {@link MixedCatalog} with {@link + * CatalogFactoryOptions#MIXED_HIVE_IDENTIFIER} identifier. + */ +public class MixedHiveCatalogFactory extends MixedIcebergCatalogFactory { + + @Override + public String factoryIdentifier() { + return CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java new file mode 100644 index 0000000000..b394e1eaa4 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories.mixed; + +import static org.apache.amoro.flink.table.OptionsUtil.getCatalogProperties; + +import org.apache.amoro.flink.InternalCatalogBuilder; +import org.apache.amoro.flink.catalog.MixedCatalog; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.CommonCatalogOptions; +import org.apache.flink.table.factories.CatalogFactory; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +/** + * The factory to create {@link MixedCatalog} with {@link + * CatalogFactoryOptions#MIXED_ICEBERG_IDENTIFIER} identifier. + */ +public class MixedIcebergCatalogFactory implements CatalogFactory { + + @Override + public String factoryIdentifier() { + return CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER; + } + + @Override + public Catalog createCatalog(Context context) { + + final String defaultDatabase = + context + .getOptions() + .getOrDefault(CommonCatalogOptions.DEFAULT_DATABASE_KEY, MixedCatalog.DEFAULT_DB); + final String amsUri = context.getOptions().get(CatalogFactoryOptions.AMS_URI.key()); + final Map catalogProperties = getCatalogProperties(context.getOptions()); + + final InternalCatalogBuilder catalogBuilder = + InternalCatalogBuilder.builder() + .amsUri(amsUri) + .catalogName(context.getName()) + .properties(catalogProperties); + + return new MixedCatalog(context.getName(), defaultDatabase, catalogBuilder); + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + return Collections.emptySet(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java new file mode 100644 index 0000000000..fd0b6ae937 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog.factories.paimon; + +import org.apache.amoro.properties.CatalogMetaProperties; +import org.apache.paimon.catalog.FileSystemCatalogFactory; +import org.apache.paimon.flink.FlinkCatalog; +import org.apache.paimon.flink.FlinkCatalogFactory; +import org.apache.paimon.options.CatalogOptions; + +import java.util.Map; + +/** Creating Paimon FlinkCatalogFactory with properties which stored in the AMS */ +public class PaimonFlinkCatalogFactory extends FlinkCatalogFactory { + private final Map options; + private final String metastoreType; + + public PaimonFlinkCatalogFactory(Map options, String metastoreType) { + this.options = options; + this.metastoreType = metastoreType; + } + + @Override + public FlinkCatalog createCatalog(Context context) { + context.getOptions().putAll(options); + addMetastoreType(context); + return super.createCatalog(context); + } + + private void addMetastoreType(Context context) { + String type; + if (CatalogMetaProperties.CATALOG_TYPE_HADOOP.equalsIgnoreCase(metastoreType)) { + type = FileSystemCatalogFactory.IDENTIFIER; + } else { + type = metastoreType; + } + context.getOptions().put(CatalogOptions.METASTORE.key(), type); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java new file mode 100644 index 0000000000..e0d1ec1b7d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.interceptor; + +import org.apache.amoro.flink.util.ReflectionUtil; +import org.apache.amoro.table.MixedTable; + +import java.io.Serializable; +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; +import java.util.HashMap; +import java.util.Map; + +/** Integrate flinkTable properties */ +public class FlinkTablePropertiesInvocationHandler implements InvocationHandler, Serializable { + + private final MixedTable mixedTable; + private final Map flinkTableProperties = new HashMap<>(); + protected Map tablePropertiesCombined = new HashMap<>(); + + public FlinkTablePropertiesInvocationHandler( + Map flinkTableProperties, MixedTable mixedTable) { + this.tablePropertiesCombined.putAll(mixedTable.properties()); + this.mixedTable = mixedTable; + if (flinkTableProperties == null) { + return; + } + this.flinkTableProperties.putAll(flinkTableProperties); + this.tablePropertiesCombined.putAll(flinkTableProperties); + } + + public Object getProxy() { + return Proxy.newProxyInstance( + mixedTable.getClass().getClassLoader(), + ReflectionUtil.getAllInterface(mixedTable.getClass()), + this); + } + + @Override + public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { + if ("properties".equals(method.getName())) { + return tablePropertiesCombined; + } else if ("asKeyedTable".equals(method.getName())) { + return proxy; + } + Object result = method.invoke(mixedTable, args); + // rewrite the properties as of the mixed-format table properties may be updated. + if ("refresh".equals(method.getName())) { + rewriteProperties(); + } + return result; + } + + private void rewriteProperties() { + Map refreshedProperties = mixedTable.properties(); + // iterate through the properties of the mixed-format table and update the properties of the + // tablePropertiesCombined. + for (Map.Entry entry : refreshedProperties.entrySet()) { + if (flinkTableProperties.containsKey(entry.getKey())) { + // Don't update the properties of the tablePropertiesCombined + continue; + } + if (!tablePropertiesCombined.containsKey(entry.getKey()) + || !tablePropertiesCombined.get(entry.getKey()).equals(entry.getValue())) { + tablePropertiesCombined.put(entry.getKey(), entry.getValue()); + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java new file mode 100644 index 0000000000..14de1dabd6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.interceptor; + +import net.sf.cglib.proxy.MethodInterceptor; +import net.sf.cglib.proxy.MethodProxy; +import org.apache.amoro.io.AuthenticatedFileIO; + +import java.io.Serializable; +import java.lang.reflect.Method; + +/** Using cglib proxy to avoid proxy object having different class */ +public class KerberosInterceptor implements MethodInterceptor, Serializable { + + private static final long serialVersionUID = 1L; + private final AuthenticatedFileIO authenticatedFileIO; + + public KerberosInterceptor(AuthenticatedFileIO authenticatedFileIO) { + this.authenticatedFileIO = authenticatedFileIO; + } + + @Override + public Object intercept(Object o, Method method, Object[] args, MethodProxy proxy) + throws Throwable { + Object res; + try { + res = + authenticatedFileIO.doAs( + () -> { + try { + return proxy.invokeSuper(o, args); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } catch (RuntimeException e) { + throw e.getCause(); + } + return res; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java new file mode 100644 index 0000000000..25dce7fb0d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.interceptor; + +import org.apache.amoro.flink.util.ReflectionUtil; +import org.apache.amoro.io.AuthenticatedFileIO; + +import java.io.Serializable; +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; + +/** + * Proxy for iceberg-flink class. To support kerberos. Using jdk proxy can surrogate an instance + * which already exists. + * + * @param proxy class type + */ +public class KerberosInvocationHandler implements InvocationHandler, Serializable { + + private static final long serialVersionUID = 1L; + private final AuthenticatedFileIO authenticatedFileIO; + private T obj; + + public KerberosInvocationHandler(AuthenticatedFileIO authenticatedFileIO) { + this.authenticatedFileIO = authenticatedFileIO; + } + + public Object getProxy(T obj) { + this.obj = obj; + return Proxy.newProxyInstance( + obj.getClass().getClassLoader(), ReflectionUtil.getAllInterface(obj.getClass()), this); + } + + @Override + public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { + Object res; + try { + res = + authenticatedFileIO.doAs( + () -> { + try { + method.setAccessible(true); + return method.invoke(obj, args); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } catch (RuntimeException e) { + throw e.getCause(); + } + return res; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java new file mode 100644 index 0000000000..0e0341bac6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.interceptor; + +import org.apache.amoro.flink.util.ProxyUtil; + +import java.io.Serializable; + +/** + * Create proxy in runtime to avoid 'ClassNotFoundException: $$EnhancerByCglib' + * + * @param + */ +public class ProxyFactory implements Serializable { + private static final long serialVersionUID = 1L; + private final Class clazz; + private final KerberosInterceptor interceptor; + private final Class[] argumentTypes; + private final Object[] arguments; + + public ProxyFactory( + Class clazz, KerberosInterceptor interceptor, Class[] argumentTypes, Object[] arguments) { + this.clazz = clazz; + this.interceptor = interceptor; + this.argumentTypes = argumentTypes; + this.arguments = arguments; + } + + public T getInstance() { + return ProxyUtil.getProxy(clazz, interceptor, argumentTypes, arguments); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java new file mode 100644 index 0000000000..114245de93 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOOKUP_RELOADING_INTERVAL; +import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; +import static org.apache.flink.util.Preconditions.checkArgument; + +import org.apache.amoro.flink.read.MixedIncrementalLoader; +import org.apache.amoro.flink.read.hybrid.enumerator.MergeOnReadIncrementalPlanner; +import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.functions.FunctionContext; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.concurrent.ExecutorThreadFactory; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.Field; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Predicate; + +/** This is a basic lookup function for an mixed-format table. */ +public class BasicLookupFunction implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(BasicLookupFunction.class); + private static final long serialVersionUID = 1671720424494168710L; + private MixedTable mixedTable; + private KVTable kvTable; + private final List joinKeys; + private final Schema projectSchema; + private final List filters; + private final MixedFormatTableLoader loader; + private long nextLoadTime = Long.MIN_VALUE; + private final long reloadIntervalSeconds; + private MixedIncrementalLoader incrementalLoader; + private final Configuration config; + private transient AtomicLong lookupLoadingTimeMs; + private final Predicate predicate; + private final TableFactory kvTableFactory; + private final AbstractAdaptHiveKeyedDataReader flinkMORDataReader; + private final DataIteratorReaderFunction readerFunction; + + private transient ScheduledExecutorService executor; + private final AtomicReference failureThrowable = new AtomicReference<>(); + + public BasicLookupFunction( + TableFactory tableFactory, + MixedTable mixedTable, + List joinKeys, + Schema projectSchema, + List filters, + MixedFormatTableLoader tableLoader, + Configuration config, + Predicate predicate, + AbstractAdaptHiveKeyedDataReader adaptHiveKeyedDataReader, + DataIteratorReaderFunction readerFunction) { + checkArgument( + mixedTable.isKeyedTable(), + String.format( + "Only keyed mixed-format table support lookup join, this table [%s] is an unkeyed table.", + mixedTable.name())); + Preconditions.checkNotNull(tableFactory, "kvTableFactory cannot be null"); + this.kvTableFactory = tableFactory; + this.joinKeys = joinKeys; + this.projectSchema = projectSchema; + this.filters = filters; + this.loader = tableLoader; + this.config = config; + this.reloadIntervalSeconds = config.get(LOOKUP_RELOADING_INTERVAL).getSeconds(); + this.predicate = predicate; + this.flinkMORDataReader = adaptHiveKeyedDataReader; + this.readerFunction = readerFunction; + } + + /** + * Open the lookup function, e.g.: create {@link KVTable} kvTable, and load data. + * + * @throws IOException If serialize or deserialize failed + */ + public void open(FunctionContext context) throws IOException { + init(context); + start(); + } + + /** + * Initialize the mixed-format table, kvTable and incrementalLoader. + * + * @param context + */ + public void init(FunctionContext context) { + LOG.info("lookup function row data predicate: {}.", predicate); + MetricGroup metricGroup = context.getMetricGroup().addGroup(LookupMetrics.GROUP_NAME_LOOKUP); + if (mixedTable == null) { + mixedTable = loadMixedTable(loader).asKeyedTable(); + } + mixedTable.refresh(); + + lookupLoadingTimeMs = new AtomicLong(); + metricGroup.gauge(LookupMetrics.LOADING_TIME_MS, () -> lookupLoadingTimeMs.get()); + + LOG.info("projected schema {}.\n table schema {}.", projectSchema, mixedTable.schema()); + kvTable = + kvTableFactory.create( + new RowDataStateFactory(generateRocksDBPath(context, mixedTable.name()), metricGroup), + mixedTable.asKeyedTable().primaryKeySpec().fieldNames(), + joinKeys, + projectSchema, + config, + predicate); + kvTable.open(); + + this.incrementalLoader = + new MixedIncrementalLoader<>( + new MergeOnReadIncrementalPlanner(loader), flinkMORDataReader, readerFunction, filters); + } + + public void start() { + // Keep the first-time synchronized loading to avoid a mass of null-match records during + // initialization + checkAndLoad(); + + this.executor = + Executors.newScheduledThreadPool( + 1, new ExecutorThreadFactory("Mixed-format-lookup-scheduled-loader")); + this.executor.scheduleWithFixedDelay( + () -> { + try { + checkAndLoad(); + } catch (Exception e) { + // fail the lookup and skip the rest of the items + // if the failure handler decides to throw an exception + failureThrowable.compareAndSet(null, e); + } + }, + 0, + reloadIntervalSeconds, + TimeUnit.MILLISECONDS); + } + + public List lookup(RowData lookupKey) { + checkErrorAndRethrow(); + try { + return kvTable.get(lookupKey); + } catch (Exception e) { + throw new FlinkRuntimeException(e); + } + } + + /** + * Check whether it is time to periodically load data to kvTable. Support to use {@link + * Expression} filters to filter the data. + */ + private synchronized void checkAndLoad() { + if (nextLoadTime > System.currentTimeMillis()) { + return; + } + nextLoadTime = System.currentTimeMillis() + 1000 * reloadIntervalSeconds; + + long batchStart = System.currentTimeMillis(); + while (incrementalLoader.hasNext()) { + long start = System.currentTimeMillis(); + mixedTable + .io() + .doAs( + () -> { + try (CloseableIterator iterator = incrementalLoader.next()) { + if (kvTable.initialized()) { + kvTable.upsert(iterator); + } else { + LOG.info( + "This table {} is still under initialization progress.", mixedTable.name()); + kvTable.initialize(iterator); + } + } + return null; + }); + LOG.info("Split task fetched, cost {}ms.", System.currentTimeMillis() - start); + } + if (!kvTable.initialized()) { + kvTable.waitInitializationCompleted(); + } + lookupLoadingTimeMs.set(System.currentTimeMillis() - batchStart); + + LOG.info( + "{} table lookup loading, these batch tasks completed, cost {}ms.", + mixedTable.name(), + lookupLoadingTimeMs.get()); + } + + public KVTable getKVTable() { + return kvTable; + } + + public void close() throws Exception { + if (kvTable != null) { + kvTable.close(); + } + if (executor != null) { + executor.shutdownNow(); + } + } + + private void checkErrorAndRethrow() { + Throwable cause = failureThrowable.get(); + if (cause != null) { + throw new RuntimeException("An error occurred in MixedFormatLookupFunction.", cause); + } + } + + private String generateRocksDBPath(FunctionContext context, String tableName) { + String tmpPath = getTmpDirectoryFromTMContainer(context); + File db = new File(tmpPath, tableName + "-lookup-" + UUID.randomUUID()); + return db.toString(); + } + + private static String getTmpDirectoryFromTMContainer(FunctionContext context) { + try { + Field field = context.getClass().getDeclaredField("context"); + field.setAccessible(true); + StreamingRuntimeContext runtimeContext = (StreamingRuntimeContext) field.get(context); + String[] tmpDirectories = runtimeContext.getTaskManagerRuntimeInfo().getTmpDirectories(); + return tmpDirectories[ThreadLocalRandom.current().nextInt(tmpDirectories.length)]; + } catch (NoSuchFieldException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java new file mode 100644 index 0000000000..e6bae12c04 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; + +import java.io.IOException; +import java.io.Serializable; + +/** + * This is a wrapper for {@link BinaryRowDataSerializer}. It is used to serialize and deserialize + * RowData. And serialize and deserialize operations are not thread-safe. + */ +public class BinaryRowDataSerializerWrapper implements Serializable, Cloneable { + + private static final long serialVersionUID = 1L; + protected BinaryRowDataSerializer serializer; + private RowDataSerializer rowDataSerializer; + private DataOutputSerializer outputView; + private DataInputDeserializer inputView; + private final Schema schema; + + public BinaryRowDataSerializerWrapper(Schema schema) { + this.serializer = new BinaryRowDataSerializer(schema.asStruct().fields().size()); + this.schema = schema; + } + + public byte[] serialize(RowData rowData) throws IOException { + if (rowDataSerializer == null) { + RowType rowType = FlinkSchemaUtil.convert(schema); + rowDataSerializer = new RowDataSerializer(rowType); + } + BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(rowData); + if (outputView == null) { + outputView = new DataOutputSerializer(32); + } + outputView.clear(); + serializer.serialize(binaryRowData, outputView); + return outputView.getCopyOfBuffer(); + } + + public RowData deserialize(byte[] recordBytes) throws IOException { + if (recordBytes == null) { + return null; + } + if (inputView == null) { + inputView = new DataInputDeserializer(); + } + inputView.setBuffer(recordBytes); + return serializer.deserialize(inputView); + } + + @Override + public BinaryRowDataSerializerWrapper clone() { + return new BinaryRowDataSerializerWrapper(schema); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java new file mode 100644 index 0000000000..2d54d65944 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.commons.collections.CollectionUtils; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** Utility class for serializing and deserializing a set of ByteArrayWrapper objects. */ +public class ByteArraySetSerializer { + + /** + * Deserializes a byte array into a set of ByteArrayWrapper objects. + * + * @param byteArray the byte array to deserialize + * @return the deserialized set of ByteArrayWrapper objects + */ + public static Set deserialize(byte[] byteArray) { + if (byteArray == null) { + return Collections.emptySet(); + } + + Set set = new HashSet<>(); + + ByteBuffer buffer = ByteBuffer.wrap(byteArray); + int setSize = buffer.getInt(); // Read the size of the set + + for (int i = 0; i < setSize; i++) { + int elementSize = buffer.getInt(); // Read the size of the element + byte[] element = new byte[elementSize]; + buffer.get(element); // Read the element bytes + ByteArrayWrapper baw = new ByteArrayWrapper(element, elementSize); + set.add(baw); + } + + return set; + } + + /** + * Serializes a set of ByteArrayWrapper objects into a byte array. + * + * @param set the set of ByteArrayWrapper objects to serialize + * @return the serialized byte array + */ + public static byte[] serialize(Set set) { + if (CollectionUtils.isEmpty(set)) { + return null; + } + + // Calculate the total size of the resulting byte array + // The first 4 bytes represent the size of the set + int totalSize = 4; + for (ByteArrayWrapper record : set) { + // Each element consists of 4 bytes representing the size of the element + totalSize += 4; + totalSize += record.size; + } + + // Create a new byte array with the total size + ByteBuffer buffer = ByteBuffer.allocate(totalSize); + buffer.putInt(set.size()); // Write the size of the set + + for (ByteArrayWrapper record : set) { + buffer.putInt(record.size); // Write the size of the element + buffer.put(record.bytes); // Write the element bytes + } + + return buffer.array(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java new file mode 100644 index 0000000000..264a853a41 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Locale; + +/** This byte array wrapper utility class. copied from com.ibm.icu.util.ByteArrayWrapper. */ +public class ByteArrayWrapper implements Comparable, Serializable { + private static final long serialVersionUID = -6697944376117365645L; + public byte[] bytes; + + /** + * Size of the internal byte array used. Different from bytes.length, size will be <= + * bytes.length. Semantics of size is similar to java.util.Vector.size(). + */ + public int size; + + /** + * Construct a new ByteArrayWrapper from a byte array and size. + * + * @param bytesToAdopt the byte array to adopt + * @param size the length of valid data in the byte array + * @throws IndexOutOfBoundsException if bytesToAdopt == null and size != 0, or size < 0, or + * size > bytesToAdopt.length. + */ + public ByteArrayWrapper(byte[] bytesToAdopt, int size) { + if ((bytesToAdopt == null && size != 0) + || size < 0 + || (bytesToAdopt != null && size > bytesToAdopt.length)) { + throw new IndexOutOfBoundsException("illegal size: " + size); + } + this.bytes = bytesToAdopt; + this.size = size; + } + + /** + * Construct a new ByteArrayWrapper from the contents of a ByteBuffer. + * + * @param source the ByteBuffer from which to get the data. + */ + public ByteArrayWrapper(ByteBuffer source) { + size = source.limit(); + bytes = new byte[size]; + source.get(bytes, 0, size); + } + + /** + * Ensure that the internal byte array is at least of length capacity. If the byte array is null + * or its length is less than capacity, a new byte array of length capacity will be allocated. The + * contents of the array (between 0 and size) remain unchanged. + * + * @param capacity minimum length of internal byte array. + * @return this ByteArrayWrapper + */ + public ByteArrayWrapper ensureCapacity(int capacity) { + if (bytes == null || bytes.length < capacity) { + byte[] newBytes = new byte[capacity]; + if (bytes != null) { + copyBytes(bytes, 0, newBytes, 0, size); + } + bytes = newBytes; + } + return this; + } + + /** + * Set the internal byte array from offset 0 to (limit - start) with the contents of src from + * offset start to limit. If the byte array is null or its length is less than capacity, a new + * byte array of length (limit - start) will be allocated. This resets the size of the internal + * byte array to (limit - start). + * + * @param src source byte array to copy from + * @param start start offset of src to copy from + * @param limit end + 1 offset of src to copy from + * @return this ByteArrayWrapper + */ + public final ByteArrayWrapper set(byte[] src, int start, int limit) { + size = 0; + append(src, start, limit); + return this; + } + + /** + * Appends the internal byte array from offset size with the contents of src from offset start to + * limit. This increases the size of the internal byte array to (size + limit - start). + * + * @param src source byte array to copy from + * @param start start offset of src to copy from + * @param limit end + 1 offset of src to copy from + * @return this ByteArrayWrapper + */ + public final ByteArrayWrapper append(byte[] src, int start, int limit) { + int len = limit - start; + ensureCapacity(size + len); + copyBytes(src, start, bytes, size, len); + size += len; + return this; + } + + /** + * Releases the internal byte array to the caller, resets the internal byte array to null and its + * size to 0. + * + * @return internal byte array. + */ + public final byte[] releaseBytes() { + byte[] result = bytes; + bytes = null; + size = 0; + return result; + } + + /** Returns string value for debugging. */ + @Override + public String toString() { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < size; ++i) { + if (i != 0) { + result.append(" "); + } + result.append(hex(bytes[i] & 0xFF)); + } + return result.toString(); + } + + private static String hex(long i) { + if (i == Long.MIN_VALUE) { + return "-8000000000000000"; + } else { + boolean negative = i < 0L; + if (negative) { + i = -i; + } + + String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); + if (result.length() < 2) { + result = "0000000000000000".substring(result.length(), 2) + result; + } + + return negative ? '-' + result : result; + } + } + + /** + * Return true if the bytes in each wrapper are equal. + * + * @param other the object to compare to. + * @return true if the two objects are equal. + */ + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null) { + return false; + } + if (!(other instanceof ByteArrayWrapper)) { + return false; + } + + ByteArrayWrapper that = (ByteArrayWrapper) other; + if (size != that.size) { + return false; + } + for (int i = 0; i < size; ++i) { + if (bytes[i] != that.bytes[i]) { + return false; + } + } + return true; + } + + /** + * Return the hashcode. + * + * @return the hashcode. + */ + @Override + public int hashCode() { + int result = size; + for (int i = 0; i < size; ++i) { + result = 37 * result + bytes[i]; + } + return result; + } + + /** + * Compare this object to another ByteArrayWrapper, which must not be null. + * + * @param other the object to compare to. + * @return a value <0, 0, or >0 as this compares less than, equal to, or greater than other. + * @throws ClassCastException if the other object is not a ByteArrayWrapper + */ + @Override + public int compareTo(ByteArrayWrapper other) { + if (this == other) { + return 0; + } + int minSize = Math.min(size, other.size); + for (int i = 0; i < minSize; ++i) { + if (bytes[i] != other.bytes[i]) { + return (bytes[i] & 0xFF) - (other.bytes[i] & 0xFF); + } + } + return size - other.size; + } + + /** + * Copies the contents of src byte array from offset srcOff to the target of target byte array at + * the offset targetOff. + * + * @param src source byte array to copy from + * @param srcOff start offset of src to copy from + * @param target target byte array to copy to + * @param targetOff start offset of target to copy to + * @param length size of contents to copy + */ + private static void copyBytes(byte[] src, int srcOff, byte[] target, int targetOff, int length) { + if (length < 64) { + for (int i = srcOff, n = targetOff; --length >= 0; ++i, ++n) { + target[n] = src[i]; + } + } else { + System.arraycopy(src, srcOff, target, targetOff, length); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java new file mode 100644 index 0000000000..596647eafa --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.flink.table.data.RowData; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Serializable; +import java.util.Iterator; +import java.util.List; + +/** + * The KVTable interface is used for lookup join in mixed-format table on Flink. It includes methods + * for initializing and updating the lookup table, as well as getting results by key and cleaning up + * the cache. + */ +public interface KVTable extends Serializable, Closeable { + /** Initialize the lookup table */ + void open(); + + /** + * Get the result by the key. + * + * @throws IOException Serialize the rowData failed. + */ + List get(RowData key) throws IOException; + + /** + * Upsert the {@link KVTable} by the Change table dataStream. + * + * @throws IOException Serialize the rowData failed. + */ + void upsert(Iterator dataStream) throws IOException; + + /** + * Initial the {@link KVTable} by the MoR dataStream. + * + * @param dataStream the data stream for loading into the {@link KVTable}. + * @throws IOException Serialize the rowData failed. + */ + void initialize(Iterator dataStream) throws IOException; + + /** @return if the rowData is filtered, return true. */ + boolean filter(T value); + + /** @return if initialization is completed, return true. */ + boolean initialized(); + + /** + * Waiting for the initialization completed, and enable auto compaction at the end of the + * initialization. + */ + void waitInitializationCompleted(); + + /** + * Try to clean up the cache manually, due to the lookup_cache.ttl-after-write configuration. + * + *

lookup_cache.ttl-after-writ Only works in SecondaryIndexTable. + */ + default void cleanUp() {} + + void close(); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java new file mode 100644 index 0000000000..746e7eed37 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import static org.apache.amoro.flink.util.LookupUtil.convertLookupOptions; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class KVTableFactory implements TableFactory, Serializable { + private static final Logger LOG = LoggerFactory.getLogger(KVTableFactory.class); + private static final long serialVersionUID = 8090117643055858494L; + public static final KVTableFactory INSTANCE = new KVTableFactory(); + + public KVTable create( + RowDataStateFactory rowDataStateFactory, + List primaryKeys, + List joinKeys, + Schema projectSchema, + Configuration config, + Predicate rowDataPredicate) { + Set joinKeySet = new HashSet<>(joinKeys); + Set primaryKeySet = new HashSet<>(primaryKeys); + // keep the primary keys order with projected schema fields. + primaryKeys = + projectSchema.asStruct().fields().stream() + .map(Types.NestedField::name) + .filter(primaryKeySet::contains) + .collect(Collectors.toList()); + + if (primaryKeySet.equals(joinKeySet)) { + LOG.info( + "create unique index table, unique keys are {}, lookup keys are {}.", + primaryKeys.toArray(), + joinKeys.toArray()); + return new UniqueIndexTable( + rowDataStateFactory, + primaryKeys, + projectSchema, + convertLookupOptions(config), + rowDataPredicate); + } else { + LOG.info( + "create secondary index table, unique keys are {}, lookup keys are {}.", + primaryKeys.toArray(), + joinKeys.toArray()); + return new SecondaryIndexTable( + rowDataStateFactory, + primaryKeys, + joinKeys, + projectSchema, + convertLookupOptions(config), + rowDataPredicate); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java new file mode 100644 index 0000000000..121fbbd080 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; + +public class KeyRowData implements RowData { + private final int[] keyIndexMapping; + private final RowData rowData; + + public KeyRowData(int[] keyIndexMapping, RowData rowData) { + this.keyIndexMapping = keyIndexMapping; + this.rowData = rowData; + } + + @Override + public int getArity() { + return keyIndexMapping.length; + } + + @Override + public RowKind getRowKind() { + return rowData.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + rowData.setRowKind(kind); + } + + @Override + public boolean isNullAt(int pos) { + return rowData.isNullAt(keyIndexMapping[pos]); + } + + @Override + public boolean getBoolean(int pos) { + return rowData.getBoolean(keyIndexMapping[pos]); + } + + @Override + public byte getByte(int pos) { + return rowData.getByte(keyIndexMapping[pos]); + } + + @Override + public short getShort(int pos) { + return rowData.getShort(keyIndexMapping[pos]); + } + + @Override + public int getInt(int pos) { + return rowData.getInt(keyIndexMapping[pos]); + } + + @Override + public long getLong(int pos) { + return rowData.getLong(keyIndexMapping[pos]); + } + + @Override + public float getFloat(int pos) { + return rowData.getFloat(keyIndexMapping[pos]); + } + + @Override + public double getDouble(int pos) { + return rowData.getDouble(keyIndexMapping[pos]); + } + + @Override + public StringData getString(int pos) { + return rowData.getString(keyIndexMapping[pos]); + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return rowData.getDecimal(keyIndexMapping[pos], precision, scale); + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return rowData.getTimestamp(keyIndexMapping[pos], precision); + } + + @Override + public RawValueData getRawValue(int pos) { + return rowData.getRawValue(keyIndexMapping[pos]); + } + + @Override + public byte[] getBinary(int pos) { + return rowData.getBinary(keyIndexMapping[pos]); + } + + @Override + public ArrayData getArray(int pos) { + return rowData.getArray(keyIndexMapping[pos]); + } + + @Override + public MapData getMap(int pos) { + return rowData.getMap(keyIndexMapping[pos]); + } + + @Override + public RowData getRow(int pos, int numFields) { + return rowData.getRow(keyIndexMapping[pos], numFields); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java new file mode 100644 index 0000000000..44f7b0eeb2 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +public class LookupMetrics { + + public static final String GROUP_NAME_LOOKUP = "mixed_format_lookup"; + public static final String LOADING_TIME_MS = "lookup_loading_cost_ms"; + public static final String UNIQUE_CACHE_SIZE = "lookup_unique_index_cache_size"; + public static final String SECONDARY_CACHE_SIZE = "lookup_secondary_index_cache_size"; +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java new file mode 100644 index 0000000000..de5da31dd5 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.flink.util.Preconditions; + +import java.io.Serializable; +import java.time.Duration; + +/** This class is used to configure lookup options. */ +public class LookupOptions implements Serializable { + private static final long serialVersionUID = -1L; + + private final long lruMaximumSize; + private final int writeRecordThreadNum; + private final Duration ttlAfterWrite; + private final long blockCacheCapacity; + private final int blockCacheNumShardBits; + + private LookupOptions(Builder builder) { + this.lruMaximumSize = builder.lruMaximumSize; + this.writeRecordThreadNum = builder.writeRecordThreadNum; + this.ttlAfterWrite = builder.ttlAfterWrite; + this.blockCacheCapacity = builder.blockCacheCapacity; + this.blockCacheNumShardBits = builder.blockCacheNumShardBits; + } + + public long lruMaximumSize() { + return lruMaximumSize; + } + + public int writeRecordThreadNum() { + return writeRecordThreadNum; + } + + public Duration ttlAfterWrite() { + return ttlAfterWrite; + } + + public boolean isTTLAfterWriteValidated() { + return ttlAfterWrite.compareTo(Duration.ZERO) > 0; + } + + public long blockCacheCapacity() { + return blockCacheCapacity; + } + + public int numShardBits() { + return blockCacheNumShardBits; + } + + @Override + public String toString() { + return "LookupOptions{" + + "lruMaximumSize=" + + lruMaximumSize + + ", writeRecordThreadNum=" + + writeRecordThreadNum + + ", ttlAfterWrite=" + + ttlAfterWrite + + ", blockCacheCapacity=" + + blockCacheCapacity + + ", blockCacheNumShardBits=" + + blockCacheNumShardBits + + "}"; + } + + public static class Builder { + private long lruMaximumSize; + private int writeRecordThreadNum; + private Duration ttlAfterWrite; + private long blockCacheCapacity; + private int blockCacheNumShardBits; + + /** LRU cache max size. */ + public Builder lruMaximumSize(long lruMaximumSize) { + Preconditions.checkArgument(lruMaximumSize >= 0, "lruMaximumSize must not be negative"); + this.lruMaximumSize = lruMaximumSize; + return this; + } + + /** Write record thread num. */ + public Builder writeRecordThreadNum(int writeRecordThreadNum) { + Preconditions.checkArgument( + writeRecordThreadNum > 0, "writeRecordThreadNum must be greater than 0"); + this.writeRecordThreadNum = writeRecordThreadNum; + return this; + } + + /** Clean expired records after write. */ + public Builder ttlAfterWrite(Duration ttlAfterWrite) { + Preconditions.checkArgument( + !ttlAfterWrite.isNegative(), "ttlAfterWrite must not be negative"); + this.ttlAfterWrite = ttlAfterWrite; + return this; + } + + public Builder blockCacheCapacity(long blockCacheCapacity) { + Preconditions.checkArgument( + blockCacheCapacity > 0, "blockCacheCapacity must be greater than 0"); + this.blockCacheCapacity = blockCacheCapacity; + return this; + } + + public Builder blockCacheNumShardBits(int blockCacheNumShardBits) { + Preconditions.checkArgument( + blockCacheNumShardBits >= -1, + "blockCacheNumShardBits must be greater than or equal to -1"); + this.blockCacheNumShardBits = blockCacheNumShardBits; + return this; + } + + public LookupOptions build() { + return new LookupOptions(this); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java new file mode 100644 index 0000000000..0c5ddfe97c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +public class LookupRecord { + private final byte[] keyBytes; + private final byte[] valueBytes; + + private final OpType opType; + + private LookupRecord(OpType opType, byte[] keyBytes, byte[] valueBytes) { + this.keyBytes = keyBytes; + this.valueBytes = valueBytes; + this.opType = opType; + } + + public static LookupRecord of(OpType opType, byte[] keyBytes, byte[] valueBytes) { + return new LookupRecord(opType, keyBytes, valueBytes); + } + + public byte[] keyBytes() { + return keyBytes; + } + + public byte[] valueBytes() { + return valueBytes; + } + + public OpType opType() { + return opType; + } + + enum OpType { + PUT_BYTES, + DELETE_BYTES + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java new file mode 100644 index 0000000000..ee50800585 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.functions.FunctionContext; +import org.apache.flink.table.functions.LookupFunction; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.function.Predicate; + +/** A lookup function for {@link RowData} type. */ +public class MixedFormatRowDataLookupFunction extends LookupFunction { + private static final long serialVersionUID = -7694050999266540499L; + private final BasicLookupFunction basicLookupFunction; + + public MixedFormatRowDataLookupFunction( + TableFactory tableFactory, + MixedTable mixedTable, + List joinKeys, + Schema projectSchema, + List filters, + MixedFormatTableLoader tableLoader, + Configuration config, + Predicate predicate, + AbstractAdaptHiveKeyedDataReader flinkMORDataReader, + DataIteratorReaderFunction readerFunction) { + this.basicLookupFunction = + new BasicLookupFunction<>( + tableFactory, + mixedTable, + joinKeys, + projectSchema, + filters, + tableLoader, + config, + predicate, + flinkMORDataReader, + readerFunction); + } + + @Override + public void open(FunctionContext context) throws IOException { + basicLookupFunction.open(context); + } + + @Override + public Collection lookup(RowData keyRow) throws IOException { + return basicLookupFunction.lookup(keyRow); + } + + @Override + public void close() throws Exception { + basicLookupFunction.close(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java new file mode 100644 index 0000000000..a82bff54db --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.AmoroIOException; +import org.apache.amoro.utils.map.RocksDBBackend; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; +import org.apache.flink.shaded.guava30.com.google.common.cache.CacheBuilder; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.Preconditions; +import org.rocksdb.ColumnFamilyHandle; +import org.rocksdb.MutableColumnFamilyOptions; +import org.rocksdb.RocksDBException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * This is an abstract state backed by RocksDB and Guava cache for storing and retrieving key-value + * pairs of byte arrays. + * + * @param the type of the cache's values, which are not permitted to be null + */ +public abstract class RocksDBCacheState { + private static final Logger LOG = LoggerFactory.getLogger(RocksDBCacheState.class); + protected RocksDBBackend rocksDB; + protected final boolean secondaryIndexMemoryMapEnabled; + + protected Cache guavaCache; + + protected final String columnFamilyName; + protected final ColumnFamilyHandle columnFamilyHandle; + protected ThreadLocal keySerializerThreadLocal = + new ThreadLocal<>(); + + protected ThreadLocal valueSerializerThreadLocal = + new ThreadLocal<>(); + + protected final BinaryRowDataSerializerWrapper keySerializer; + + protected final BinaryRowDataSerializerWrapper valueSerializer; + private ExecutorService writeRocksDBService; + private final AtomicBoolean initialized = new AtomicBoolean(false); + private final AtomicBoolean closed = new AtomicBoolean(false); + protected Queue lookupRecordsQueue; + + private final int writeRocksDBThreadNum; + private List> writeRocksDBThreadFutures; + private final AtomicReference writingThreadException = new AtomicReference<>(); + protected final MetricGroup metricGroup; + private final LookupOptions lookupOptions; + + public RocksDBCacheState( + RocksDBBackend rocksDB, + String columnFamilyName, + BinaryRowDataSerializerWrapper keySerializer, + BinaryRowDataSerializerWrapper valueSerializer, + MetricGroup metricGroup, + LookupOptions lookupOptions, + boolean secondaryIndexMemoryMapEnabled) { + this.rocksDB = rocksDB; + this.columnFamilyName = columnFamilyName; + this.keySerializer = keySerializer; + this.valueSerializer = valueSerializer; + this.columnFamilyHandle = rocksDB.getColumnFamilyHandle(columnFamilyName); + this.writeRocksDBThreadNum = lookupOptions.writeRecordThreadNum(); + this.secondaryIndexMemoryMapEnabled = secondaryIndexMemoryMapEnabled; + this.metricGroup = metricGroup; + this.lookupOptions = lookupOptions; + } + + public void open() { + writeRocksDBService = Executors.newFixedThreadPool(writeRocksDBThreadNum); + + if (secondaryIndexMemoryMapEnabled) { + CacheBuilder cacheBuilder = CacheBuilder.newBuilder(); + if (lookupOptions.isTTLAfterWriteValidated()) { + cacheBuilder.expireAfterWrite(lookupOptions.ttlAfterWrite()); + } + } + guavaCache = CacheBuilder.newBuilder().maximumSize(lookupOptions.lruMaximumSize()).build(); + + addGauge(columnFamilyName + "_queue_size", () -> lookupRecordsQueue.size()); + + lookupRecordsQueue = new ConcurrentLinkedQueue<>(); + writeRocksDBThreadFutures = + IntStream.range(0, writeRocksDBThreadNum) + .mapToObj( + value -> + writeRocksDBService.submit( + new WriteRocksDBTask( + String.format( + "writing-rocksDB-cf_%s-thread-%d", columnFamilyName, value), + secondaryIndexMemoryMapEnabled))) + .collect(Collectors.toList()); + } + + @VisibleForTesting + public byte[] serializeKey(RowData key) throws IOException { + if (keySerializerThreadLocal.get() == null) { + keySerializerThreadLocal.set(keySerializer.clone()); + } + return serializeKey(keySerializerThreadLocal.get(), key); + } + + @VisibleForTesting + public byte[] serializeKey(BinaryRowDataSerializerWrapper keySerializer, RowData key) + throws IOException { + // key has a different RowKind would serialize different byte[], so unify the RowKind as INSERT. + byte[] result; + if (key.getRowKind() != RowKind.INSERT) { + RowKind rowKind = key.getRowKind(); + key.setRowKind(RowKind.INSERT); + result = keySerializer.serialize(key); + key.setRowKind(rowKind); + return result; + } + key.setRowKind(RowKind.INSERT); + return keySerializer.serialize(key); + } + + protected ByteArrayWrapper wrap(byte[] bytes) { + return new ByteArrayWrapper(bytes, bytes.length); + } + + protected void putIntoQueue(LookupRecord lookupRecord) { + Preconditions.checkNotNull(lookupRecord); + lookupRecordsQueue.add(lookupRecord); + } + + /** Waiting for the writing threads completed. */ + public void waitWriteRocksDBDone() { + long every5SecondsPrint = Long.MIN_VALUE; + + while (true) { + if (lookupRecordsQueue.isEmpty()) { + initialized.set(true); + break; + } else if (every5SecondsPrint < System.currentTimeMillis()) { + LOG.info("Currently rocksDB queue size is {}.", lookupRecordsQueue.size()); + every5SecondsPrint = System.currentTimeMillis() + 5000; + } + } + // Wait for all threads to finish + for (Future future : writeRocksDBThreadFutures) { + try { + // wait for the task to complete, with a timeout of 5 seconds + future.get(5, TimeUnit.SECONDS); + } catch (TimeoutException e) { + // task took too long, interrupt the thread and terminate the task + future.cancel(true); + } catch (InterruptedException | ExecutionException e) { + // handle other exceptions + throw new FlinkRuntimeException(e); + } + } + } + + public boolean initialized() { + return initialized.get(); + } + + protected LookupRecord.OpType convertToOpType(RowKind rowKind) { + switch (rowKind) { + case INSERT: + case UPDATE_AFTER: + return LookupRecord.OpType.PUT_BYTES; + case DELETE: + case UPDATE_BEFORE: + return LookupRecord.OpType.DELETE_BYTES; + default: + throw new IllegalArgumentException(String.format("Not support this rowKind %s", rowKind)); + } + } + + /** + * Closes the RocksDB instance and cleans up the Guava cache. + * + *

Additionally, it shuts down the write-service and clears the RocksDB record queue if they + * exist. + */ + public void close() { + rocksDB.close(); + guavaCache.cleanUp(); + if (writeRocksDBService != null) { + writeRocksDBService.shutdown(); + writeRocksDBService = null; + } + closed.set(true); + if (lookupRecordsQueue != null) { + lookupRecordsQueue.clear(); + lookupRecordsQueue = null; + } + } + + public void initializationCompleted() { + try { + rocksDB.getDB().enableAutoCompaction(Collections.singletonList(columnFamilyHandle)); + MutableColumnFamilyOptions mutableColumnFamilyOptions = + MutableColumnFamilyOptions.builder().setDisableAutoCompactions(false).build(); + rocksDB.setOptions(columnFamilyHandle, mutableColumnFamilyOptions); + } catch (RocksDBException e) { + throw new AmoroIOException(e); + } + + LOG.info("set db options[disable_auto_compactions={}]", false); + } + + public void addGauge(String metricName, Gauge gauge) { + metricGroup.gauge(metricName, gauge); + } + + protected void checkConcurrentFailed() { + if (writingThreadException.get() != null) { + LOG.error("Check concurrent writing threads.", writingThreadException.get()); + throw new FlinkRuntimeException(writingThreadException.get()); + } + } + + /** + * This task is running during the initialization phase to write data{@link LookupRecord} to + * RocksDB. + * + *

During the initialization phase, the Merge-on-Read approach is used to retrieve data, which + * will only return INSERT data. When there are multiple entries with the same primary key, only + * one entry will be returned. + * + *

During the initialization phase, the incremental pull approach is also used to retrieve data + * that include four {@link RowKind} rowKinds, -D, +I, -U, and +U. + */ + class WriteRocksDBTask implements Runnable { + + private final String name; + private final boolean secondaryIndexMemoryMapEnabled; + + public WriteRocksDBTask(String name, boolean secondaryIndexMemoryMapEnabled) { + this.name = name; + this.secondaryIndexMemoryMapEnabled = secondaryIndexMemoryMapEnabled; + } + + @Override + public void run() { + LOG.info("{} starting.", name); + try { + while (!closed.get() && !initialized.get()) { + LookupRecord record = lookupRecordsQueue.poll(); + if (record != null) { + switch (record.opType()) { + case PUT_BYTES: + put(record); + break; + case DELETE_BYTES: + delete(record); + break; + default: + throw new IllegalArgumentException( + String.format("Not support this OpType %s", record.opType())); + } + } + } + } catch (Throwable e) { + LOG.error("writing failed:", e); + writingThreadException.set(e); + } + LOG.info("{} stopping.", name); + } + + private void delete(LookupRecord record) { + if (secondaryIndexMemoryMapEnabled) { + deleteSecondaryCache(record.keyBytes(), record.valueBytes()); + } else { + rocksDB.delete(columnFamilyName, record.keyBytes()); + // manually clear the record + record = null; + } + } + + private void put(LookupRecord record) { + if (secondaryIndexMemoryMapEnabled) { + putSecondaryCache(record.keyBytes(), record.valueBytes()); + } else { + rocksDB.put(columnFamilyHandle, record.keyBytes(), record.valueBytes()); + // manually clear the record + record = null; + } + } + } + + void putSecondaryCache(byte[] key, byte[] value) { + ByteArrayWrapper keyWrap = wrap(key); + ByteArrayWrapper valueWrap = wrap(value); + putCacheValue(guavaCache, keyWrap, valueWrap); + } + + void deleteSecondaryCache(byte[] key, byte[] value) { + ByteArrayWrapper keyWrap = wrap(key); + ByteArrayWrapper valueWrap = wrap(value); + removeValue(guavaCache, keyWrap, valueWrap); + } + + void putCacheValue( + Cache cache, ByteArrayWrapper keyWrap, ByteArrayWrapper valueWrap) {} + + void removeValue( + Cache cache, ByteArrayWrapper keyWrap, ByteArrayWrapper valueWrap) {} +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java new file mode 100644 index 0000000000..51bce5ab78 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.utils.map.RocksDBBackend; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +/** A class used to store the state of a lookup record. For {@link UniqueIndexTable}. */ +public class RocksDBRecordState extends RocksDBCacheState { + private static final Logger LOG = LoggerFactory.getLogger(RocksDBRecordState.class); + + public RocksDBRecordState( + RocksDBBackend rocksDB, + String columnFamilyName, + BinaryRowDataSerializerWrapper keySerializer, + BinaryRowDataSerializerWrapper valueSerializer, + MetricGroup metricGroup, + LookupOptions lookupOptions) { + super( + rocksDB, + columnFamilyName, + keySerializer, + valueSerializer, + metricGroup, + lookupOptions, + false); + } + + /** + * Writes a key-value pair to the sst file. + * + * @param key The key of the pair. + * @param value The value of the pair. + */ + public void asyncWrite(RowData key, RowData value) throws IOException { + byte[] keyBytes = serializeKey(key); + asyncWrite(key.getRowKind(), keyBytes, value); + } + + public void asyncWrite(RowKind rowKind, byte[] keyBytes, RowData value) throws IOException { + byte[] valueBytes = serializeValue(value); + LookupRecord.OpType opType = convertToOpType(rowKind); + putIntoQueue(LookupRecord.of(opType, keyBytes, valueBytes)); + } + + /** + * Retrieve the RowData from guava cache firstly, if value is null, fetch the value from the + * rocksDB. + * + * @param key try to find the record via this key. + * @throws IOException if serialize the RowData variable key failed. + */ + public Optional get(RowData key) throws IOException { + byte[] keyBytes = serializeKey(key); + return get(keyBytes); + } + + public Optional get(byte[] keyBytes) throws IOException { + ByteArrayWrapper key = wrap(keyBytes); + byte[] recordBytes = guavaCache.getIfPresent(key); + if (recordBytes == null) { + recordBytes = rocksDB.get(columnFamilyHandle, key.bytes); + if (recordBytes != null) { + guavaCache.put(key, recordBytes); + } + } + return Optional.ofNullable(deserializeValue(recordBytes)); + } + + /** + * Putting the serialized RowData key and value into the rocksDB and cache. + * + * @throws IOException if serialize the RowData variable key and value failed. + */ + public void put(RowData key, RowData value) throws IOException { + byte[] keyBytes = serializeKey(key); + put(keyBytes, value); + } + + public void put(byte[] keyBytes, RowData value) throws IOException { + Preconditions.checkNotNull(value); + + byte[] valueBytes = serializeValue(value); + rocksDB.put(columnFamilyHandle, keyBytes, valueBytes); + + // Speed up the initialization process of Lookup Join Function + ByteArrayWrapper key = wrap(keyBytes); + if (guavaCache.getIfPresent(wrap(keyBytes)) != null) { + guavaCache.put(key, valueBytes); + } + } + + /** + * Deleting the record in the rocksDB and cache if it exists. + * + * @throws IOException if serialize the RowData variable key failed. + */ + public void delete(RowData key) throws IOException { + byte[] keyBytes = serializeKey(key); + delete(keyBytes); + } + + public void delete(byte[] keyBytes) { + if (contain(wrap(keyBytes))) { + rocksDB.delete(columnFamilyName, keyBytes); + guavaCache.invalidate(wrap(keyBytes)); + } + } + + private boolean contain(ByteArrayWrapper byteArrayWrapper) { + byte[] recordBytes = guavaCache.getIfPresent(byteArrayWrapper); + if (recordBytes == null) { + recordBytes = rocksDB.get(columnFamilyName, byteArrayWrapper.bytes); + } + return recordBytes != null; + } + + private byte[] serializeValue(RowData value) throws IOException { + return valueSerializer().serialize(value); + } + + private RowData deserializeValue(byte[] recordBytes) throws IOException { + return valueSerializer().deserialize(recordBytes); + } + + private BinaryRowDataSerializerWrapper valueSerializer() { + if (valueSerializerThreadLocal.get() == null) { + valueSerializerThreadLocal.set(valueSerializer.clone()); + } + return valueSerializerThreadLocal.get(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java new file mode 100644 index 0000000000..5215ed812c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.amoro.utils.map.RocksDBBackend; +import org.apache.commons.collections.CollectionUtils; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; +import org.apache.flink.table.data.RowData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * A class that stores the secondary index in the cache. For {@link SecondaryIndexTable}. + * + *

Support update the secondary index in the cache. + */ +public class RocksDBSetSpilledState extends RocksDBCacheState> { + private static final Logger LOG = LoggerFactory.getLogger(RocksDBSetSpilledState.class); + protected ThreadLocal joinKeySerializerThreadLocal = + new ThreadLocal<>(); + private final BinaryRowDataSerializerWrapper joinKeySerializer; + /** Multi-threads would put and delete the joinKeys and Set in the rocksdb. */ + private final Object rocksDBLock = new Object(); + + private final Map> tmpInitializationMap = + new ConcurrentHashMap<>(); + + public RocksDBSetSpilledState( + RocksDBBackend rocksDB, + String columnFamilyName, + BinaryRowDataSerializerWrapper joinKeySerializer, + BinaryRowDataSerializerWrapper uniqueKeySerialization, + BinaryRowDataSerializerWrapper valueSerializer, + MetricGroup metricGroup, + LookupOptions lookupOptions) { + super( + rocksDB, + columnFamilyName, + uniqueKeySerialization, + valueSerializer, + metricGroup, + lookupOptions, + true); + this.joinKeySerializer = joinKeySerializer; + } + + public void asyncWrite(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { + byte[] joinKeyBytes = serializeKey(joinKey); + LookupRecord.OpType opType = convertToOpType(joinKey.getRowKind()); + putIntoQueue(LookupRecord.of(opType, joinKeyBytes, uniqueKeyBytes)); + } + + @Override + public byte[] serializeKey(RowData key) throws IOException { + if (joinKeySerializerThreadLocal.get() == null) { + joinKeySerializerThreadLocal.set(joinKeySerializer.clone()); + } + return serializeKey(joinKeySerializerThreadLocal.get(), key); + } + + /** + * Serialize join key to bytes and put the join key bytes and unique key bytes in the cache. + * + * @param joinKey the join key + * @param uniqueKeyBytes the unique key bytes + * @throws IOException if serialize the RowData variable failed. + */ + public void put(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { + byte[] joinKeyBytes = serializeKey(joinKey); + putSecondaryCache(joinKeyBytes, uniqueKeyBytes); + } + + /** + * Delete the secondary index in the cache. + * + * @param joinKey the join key + * @param uniqueKeyBytes the unique key bytes + * @throws IOException if serialize the RowData variable failed. + */ + public void delete(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { + final byte[] joinKeyBytes = serializeKey(joinKey); + deleteSecondaryCache(joinKeyBytes, uniqueKeyBytes); + } + + /** + * Retrieve the elements of the key. + * + *

Fetch the Collection from guava cache, if not present, fetch the result from RocksDB. if + * present, just return the result. + * + * @return not null, but may be empty. + */ + public Collection get(RowData key) throws IOException { + final byte[] joinKeyBytes = serializeKey(key); + ByteArrayWrapper joinKeyWrap = wrap(joinKeyBytes); + Set result = guavaCache.getIfPresent(joinKeyWrap); + if (result == null) { + byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); + if (uniqueKeysDeserialized != null) { + result = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); + } + + if (CollectionUtils.isNotEmpty(result)) { + guavaCache.put(joinKeyWrap, result); + return result; + } + return Collections.emptyList(); + } + return result; + } + + @Override + public void putCacheValue( + Cache> cache, + ByteArrayWrapper keyWrap, + ByteArrayWrapper valueWrap) { + if (initialized()) { + byte[] joinKeyBytes = keyWrap.bytes; + synchronized (rocksDBLock) { + byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); + if (uniqueKeysDeserialized != null) { + Set set = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); + if (!set.contains(valueWrap)) { + set.add(valueWrap); + uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); + rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); + } + } else { + Set set = new HashSet<>(); + set.add(valueWrap); + uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); + rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); + } + } + return; + } + tmpInitializationMap.compute( + keyWrap, + (keyWrapper, oldSet) -> { + if (oldSet == null) { + oldSet = Sets.newHashSet(); + } + oldSet.add(valueWrap); + return oldSet; + }); + } + + @Override + public void removeValue( + Cache> cache, + ByteArrayWrapper keyWrap, + ByteArrayWrapper valueWrap) { + if (initialized()) { + byte[] joinKeyBytes = keyWrap.bytes; + synchronized (rocksDBLock) { + byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); + if (uniqueKeysDeserialized == null) { + return; + } + Set set = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); + if (set.contains(valueWrap)) { + set.remove(valueWrap); + if (!set.isEmpty()) { + uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); + rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); + } + } + } + return; + } + tmpInitializationMap.compute( + keyWrap, + (keyWrapper, oldSet) -> { + if (oldSet == null) { + return null; + } + oldSet.remove(valueWrap); + if (oldSet.isEmpty()) { + return null; + } + return oldSet; + }); + } + + public void bulkIntoRocksDB() { + LOG.info("Total size={} in the tmp map, try to bulk into rocksdb", tmpInitializationMap.size()); + int[] count = {0}; + long start = System.currentTimeMillis(); + + tmpInitializationMap.forEach( + (byteArrayWrapper, set) -> { + rocksDB.put( + columnFamilyHandle, byteArrayWrapper.bytes, ByteArraySetSerializer.serialize(set)); + set = null; + count[0] = count[0] + 1; + if (count[0] % 100000 == 0) { + LOG.info("Ingested {} into rocksdb.", count[0]); + } + }); + tmpInitializationMap.clear(); + + LOG.info("Ingested {} completely, cost:{} ms.", count, System.currentTimeMillis() - start); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java new file mode 100644 index 0000000000..e1f63ab4aa --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.log.Bytes; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.utils.map.RocksDBBackend; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.table.data.RowData; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * Guava cache structure: key -> list, the elements of this list are rocksdb keys. RocksDB + * structure: element -> empty. + */ +public class RocksDBSetState extends RocksDBCacheState> { + + protected BinaryRowDataSerializerWrapper joinKeySerializer; + + private static final byte[] EMPTY = new byte[0]; + + public RocksDBSetState( + RocksDBBackend rocksDB, + String columnFamilyName, + BinaryRowDataSerializerWrapper keySerialization, + BinaryRowDataSerializerWrapper elementSerialization, + BinaryRowDataSerializerWrapper valueSerializer, + MetricGroup metricGroup, + LookupOptions lookupOptions) { + super( + rocksDB, + columnFamilyName, + elementSerialization, + valueSerializer, + metricGroup, + lookupOptions, + false); + this.joinKeySerializer = keySerialization; + } + + /** + * Retrieve the elements of the key. + * + *

Fetch the Collection from guava cache, if not present, fetch from rocksDB continuously, via + * prefix key scanning the rocksDB; if present, just return the result. + * + * @return not null, but may be empty. + */ + public List get(RowData key) throws IOException { + final byte[] keyBytes = serializeKey(key); + ByteArrayWrapper keyWrap = wrap(keyBytes); + List result = guavaCache.getIfPresent(keyWrap); + if (result == null) { + try (RocksDBBackend.ValueIterator iterator = + (RocksDBBackend.ValueIterator) rocksDB.values(columnFamilyName, keyBytes)) { + result = Lists.newArrayList(); + while (iterator.hasNext()) { + byte[] targetKeyBytes = iterator.key(); + if (isPrefixKey(targetKeyBytes, keyBytes)) { + byte[] value = + Arrays.copyOfRange(targetKeyBytes, keyBytes.length, targetKeyBytes.length); + result.add(value); + } + iterator.next(); + } + if (!result.isEmpty()) { + guavaCache.put(keyWrap, result); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + return result; + } + + private boolean isPrefixKey(byte[] targetKeyBytes, byte[] keyBytes) { + for (int i = 0; i < keyBytes.length; i++) { + if (targetKeyBytes[i] != keyBytes[i]) { + return false; + } + } + return true; + } + + /** Merge key and element into guava cache and rocksdb. */ + public void merge(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { + byte[] joinKeyBytes = serializeKey(joinKey); + byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, uniqueKeyBytes); + ByteArrayWrapper keyWrap = wrap(joinKeyBytes); + if (guavaCache.getIfPresent(keyWrap) != null) { + guavaCache.invalidate(keyWrap); + } + rocksDB.put(columnFamilyName, joinKeyAndPrimaryKeyBytes, EMPTY); + } + + public void delete(RowData joinKey, byte[] elementBytes) throws IOException { + final byte[] joinKeyBytes = serializeKey(joinKey); + ByteArrayWrapper keyWrap = wrap(joinKeyBytes); + if (guavaCache.getIfPresent(keyWrap) != null) { + guavaCache.invalidate(keyWrap); + } + byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, elementBytes); + if (rocksDB.get(columnFamilyName, joinKeyAndPrimaryKeyBytes) != null) { + rocksDB.delete(columnFamilyName, joinKeyAndPrimaryKeyBytes); + } + } + + public void batchWrite(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { + byte[] joinKeyBytes = serializeKey(joinKey); + byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, uniqueKeyBytes); + LookupRecord.OpType opType = convertToOpType(joinKey.getRowKind()); + lookupRecordsQueue.add(LookupRecord.of(opType, joinKeyAndPrimaryKeyBytes, EMPTY)); + } + + public byte[] serializeKey(RowData key) throws IOException { + return serializeKey(joinKeySerializer, key); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java new file mode 100644 index 0000000000..57166b9ee8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.amoro.utils.map.RocksDBBackend; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.util.Preconditions; +import org.rocksdb.BlockBasedTableConfig; +import org.rocksdb.ColumnFamilyOptions; +import org.rocksdb.LRUCache; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class RowDataStateFactory { + private static final Logger LOG = LoggerFactory.getLogger(RowDataStateFactory.class); + + private final String dbPath; + private RocksDBBackend db; + private final MetricGroup metricGroup; + + public RowDataStateFactory(String dbPath, MetricGroup metricGroup) { + Preconditions.checkNotNull(metricGroup); + this.dbPath = dbPath; + this.metricGroup = metricGroup; + } + + public RocksDBRecordState createRecordState( + String columnFamilyName, + BinaryRowDataSerializerWrapper keySerializer, + BinaryRowDataSerializerWrapper valueSerializer, + LookupOptions lookupOptions) { + db = createDB(lookupOptions, columnFamilyName); + + return new RocksDBRecordState( + db, columnFamilyName, keySerializer, valueSerializer, metricGroup, lookupOptions); + } + + public RocksDBSetSpilledState createSetState( + String columnFamilyName, + BinaryRowDataSerializerWrapper keySerialization, + BinaryRowDataSerializerWrapper elementSerialization, + BinaryRowDataSerializerWrapper valueSerializer, + LookupOptions lookupOptions) { + db = createDB(lookupOptions, columnFamilyName); + + return new RocksDBSetSpilledState( + db, + columnFamilyName, + keySerialization, + elementSerialization, + valueSerializer, + metricGroup, + lookupOptions); + } + + RocksDBBackend createDB(final LookupOptions lookupOptions, final String columnFamilyName) { + if (lookupOptions.isTTLAfterWriteValidated()) { + db = + RocksDBBackend.getOrCreateInstance( + dbPath, (int) lookupOptions.ttlAfterWrite().getSeconds()); + } else { + db = RocksDBBackend.getOrCreateInstance(dbPath); + } + ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); + configColumnFamilyOption(columnFamilyOptions, lookupOptions); + db.addColumnFamily(columnFamilyName, columnFamilyOptions); + return db; + } + + private void configColumnFamilyOption( + ColumnFamilyOptions columnFamilyOptions, LookupOptions lookupOptions) { + columnFamilyOptions.setDisableAutoCompactions(true); + + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockCache( + new LRUCache(lookupOptions.blockCacheCapacity(), lookupOptions.numShardBits())); + columnFamilyOptions.setTableFormatConfig(blockBasedTableConfig); + + LOG.info("set db options[disable_auto_compactions={}]", true); + LOG.info("{}", lookupOptions); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java new file mode 100644 index 0000000000..41d7322574 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import static org.apache.amoro.flink.lookup.LookupMetrics.SECONDARY_CACHE_SIZE; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Use secondary index to lookup. Working for the situation where the join keys don't match the + * mixed-format table's primary keys. + * + *

Example: + * SELECT * FROM t1 JOIN t2 for system_time as of t1.pt as dim ON t1.user_name = dim.user_name + * + * + *

t2 as an mixed-format table with primary keys: user_name, city_name. + */ +public class SecondaryIndexTable extends UniqueIndexTable { + private static final Logger LOG = LoggerFactory.getLogger(SecondaryIndexTable.class); + private static final long serialVersionUID = 8707586070315884365L; + private final int[] secondaryKeyIndexMapping; + private final RocksDBSetSpilledState setState; + + private final LookupOptions lookupOptions; + + public SecondaryIndexTable( + RowDataStateFactory rowDataStateFactory, + List primaryKeys, + List joinKeys, + Schema projectSchema, + LookupOptions lookupOptions, + Predicate rowDataPredicate) { + super(rowDataStateFactory, primaryKeys, projectSchema, lookupOptions, rowDataPredicate); + + this.setState = + rowDataStateFactory.createSetState( + "secondaryIndex", + createKeySerializer(projectSchema, joinKeys), + createKeySerializer(projectSchema, primaryKeys), + createValueSerializer(projectSchema), + lookupOptions); + + List fields = + projectSchema.asStruct().fields().stream() + .map(Types.NestedField::name) + .collect(Collectors.toList()); + secondaryKeyIndexMapping = joinKeys.stream().mapToInt(fields::indexOf).toArray(); + this.lookupOptions = lookupOptions; + } + + @Override + public void open() { + super.open(); + setState.open(); + setState.addGauge(SECONDARY_CACHE_SIZE, () -> setState.guavaCache.size()); + } + + @Override + public List get(RowData key) throws IOException { + Collection uniqueKeys = setState.get(key); + if (!uniqueKeys.isEmpty()) { + List result = new ArrayList<>(uniqueKeys.size()); + for (ByteArrayWrapper uniqueKey : uniqueKeys) { + recordState.get(uniqueKey.bytes).ifPresent(result::add); + } + return result; + } + return Collections.emptyList(); + } + + @Override + public void upsert(Iterator dataStream) throws IOException { + while (dataStream.hasNext()) { + RowData value = dataStream.next(); + if (filter(value)) { + continue; + } + RowData uniqueKey = new KeyRowData(uniqueKeyIndexMapping, value); + RowData joinKey = new KeyRowData(secondaryKeyIndexMapping, value); + byte[] uniqueKeyBytes = recordState.serializeKey(uniqueKey); + + if (value.getRowKind() == RowKind.INSERT || value.getRowKind() == RowKind.UPDATE_AFTER) { + recordState.put(uniqueKeyBytes, value); + setState.put(joinKey, uniqueKeyBytes); + } else { + recordState.delete(uniqueKeyBytes); + setState.delete(joinKey, uniqueKeyBytes); + } + } + cleanUp(); + } + + @Override + public void initialize(Iterator dataStream) throws IOException { + while (dataStream.hasNext()) { + RowData value = dataStream.next(); + if (filter(value)) { + continue; + } + RowData uniqueKey = new KeyRowData(uniqueKeyIndexMapping, value); + RowData joinKey = new KeyRowData(secondaryKeyIndexMapping, value); + byte[] uniqueKeyBytes = recordState.serializeKey(uniqueKey); + + recordState.asyncWrite(value.getRowKind(), uniqueKeyBytes, value); + setState.asyncWrite(joinKey, uniqueKeyBytes); + } + recordState.checkConcurrentFailed(); + setState.checkConcurrentFailed(); + } + + @Override + public boolean initialized() { + return recordState.initialized() && setState.initialized(); + } + + @Override + public void cleanUp() { + if (lookupOptions.isTTLAfterWriteValidated()) { + setState.guavaCache.cleanUp(); + } + } + + @Override + public void waitInitializationCompleted() { + super.waitInitializationCompleted(); + LOG.info("Waiting for Set State initialization"); + setState.waitWriteRocksDBDone(); + LOG.info("Queue is empty row, try to bulk tmp map into rocksdb"); + setState.bulkIntoRocksDB(); + LOG.info("The concurrent threads have finished writing data into the Set State."); + setState.initializationCompleted(); + } + + @Override + public void close() { + super.close(); + recordState.close(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java new file mode 100644 index 0000000000..3cfed235fd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.apache.flink.configuration.Configuration; +import org.apache.iceberg.Schema; + +import java.util.List; +import java.util.function.Predicate; + +public interface TableFactory { + + KVTable create( + RowDataStateFactory rowDataStateFactory, + List primaryKeys, + List joinKeys, + Schema projectSchema, + Configuration config, + Predicate predicate); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java new file mode 100644 index 0000000000..8f454f158b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import static org.apache.amoro.flink.lookup.LookupMetrics.UNIQUE_CACHE_SIZE; + +import org.apache.amoro.utils.SchemaUtil; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Use a unique index to lookup. Working for the situation where the join keys include the + * mixed-format table's primary keys. + */ +public class UniqueIndexTable implements KVTable { + private static final Logger LOG = LoggerFactory.getLogger(UniqueIndexTable.class); + private static final long serialVersionUID = -6537777722200330050L; + protected final RocksDBRecordState recordState; + + protected int[] uniqueKeyIndexMapping; + protected final Predicate rowDataPredicate; + + public UniqueIndexTable( + RowDataStateFactory rowDataStateFactory, + List primaryKeys, + Schema projectSchema, + LookupOptions lookupOptions, + Predicate rowDataPredicate) { + + recordState = + rowDataStateFactory.createRecordState( + "uniqueIndex", + createKeySerializer(projectSchema, primaryKeys), + createValueSerializer(projectSchema), + lookupOptions); + List fields = + projectSchema.asStruct().fields().stream() + .map(Types.NestedField::name) + .collect(Collectors.toList()); + this.uniqueKeyIndexMapping = primaryKeys.stream().mapToInt(fields::indexOf).toArray(); + this.rowDataPredicate = rowDataPredicate; + } + + @Override + public void open() { + recordState.open(); + recordState.addGauge(UNIQUE_CACHE_SIZE, () -> recordState.guavaCache.size()); + } + + @Override + public List get(RowData key) throws IOException { + Optional record = recordState.get(key); + return record.map(Collections::singletonList).orElse(Collections.emptyList()); + } + + @Override + public void upsert(Iterator dataStream) throws IOException { + while (dataStream.hasNext()) { + RowData value = dataStream.next(); + if (filter(value)) { + continue; + } + RowData key = new KeyRowData(uniqueKeyIndexMapping, value); + + if (value.getRowKind() == RowKind.INSERT || value.getRowKind() == RowKind.UPDATE_AFTER) { + recordState.put(key, value); + } else { + recordState.delete(key); + } + } + } + + @Override + public void initialize(Iterator dataStream) throws IOException { + while (dataStream.hasNext()) { + RowData value = dataStream.next(); + if (filter(value)) { + continue; + } + + RowData key = new KeyRowData(uniqueKeyIndexMapping, value); + recordState.asyncWrite(key, value); + } + recordState.checkConcurrentFailed(); + } + + @Override + public boolean filter(RowData value) { + return predicate(value); + } + + protected boolean predicate(RowData value) { + return Optional.ofNullable(rowDataPredicate) + .map(predicate -> !predicate.test(value)) + .orElse(false); + } + + @Override + public boolean initialized() { + return recordState.initialized(); + } + + @Override + public void waitInitializationCompleted() { + LOG.info("Waiting for Record State initialization"); + recordState.waitWriteRocksDBDone(); + LOG.info("The concurrent threads have finished writing data into the Record State."); + recordState.initializationCompleted(); + } + + protected BinaryRowDataSerializerWrapper createKeySerializer( + Schema mixedTableSchema, List keys) { + Schema keySchema = SchemaUtil.selectInOrder(mixedTableSchema, keys); + return new BinaryRowDataSerializerWrapper(keySchema); + } + + protected BinaryRowDataSerializerWrapper createValueSerializer(Schema projectSchema) { + return new BinaryRowDataSerializerWrapper(projectSchema); + } + + @Override + public void close() { + recordState.close(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java new file mode 100644 index 0000000000..00e2769eab --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup.filter; + +import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision; + +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.shade.guava32.com.google.common.collect.Iterables; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.util.Preconditions; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * A predicate to be used in a filter operation on a {@link RowData} object. It can be constructed + * from various comparison operators on a field or from boolean operators with other predicates. + * + *

The {@code test} method will apply the predicate to a {@link RowData} object, returning true + * if the predicate is satisfied by the given data. + */ +public class RowDataPredicate implements Predicate, Serializable { + private static final long serialVersionUID = 1L; + private Opt opt; + private final String fieldName; + private final int fieldIndex; + private final DataType dataType; + private Serializable[] parameters; + private final RowDataPredicate[] leftPredicates; + private final RowDataPredicate[] rightPredicates; + + /** Constructor used for testing purposes. */ + public RowDataPredicate( + Opt opt, + String fieldName, + int fieldIndex, + DataType dataType, + Serializable[] parameters, + RowDataPredicate[] leftPredicates, + RowDataPredicate[] rightPredicates) { + this.opt = opt; + this.fieldName = fieldName; + this.fieldIndex = fieldIndex; + this.dataType = dataType; + this.parameters = parameters; + this.leftPredicates = leftPredicates; + this.rightPredicates = rightPredicates; + } + + /** + * Constructor for logical operation, when left and right side of the operation is not a simple + * comparison. + */ + public RowDataPredicate( + Opt opt, RowDataPredicate[] leftPredicates, RowDataPredicate[] rightPredicates) { + this(opt, null, -1, null, null, leftPredicates, rightPredicates); + } + + /** Constructor for simple comparison operator. */ + public RowDataPredicate(String fieldName, int fieldIndex, DataType dataType) { + this(null, fieldName, fieldIndex, dataType, null, null, null); + } + + /** Constructor for comparing value to a fixed value or NULL value. */ + public RowDataPredicate(Serializable[] parameters) { + this(null, null, -1, null, parameters, null, null); + } + + /** Test if the RowData record satisfies this predicate. */ + @Override + public boolean test(RowData rowData) { + boolean result; + Object val; + switch (opt) { + case EQUALS: + val = getter(rowData); + result = compareEquals(val); + break; + case NOT_EQUALS: + val = getter(rowData); + result = !compareEquals(val); + break; + case GREATER_THAN: + val = getter(rowData); + result = compareGreaterThan(val); + break; + case GREATER_THAN_OR_EQUAL: + val = getter(rowData); + result = compareGreaterThanOrEqual(val); + break; + case LESS_THAN: + val = getter(rowData); + result = compareLessThan(val); + break; + case LESS_THAN_OR_EQUAL: + val = getter(rowData); + result = compareLessThanOrEqual(val); + break; + case IS_NOT_NULL: + val = getter(rowData); + result = compareIsNotNull(val); + break; + case IS_NULL: + val = getter(rowData); + result = compareIsNull(val); + break; + case AND: + Preconditions.checkNotNull(leftPredicates); + Preconditions.checkNotNull(rightPredicates); + result = Arrays.stream(leftPredicates).allMatch(p -> p.test(rowData)); + if (!result) { + return false; + } + result = Arrays.stream(rightPredicates).allMatch(p -> p.test(rowData)); + break; + case OR: + Preconditions.checkNotNull(leftPredicates); + Preconditions.checkNotNull(rightPredicates); + result = Arrays.stream(leftPredicates).allMatch(p -> p.test(rowData)); + if (result) { + return true; + } + result = Arrays.stream(rightPredicates).allMatch(p -> p.test(rowData)); + break; + default: + throw new IllegalArgumentException("Unsupported opt: " + opt); + } + + return result; + } + + public Serializable[] parameters() { + return parameters; + } + + /** + * Combines this RowDataPredicate with another using the specified operator. + * + * @param operator the operator to use for the combination + * @param that the other RowDataPredicate to combine with this one + * @return the combined RowDataPredicate + */ + public RowDataPredicate combine(Opt operator, RowDataPredicate that) { + this.opt = operator; + if (that == null) { + this.parameters = null; + } else { + this.parameters = that.parameters; + } + return this; + } + + private boolean compareLessThanOrEqual(Object val) { + return compareLiteral(dataType, parameters[0], val) >= 0; + } + + private boolean compareLessThan(Object val) { + return compareLiteral(dataType, parameters[0], val) > 0; + } + + private boolean compareGreaterThanOrEqual(Object val) { + return compareLiteral(dataType, parameters[0], val) <= 0; + } + + private boolean compareIsNotNull(Object val) { + return val != null; + } + + private boolean compareIsNull(Object val) { + return val == null; + } + + private boolean compareGreaterThan(Object val) { + return compareLiteral(dataType, parameters[0], val) < 0; + } + + private boolean compareEquals(Object val) { + if (parameters[0] == null && val == null) { + return true; + } + if (parameters[0] == null || val == null) { + return false; + } + return compareLiteral(dataType, parameters[0], val) == 0; + } + + Object getter(RowData rowData) { + int pos = fieldIndex; + if (rowData.isNullAt(pos)) { + return null; + } + Preconditions.checkNotNull(dataType); + LogicalType logicalType = dataType.getLogicalType(); + switch (logicalType.getTypeRoot()) { + case CHAR: + case VARCHAR: + return rowData.getString(pos).toString(); + case BOOLEAN: + return rowData.getBoolean(pos); + case BINARY: + case VARBINARY: + return rowData.getBinary(pos); + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + return rowData + .getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()) + .toBigDecimal(); + case TINYINT: + return rowData.getByte(pos); + case SMALLINT: + return rowData.getShort(pos); + case INTEGER: + case DATE: + case INTERVAL_YEAR_MONTH: + case TIME_WITHOUT_TIME_ZONE: + return rowData.getInt(pos); + case BIGINT: + case INTERVAL_DAY_TIME: + return rowData.getLong(pos); + case FLOAT: + return rowData.getFloat(pos); + case DOUBLE: + return rowData.getDouble(pos); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int timestampPrecision = getPrecision(logicalType); + return rowData.getTimestamp(pos, timestampPrecision).getMillisecond(); + default: + throw new IllegalArgumentException( + String.format("Not supported datatype: %s, field: %s", dataType, fieldName)); + } + } + + private static int compareLiteral(DataType type, Object v1, Object v2) { + if (v1 instanceof Comparable) { + return ((Comparable) v1).compareTo(v2); + } else { + throw new RuntimeException(String.format("Unsupported type: %s, val: %s", type, v1)); + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(RowDataPredicate.class) + .add("\n\topt", opt) + .add("\n\tfieldName", fieldName) + .add("\n\tfieldIndex", fieldIndex) + .add("\n\tdataType", dataType) + .add("\n\tparameters", parameters) + .add( + "\n\tleftPredicates", + leftPredicates == null + ? "[]" + : Iterables.toString( + Arrays.stream(leftPredicates) + .map(predicate -> predicate.toString().replaceAll("\n", "\n\t")) + .collect(Collectors.toList()))) + .add( + "\n\trightPredicates", + rightPredicates == null + ? "[]" + : Iterables.toString( + Arrays.stream(rightPredicates) + .map(predicate -> predicate.toString().replaceAll("\n", "\n\t")) + .collect(Collectors.toList()))) + .toString(); + } + + public enum Opt { + AND, + OR, + EQUALS, + GREATER_THAN, + GREATER_THAN_OR_EQUAL, + LESS_THAN, + LESS_THAN_OR_EQUAL, + NOT_EQUALS, + IS_NULL, + IS_NOT_NULL, + TO_TIMESTAMP, + MINUS, + PLUS, + DIVIDE, + TIMES + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java new file mode 100644 index 0000000000..66d3f7e4d1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup.filter; + +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.AND; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.DIVIDE; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.EQUALS; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.GREATER_THAN; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.GREATER_THAN_OR_EQUAL; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.IS_NOT_NULL; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.IS_NULL; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.LESS_THAN; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.LESS_THAN_OR_EQUAL; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.MINUS; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.NOT_EQUALS; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.OR; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.PLUS; +import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.TIMES; + +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.Expression; +import org.apache.flink.table.expressions.ExpressionDefaultVisitor; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.TypeLiteralExpression; +import org.apache.flink.table.expressions.ValueLiteralExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.util.Preconditions; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * This class implements the visitor pattern for traversing expressions and building a {@link + * RowDataPredicate} out of them. + * + *

It supports a limited set of built-in functions, such as EQUALS, LESS_THAN, GREATER_THAN, + * NOT_EQUALS, etc. + */ +public class RowDataPredicateExpressionVisitor + extends ExpressionDefaultVisitor> { + + /** + * A map from field names to their respective indices in the input row. + * + *

Start from 0. + */ + private final Map fieldIndexMap; + /** A map from field names to their respective data types */ + private final Map fieldDataTypeMap; + + public RowDataPredicateExpressionVisitor( + Map fieldIndexMap, Map fieldDataTypeMap) { + this.fieldIndexMap = fieldIndexMap; + this.fieldDataTypeMap = fieldDataTypeMap; + } + + /** + * Visits a {@link CallExpression} and renders it as a {@link RowDataPredicate}. + * + * @param call the call expression to visit + * @return an optional {@link RowDataPredicate} + */ + @Override + public Optional visit(CallExpression call) { + if (BuiltInFunctionDefinitions.EQUALS.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(EQUALS, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.LESS_THAN.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(LESS_THAN, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(LESS_THAN_OR_EQUAL, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.GREATER_THAN.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(GREATER_THAN, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(GREATER_THAN_OR_EQUAL, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.NOT_EQUALS.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(NOT_EQUALS, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.OR.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(OR, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.AND.equals(call.getFunctionDefinition())) { + return renderBinaryOperator(AND, call.getResolvedChildren()); + } + if (BuiltInFunctionDefinitions.IS_NULL.equals(call.getFunctionDefinition())) { + return renderUnaryOperator(IS_NULL, call.getResolvedChildren().get(0)); + } + if (BuiltInFunctionDefinitions.IS_NOT_NULL.equals(call.getFunctionDefinition())) { + return renderUnaryOperator(IS_NOT_NULL, call.getResolvedChildren().get(0)); + } + if (BuiltInFunctionDefinitions.PLUS.equals(call.getFunctionDefinition())) { + return arithmeticOperator(PLUS, call); + } + if (BuiltInFunctionDefinitions.MINUS.equals(call.getFunctionDefinition())) { + return arithmeticOperator(MINUS, call); + } + if (BuiltInFunctionDefinitions.TIMES.equals(call.getFunctionDefinition())) { + return arithmeticOperator(TIMES, call); + } + if (BuiltInFunctionDefinitions.DIVIDE.equals(call.getFunctionDefinition())) { + return arithmeticOperator(DIVIDE, call); + } + if (BuiltInFunctionDefinitions.CAST.equals(call.getFunctionDefinition())) { + return castOperator(call); + } + throw new IllegalArgumentException( + String.format( + "Not supported build-in function: %s, CallExpression: %s, for RowDataPredicateExpressionVisitor", + call.getFunctionDefinition(), call)); + } + + @Override + public Optional visit(ValueLiteralExpression valueLiteralExpression) { + LogicalType tpe = valueLiteralExpression.getOutputDataType().getLogicalType(); + Serializable[] params = new Serializable[1]; + switch (tpe.getTypeRoot()) { + case CHAR: + case VARCHAR: + params[0] = valueLiteralExpression.getValueAs(String.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case BOOLEAN: + params[0] = valueLiteralExpression.getValueAs(Boolean.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case DECIMAL: + params[0] = valueLiteralExpression.getValueAs(BigDecimal.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case TINYINT: + params[0] = valueLiteralExpression.getValueAs(Byte.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case SMALLINT: + params[0] = valueLiteralExpression.getValueAs(Short.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case INTEGER: + params[0] = valueLiteralExpression.getValueAs(Integer.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case BIGINT: + params[0] = valueLiteralExpression.getValueAs(Long.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case FLOAT: + params[0] = valueLiteralExpression.getValueAs(Float.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case DOUBLE: + params[0] = valueLiteralExpression.getValueAs(Double.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case DATE: + params[0] = + valueLiteralExpression.getValueAs(LocalDate.class).map(Date::valueOf).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case TIME_WITHOUT_TIME_ZONE: + params[0] = valueLiteralExpression.getValueAs(java.sql.Time.class).orElse(null); + return Optional.of(new RowDataPredicate(params)); + case TIMESTAMP_WITHOUT_TIME_ZONE: + params[0] = + valueLiteralExpression + .getValueAs(LocalDateTime.class) + .map(Timestamp::valueOf) + .orElse(null); + return Optional.of(new RowDataPredicate(params)); + default: + return Optional.empty(); + } + } + + @Override + public Optional visit(FieldReferenceExpression fieldReferenceExpression) { + String fieldName = fieldReferenceExpression.getName(); + int fieldIndex = fieldIndexMap.get(fieldName); + DataType dataType = fieldDataTypeMap.get(fieldName); + return Optional.of(new RowDataPredicate(fieldName, fieldIndex, dataType)); + } + + @Override + protected Optional defaultMethod(Expression expression) { + return Optional.empty(); + } + + protected Optional arithmeticOperator( + RowDataPredicate.Opt arithmeticOpt, CallExpression call) { + List resolvedChildren = call.getResolvedChildren(); + Optional leftPredicate = resolvedChildren.get(0).accept(this); + Optional rightPredicate = resolvedChildren.get(1).accept(this); + Serializable left = leftPredicate.get().parameters()[0]; + Serializable right = rightPredicate.get().parameters()[0]; + if (left instanceof Number && right instanceof Number) { + Serializable result; + switch (arithmeticOpt) { + case MINUS: + result = ((Number) left).longValue() - ((Number) right).longValue(); + break; + case TIMES: + result = ((Number) left).longValue() * ((Number) right).longValue(); + break; + case PLUS: + result = ((Number) left).longValue() + ((Number) right).longValue(); + break; + case DIVIDE: + result = ((Number) left).longValue() / ((Number) right).longValue(); + break; + default: + throw new IllegalArgumentException( + String.format( + "Not supported arithmetic opt: %s, call expression: %s", arithmeticOpt, call)); + } + return Optional.of(new RowDataPredicate(new Serializable[] {result})); + } + throw new IllegalArgumentException( + String.format( + "arithmetic operator: %s only supported numerical parameters, call expression: %s", + arithmeticOpt, call)); + } + + protected Optional castOperator(CallExpression call) { + List resolvedChildren = call.getResolvedChildren(); + Optional leftPredicate = resolvedChildren.get(0).accept(this); + if (resolvedChildren.size() != 2) { + throw new IllegalArgumentException( + String.format( + "cast operator's children expressions should be 2. call expression: %s", call)); + } + if (resolvedChildren.get(1) instanceof TypeLiteralExpression) { + Class type = resolvedChildren.get(1).getOutputDataType().getConversionClass(); + Serializable se = (Serializable) type.cast(leftPredicate.get().parameters()[0]); + return Optional.of(new RowDataPredicate(new Serializable[] {se})); + } + throw new IllegalArgumentException( + String.format( + "cast operator's children expressions should be 2. call expression: %s", call)); + } + + protected Optional renderUnaryOperator( + RowDataPredicate.Opt opt, ResolvedExpression resolvedExpression) { + if (resolvedExpression instanceof FieldReferenceExpression) { + Optional leftPredicate = resolvedExpression.accept(this); + return leftPredicate.map(rowDataPredicate -> rowDataPredicate.combine(opt, null)); + } + return Optional.empty(); + } + + protected Optional renderBinaryOperator( + RowDataPredicate.Opt opt, List resolvedExpressions) { + Optional leftPredicate = resolvedExpressions.get(0).accept(this); + + Optional rightPredicate = resolvedExpressions.get(1).accept(this); + + if (AND.equals(opt) || OR.equals(opt)) { + Preconditions.checkArgument(leftPredicate.isPresent()); + Preconditions.checkArgument(rightPredicate.isPresent()); + return Optional.of( + new RowDataPredicate( + opt, + new RowDataPredicate[] {leftPredicate.get()}, + new RowDataPredicate[] {rightPredicate.get()})); + } + + return leftPredicate.flatMap(left -> rightPredicate.map(right -> left.combine(opt, right))); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java new file mode 100644 index 0000000000..e86ac102f9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.metric; + +/** metric constant */ +public class MetricConstant { + + /** + * The start time of mixed-format table's initialization when it used as build table in temporal + * join. + */ + public static final String TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP = + "temporalTableInitializationStartTimestamp"; + /** + * The end time of mixed-format table's initialization when it used as build table in temporal + * join. + */ + public static final String TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP = + "temporalTableInitializationEndTimestamp"; +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java new file mode 100644 index 0000000000..d539f1664a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.metric; + +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.time.LocalDateTime; +import java.time.ZoneId; + +/** + * A generator that generates the latency metrics of the writing operators in flink applications. + */ +public class MetricsGenerator implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(MetricsGenerator.class); + private long currentLatency; + private final boolean latencyEnable; + private final boolean metricEnable; + private final Schema schema; + private final RowType flinkSchema; + private RowData.FieldGetter modifyTimeGetter; + private boolean findColumn = false; + + private MetricsGenerator( + boolean latencyEnable, + Schema schema, + RowType flinkSchema, + String modifyTimeColumn, + boolean metricEnable) { + this.latencyEnable = latencyEnable; + this.schema = schema; + this.metricEnable = metricEnable; + this.flinkSchema = flinkSchema; + checkColumnExist(modifyTimeColumn); + } + + private void checkColumnExist(String modifyTimeColumn) { + if (!this.latencyEnable) { + return; + } + if (modifyTimeColumn == null || this.schema.findField(modifyTimeColumn) == null) { + LOG.warn("can't find event time column {}", modifyTimeColumn); + findColumn = false; + } else { + findColumn = true; + int modifyTimeColumnIndex = flinkSchema.getFieldIndex(modifyTimeColumn); + LogicalType type = flinkSchema.getTypeAt(modifyTimeColumnIndex); + LOG.info( + "event latency with column {}, index {}, type {}", + modifyTimeColumn, + modifyTimeColumnIndex, + type); + modifyTimeGetter = RowData.createFieldGetter(type, modifyTimeColumnIndex); + } + } + + public static MetricsGenerator empty(boolean metricEnable) { + return new MetricsGenerator(false, null, null, null, metricEnable); + } + + public static MetricsGenerator newGenerator( + Schema schema, RowType flinkSchema, String modifyTimeColumn, boolean metricEnable) { + return new MetricsGenerator(true, schema, flinkSchema, modifyTimeColumn, metricEnable); + } + + public boolean enable() { + return latencyEnable; + } + + public boolean isMetricEnable() { + return metricEnable; + } + + public void recordLatency(StreamRecord element) { + if (latencyEnable) { + if (findColumn) { + RowData rowData = element.getValue(); + if (rowData.getRowKind() == RowKind.UPDATE_BEFORE + || rowData.getRowKind() == RowKind.DELETE) { + return; + } + + Object value = modifyTimeGetter.getFieldOrNull(rowData); + if (value == null) { + return; + } + if (value instanceof LocalDateTime) { + LocalDateTime localDateTime = (LocalDateTime) value; + long eventTime = localDateTime.atZone(ZoneId.systemDefault()).toInstant().toEpochMilli(); + this.currentLatency = System.currentTimeMillis() - eventTime; + } else if (value instanceof Long) { + this.currentLatency = System.currentTimeMillis() - (Long) value; + } else { + LOG.warn("eventTimeColumn is not LocalDateTime/Long, {}", value.getClass()); + } + } else if (element.hasTimestamp()) { + this.currentLatency = System.currentTimeMillis() - element.getTimestamp(); + } + } + } + + public long getCurrentLatency() { + return currentLatency; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java new file mode 100644 index 0000000000..deb6639d12 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.planner.calcite; + +import static org.apache.calcite.sql.type.SqlTypeName.DECIMAL; +import static org.apache.flink.table.planner.utils.ShortcutUtils.unwrapTypeFactory; + +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.rel.type.RelDataTypeFactoryImpl; +import org.apache.calcite.rel.type.RelDataTypeSystemImpl; +import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.type.SqlTypeUtil; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.planner.calcite.FlinkTypeFactory; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.utils.LogicalTypeMerging; +import org.apache.flink.util.function.QuadFunction; + +import javax.annotation.Nullable; + +/** + * Custom type system for Flink. + * + *

Copied from flink-1.18. + */ +@Internal +public class FlinkTypeSystem extends RelDataTypeSystemImpl { + + public static final FlinkTypeSystem INSTANCE = new FlinkTypeSystem(); + public static final DecimalType DECIMAL_SYSTEM_DEFAULT = + new DecimalType(DecimalType.MAX_PRECISION, 18); + + private FlinkTypeSystem() {} + + @Override + public int getMaxNumericPrecision() { + // set the maximum precision of a NUMERIC or DECIMAL type to DecimalType.MAX_PRECISION. + return DecimalType.MAX_PRECISION; + } + + @Override + public int getMaxNumericScale() { + // the max scale can't be greater than precision + return DecimalType.MAX_PRECISION; + } + + @Override + public int getDefaultPrecision(SqlTypeName typeName) { + switch (typeName) { + case VARCHAR: + case VARBINARY: + // Calcite will limit the length of the VARCHAR field to 65536 + return Integer.MAX_VALUE; + case TIMESTAMP: + // by default we support timestamp with microseconds precision (Timestamp(6)) + return TimestampType.DEFAULT_PRECISION; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + // by default we support timestamp with local time zone with microseconds precision + // Timestamp(6) with local time zone + return LocalZonedTimestampType.DEFAULT_PRECISION; + } + return super.getDefaultPrecision(typeName); + } + + @Override + public int getMaxPrecision(SqlTypeName typeName) { + switch (typeName) { + case VARCHAR: + case CHAR: + case VARBINARY: + case BINARY: + return Integer.MAX_VALUE; + + case TIMESTAMP: + // The maximum precision of TIMESTAMP is 3 in Calcite, + // change it to 9 to support nanoseconds precision + return TimestampType.MAX_PRECISION; + + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + // The maximum precision of TIMESTAMP_WITH_LOCAL_TIME_ZONE is 3 in Calcite, + // change it to 9 to support nanoseconds precision + return LocalZonedTimestampType.MAX_PRECISION; + } + return super.getMaxPrecision(typeName); + } + + @Override + public boolean shouldConvertRaggedUnionTypesToVarying() { + // when union a number of CHAR types of different lengths, we should cast to a VARCHAR + // this fixes the problem of CASE WHEN with different length string literals but get wrong + // result with additional space suffix + return true; + } + + @Override + public RelDataType deriveAvgAggType(RelDataTypeFactory typeFactory, RelDataType argRelDataType) { + LogicalType argType = FlinkTypeFactory.toLogicalType(argRelDataType); + LogicalType resultType = LogicalTypeMerging.findAvgAggType(argType); + return unwrapTypeFactory(typeFactory).createFieldTypeFromLogicalType(resultType); + } + + @Override + public RelDataType deriveSumType(RelDataTypeFactory typeFactory, RelDataType argRelDataType) { + LogicalType argType = FlinkTypeFactory.toLogicalType(argRelDataType); + LogicalType resultType = LogicalTypeMerging.findSumAggType(argType); + return unwrapTypeFactory(typeFactory).createFieldTypeFromLogicalType(resultType); + } + + @Override + public RelDataType deriveDecimalPlusType( + RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { + return deriveDecimalType( + typeFactory, type1, type2, LogicalTypeMerging::findAdditionDecimalType); + } + + @Override + public RelDataType deriveDecimalModType( + RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { + return deriveDecimalRelDataType( + typeFactory, + type1, + type2, + (p1, s1, p2, s2) -> { + if (s1 == 0 && s2 == 0) { + return type2; + } + DecimalType result = LogicalTypeMerging.findModuloDecimalType(p1, s1, p2, s2); + return typeFactory.createSqlType(DECIMAL, result.getPrecision(), result.getScale()); + }); + } + + @Override + public RelDataType deriveDecimalDivideType( + RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { + return deriveDecimalType( + typeFactory, type1, type2, LogicalTypeMerging::findDivisionDecimalType); + } + + @Override + public RelDataType deriveDecimalMultiplyType( + RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { + return deriveDecimalType( + typeFactory, type1, type2, LogicalTypeMerging::findMultiplicationDecimalType); + } + + /** Use derivation from {@link LogicalTypeMerging} to derive decimal type. */ + private @Nullable RelDataType deriveDecimalType( + RelDataTypeFactory typeFactory, + RelDataType type1, + RelDataType type2, + QuadFunction deriveImpl) { + return deriveDecimalRelDataType( + typeFactory, + type1, + type2, + (p1, s1, p2, s2) -> { + DecimalType result = deriveImpl.apply(p1, s1, p2, s2); + return typeFactory.createSqlType(DECIMAL, result.getPrecision(), result.getScale()); + }); + } + + private @Nullable RelDataType deriveDecimalRelDataType( + RelDataTypeFactory typeFactory, + RelDataType type1, + RelDataType type2, + QuadFunction deriveImpl) { + if (canDeriveDecimal(type1, type2)) { + RelDataType decType1 = adjustType(typeFactory, type1); + RelDataType decType2 = adjustType(typeFactory, type2); + return deriveImpl.apply( + decType1.getPrecision(), + decType1.getScale(), + decType2.getPrecision(), + decType2.getScale()); + } else { + return null; + } + } + + /** + * Java numeric will always have invalid precision/scale, use its default decimal precision/scale + * instead. + */ + private RelDataType adjustType(RelDataTypeFactory typeFactory, RelDataType relDataType) { + return RelDataTypeFactoryImpl.isJavaType(relDataType) + ? typeFactory.decimalOf(relDataType) + : relDataType; + } + + private boolean canDeriveDecimal(RelDataType type1, RelDataType type2) { + return SqlTypeUtil.isExactNumeric(type1) + && SqlTypeUtil.isExactNumeric(type2) + && (SqlTypeUtil.isDecimal(type1) || SqlTypeUtil.isDecimal(type2)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java new file mode 100644 index 0000000000..155bda30ad --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java @@ -0,0 +1,873 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +public class AdaptHiveFlinkParquetReaders { + private AdaptHiveFlinkParquetReaders() {} + + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { + return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); + } + + private static class ReadBuilder extends TypeWithSchemaVisitor> { + private final MessageType type; + private final Map idToConstant; + + ReadBuilder(MessageType type, Map idToConstant) { + this.type = type; + this.idToConstant = idToConstant; + } + + @Override + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { + return struct(expected, message.asGroupType(), fieldReaders); + } + + @Override + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { + // match the expected struct's order + Map> readersById = Maps.newHashMap(); + Map typesById = Maps.newHashMap(); + List fields = struct.getFields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i); + if (fieldReaders.get(i) != null) { + int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; + if (fieldType.getId() != null) { + int id = fieldType.getId().intValue(); + readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); + typesById.put(id, fieldType); + } + } + } + + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); + List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); + for (Types.NestedField field : expectedFields) { + int id = field.fieldId(); + if (idToConstant.containsKey(id)) { + // containsKey is used because the constant may be null + reorderedFields.add(ParquetValueReaders.constant(idToConstant.get(id))); + types.add(null); + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { + reorderedFields.add(ParquetValueReaders.position()); + types.add(null); + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { + reorderedFields.add(ParquetValueReaders.constant(false)); + types.add(null); + } else { + ParquetValueReader reader = readersById.get(id); + if (reader != null) { + reorderedFields.add(reader); + types.add(typesById.get(id)); + } else { + reorderedFields.add(ParquetValueReaders.nulls()); + types.add(null); + } + } + } + + return new RowDataReader(types, reorderedFields); + } + + @Override + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { + if (expectedList == null) { + return null; + } + + GroupType repeated = array.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type elementType = repeated.getType(0); + int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; + + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + } + + @Override + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + if (expectedMap == null) { + return null; + } + + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type keyType = repeatedKeyValue.getType(0); + int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; + Type valueType = repeatedKeyValue.getType(1); + int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; + + return new MapReader<>( + repeatedD, + repeatedR, + ParquetValueReaders.option(keyType, keyD, keyReader), + ParquetValueReaders.option(valueType, valueD, valueReader)); + } + + @Override + @SuppressWarnings("CyclomaticComplexity") + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { + if (expected == null) { + return null; + } + + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + + if (primitive.getOriginalType() != null) { + switch (primitive.getOriginalType()) { + case ENUM: + case JSON: + case UTF8: + return new StringReader(desc); + case INT_8: + case INT_16: + case INT_32: + if (expected.typeId() == Types.LongType.get().typeId()) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case TIME_MICROS: + return new LossyMicrosToMillisTimeReader(desc); + case TIME_MILLIS: + return new MillisTimeReader(desc); + case DATE: + case INT_64: + return new ParquetValueReaders.UnboxedReader<>(desc); + case TIMESTAMP_MICROS: + if (((Types.TimestampType) expected).shouldAdjustToUTC()) { + return new MicrosToTimestampTzReader(desc); + } else { + return new MicrosToTimestampReader(desc); + } + case TIMESTAMP_MILLIS: + if (((Types.TimestampType) expected).shouldAdjustToUTC()) { + return new MillisToTimestampTzReader(desc); + } else { + return new MillisToTimestampReader(desc); + } + case DECIMAL: + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + switch (primitive.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + case INT64: + return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + case INT32: + return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + default: + throw new UnsupportedOperationException( + "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); + } + case BSON: + return new ParquetValueReaders.ByteArrayReader(desc); + default: + throw new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getOriginalType()); + } + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return new ParquetValueReaders.ByteArrayReader(desc); + case INT32: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case FLOAT: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { + return new ParquetValueReaders.FloatAsDoubleReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case BOOLEAN: + case INT64: + case DOUBLE: + return new ParquetValueReaders.UnboxedReader<>(desc); + case INT96: + Types.TimestampType tsMicrosType = (Types.TimestampType) expected; + if (tsMicrosType.shouldAdjustToUTC()) { + return new TimestampIntWithTZ96Reader(desc); + } else { + return new TimestampIntWithOutTZ96Reader(desc); + } + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class TimestampIntWithOutTZ96Reader + extends ParquetValueReaders.PrimitiveReader { + private static final long UNIX_EPOCH_JULIAN = 2_440_588L; + + TimestampIntWithOutTZ96Reader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData reuse) { + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final long timeOfDayNanos = byteBuffer.getLong(); + final int julianDay = byteBuffer.getInt(); + + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) + .plusNanos(timeOfDayNanos) + .atZone(ZoneId.systemDefault()) + .toLocalDateTime()); + } + } + + private static class TimestampIntWithTZ96Reader + extends ParquetValueReaders.PrimitiveReader { + private static final long UNIX_EPOCH_JULIAN = 2_440_588L; + + private TimestampIntWithTZ96Reader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData reuse) { + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final long timeOfDayNanos = byteBuffer.getLong(); + final int julianDay = byteBuffer.getInt(); + + return TimestampData.fromInstant( + Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) + .plusNanos(timeOfDayNanos)); + } + } + + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + Binary binary = column.nextBinary(); + BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); + // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader + return DecimalData.fromBigDecimal(bigDecimal, precision, scale); + } + } + + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); + } + } + + private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); + } + } + + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000)); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromEpochMillis(millis); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class StringReader extends ParquetValueReaders.PrimitiveReader { + StringReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public StringData read(StringData ignored) { + Binary binary = column.nextBinary(); + ByteBuffer buffer = binary.toByteBuffer(); + if (buffer.hasArray()) { + return StringData.fromBytes( + buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); + } else { + return StringData.fromBytes(binary.getBytes()); + } + } + } + + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { + LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + // Discard microseconds since Flink uses millisecond unit for TIME type. + return (int) Math.floorDiv(column.nextLong(), 1000L); + } + } + + private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { + MillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + return (int) column.nextLong(); + } + } + + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { + private int readPos = 0; + private int writePos = 0; + + ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { + super(definitionLevel, repetitionLevel, reader); + } + + @Override + protected ReusableArrayData newListData(ArrayData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableArrayData) { + return (ReusableArrayData) reuse; + } else { + return new ReusableArrayData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected E getElement(ReusableArrayData list) { + E value = null; + if (readPos < list.capacity()) { + value = (E) list.values[readPos]; + } + + readPos += 1; + + return value; + } + + @Override + protected void addElement(ReusableArrayData reused, E element) { + if (writePos >= reused.capacity()) { + reused.grow(); + } + + reused.values[writePos] = element; + + writePos += 1; + } + + @Override + protected ArrayData buildList(ReusableArrayData list) { + list.setNumElements(writePos); + return list; + } + } + + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { + private int readPos = 0; + private int writePos = 0; + + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); + + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + super(definitionLevel, repetitionLevel, keyReader, valueReader); + } + + @Override + protected ReusableMapData newMapData(MapData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableMapData) { + return (ReusableMapData) reuse; + } else { + return new ReusableMapData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected Map.Entry getPair(ReusableMapData map) { + Map.Entry kv = nullEntry; + if (readPos < map.capacity()) { + entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); + kv = entry; + } + + readPos += 1; + + return kv; + } + + @Override + protected void addPair(ReusableMapData map, K key, V value) { + if (writePos >= map.capacity()) { + map.grow(); + } + + map.keys.values[writePos] = key; + map.values.values[writePos] = value; + + writePos += 1; + } + + @Override + protected MapData buildMap(ReusableMapData map) { + map.setNumElements(writePos); + return map; + } + } + + private static class RowDataReader + extends ParquetValueReaders.StructReader { + private final int numFields; + + RowDataReader(List types, List> readers) { + super(types, readers); + this.numFields = readers.size(); + } + + @Override + protected GenericRowData newStructData(RowData reuse) { + if (reuse instanceof GenericRowData) { + return (GenericRowData) reuse; + } else { + return new GenericRowData(numFields); + } + } + + @Override + protected Object getField(GenericRowData intermediate, int pos) { + return intermediate.getField(pos); + } + + @Override + protected RowData buildStruct(GenericRowData struct) { + return struct; + } + + @Override + protected void set(GenericRowData row, int pos, Object value) { + row.setField(pos, value); + } + + @Override + protected void setNull(GenericRowData row, int pos) { + row.setField(pos, null); + } + + @Override + protected void setBoolean(GenericRowData row, int pos, boolean value) { + row.setField(pos, value); + } + + @Override + protected void setInteger(GenericRowData row, int pos, int value) { + row.setField(pos, value); + } + + @Override + protected void setLong(GenericRowData row, int pos, long value) { + row.setField(pos, value); + } + + @Override + protected void setFloat(GenericRowData row, int pos, float value) { + row.setField(pos, value); + } + + @Override + protected void setDouble(GenericRowData row, int pos, double value) { + row.setField(pos, value); + } + } + + private static class ReusableMapData implements MapData { + private final ReusableArrayData keys; + private final ReusableArrayData values; + + private int numElements; + + private ReusableMapData() { + this.keys = new ReusableArrayData(); + this.values = new ReusableArrayData(); + } + + private void grow() { + keys.grow(); + values.grow(); + } + + private int capacity() { + return keys.capacity(); + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + keys.setNumElements(numElements); + values.setNumElements(numElements); + } + + @Override + public int size() { + return numElements; + } + + @Override + public ReusableArrayData keyArray() { + return keys; + } + + @Override + public ReusableArrayData valueArray() { + return values; + } + } + + private static class ReusableArrayData implements ArrayData { + private static final Object[] EMPTY = new Object[0]; + + private Object[] values = EMPTY; + private int numElements = 0; + + private void grow() { + if (values.length == 0) { + this.values = new Object[20]; + } else { + Object[] old = values; + this.values = new Object[old.length << 1]; + // copy the old array in case it has values that can be reused + System.arraycopy(old, 0, values, 0, old.length); + } + } + + private int capacity() { + return values.length; + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public boolean isNullAt(int ordinal) { + return null == values[ordinal]; + } + + @Override + public boolean getBoolean(int ordinal) { + return (boolean) values[ordinal]; + } + + @Override + public byte getByte(int ordinal) { + return (byte) values[ordinal]; + } + + @Override + public short getShort(int ordinal) { + return (short) values[ordinal]; + } + + @Override + public int getInt(int ordinal) { + return (int) values[ordinal]; + } + + @Override + public long getLong(int ordinal) { + return (long) values[ordinal]; + } + + @Override + public float getFloat(int ordinal) { + return (float) values[ordinal]; + } + + @Override + public double getDouble(int ordinal) { + return (double) values[ordinal]; + } + + @Override + public StringData getString(int pos) { + return (StringData) values[pos]; + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) values[pos]; + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) values[pos]; + } + + @SuppressWarnings("unchecked") + @Override + public RawValueData getRawValue(int pos) { + return (RawValueData) values[pos]; + } + + @Override + public byte[] getBinary(int ordinal) { + return (byte[]) values[ordinal]; + } + + @Override + public ArrayData getArray(int ordinal) { + return (ArrayData) values[ordinal]; + } + + @Override + public MapData getMap(int ordinal) { + return (MapData) values[ordinal]; + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) values[pos]; + } + + @Override + public boolean[] toBooleanArray() { + return ArrayUtil.toPrimitive((Boolean[]) values); + } + + @Override + public byte[] toByteArray() { + return ArrayUtil.toPrimitive((Byte[]) values); + } + + @Override + public short[] toShortArray() { + return ArrayUtil.toPrimitive((Short[]) values); + } + + @Override + public int[] toIntArray() { + return ArrayUtil.toPrimitive((Integer[]) values); + } + + @Override + public long[] toLongArray() { + return ArrayUtil.toPrimitive((Long[]) values); + } + + @Override + public float[] toFloatArray() { + return ArrayUtil.toPrimitive((Float[]) values); + } + + @Override + public double[] toDoubleArray() { + return ArrayUtil.toPrimitive((Double[]) values); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java new file mode 100644 index 0000000000..f591ebe527 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.data.DataFileType; +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; +import org.apache.amoro.scan.ChangeTableIncrementalScan; +import org.apache.amoro.scan.CombinedScanTask; +import org.apache.amoro.scan.KeyedTableScan; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.table.KeyedTable; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** + * An util class that plans mixed-format table(base and change) or just plans change table. invoked + * by mixed-format enumerator. + */ +public class FlinkSplitPlanner { + private static final Logger LOG = LoggerFactory.getLogger(FlinkSplitPlanner.class); + + private FlinkSplitPlanner() {} + + public static List planFullTable( + KeyedTable keyedTable, AtomicInteger splitCount) { + CloseableIterable combinedScanTasks = keyedTable.newScan().planTasks(); + BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.of(combinedScanTasks); + return planFullTable(baseAndChangeTask, splitCount); + } + + /** + * Plans full table scanning for a {@link KeyedTable} with optional filters and a specified split + * count. + * + * @param keyedTable The {@link KeyedTable} to scan. + * @param filters Optional list of filters to apply to the scan. + * @param splitCount The atomic integer to track the split count. + * @return The list of planned {@link MixedFormatSplit} included {@link SnapshotSplit}, {@link + * ChangelogSplit}. + */ + public static List planFullTable( + KeyedTable keyedTable, List filters, AtomicInteger splitCount) { + KeyedTableScan keyedTableScan = keyedTable.newScan(); + if (filters != null) { + filters.forEach(keyedTableScan::filter); + } + CloseableIterable combinedScanTasks = keyedTableScan.planTasks(); + BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.of(combinedScanTasks); + return planFullTable(baseAndChangeTask, splitCount); + } + + private static List planFullTable( + BaseAndChangeTask baseAndChangeTask, AtomicInteger splitCount) { + Collection baseTasks = baseAndChangeTask.allBaseTasks(); + List allSplits = + baseTasks.stream() + .map( + mixedFileScanTask -> + new SnapshotSplit( + Collections.singleton(mixedFileScanTask), splitCount.incrementAndGet())) + .collect(Collectors.toList()); + + Collection changeTasks = baseAndChangeTask.transactionTasks(); + List changeSplits = planChangeTable(changeTasks, splitCount); + allSplits.addAll(changeSplits); + + return allSplits; + } + + /** + * Plans full table scanning for a {@link KeyedTable} with optional filters and a specified split + * count. + * + * @param keyedTable The {@link KeyedTable} to scan. + * @param filters Optional list of filters to apply to the scan. + * @param splitCount The atomic integer to track the split count. + * @return The list of planned {@link MixedFormatSplit} included {@link MergeOnReadSplit}. + */ + public static List mergeOnReadPlan( + KeyedTable keyedTable, List filters, AtomicInteger splitCount) { + KeyedTableScan keyedTableScan = keyedTable.newScan(); + if (filters != null) { + filters.forEach(keyedTableScan::filter); + } + CloseableIterable combinedScanTasks = keyedTableScan.planTasks(); + List morSplits = Lists.newArrayList(); + try (CloseableIterator initTasks = combinedScanTasks.iterator()) { + + while (initTasks.hasNext()) { + CombinedScanTask combinedScanTask = initTasks.next(); + combinedScanTask + .tasks() + .forEach( + keyedTableScanTask -> + morSplits.add(new MergeOnReadSplit(splitCount.get(), keyedTableScanTask))); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + return morSplits; + } + + public static List planChangeTable( + ChangeTableIncrementalScan tableIncrementalScan, AtomicInteger splitCount) { + CloseableIterable tasks = tableIncrementalScan.planFiles(); + BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.ofIceberg(tasks); + return planChangeTable(baseAndChangeTask.transactionTasks(), splitCount); + } + + private static List planChangeTable( + Collection transactionTasks, AtomicInteger splitCount) { + List changeTasks = new ArrayList<>(transactionTasks.size()); + transactionTasks.forEach( + transactionTask -> { + PartitionAndNodeGroup partitionAndNodeGroup = + new PartitionAndNodeGroup() + .insertFileScanTask(transactionTask.insertTasks) + .deleteFileScanTask(transactionTask.deleteTasks) + .splitCount(splitCount); + changeTasks.addAll(partitionAndNodeGroup.planSplits()); + }); + return changeTasks; + } + + private static class TransactionTask { + private Set insertTasks; + private Set deleteTasks; + Long transactionId; + + public TransactionTask(Long transactionId) { + this.transactionId = transactionId; + } + + public void putInsertTask(MixedFileScanTask insert) { + if (insertTasks == null) { + insertTasks = new HashSet<>(); + } + insertTasks.add(insert); + } + + public void putDeleteTask(MixedFileScanTask delete) { + if (deleteTasks == null) { + deleteTasks = new HashSet<>(); + } + deleteTasks.add(delete); + } + } + + public static class BaseAndChangeTask { + Collection allBaseTasks; + Collection changeTableTasks; + + private BaseAndChangeTask( + Collection allBaseTasks, Map changeTableTaskMap) { + this.allBaseTasks = allBaseTasks; + if (changeTableTaskMap == null || changeTableTaskMap.isEmpty()) { + this.changeTableTasks = Collections.emptyList(); + } else { + this.changeTableTasks = + changeTableTaskMap.values().stream() + .sorted(Comparator.comparing(o -> o.transactionId)) + .collect(Collectors.toList()); + } + } + + public static BaseAndChangeTask ofIceberg(CloseableIterable tasks) { + try (CloseableIterator tasksIterator = tasks.iterator()) { + Map transactionTasks = new HashMap<>(); + long startTime = System.currentTimeMillis(); + int count = 0; + while (tasksIterator.hasNext()) { + count++; + MixedFileScanTask fileScanTask = (MixedFileScanTask) tasksIterator.next(); + if (fileScanTask.file().type().equals(DataFileType.INSERT_FILE)) { + taskMap(Collections.singleton(fileScanTask), true, transactionTasks); + } else if (fileScanTask.file().type().equals(DataFileType.EQ_DELETE_FILE)) { + taskMap(Collections.singleton(fileScanTask), false, transactionTasks); + } else { + throw new IllegalArgumentException( + String.format( + "DataFileType %s is not supported during change log reading period.", + fileScanTask.file().type())); + } + } + LOG.info( + "Read {} change log from {} in {} ms", + count, + tasksIterator.getClass(), + System.currentTimeMillis() - startTime); + return new BaseAndChangeTask(Collections.emptySet(), transactionTasks); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + public static BaseAndChangeTask of(CloseableIterable combinedScanTasks) { + try (CloseableIterator initTasks = combinedScanTasks.iterator()) { + final Set allBaseTasks = new HashSet<>(); + final Map transactionTasks = new HashMap<>(); + + while (initTasks.hasNext()) { + CombinedScanTask combinedScanTask = initTasks.next(); + combinedScanTask + .tasks() + .forEach( + keyedTableScanTask -> { + allBaseTasks.addAll(keyedTableScanTask.baseTasks()); + + taskMap(keyedTableScanTask.insertTasks(), true, transactionTasks); + taskMap(keyedTableScanTask.mixedEquityDeletes(), false, transactionTasks); + }); + } + List baseTasks = + allBaseTasks.stream() + .sorted(Comparator.comparing(t -> t.file().transactionId())) + .collect(Collectors.toList()); + + return new BaseAndChangeTask(baseTasks, transactionTasks); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static void taskMap( + Collection tasks, + boolean insert, + Map transactionTaskMap) { + tasks.forEach( + task -> { + long transactionId = task.file().transactionId(); + TransactionTask tasksInSingleTransaction = + transactionTaskMap.getOrDefault(transactionId, new TransactionTask(transactionId)); + if (insert) { + tasksInSingleTransaction.putInsertTask(task); + } else { + tasksInSingleTransaction.putDeleteTask(task); + } + transactionTaskMap.put(transactionId, tasksInSingleTransaction); + }); + } + + public Collection allBaseTasks() { + return allBaseTasks; + } + + public Collection transactionTasks() { + return changeTableTasks; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java new file mode 100644 index 0000000000..5597c0e2ed --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; +import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; +import org.apache.amoro.flink.read.hybrid.assigner.StaticSplitAssigner; +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumStateSerializer; +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumerator; +import org.apache.amoro.flink.read.hybrid.enumerator.StaticMixedFormatSourceEnumerator; +import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; +import org.apache.amoro.flink.read.hybrid.reader.ReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitSerializer; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.core.io.SimpleVersionedSerializer; + +/** + * Mixed-format Source based of FLIP-27. + * + *

If MixedFormatSource is used as a build table in lookup join, it will be implemented by + * temporal join. Two source should use processing time as watermark. MixedFormatSource will + * generate watermark after first splits planned by MixedFormatSourceEnumerator having been + * finished. + */ +public class MixedFormatSource + implements Source, ResultTypeQueryable { + private static final long serialVersionUID = 1L; + private final MixedFormatScanContext scanContext; + private final ReaderFunction readerFunction; + private final TypeInformation typeInformation; + private final MixedFormatTableLoader loader; + private final String tableName; + /** + * generate mixed-format watermark. This is only for lookup join mixed-format table, and + * mixed-format table is used as build table, i.e. right table. + */ + private final boolean dimTable; + + public MixedFormatSource( + MixedFormatTableLoader loader, + MixedFormatScanContext scanContext, + ReaderFunction readerFunction, + TypeInformation typeInformation, + String tableName, + boolean dimTable) { + this.loader = loader; + this.scanContext = scanContext; + this.readerFunction = readerFunction; + this.typeInformation = typeInformation; + this.tableName = tableName; + this.dimTable = dimTable; + } + + @Override + public Boundedness getBoundedness() { + return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; + } + + @Override + public SourceReader createReader(SourceReaderContext readerContext) { + return new MixedFormatSourceReader<>( + readerFunction, readerContext.getConfiguration(), readerContext, dimTable); + } + + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return createEnumerator(enumContext, null); + } + + private SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext, MixedFormatSourceEnumState enumState) { + SplitAssigner splitAssigner; + if (scanContext.isStreaming()) { + splitAssigner = new ShuffleSplitAssigner(enumContext, tableName, enumState); + return new MixedFormatSourceEnumerator( + enumContext, splitAssigner, loader, scanContext, enumState, dimTable); + } else { + splitAssigner = new StaticSplitAssigner(enumState); + return new StaticMixedFormatSourceEnumerator( + enumContext, splitAssigner, loader, scanContext, null); + } + } + + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, MixedFormatSourceEnumState checkpoint) { + return createEnumerator(enumContext, checkpoint); + } + + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new MixedFormatSplitSerializer(); + } + + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new MixedFormatSourceEnumStateSerializer(); + } + + @Override + public TypeInformation getProducedType() { + return typeInformation; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java new file mode 100644 index 0000000000..04417f297c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousEnumerationResult; +import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousSplitPlanner; +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset; +import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayDeque; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.atomic.AtomicReference; + +/** + * This is a mixed-format table(mixed iceberg, mixed-hive) incremental loader. + * + *

This loader is used to load data by the merge on read approach first, then by the incremental + * pull approach. + * + *

Merge on read approach only contain INSERT rows. + * + *

Incremental pull approach contains INSERT, DELETE, UPDATE_BEFORE, and UPDATE_AFTER. + * + *

Support projection and filter push-down to speed up the loading process. + */ +public class MixedIncrementalLoader implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(MixedIncrementalLoader.class); + private final ContinuousSplitPlanner continuousSplitPlanner; + private final DataIteratorReaderFunction readerFunction; + private AbstractAdaptHiveKeyedDataReader flinkMORDataReader; + private final List filters; + private final AtomicReference enumeratorPosition; + private final Queue splitQueue; + + public MixedIncrementalLoader( + ContinuousSplitPlanner continuousSplitPlanner, + AbstractAdaptHiveKeyedDataReader flinkMORDataReader, + DataIteratorReaderFunction readerFunction, + List filters) { + this.continuousSplitPlanner = continuousSplitPlanner; + this.flinkMORDataReader = flinkMORDataReader; + this.readerFunction = readerFunction; + this.filters = filters; + this.enumeratorPosition = new AtomicReference<>(); + this.splitQueue = new ArrayDeque<>(); + } + + public MixedIncrementalLoader( + ContinuousSplitPlanner continuousSplitPlanner, + DataIteratorReaderFunction readerFunction, + List filters) { + this.continuousSplitPlanner = continuousSplitPlanner; + this.readerFunction = readerFunction; + this.filters = filters; + this.enumeratorPosition = new AtomicReference<>(); + this.splitQueue = new ArrayDeque<>(); + } + + public boolean hasNext() { + if (splitQueue.isEmpty()) { + ContinuousEnumerationResult planResult = + continuousSplitPlanner.planSplits(enumeratorPosition.get(), filters); + if (!planResult.isEmpty()) { + planResult.splits().forEach(split -> LOG.info("Putting this split into queue: {}.", split)); + splitQueue.addAll(planResult.splits()); + } + if (!planResult.toOffset().isEmpty()) { + enumeratorPosition.set(planResult.toOffset()); + } + LOG.info( + "Currently, queue contain {} splits, scan position is {}.", + splitQueue.size(), + enumeratorPosition.get()); + return !splitQueue.isEmpty(); + } + return true; + } + + public CloseableIterator next() { + MixedFormatSplit split = splitQueue.poll(); + if (split == null) { + throw new IllegalStateException("next() called, but no more valid splits"); + } + + LOG.info("Fetching data by this split:{}.", split); + if (split.isMergeOnReadSplit()) { + return flinkMORDataReader.readData(split.asMergeOnReadSplit().keyedTableScanTask()); + } + return readerFunction.createDataIterator(split); + } + + @Override + public void close() throws Exception { + continuousSplitPlanner.close(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java new file mode 100644 index 0000000000..a8b1d5ca0c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.scan.MixedFileScanTask; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * This is a group of the partitions and nodes of the mixed-format table, it can plan different + * nodes and different partitions into different {@link MixedFormatSplit}. + */ +public class PartitionAndNodeGroup { + AtomicInteger splitCount = new AtomicInteger(); + Collection insertTasks; + Collection deleteTasks; + + public PartitionAndNodeGroup insertFileScanTask(Set insertTasks) { + this.insertTasks = insertTasks; + return this; + } + + public PartitionAndNodeGroup deleteFileScanTask(Set deleteTasks) { + this.deleteTasks = deleteTasks; + return this; + } + + public PartitionAndNodeGroup splitCount(AtomicInteger splitCount) { + this.splitCount = splitCount; + return this; + } + + List planSplits() { + Map> nodes = new HashMap<>(); + plan(true, nodes); + plan(false, nodes); + + List splits = new ArrayList<>(); + + nodes + .values() + .forEach( + indexNodes -> + indexNodes + .values() + .forEach( + node -> + splits.add( + new ChangelogSplit( + node.inserts, node.deletes, splitCount.incrementAndGet())))); + return splits; + } + + /** + * Split the collection of {@link MixedFileScanTask} into different groups. + * + * @param insert if plan insert files or not + * @param nodes the key of nodes is partition info which the file located, the value of nodes is + * hashmap of mixed-format tree node id and {@link Node} + */ + private void plan(boolean insert, Map> nodes) { + Collection tasks = insert ? insertTasks : deleteTasks; + if (tasks == null) { + return; + } + + tasks.forEach( + task -> { + String partitionKey = task.file().partition().toString(); + Long nodeId = task.file().node().getId(); + Map indexNodes = nodes.getOrDefault(partitionKey, new HashMap<>()); + Node node = indexNodes.getOrDefault(nodeId, new Node()); + if (insert) { + node.addInsert(task); + } else { + node.addDelete(task); + } + indexNodes.put(nodeId, node); + nodes.put(partitionKey, indexNodes); + }); + } + + private static class Node { + List inserts = new ArrayList<>(1); + List deletes = new ArrayList<>(1); + + void addInsert(MixedFileScanTask task) { + inserts.add(task); + } + + void addDelete(MixedFileScanTask task) { + deletes.add(task); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java new file mode 100644 index 0000000000..cd0671fe84 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.data.PrimaryKeyedFile; +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.util.FlinkRuntimeException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; + +/** + * According to Mark, Index TreeNodes and subtaskId assigning a split to special subtask to read. + */ +public class ShuffleSplitAssigner implements SplitAssigner { + private static final Logger LOG = LoggerFactory.getLogger(ShuffleSplitAssigner.class); + + private static final long POLL_TIMEOUT = 200; + private final SplitEnumeratorContext enumeratorContext; + + private int totalParallelism; + private int totalSplitNum; + private Long currentMaskOfTreeNode; + private final Object lock = new Object(); + + /** + * Key is the partition data and file index of the mixed-format file, Value is flink application + * subtaskId. + */ + private final Map partitionIndexSubtaskMap; + /** Key is subtaskId, Value is the queue of unAssigned mixed-format splits. */ + private final Map> subtaskSplitMap; + + private CompletableFuture availableFuture; + + @VisibleForTesting + public ShuffleSplitAssigner(SplitEnumeratorContext enumeratorContext) { + this.enumeratorContext = enumeratorContext; + this.totalParallelism = enumeratorContext.currentParallelism(); + this.partitionIndexSubtaskMap = new ConcurrentHashMap<>(); + this.subtaskSplitMap = new ConcurrentHashMap<>(); + } + + public ShuffleSplitAssigner( + SplitEnumeratorContext enumeratorContext, + String tableName, + @Nullable MixedFormatSourceEnumState enumState) { + this.enumeratorContext = enumeratorContext; + this.partitionIndexSubtaskMap = new ConcurrentHashMap<>(); + this.subtaskSplitMap = new ConcurrentHashMap<>(); + if (enumState == null) { + this.totalParallelism = enumeratorContext.currentParallelism(); + LOG.info( + "Mixed-format source enumerator current parallelism is {} for table {}", + totalParallelism, + tableName); + } else { + LOG.info( + "Mixed-format source restored {} splits from state for table {}", + enumState.pendingSplits().size(), + tableName); + deserializePartitionIndex( + Objects.requireNonNull( + enumState.shuffleSplitRelation(), + "The partition index and subtask state couldn't be null.")); + enumState + .pendingSplits() + .forEach(state -> onDiscoveredSplits(Collections.singleton(state.toSourceSplit()))); + } + } + + @Override + public Split getNext() { + throw new UnsupportedOperationException( + "ShuffleSplitAssigner couldn't support this operation."); + } + + @Override + public Split getNext(int subtaskId) { + return getNextSplit(subtaskId) + .map(Split::of) + .orElseGet(isEmpty() ? Split::unavailable : Split::subtaskUnavailable); + } + + private Optional getNextSplit(int subTaskId) { + int currentParallelism = enumeratorContext.currentParallelism(); + if (totalParallelism != currentParallelism) { + throw new FlinkRuntimeException( + String.format( + "Source parallelism has been changed, before parallelism is %s, now is %s", + totalParallelism, currentParallelism)); + } + if (subtaskSplitMap.containsKey(subTaskId)) { + PriorityBlockingQueue queue = subtaskSplitMap.get(subTaskId); + + MixedFormatSplit mixedFormatSplit = null; + try { + mixedFormatSplit = queue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + LOG.warn("interruptedException", e); + } + if (mixedFormatSplit == null) { + LOG.debug( + "Subtask {}, couldn't retrieve mixed-format source split in the queue.", subTaskId); + return Optional.empty(); + } else { + LOG.info( + "get next mixed-format split taskIndex {}, totalSplitNum {}, mixed-format split {}.", + mixedFormatSplit.taskIndex(), + totalSplitNum, + mixedFormatSplit); + return Optional.of(mixedFormatSplit); + } + } else { + LOG.debug( + "Subtask {}, it's an idle subtask due to the empty queue with this subtask.", subTaskId); + return Optional.empty(); + } + } + + @Override + public void onDiscoveredSplits(Collection splits) { + splits.forEach(this::putSplitIntoQueue); + // only complete pending future if new splits are discovered + completeAvailableFuturesIfNeeded(); + } + + @Override + public void onUnassignedSplits(Collection splits) { + onDiscoveredSplits(splits); + } + + void putSplitIntoQueue(final MixedFormatSplit split) { + List exactlyTreeNodes = getExactlyTreeNodes(split); + + PrimaryKeyedFile file = findAnyFileInSplit(split); + + for (DataTreeNode node : exactlyTreeNodes) { + long partitionIndexKey = Math.abs(file.partition().toString().hashCode() + node.index()); + int subtaskId = + partitionIndexSubtaskMap.computeIfAbsent( + partitionIndexKey, key -> (partitionIndexSubtaskMap.size() + 1) % totalParallelism); + LOG.info( + "partition = {}, (mask, index) = ({}, {}), subtaskId = {}", + file.partition().toString(), + node.mask(), + node.index(), + subtaskId); + + PriorityBlockingQueue queue = + subtaskSplitMap.getOrDefault(subtaskId, new PriorityBlockingQueue<>()); + MixedFormatSplit copiedSplit = split.copy(); + copiedSplit.modifyTreeNode(node); + LOG.info("put split into queue: {}", copiedSplit); + queue.add(copiedSplit); + totalSplitNum = totalSplitNum + 1; + subtaskSplitMap.put(subtaskId, queue); + } + } + + @Override + public Collection state() { + List mixedFormatSplitStates = new ArrayList<>(); + subtaskSplitMap.forEach( + (key, value) -> + mixedFormatSplitStates.addAll( + value.stream().map(MixedFormatSplitState::new).collect(Collectors.toList()))); + + return mixedFormatSplitStates; + } + + @Override + public synchronized CompletableFuture isAvailable() { + if (availableFuture == null) { + availableFuture = new CompletableFuture<>(); + } + return availableFuture; + } + + public boolean isEmpty() { + if (subtaskSplitMap.isEmpty()) { + return true; + } + for (Map.Entry> entry : + subtaskSplitMap.entrySet()) { + if (!entry.getValue().isEmpty()) { + return false; + } + } + return true; + } + + @Override + public void close() throws IOException { + subtaskSplitMap.clear(); + partitionIndexSubtaskMap.clear(); + } + + public long[] serializePartitionIndex() { + int prefixParams = 3; + long[] shuffleSplitRelation = new long[partitionIndexSubtaskMap.size() * 2 + prefixParams]; + shuffleSplitRelation[0] = totalParallelism; + shuffleSplitRelation[1] = totalSplitNum; + shuffleSplitRelation[2] = currentMaskOfTreeNode == null ? -1 : currentMaskOfTreeNode; + + int i = prefixParams; + for (Map.Entry entry : partitionIndexSubtaskMap.entrySet()) { + shuffleSplitRelation[i++] = entry.getKey(); + shuffleSplitRelation[i++] = entry.getValue(); + } + return shuffleSplitRelation; + } + + void deserializePartitionIndex(long[] shuffleSplitRelation) { + int prefixParams = 3; + this.totalParallelism = (int) shuffleSplitRelation[0]; + this.totalSplitNum = (int) shuffleSplitRelation[1]; + this.currentMaskOfTreeNode = shuffleSplitRelation[2] == -1 ? null : shuffleSplitRelation[2]; + + for (int i = prefixParams; i < shuffleSplitRelation.length; i++) { + partitionIndexSubtaskMap.put(shuffleSplitRelation[i], (int) shuffleSplitRelation[++i]); + } + } + + /** + * Different data files may locate in different layers when multi snapshots are committed, so + * mixed-format source reading should consider emitting the records and keeping ordering. + * According to the dataTreeNode of the mixed-format split and the currentMaskOfTreeNode, return + * the exact tree node list which may move up or go down layers in the mixed-format tree. + * + *

+   * |mask=0          o
+   * |             /     \
+   * |mask=1     o        o
+   * |         /   \    /   \
+   * |mask=3  o     o  o     o
+   * 
+ * + * @param mixedFormatSplit Mixed-format split. + * @return The exact tree node list. + */ + public List getExactlyTreeNodes(MixedFormatSplit mixedFormatSplit) { + DataTreeNode dataTreeNode = mixedFormatSplit.dataTreeNode(); + long mask = dataTreeNode.mask(); + + synchronized (lock) { + if (currentMaskOfTreeNode == null) { + currentMaskOfTreeNode = mask; + } + } + + return scanTreeNode(dataTreeNode); + } + + private List scanTreeNode(DataTreeNode dataTreeNode) { + long mask = dataTreeNode.mask(); + if (mask == currentMaskOfTreeNode) { + return Collections.singletonList(dataTreeNode); + } else if (mask > currentMaskOfTreeNode) { + // move up one layer + return scanTreeNode(dataTreeNode.parent()); + } else { + // go down one layer + List allNodes = new ArrayList<>(); + allNodes.addAll(scanTreeNode(dataTreeNode.left())); + allNodes.addAll(scanTreeNode(dataTreeNode.right())); + return allNodes; + } + } + + /** + * In one mixed-format split, the partitions, mask and index of the files are the same. + * + * @param mixedFormatSplit mixed-format source split + * @return anyone primary keyed file in the mixed-format split. + */ + private PrimaryKeyedFile findAnyFileInSplit(MixedFormatSplit mixedFormatSplit) { + AtomicReference file = new AtomicReference<>(); + if (mixedFormatSplit.isChangelogSplit()) { + List mixedFileScanTasks = + new ArrayList<>(mixedFormatSplit.asChangelogSplit().insertTasks()); + mixedFileScanTasks.addAll(mixedFormatSplit.asChangelogSplit().deleteTasks()); + mixedFileScanTasks.stream().findFirst().ifPresent(task -> file.set(task.file())); + if (file.get() != null) { + return file.get(); + } + } + + List mixedFileScanTasks = + new ArrayList<>(mixedFormatSplit.asSnapshotSplit().insertTasks()); + mixedFileScanTasks.stream().findFirst().ifPresent(task -> file.set(task.file())); + if (file.get() != null) { + return file.get(); + } + throw new FlinkRuntimeException("Couldn't find a primaryKeyedFile."); + } + + private synchronized void completeAvailableFuturesIfNeeded() { + if (availableFuture != null && !isEmpty()) { + availableFuture.complete(null); + } + availableFuture = null; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java new file mode 100644 index 0000000000..265710ee1c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.util.Preconditions; + +/** This is a wrapper Split of {@link MixedFormatSplit} with split status. */ +public class Split { + + public enum Status { + AVAILABLE, + + /** Assigner has pending splits. But current subtask doesn't have pending splits. */ + SUBTASK_UNAVAILABLE, + + /** Assigner doesn't have pending splits. */ + UNAVAILABLE + } + + private final Status status; + private final MixedFormatSplit split; + + private Split(Status status) { + this.status = status; + this.split = null; + } + + private Split(MixedFormatSplit split) { + Preconditions.checkNotNull(split, "Split cannot be null"); + this.status = Status.AVAILABLE; + this.split = split; + } + + @VisibleForTesting + public Status status() { + return status; + } + + public boolean isAvailable() { + return status == Status.AVAILABLE; + } + + public boolean isUnavailable() { + return status == Status.UNAVAILABLE; + } + + public MixedFormatSplit split() { + return split; + } + + private static final Split UNAVAILABLE = new Split(Status.UNAVAILABLE); + private static final Split SUBTASK_UNAVAILABLE = new Split(Status.SUBTASK_UNAVAILABLE); + + public static Split unavailable() { + return UNAVAILABLE; + } + + public static Split subtaskUnavailable() { + return SUBTASK_UNAVAILABLE; + } + + public static Split of(MixedFormatSplit mixedFormatSplit) { + return new Split(mixedFormatSplit); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java new file mode 100644 index 0000000000..8e7b36a40b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; + +import java.io.Closeable; +import java.util.Collection; +import java.util.concurrent.CompletableFuture; + +/** An interface SplitAssigner for {@link MixedFormatSplit} */ +public interface SplitAssigner extends Closeable { + + default void open() {} + + Split getNext(); + + Split getNext(int subtaskId); + + /** Add new splits discovered by enumerator */ + void onDiscoveredSplits(Collection splits); + + /** Forward addSplitsBack event (for failed reader) to assigner */ + void onUnassignedSplits(Collection splits); + + /** + * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon + * completed splits + */ + default void onCompletedSplits(Collection completedSplitIds) {} + + Collection state(); + + /** + * Enumerator can get a notification via CompletableFuture when the assigner has more splits + * available later. Enumerator should schedule assignment in the thenAccept action of the future. + * + *

Assigner will return the same future if this method is called again before the previous + * future is completed. + * + *

The future can be completed from other thread, e.g. the coordinator thread from another + * thread for event time alignment. + * + *

If enumerator need to trigger action upon the future completion, it may want to run it in + * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. + */ + CompletableFuture isAvailable(); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java new file mode 100644 index 0000000000..ff39e4124c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +/** This is a static split assigner which is used for batch mode. */ +public class StaticSplitAssigner implements SplitAssigner { + private static final Logger LOG = LoggerFactory.getLogger(StaticSplitAssigner.class); + + private static final long POLL_TIMEOUT = 200; + private int totalSplitNum; + + private final PriorityBlockingQueue splitQueue; + + private CompletableFuture availableFuture; + + public StaticSplitAssigner(@Nullable MixedFormatSourceEnumState enumState) { + this.splitQueue = new PriorityBlockingQueue<>(); + if (enumState != null) { + Collection splitStates = enumState.pendingSplits(); + splitStates.forEach( + state -> onDiscoveredSplits(Collections.singleton(state.toSourceSplit()))); + } + } + + @Override + public Split getNext() { + return getNextSplit().map(Split::of).orElseGet(Split::unavailable); + } + + @Override + public Split getNext(int subtaskId) { + return getNext(); + } + + private Optional getNextSplit() { + MixedFormatSplit mixedFormatSplit = null; + try { + mixedFormatSplit = splitQueue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + LOG.warn("Interrupted when polling splits from the split queue", e); + } + if (mixedFormatSplit == null) { + LOG.debug( + "Couldn't retrieve mixed-format source split from the queue, as the queue is empty."); + return Optional.empty(); + } else { + LOG.info( + "Assigning the mixed-format split, task index is {}, total number of splits is {}, mixed-format split is {}.", + mixedFormatSplit.taskIndex(), + totalSplitNum, + mixedFormatSplit); + return Optional.of(mixedFormatSplit); + } + } + + @Override + public void onDiscoveredSplits(Collection splits) { + splits.forEach(this::putSplitIntoQueue); + totalSplitNum += splits.size(); + // only complete pending future if new splits are discovered + completeAvailableFuturesIfNeeded(); + } + + @Override + public void onUnassignedSplits(Collection splits) { + onDiscoveredSplits(splits); + } + + void putSplitIntoQueue(final MixedFormatSplit split) { + splitQueue.put(split); + } + + @Override + public Collection state() { + return splitQueue.stream().map(MixedFormatSplitState::new).collect(Collectors.toList()); + } + + @Override + public synchronized CompletableFuture isAvailable() { + if (availableFuture == null) { + availableFuture = new CompletableFuture<>(); + } + return availableFuture; + } + + public boolean isEmpty() { + return splitQueue.isEmpty(); + } + + @Override + public void close() throws IOException { + splitQueue.clear(); + } + + private synchronized void completeAvailableFuturesIfNeeded() { + if (availableFuture != null && !isEmpty()) { + availableFuture.complete(null); + } + availableFuture = null; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java new file mode 100644 index 0000000000..49a0f07a3f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.hybrid.assigner.Split; +import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; +import org.apache.amoro.flink.read.hybrid.reader.ReaderStartedEvent; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicReference; + +/** The abstract mixed-format source enumerator. */ +public abstract class AbstractMixedFormatEnumerator + implements SplitEnumerator { + private static final Logger LOG = LoggerFactory.getLogger(AbstractMixedFormatEnumerator.class); + private final SplitEnumeratorContext enumeratorContext; + private final SplitAssigner assigner; + private final Map readersAwaitingSplit; + private final AtomicReference> availableFuture; + + AbstractMixedFormatEnumerator( + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { + this.enumeratorContext = enumeratorContext; + this.assigner = assigner; + this.readersAwaitingSplit = new ConcurrentHashMap<>(); + this.availableFuture = new AtomicReference<>(); + } + + @Override + public void start() {} + + @Override + public void close() throws IOException { + assigner.close(); + } + + @Override + public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { + throw new UnsupportedOperationException( + String.format( + "Received invalid default split request event " + + "from subtask %d as mixed-format source uses custom split request event", + subtaskId)); + } + + @Override + public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { + if (sourceEvent instanceof SplitRequestEvent) { + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; + LOG.info("Received request split event from subtask {}", subtaskId); + assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); + readersAwaitingSplit.put(subtaskId, String.valueOf(splitRequestEvent.requesterHostname())); + assignSplits(); + } else if (sourceEvent instanceof ReaderStartedEvent) { + LOG.info("Received ReaderStartEvent from subtask {}", subtaskId); + } else { + throw new IllegalArgumentException( + String.format( + "Received unknown event from subtask %d: %s", + subtaskId, sourceEvent.getClass().getCanonicalName())); + } + } + + @Override + public void addReader(int subtaskId) { + LOG.info("Added reader: {}", subtaskId); + } + + @Override + public void addSplitsBack(List splits, int subtaskId) { + LOG.info("addSplitsBack from subtaskId {}, splits {}.", subtaskId, splits); + assigner.onUnassignedSplits(splits); + } + + /** return true if enumerator should wait for splits like in the continuous enumerator case. */ + protected abstract boolean shouldWaitForMoreSplits(); + + protected void assignSplits() { + LOG.info( + "Assign mixed-format splits to {} readers, subtasks:{}.", + readersAwaitingSplit.size(), + readersAwaitingSplit.keySet().toArray()); + final Iterator> awaitingReader = + readersAwaitingSplit.entrySet().iterator(); + while (awaitingReader.hasNext()) { + final Map.Entry nextAwaiting = awaitingReader.next(); + + // if the reader that requested another split has failed in the meantime, remove + // it from the list of waiting readers + if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { + LOG.info( + "Due to this reader doesn't registered in the enumerator context any more, so remove this subtask reader" + + " [{}] from the awaiting reader map.", + nextAwaiting.getKey()); + awaitingReader.remove(); + continue; + } + + final int awaitingSubtask = nextAwaiting.getKey(); + final Split nextSplit = assigner.getNext(awaitingSubtask); + if (nextSplit.isAvailable()) { + MixedFormatSplit mixedFormatSplit = nextSplit.split(); + LOG.info( + "assign a mixed-format split to subtaskId {}, taskIndex {}, mixed-format split {}.", + awaitingSubtask, + mixedFormatSplit.taskIndex(), + mixedFormatSplit); + enumeratorContext.assignSplit(mixedFormatSplit, awaitingSubtask); + awaitingReader.remove(); + } else if (nextSplit.isUnavailable()) { + if (!shouldWaitForMoreSplits()) { + LOG.info("No more splits available for subtask {}", awaitingSubtask); + enumeratorContext.signalNoMoreSplits(awaitingSubtask); + awaitingReader.remove(); + } else { + fetchAvailableFutureIfNeeded(); + break; + } + } + } + } + + private synchronized void fetchAvailableFutureIfNeeded() { + if (availableFuture.get() != null) { + return; + } + + CompletableFuture future = + assigner + .isAvailable() + .thenAccept( + ignore -> + // Must run assignSplits in coordinator thread + // because the future may be completed from other threads. + // E.g., in event time alignment assigner, + // watermark advancement from another source may + // cause the available future to be completed + enumeratorContext.runInCoordinatorThread( + () -> { + LOG.debug("Executing callback of assignSplits"); + availableFuture.set(null); + assignSplits(); + })); + availableFuture.set(future); + LOG.debug("Registered callback for future available splits"); + } + + @VisibleForTesting + public Map getReadersAwaitingSplit() { + return readersAwaitingSplit; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java new file mode 100644 index 0000000000..ff9c610187 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; + +/** + * The result that contains {@link MixedFormatSplit}s and is generated by {@link FlinkSplitPlanner}. + */ +public class ContinuousEnumerationResult { + public static final ContinuousEnumerationResult EMPTY = + new ContinuousEnumerationResult( + Collections.emptyList(), null, MixedFormatEnumeratorOffset.empty()); + + private final Collection splits; + private final MixedFormatEnumeratorOffset fromOffset; + private final MixedFormatEnumeratorOffset toOffset; + + /** + * @param splits should never be null. But it can be an empty collection + * @param fromOffset can be null + * @param toOffset should never be null. But it can have null snapshotId and snapshotTimestampMs + */ + public ContinuousEnumerationResult( + Collection splits, + MixedFormatEnumeratorOffset fromOffset, + MixedFormatEnumeratorOffset toOffset) { + Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); + Preconditions.checkArgument(toOffset != null, "Invalid end position: null"); + this.splits = splits; + this.fromOffset = fromOffset; + this.toOffset = toOffset; + } + + public Collection splits() { + return splits; + } + + public MixedFormatEnumeratorOffset fromOffset() { + return fromOffset; + } + + public MixedFormatEnumeratorOffset toOffset() { + return toOffset; + } + + public boolean isEmpty() { + return null == splits || splits.isEmpty(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("splits", Arrays.toString(splits.toArray())) + .add("fromPosition", fromOffset) + .add("toPosition", toOffset) + .toString(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java new file mode 100644 index 0000000000..804ae9db81 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.commons.compress.utils.Lists; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.expressions.Expression; + +import java.io.Closeable; +import java.util.List; + +/** This interface is introduced so that we can plug in a different split planner for unit test */ +@Internal +public interface ContinuousSplitPlanner extends Closeable { + + /** Discover the files appended between {@code lastPosition} and current table snapshot */ + default ContinuousEnumerationResult planSplits(MixedFormatEnumeratorOffset lastPosition) { + return planSplits(lastPosition, Lists.newArrayList()); + } + + /** + * Discover the files appended between {@code lastPosition} and current table snapshot, filter the + * data with expressions. + */ + ContinuousEnumerationResult planSplits( + MixedFormatEnumeratorOffset lastPosition, List filters); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java new file mode 100644 index 0000000000..b513975ffb --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.amoro.flink.read.FlinkSplitPlanner.planChangeTable; +import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; +import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.scan.ChangeTableIncrementalScan; +import org.apache.amoro.table.KeyedTable; +import org.apache.commons.collections.CollectionUtils; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.expressions.Expression; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Continuous planning {@link KeyedTable} by {@link MixedFormatEnumeratorOffset} and generate a + * {@link ContinuousEnumerationResult}. + * + *

{@link ContinuousEnumerationResult#splits()} includes the {@link SnapshotSplit}s and {@link + * ChangelogSplit}s. + */ +@Internal +public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { + private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); + + protected transient KeyedTable table; + protected final MixedFormatTableLoader loader; + protected static final AtomicInteger SPLIT_COUNT = new AtomicInteger(); + + public ContinuousSplitPlannerImpl(MixedFormatTableLoader loader) { + this.loader = loader; + } + + @Override + public void close() throws IOException { + if (loader != null) { + loader.close(); + } + } + + @Override + public ContinuousEnumerationResult planSplits( + MixedFormatEnumeratorOffset lastOffset, List filters) { + if (table == null) { + table = loadMixedTable(loader).asKeyedTable(); + } + table.refresh(); + if (lastOffset != null) { + return discoverIncrementalSplits(lastOffset, filters); + } else { + return discoverInitialSplits(filters); + } + } + + protected ContinuousEnumerationResult discoverIncrementalSplits( + MixedFormatEnumeratorOffset lastPosition, List filters) { + long fromChangeSnapshotId = lastPosition.changeSnapshotId(); + Snapshot changeSnapshot = table.changeTable().currentSnapshot(); + if (changeSnapshot != null && changeSnapshot.snapshotId() != fromChangeSnapshotId) { + long snapshotId = changeSnapshot.snapshotId(); + ChangeTableIncrementalScan changeTableScan = + table.changeTable().newScan().useSnapshot(snapshotId); + if (filters != null) { + for (Expression filter : filters) { + changeTableScan = changeTableScan.filter(filter); + } + } + + if (fromChangeSnapshotId != Long.MIN_VALUE) { + Snapshot snapshot = table.changeTable().snapshot(fromChangeSnapshotId); + changeTableScan = changeTableScan.fromSequence(snapshot.sequenceNumber()); + } + + List changeSplit = planChangeTable(changeTableScan, SPLIT_COUNT); + return new ContinuousEnumerationResult( + changeSplit, lastPosition, MixedFormatEnumeratorOffset.of(snapshotId, null)); + } + return ContinuousEnumerationResult.EMPTY; + } + + protected ContinuousEnumerationResult discoverInitialSplits(List filters) { + Snapshot changeSnapshot = table.changeTable().currentSnapshot(); + // todo ShuffleSplitAssigner doesn't support MergeOnReadSplit right now, + // because it doesn't implement the dataTreeNode() method + // fix AMORO-1950 in the future. + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(table, filters, SPLIT_COUNT); + + long changeStartSnapshotId = + changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; + if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { + LOG.info("There have no change snapshot, and no base splits in table: {}.", table); + return ContinuousEnumerationResult.EMPTY; + } + + return new ContinuousEnumerationResult( + mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java new file mode 100644 index 0000000000..b2ad16e819 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; +import org.apache.flink.api.connector.source.SourceEvent; + +/** {@link MixedFormatSourceReader} won't set timestamp to RowData until receiving this Event. */ +public class InitializationFinishedEvent implements SourceEvent { + private static final long serialVersionUID = 1L; + + public static final InitializationFinishedEvent INSTANCE = new InitializationFinishedEvent(); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java new file mode 100644 index 0000000000..8f669ed914 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.commons.collections.CollectionUtils; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.expressions.Expression; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +/** + * A planner for merge-on-read scanning by {@link this#discoverInitialSplits} and incremental + * scanning by {@link this#discoverIncrementalSplits(MixedFormatEnumeratorOffset, List)}. + * + *

{@link ContinuousEnumerationResult#splits()} includes the {@link MergeOnReadSplit}s and {@link + * ChangelogSplit}s. + */ +public class MergeOnReadIncrementalPlanner extends ContinuousSplitPlannerImpl { + private static final Logger LOG = LoggerFactory.getLogger(MergeOnReadIncrementalPlanner.class); + + public MergeOnReadIncrementalPlanner(MixedFormatTableLoader loader) { + super(loader); + } + + @Override + protected ContinuousEnumerationResult discoverInitialSplits(List filters) { + Snapshot changeSnapshot = table.changeTable().currentSnapshot(); + + List mixedFormatSplits = + FlinkSplitPlanner.mergeOnReadPlan(table, filters, SPLIT_COUNT); + + long changeStartSnapshotId = + changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; + if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { + LOG.info("There have no change snapshot, and no base splits in table: {}.", table); + return ContinuousEnumerationResult.EMPTY; + } + + return new ContinuousEnumerationResult( + mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java new file mode 100644 index 0000000000..84b276ec89 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; +import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.table.KeyedTable; +import org.apache.commons.collections.CollectionUtils; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.expressions.Expression; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** Used for MergeOnRead, only for the bounded reading and return append stream. */ +public class MergeOnReadPlannerImpl implements ContinuousSplitPlanner { + private static final Logger LOG = LoggerFactory.getLogger(MergeOnReadPlannerImpl.class); + + protected transient KeyedTable table; + protected final MixedFormatTableLoader loader; + protected static final AtomicInteger SPLIT_COUNT = new AtomicInteger(); + + public MergeOnReadPlannerImpl(MixedFormatTableLoader loader) { + this.loader = loader; + } + + @Override + public ContinuousEnumerationResult planSplits( + MixedFormatEnumeratorOffset ignored, List filters) { + // todo support mor the table from the specific offset in the future + if (table == null) { + table = loadMixedTable(loader).asKeyedTable(); + } + table.refresh(); + return discoverInitialSplits(filters); + } + + protected ContinuousEnumerationResult discoverInitialSplits(List filters) { + Snapshot changeSnapshot = table.changeTable().currentSnapshot(); + List mixedFormatSplits = + FlinkSplitPlanner.mergeOnReadPlan(table, filters, SPLIT_COUNT); + + long changeStartSnapshotId = + changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; + if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { + LOG.info("There have no change snapshot, and no base splits in table: {}.", table); + return ContinuousEnumerationResult.EMPTY; + } + + return new ContinuousEnumerationResult( + mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); + } + + @Override + public void close() throws IOException { + if (loader != null) { + loader.close(); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java new file mode 100644 index 0000000000..5c9beed199 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.shade.guava32.com.google.common.base.Objects; + +/** + * The enumerator offset indicate the snapshot id of the change table, or the timestamp of snapshot. + */ +public class MixedFormatEnumeratorOffset { + private static final MixedFormatEnumeratorOffset EMPTY = of(Long.MIN_VALUE, Long.MIN_VALUE); + + /** use Long.MIN_VALUE to indicate the earliest offset */ + public static final long EARLIEST_SNAPSHOT_ID = Long.MIN_VALUE; + + private Long changeSnapshotId; + private Long snapshotTimestampMs; + + private MixedFormatEnumeratorOffset(Long changeSnapshotId, Long snapshotTimestampMs) { + this.changeSnapshotId = changeSnapshotId; + this.snapshotTimestampMs = snapshotTimestampMs; + } + + public static MixedFormatEnumeratorOffset of(Long changeSnapshotId, Long snapshotTimestampMs) { + return new MixedFormatEnumeratorOffset(changeSnapshotId, snapshotTimestampMs); + } + + public static MixedFormatEnumeratorOffset empty() { + return EMPTY; + } + + public Long changeSnapshotId() { + return changeSnapshotId; + } + + public void changeSnapshotId(long changeSnapshotId) { + this.changeSnapshotId = changeSnapshotId; + } + + public Long snapshotTimestampMs() { + return snapshotTimestampMs; + } + + public void snapshotTimestampMs(Long snapshotTimestamp) { + this.snapshotTimestampMs = snapshotTimestamp; + } + + public boolean isEmpty() { + return (changeSnapshotId == null && snapshotTimestampMs == null) || equals(EMPTY); + } + + @Override + public int hashCode() { + return Objects.hashCode(changeSnapshotId, snapshotTimestampMs); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("changeSnapshotId", changeSnapshotId) + .add("snapshotTimestamp", snapshotTimestampMs) + .toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + MixedFormatEnumeratorOffset other = (MixedFormatEnumeratorOffset) o; + return Objects.equal(changeSnapshotId, other.changeSnapshotId()) + && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java new file mode 100644 index 0000000000..1692dbff4b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; + +import java.io.IOException; + +/** + * Serializer that serializes and deserializes mixed-format enumerator {@link + * MixedFormatEnumeratorOffset}. + */ +class MixedFormatEnumeratorOffsetSerializer + implements SimpleVersionedSerializer { + public static final MixedFormatEnumeratorOffsetSerializer INSTANCE = + new MixedFormatEnumeratorOffsetSerializer(); + + private static final int VERSION = 1; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(MixedFormatEnumeratorOffset position) throws IOException { + return serializeV1(position); + } + + @Override + public MixedFormatEnumeratorOffset deserialize(int version, byte[] serialized) + throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + private byte[] serializeV1(MixedFormatEnumeratorOffset position) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + out.writeBoolean(position.changeSnapshotId() != null); + if (position.changeSnapshotId() != null) { + out.writeLong(position.changeSnapshotId()); + } + out.writeBoolean(position.snapshotTimestampMs() != null); + if (position.snapshotTimestampMs() != null) { + out.writeLong(position.snapshotTimestampMs()); + } + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + private MixedFormatEnumeratorOffset deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + Long snapshotId = null; + if (in.readBoolean()) { + snapshotId = in.readLong(); + } + + Long snapshotTimestampMs = null; + if (in.readBoolean()) { + snapshotTimestampMs = in.readLong(); + } + + return MixedFormatEnumeratorOffset.of(snapshotId, snapshotTimestampMs); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java new file mode 100644 index 0000000000..181c15dd9c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; + +import javax.annotation.Nullable; + +import java.util.Collection; + +/** + * State that contains pending mixed-format splits and last enumerator offset in mixed-format source + * enumerator {@link MixedFormatSourceEnumerator}. + */ +public class MixedFormatSourceEnumState { + @Nullable private final MixedFormatEnumeratorOffset lastEnumeratedOffset; + private final Collection pendingSplits; + @Nullable private final long[] shuffleSplitRelation; + @Nullable private final TemporalJoinSplits temporalJoinSplits; + + public MixedFormatSourceEnumState( + Collection pendingSplits, + @Nullable MixedFormatEnumeratorOffset lastEnumeratedOffset, + @Nullable long[] shuffleSplitRelation, + @Nullable TemporalJoinSplits temporalJoinSplits) { + this.pendingSplits = pendingSplits; + this.lastEnumeratedOffset = lastEnumeratedOffset; + this.shuffleSplitRelation = shuffleSplitRelation; + this.temporalJoinSplits = temporalJoinSplits; + } + + @Nullable + public MixedFormatEnumeratorOffset lastEnumeratedOffset() { + return lastEnumeratedOffset; + } + + public Collection pendingSplits() { + return pendingSplits; + } + + @Nullable + public long[] shuffleSplitRelation() { + return shuffleSplitRelation; + } + + @Nullable + public TemporalJoinSplits temporalJoinSplits() { + return temporalJoinSplits; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java new file mode 100644 index 0000000000..c28cc7de65 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitSerializer; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.util.InstantiationUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; +import java.util.Objects; + +/** + * Serializer that serializes and deserializes mixed-format enumerator {@link + * MixedFormatSourceEnumState}. + */ +public class MixedFormatSourceEnumStateSerializer + implements SimpleVersionedSerializer { + + private static final Logger LOGGER = + LoggerFactory.getLogger(MixedFormatSourceEnumStateSerializer.class); + private static final int VERSION = 1; + private final MixedFormatSplitSerializer splitSerializer = MixedFormatSplitSerializer.INSTANCE; + private final MixedFormatEnumeratorOffsetSerializer offsetSerializer = + MixedFormatEnumeratorOffsetSerializer.INSTANCE; + + private static final ThreadLocal SERIALIZER_CACHE = + ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(MixedFormatSourceEnumState mixedFormatSourceEnumState) + throws IOException { + return serializeV1(mixedFormatSourceEnumState); + } + + private byte[] serializeV1(MixedFormatSourceEnumState enumState) throws IOException { + DataOutputSerializer out = SERIALIZER_CACHE.get(); + + out.writeBoolean(enumState.lastEnumeratedOffset() != null); + if (enumState.lastEnumeratedOffset() != null) { + out.writeInt(offsetSerializer.getVersion()); + byte[] positionBytes = offsetSerializer.serialize(enumState.lastEnumeratedOffset()); + out.writeInt(positionBytes.length); + out.write(positionBytes); + } + + out.writeInt(splitSerializer.getVersion()); + out.writeInt(enumState.pendingSplits().size()); + for (MixedFormatSplitState splitState : enumState.pendingSplits()) { + byte[] splitBytes = splitSerializer.serialize(splitState.toSourceSplit()); + out.writeInt(splitBytes.length); + out.write(splitBytes); + } + + out.writeBoolean(enumState.shuffleSplitRelation() != null); + if (enumState.shuffleSplitRelation() != null) { + long[] shuffleSplitRelation = enumState.shuffleSplitRelation(); + out.writeInt(Objects.requireNonNull(shuffleSplitRelation).length); + for (long l : shuffleSplitRelation) { + out.writeLong(l); + } + } + + out.writeBoolean(enumState.temporalJoinSplits() != null); + if (enumState.temporalJoinSplits() != null) { + byte[] temporalJoinSplits = InstantiationUtil.serializeObject(enumState.temporalJoinSplits()); + out.writeInt(temporalJoinSplits.length); + out.write(temporalJoinSplits); + } + + byte[] result = out.getCopyOfBuffer(); + out.clear(); + return result; + } + + @Override + public MixedFormatSourceEnumState deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + private MixedFormatSourceEnumState deserializeV1(byte[] serialized) throws IOException { + DataInputDeserializer in = new DataInputDeserializer(serialized); + + MixedFormatEnumeratorOffset enumeratorOffset = null; + if (in.readBoolean()) { + int version = in.readInt(); + byte[] positionBytes = new byte[in.readInt()]; + in.read(positionBytes); + enumeratorOffset = offsetSerializer.deserialize(version, positionBytes); + } + + int splitSerializerVersion = in.readInt(); + int splitCount = in.readInt(); + Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); + for (int i = 0; i < splitCount; ++i) { + byte[] splitBytes = new byte[in.readInt()]; + in.read(splitBytes); + MixedFormatSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); + pendingSplits.add(new MixedFormatSplitState(split)); + } + + long[] shuffleSplitRelation = null; + if (in.readBoolean()) { + int length = in.readInt(); + shuffleSplitRelation = new long[length]; + for (int i = 0; i < length; i++) { + shuffleSplitRelation[i] = in.readLong(); + } + } + + TemporalJoinSplits temporalJoinSplits = null; + if (in.readBoolean()) { + byte[] bytes = new byte[in.readInt()]; + in.read(bytes); + try { + temporalJoinSplits = + InstantiationUtil.deserializeObject(bytes, TemporalJoinSplits.class.getClassLoader()); + } catch (ClassNotFoundException e) { + throw new RuntimeException("deserialize FirstSplit error", e); + } + } + + return new MixedFormatSourceEnumState( + pendingSplits, enumeratorOffset, shuffleSplitRelation, temporalJoinSplits); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java new file mode 100644 index 0000000000..92929d2a94 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; +import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; + +import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; +import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; +import org.apache.amoro.flink.read.hybrid.reader.HybridSplitReader; +import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; +import org.apache.amoro.flink.read.hybrid.reader.ReaderStartedEvent; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; +import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.iceberg.Snapshot; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Collection; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.IntStream; + +/** + * Enumerator for mixed-format source, assign {@link MixedFormatSplit} to mixed-format source reader + * {@link HybridSplitReader} + */ +public class MixedFormatSourceEnumerator extends AbstractMixedFormatEnumerator { + private static final Logger LOG = LoggerFactory.getLogger(MixedFormatSourceEnumerator.class); + private transient KeyedTable keyedTable; + /** + * To record the snapshotId at the first planSplits. + * + *

If its value is null, it means that we don't need to generate watermark. Won't check. + */ + private transient volatile TemporalJoinSplits temporalJoinSplits = null; + + private final MixedFormatTableLoader loader; + private final SplitEnumeratorContext context; + private final ContinuousSplitPlanner continuousSplitPlanner; + private final SplitAssigner splitAssigner; + private final MixedFormatScanContext scanContext; + private final long snapshotDiscoveryIntervalMs; + /** + * If true, using mixed-format table as build table. {@link MixedFormatSourceEnumerator} will + * notify {@link MixedFormatSourceReader} after MixedFormatReaders have finished reading all + * {@link TemporalJoinSplits}. Then {@link MixedFormatSourceReader} will emit a Watermark values + * Long.MAX_VALUE. Advancing TemporalJoinOperator's watermark can trigger the join operation and + * push the results to downstream. The watermark of Long.MAX_VALUE avoids affecting the watermark + * defined by user arbitrary probe side + */ + private final boolean dimTable; + + private volatile boolean sourceEventBeforeFirstPlan = false; + /** + * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off + * this as the starting position. + */ + private final AtomicReference enumeratorPosition; + + private final AtomicBoolean lock = new AtomicBoolean(false); + + public MixedFormatSourceEnumerator( + SplitEnumeratorContext enumContext, + SplitAssigner splitAssigner, + MixedFormatTableLoader loader, + MixedFormatScanContext scanContext, + @Nullable MixedFormatSourceEnumState enumState, + boolean dimTable) { + super(enumContext, splitAssigner); + this.loader = loader; + this.context = enumContext; + this.splitAssigner = splitAssigner; + this.scanContext = scanContext; + this.continuousSplitPlanner = new ContinuousSplitPlannerImpl(loader); + this.snapshotDiscoveryIntervalMs = scanContext.monitorInterval().toMillis(); + this.enumeratorPosition = new AtomicReference<>(); + if (enumState != null) { + this.enumeratorPosition.set(enumState.lastEnumeratedOffset()); + this.temporalJoinSplits = enumState.temporalJoinSplits(); + } + this.dimTable = dimTable; + LOG.info("dimTable: {}", dimTable); + } + + @Override + public void start() { + if (keyedTable == null) { + keyedTable = loadMixedTable(loader).asKeyedTable(); + } + if (enumeratorPosition.get() == null + && SCAN_STARTUP_MODE_LATEST.equalsIgnoreCase(scanContext.scanStartupMode())) { + keyedTable.refresh(); + Snapshot snapshot = keyedTable.changeTable().currentSnapshot(); + long snapshotId = snapshot == null ? EARLIEST_SNAPSHOT_ID : snapshot.snapshotId(); + enumeratorPosition.set(MixedFormatEnumeratorOffset.of(snapshotId, null)); + LOG.info( + "{} is {}, the current snapshot id of the change table {} is {}.", + SCAN_STARTUP_MODE.key(), + SCAN_STARTUP_MODE_LATEST, + keyedTable.id(), + snapshotId); + } + if (snapshotDiscoveryIntervalMs > 0) { + LOG.info( + "Starting the MixedFormatSourceEnumerator with mixed-format table {} snapshot discovery interval of {} ms.", + keyedTable, + snapshotDiscoveryIntervalMs); + context.callAsync( + this::planSplits, this::handleResultOfSplits, 0, snapshotDiscoveryIntervalMs); + } + } + + private ContinuousEnumerationResult planSplits() { + ContinuousEnumerationResult result = doPlanSplits(); + if (dimTable && temporalJoinSplits == null) { + temporalJoinSplits = new TemporalJoinSplits(result.splits(), context.metricGroup()); + // the first SourceEvent may be faster than plan splits + if (result.isEmpty() && sourceEventBeforeFirstPlan) { + notifyReaders(); + } + } + return result; + } + + private ContinuousEnumerationResult doPlanSplits() { + if (lock.get()) { + LOG.info("prefix plan splits thread haven't finished."); + return ContinuousEnumerationResult.EMPTY; + } + lock.set(true); + LOG.info("begin to plan splits current offset {}.", enumeratorPosition.get()); + Optional.ofNullable(scanContext.filters()) + .ifPresent( + filters -> + filters.forEach( + expression -> + LOG.info( + "mixed-format source filter expression: {}.", expression.toString()))); + return continuousSplitPlanner.planSplits(enumeratorPosition.get(), scanContext.filters()); + } + + private void handleResultOfSplits(ContinuousEnumerationResult enumerationResult, Throwable t) { + if (t != null) { + lock.set(false); + throw new FlinkRuntimeException("Failed to scan mixed-format table due to ", t); + } + if (!enumerationResult.isEmpty()) { + splitAssigner.onDiscoveredSplits(enumerationResult.splits()); + } + if (!enumerationResult.toOffset().isEmpty()) { + enumeratorPosition.set(enumerationResult.toOffset()); + } + LOG.info( + "handled result of splits, discover splits size {}, latest offset {}.", + enumerationResult.splits().size(), + enumeratorPosition.get()); + lock.set(false); + } + + @Override + public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { + super.handleSourceEvent(subtaskId, sourceEvent); + if (sourceEvent instanceof SplitRequestEvent) { + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; + Collection finishedSplitIds = splitRequestEvent.finishedSplitIds(); + if (dimTable) { + checkAndNotifyReader(finishedSplitIds); + } + } else if (sourceEvent instanceof ReaderStartedEvent) { + if (!dimTable || temporalJoinSplits == null || !temporalJoinSplits.hasNotifiedReader()) { + return; + } + // If tm failover, the reader may not be notified and watermark will not be retrieved in + // reader. + sourceEventBeforeFirstPlan = true; + LOG.info("send InitializationFinishedEvent to reader again."); + context.sendEventToSourceReader(subtaskId, InitializationFinishedEvent.INSTANCE); + } else { + throw new IllegalArgumentException( + String.format( + "Received unknown event from subtask %d: %s", + subtaskId, sourceEvent.getClass().getCanonicalName())); + } + } + + /** + * Check whether all first splits have been finished or not. After all finished, enumerator will + * send a {@link InitializationFinishedEvent} to notify all {@link MixedFormatSourceReader}. + * + * @param finishedSplitIds + */ + public void checkAndNotifyReader(Collection finishedSplitIds) { + if (temporalJoinSplits == null) { + sourceEventBeforeFirstPlan = true; + return; + } + + if (temporalJoinSplits.hasNotifiedReader() + || !temporalJoinSplits.removeAndReturnIfAllFinished(finishedSplitIds)) { + return; + } + notifyReaders(); + } + + private void notifyReaders() { + LOG.info("all splits finished, send events to readers"); + IntStream.range(0, context.currentParallelism()) + .forEach(i -> context.sendEventToSourceReader(i, InitializationFinishedEvent.INSTANCE)); + temporalJoinSplits.clear(); + temporalJoinSplits.notifyReader(); + } + + @Override + public MixedFormatSourceEnumState snapshotState(long checkpointId) throws Exception { + long[] shuffleSplitRelation = null; + if (splitAssigner instanceof ShuffleSplitAssigner) { + shuffleSplitRelation = ((ShuffleSplitAssigner) splitAssigner).serializePartitionIndex(); + } + return new MixedFormatSourceEnumState( + splitAssigner.state(), enumeratorPosition.get(), shuffleSplitRelation, temporalJoinSplits); + } + + @Override + public void close() throws IOException { + continuousSplitPlanner.close(); + splitAssigner.close(); + super.close(); + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return true; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java new file mode 100644 index 0000000000..3a4a35039e --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; + +import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.iceberg.flink.source.ScanContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.Collection; + +/** + * This is a static mixed-format source enumerator, used for bounded source scan. Working enabled + * only just {@link ScanContext#STREAMING} is equal to false. + */ +public class StaticMixedFormatSourceEnumerator extends AbstractMixedFormatEnumerator { + private static final Logger LOG = + LoggerFactory.getLogger(StaticMixedFormatSourceEnumerator.class); + private final SplitAssigner assigner; + private final MixedFormatTableLoader loader; + private transient KeyedTable keyedTable; + private final MixedFormatScanContext scanContext; + private final boolean shouldEnumerate; + private final ContinuousSplitPlanner splitPlanner; + + public StaticMixedFormatSourceEnumerator( + SplitEnumeratorContext enumeratorContext, + SplitAssigner assigner, + MixedFormatTableLoader loader, + MixedFormatScanContext scanContext, + @Nullable MixedFormatSourceEnumState enumState) { + super(enumeratorContext, assigner); + this.loader = loader; + this.assigner = assigner; + this.scanContext = scanContext; + // split enumeration is not needed during a restore scenario + this.shouldEnumerate = enumState == null; + this.splitPlanner = new MergeOnReadPlannerImpl(loader); + } + + @Override + public void start() { + super.start(); + if (keyedTable == null) { + keyedTable = loadMixedTable(loader).asKeyedTable(); + } + if (shouldEnumerate) { + keyedTable.baseTable().refresh(); + keyedTable.changeTable().refresh(); + Collection splits = + splitPlanner.planSplits(null, scanContext.filters()).splits(); + assigner.onDiscoveredSplits(splits); + LOG.info( + "Discovered {} splits from table {} during job initialization", + splits.size(), + keyedTable.name()); + } + } + + @Override + protected boolean shouldWaitForMoreSplits() { + return false; + } + + @Override + public MixedFormatSourceEnumState snapshotState(long checkpointId) throws Exception { + return new MixedFormatSourceEnumState(assigner.state(), null, null, null); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java new file mode 100644 index 0000000000..ad51b2da93 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.flink.table.data.RowData; + +import javax.annotation.Nullable; + +import java.util.Collections; +import java.util.Set; + +/** + * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. + * Batching is to improve the efficiency for records handover. + * + *

{@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is + * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at + * the same time. + * + *

For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we + * will only have a batch of records for one split here. + * + *

This class uses array to store a batch of records from the same file (with the same + * fileOffset). + */ +class ArrayBatchRecords implements RecordsWithSplitIds> { + @Nullable private String splitId; + @Nullable private final Pool.Recycler recycler; + @Nullable private final T[] records; + private final int numberOfRecords; + private final Set finishedSplits; + private final MixedFormatRecordWithOffset recordWithOffset; + + // point to current read position within the records array + private int position; + + private RecordPosition[] recordPositions; + + private ArrayBatchRecords( + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { + Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); + Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); + Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); + + this.splitId = splitId; + this.recycler = recycler; + this.records = records; + this.numberOfRecords = numberOfRecords; + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordWithOffset = new MixedFormatRecordWithOffset<>(); + + this.position = 0; + } + + private ArrayBatchRecords( + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + RecordPosition[] positions, + Set finishedSplits) { + Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); + + this.splitId = splitId; + this.recycler = recycler; + this.records = records; + this.numberOfRecords = numberOfRecords; + this.recordPositions = Preconditions.checkNotNull(positions, "recordPositions can't be null"); + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.recordWithOffset = new MixedFormatRecordWithOffset<>(); + + this.position = 0; + } + + @Nullable + @Override + public String nextSplit() { + String nextSplit = this.splitId; + // set the splitId to null to indicate no more splits + // this class only contains record for one split + this.splitId = null; + return nextSplit; + } + + @Nullable + @Override + public MixedFormatRecordWithOffset nextRecordFromSplit() { + if (position < numberOfRecords) { + setRecordWithOffset(); + position++; + return recordWithOffset; + } else { + return null; + } + } + + private void setRecordWithOffset() { + assert records != null; + assert recordPositions[position] != null; + RecordPosition offset = recordPositions[position]; + Preconditions.checkArgument( + offset.currentInsertFileOffset() >= 0 || offset.currentDeleteFileOffset() >= 0, + "fileOffset can't be negative"); + Preconditions.checkArgument( + offset.currentInsertRecordOffset() >= 0, "numberOfRecords can't be negative"); + Preconditions.checkArgument( + offset.currentDeleteRecordOffset() >= 0, "numberOfRecords can't be negative"); + recordWithOffset.set( + records[position], + offset.currentInsertFileOffset(), + offset.currentInsertRecordOffset(), + offset.currentDeleteFileOffset(), + offset.currentDeleteRecordOffset()); + } + + /** + * This method is called when all records from this batch has been emitted. If recycler is set, it + * should be called to return the records array back to pool. + */ + @Override + public void recycle() { + if (recycler != null) { + recycler.recycle(records); + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + @VisibleForTesting + T[] records() { + return records; + } + + @VisibleForTesting + int numberOfRecords() { + return numberOfRecords; + } + + /** + * Create a ArrayBatchRecords backed up an array with records from the same file + * + * @param splitId Iceberg source only read from one split a time. We never have multiple records + * from multiple splits. + * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused + * RowData object, we need to clone RowData eagerly when constructing a batch of records. We + * can use object pool to reuse the RowData array object which can be expensive to create. + * This recycler can be provided to recycle the array object back to pool after read is + * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't + * need to clone objects. It is cheap to just create the batch array. Hence, we don't need + * object pool and recycler can be set to null. + * @param records an array (maybe reused) holding a batch of records + * @param numberOfRecords actual number of records in the array + * @param positions fileOffset and recordOffset for all records in this batch + * @param record type + */ + public static RecordsWithSplitIds> forRecords( + String splitId, + Pool.Recycler recycler, + T[] records, + int numberOfRecords, + RecordPosition[] positions) { + return new ArrayBatchRecords<>( + splitId, recycler, records, numberOfRecords, positions, Collections.emptySet()); + } + + /** + * Create ab ArrayBatchRecords with only finished split id + * + * @param splitId for the split that is just exhausted + */ + public static ArrayBatchRecords finishedSplit(String splitId) { + return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java new file mode 100644 index 0000000000..75bcb1abe7 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SOURCE_READER_FETCH_BATCH_RECORD_COUNT; + +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.IOException; +import java.util.NoSuchElementException; + +/** This implementation stores record batch in array from recyclable pool */ +class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { + private final int batchSize; + private final int handoverQueueSize; + private final RecordFactory recordFactory; + + private transient Pool pool; + + ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { + this.batchSize = config.get(SOURCE_READER_FETCH_BATCH_RECORD_COUNT); + this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); + this.recordFactory = recordFactory; + } + + @Override + public CloseableIterator>> batch( + String splitId, DataIterator inputIterator) { + Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); + // lazily create pool as it is not serializable + if (pool == null) { + this.pool = createPoolOfBatches(handoverQueueSize); + } + return new ArrayPoolBatchIterator(splitId, inputIterator, pool); + } + + private Pool createPoolOfBatches(int numBatches) { + Pool poolOfBatches = new Pool<>(numBatches); + for (int batchId = 0; batchId < numBatches; batchId++) { + T[] batch = recordFactory.createBatch(batchSize); + poolOfBatches.add(batch); + } + + return poolOfBatches; + } + + private class ArrayPoolBatchIterator + implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + private final Pool pool; + + ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { + this.splitId = splitId; + this.inputIterator = inputIterator; + this.pool = pool; + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + if (!inputIterator.hasNext()) { + throw new NoSuchElementException(); + } + + T[] batch = getCachedEntry(); + int recordCount = 0; + + RecordPosition[] positions = initPositionArray(); + while (inputIterator.hasNext() && recordCount < batchSize) { + // The record produced by inputIterator can be reused like for the + // MixedFormatRecordWithOffset + // case. + // inputIterator.next() can't be called again until the copy is made + // since the record is not consumed immediately. + T nextRecord = inputIterator.next(); + recordFactory.clone(nextRecord, batch, recordCount); + positions[recordCount].set(inputIterator); + recordCount++; + if (!inputIterator.currentFileHasNext()) { + // break early so that records in the ArrayResultIterator + // have the same fileOffset. + break; + } + } + return ArrayBatchRecords.forRecords(splitId, pool.recycler(), batch, recordCount, positions); + } + + @Override + public void close() throws IOException { + inputIterator.close(); + } + + private T[] getCachedEntry() { + try { + return pool.pollEntry(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for array pool entry", e); + } + } + + private RecordPosition[] initPositionArray() { + RecordPosition[] positions = new RecordPosition[batchSize]; + for (int i = 0; i < batchSize; i++) { + positions[i] = new RecordPosition(); + } + return positions; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java new file mode 100644 index 0000000000..1119f6a67d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.Serializable; + +/** + * Batcher converts iterator of T into iterator of batched {@code + * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns + * batched records. + */ +@FunctionalInterface +public interface DataIteratorBatcher extends Serializable { + CloseableIterator>> batch( + String splitId, DataIterator inputIterator); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java new file mode 100644 index 0000000000..273d2b539f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.ChangeLogDataIterator; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.flink.read.source.MergeOnReadDataIterator; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.io.CloseableIterator; + +/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ +public abstract class DataIteratorReaderFunction implements ReaderFunction { + private final DataIteratorBatcher batcher; + + public DataIteratorReaderFunction(DataIteratorBatcher batcher) { + this.batcher = batcher; + } + + public abstract DataIterator createDataIterator(MixedFormatSplit split); + + @Override + public CloseableIterator>> apply( + MixedFormatSplit split) { + DataIterator inputIterator = createDataIterator(split); + if (inputIterator instanceof MergeOnReadDataIterator) { + inputIterator.seek(0, split.asMergeOnReadSplit().recordOffset()); + } else if (inputIterator instanceof ChangeLogDataIterator) { + ChangeLogDataIterator changelogInputIterator = (ChangeLogDataIterator) inputIterator; + ChangelogSplit changelogSplit = split.asChangelogSplit(); + changelogInputIterator.seek( + changelogSplit.insertFileOffset(), + changelogSplit.deleteFileOffset(), + changelogSplit.insertRecordOffset(), + changelogSplit.deleteRecordOffset()); + } else { + inputIterator.seek( + split.asSnapshotSplit().insertFileOffset(), split.asSnapshotSplit().insertRecordOffset()); + } + return batcher.batch(split.splitId(), inputIterator); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java new file mode 100644 index 0000000000..1b0073e1a5 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsBySplits; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.iceberg.io.CloseableIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.Queue; + +/** + * A hybrid source split reader that could read {@link SnapshotSplit} and {@link ChangelogSplit}. + */ +public class HybridSplitReader + implements SplitReader, MixedFormatSplit> { + private static final Logger LOG = LoggerFactory.getLogger(HybridSplitReader.class); + + private final ReaderFunction openSplitFunction; + private final int indexOfSubtask; + private final Queue splits; + + private CloseableIterator>> currentReader; + private String currentSplitId; + + public HybridSplitReader(ReaderFunction openSplitFunction, SourceReaderContext context) { + this.openSplitFunction = openSplitFunction; + this.indexOfSubtask = context.getIndexOfSubtask(); + this.splits = new ArrayDeque<>(); + } + + @Override + public RecordsWithSplitIds> fetch() throws IOException { + if (currentReader == null) { + if (splits.isEmpty()) { + return new RecordsBySplits<>(Collections.emptyMap(), Collections.emptySet()); + } + MixedFormatSplit mixedFormatSplit = splits.poll(); + currentReader = openSplitFunction.apply(mixedFormatSplit); + currentSplitId = mixedFormatSplit.splitId(); + } + if (currentReader.hasNext()) { + // Because Iterator#next() doesn't support checked exception, + // we need to wrap and unwrap the checked IOException with UncheckedIOException + try { + return currentReader.next(); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + } else { + return finishSplit(); + } + } + + @Override + public void handleSplitsChanges(SplitsChange splitsChange) { + if (!(splitsChange instanceof SplitsAddition)) { + throw new UnsupportedOperationException( + String.format("The SplitChange type of %s is not supported.", splitsChange.getClass())); + } + LOG.info("Handling a split change {}.", splitsChange); + + splitsChange + .splits() + .forEach( + mixedFormatSplit -> { + if (mixedFormatSplit instanceof SnapshotSplit + || mixedFormatSplit instanceof ChangelogSplit + || mixedFormatSplit instanceof MergeOnReadSplit) { + splits.add(mixedFormatSplit); + } else { + throw new IllegalArgumentException( + String.format( + "As of now, The %s of SourceSplit type is unsupported, available source splits are %s, %s.", + mixedFormatSplit.getClass().getSimpleName(), + SnapshotSplit.class.getSimpleName(), + ChangelogSplit.class.getSimpleName())); + } + }); + } + + @Override + public void wakeUp() {} + + @Override + public void close() throws Exception { + currentSplitId = null; + if (currentReader != null) { + currentReader.close(); + } + } + + private RecordsWithSplitIds> finishSplit() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); + LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); + currentSplitId = null; + return finishRecords; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java new file mode 100644 index 0000000000..1c29b51874 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.flink.connector.base.source.reader.RecordEmitter; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.data.utils.JoinedRowData; +import org.apache.flink.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Emitter that emit {@link T} to the next flink operator and update the record offset of {@link T} + * into split state. + */ +public class MixedFormatRecordEmitter + implements RecordEmitter, T, MixedFormatSplitState> { + + public static final Logger LOGGER = LoggerFactory.getLogger(MixedFormatRecordEmitter.class); + + /** It signifies whether the Long.MIN_VALUE need to be set into RowData. */ + public boolean populateRowTime; + + public MixedFormatRecordEmitter(boolean populateRowTime) { + this.populateRowTime = populateRowTime; + } + + @Override + public void emitRecord( + MixedFormatRecordWithOffset element, + SourceOutput sourceOutput, + MixedFormatSplitState split) + throws Exception { + T record = element.record(); + if (!populateRowTime) { + sourceOutput.collect(record); + } else { + Preconditions.checkArgument( + record instanceof RowData, + "Custom watermark strategy doesn't support %s, except RowData for now.", + record.getClass()); + RowData rowData = + new JoinedRowData( + (RowData) record, GenericRowData.of(TimestampData.fromEpochMillis(Long.MIN_VALUE))); + rowData.setRowKind(((RowData) record).getRowKind()); + sourceOutput.collect((T) rowData); + } + split.updateOffset( + new Object[] { + element.insertFileOffset(), + element.insertRecordOffset(), + element.deleteFileOffset(), + element.deleteRecordOffset() + }); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java new file mode 100644 index 0000000000..dff5dbe3e8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +/** A record along with the reader position to be stored in the checkpoint. */ +public class MixedFormatRecordWithOffset { + private T record; + + private int insertFileOffset; + private long insertRecordOffset; + private int deleteFileOffset; + private long deleteRecordOffset; + + public T record() { + return record; + } + + public void record(T record) { + this.record = record; + } + + public int insertFileOffset() { + return insertFileOffset; + } + + public long insertRecordOffset() { + return insertRecordOffset; + } + + public int deleteFileOffset() { + return deleteFileOffset; + } + + public long deleteRecordOffset() { + return deleteRecordOffset; + } + + public void set( + T newRecord, + int insertFileOffset, + long insertRecordOffset, + int deleteFileOffset, + long deleteRecordOffset) { + this.record = newRecord; + this.insertFileOffset = insertFileOffset; + this.deleteFileOffset = deleteFileOffset; + this.insertRecordOffset = insertRecordOffset; + this.deleteRecordOffset = deleteRecordOffset; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java new file mode 100644 index 0000000000..caa7b6f837 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.MixedFormatSource; +import org.apache.amoro.flink.read.hybrid.enumerator.InitializationFinishedEvent; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; +import org.apache.amoro.flink.util.FlinkClassReflectionUtil; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.common.eventtime.WatermarkOutputMultiplexer; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; +import org.apache.flink.core.io.InputStatus; +import org.apache.flink.streaming.api.operators.source.ProgressiveTimestampsAndWatermarks; +import org.apache.flink.streaming.api.operators.source.SourceOutputWithWatermarks; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; + +/** + * Mixed-format source reader that is created by a {@link + * MixedFormatSource#createReader(SourceReaderContext)}. + */ +public class MixedFormatSourceReader + extends SingleThreadMultiplexSourceReaderBase< + MixedFormatRecordWithOffset, T, MixedFormatSplit, MixedFormatSplitState> { + + public static final Logger LOGGER = LoggerFactory.getLogger(MixedFormatSourceReader.class); + + public ReaderOutput output; + /** SourceEvents may be received before this#pollNext. */ + private volatile boolean maxWatermarkToBeEmitted = false; + + public MixedFormatSourceReader( + ReaderFunction readerFunction, + Configuration config, + SourceReaderContext context, + boolean populateRowTime) { + super( + () -> new HybridSplitReader<>(readerFunction, context), + new MixedFormatRecordEmitter(populateRowTime), + config, + context); + } + + @Override + public void start() { + // We request a split only if we did not get splits during the checkpoint restore. + // Otherwise, reader restarts will keep requesting more and more splits. + if (getNumberOfCurrentlyAssignedSplits() == 0) { + requestSplit(Collections.emptyList()); + } + context.sendSourceEventToCoordinator(ReaderStartedEvent.INSTANCE); + } + + @Override + protected void onSplitFinished(Map finishedSplitIds) { + requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); + } + + @Override + protected MixedFormatSplitState initializedState(MixedFormatSplit split) { + return new MixedFormatSplitState(split); + } + + @Override + protected MixedFormatSplit toSplitType(String splitId, MixedFormatSplitState splitState) { + return splitState.toSourceSplit(); + } + + private void requestSplit(Collection finishedSplitIds) { + context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); + } + + @Override + public void handleSourceEvents(SourceEvent sourceEvent) { + if (!(sourceEvent instanceof InitializationFinishedEvent)) { + return; + } + LOGGER.info("receive InitializationFinishedEvent"); + maxWatermarkToBeEmitted = true; + emitWatermarkIfNeeded(); + } + + private void emitWatermarkIfNeeded() { + if (this.output == null || !maxWatermarkToBeEmitted) { + return; + } + LOGGER.info("emit watermark"); + output.emitWatermark(new Watermark(Long.MAX_VALUE)); + maxWatermarkToBeEmitted = false; + } + + @Override + public InputStatus pollNext(ReaderOutput output) throws Exception { + this.output = output; + emitWatermarkIfNeeded(); + return super.pollNext(wrapOutput(output)); + } + + public ReaderOutput wrapOutput(ReaderOutput output) { + if (!(output instanceof SourceOutputWithWatermarks)) { + return output; + } + return new MixedFormatReaderOutput<>(output); + } + + /** + * There is a case that the watermark in {@link WatermarkOutputMultiplexer.OutputState} has been + * updated, but watermark has not been emitted for that when {@link + * WatermarkOutputMultiplexer#onPeriodicEmit} called, the outputState has been removed by {@link + * WatermarkOutputMultiplexer#unregisterOutput(String)} after split finished. Wrap {@link + * ReaderOutput} to call {@link + * ProgressiveTimestampsAndWatermarks.SplitLocalOutputs#emitPeriodicWatermark()} when split + * finishes. + */ + static class MixedFormatReaderOutput implements ReaderOutput { + + private final ReaderOutput internal; + + public MixedFormatReaderOutput(ReaderOutput readerOutput) { + Preconditions.checkArgument( + readerOutput instanceof SourceOutputWithWatermarks, + "readerOutput should be SourceOutputWithWatermarks, but was %s", + readerOutput.getClass()); + this.internal = readerOutput; + } + + @Override + public void collect(T record) { + internal.collect(record); + } + + @Override + public void collect(T record, long timestamp) { + internal.collect(record, timestamp); + } + + @Override + public void emitWatermark(Watermark watermark) { + internal.emitWatermark(watermark); + } + + @Override + public void markIdle() { + internal.markIdle(); + } + + @Override + public void markActive() { + internal.markActive(); + } + + @Override + public SourceOutput createOutputForSplit(String splitId) { + return internal.createOutputForSplit(splitId); + } + + @Override + public void releaseOutputForSplit(String splitId) { + Object splitLocalOutput = FlinkClassReflectionUtil.getSplitLocalOutput(internal); + FlinkClassReflectionUtil.emitPeriodWatermark(splitLocalOutput); + internal.releaseOutputForSplit(splitId); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java new file mode 100644 index 0000000000..23266f9391 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.Serializable; +import java.util.function.Function; + +/** + * This function that accepts one {@link MixedFormatSplit} and produces an iterator of {@link + * MixedFormatRecordWithOffset }. + */ +@FunctionalInterface +public interface ReaderFunction + extends Serializable, + Function< + MixedFormatSplit, + CloseableIterator>>> {} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java new file mode 100644 index 0000000000..baa3e41527 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.flink.api.connector.source.SourceEvent; + +/** It denotes {@link MixedFormatSourceReader} is starting. */ +public class ReaderStartedEvent implements SourceEvent { + private static final long serialVersionUID = 1L; + + public static final ReaderStartedEvent INSTANCE = new ReaderStartedEvent(); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java new file mode 100644 index 0000000000..44b8c2c93a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import java.io.Serializable; + +/** + * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData + * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array + * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. + */ +interface RecordFactory extends Serializable { + /** Create a batch of records */ + T[] createBatch(int batchSize); + + /** Clone record into the specified position of the batch array */ + void clone(T from, T[] batch, int position); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java new file mode 100644 index 0000000000..ca8d8b4617 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.flink.read.source.ChangeLogDataIterator; +import org.apache.amoro.flink.read.source.DataIterator; + +/** This class contains the file offset and record offset with actual record. */ +public class RecordPosition { + private int currentInsertFileOffset; + private int currentDeleteFileOffset; + private long currentInsertRecordOffset; + private long currentDeleteRecordOffset; + + public RecordPosition() {} + + void set(DataIterator dataIterator) { + if (dataIterator instanceof ChangeLogDataIterator) { + ChangeLogDataIterator changelog = (ChangeLogDataIterator) dataIterator; + currentInsertFileOffset = changelog.insertFileOffset(); + currentInsertRecordOffset = changelog.insertRecordOffset(); + currentDeleteFileOffset = changelog.deleteFileOffset(); + currentDeleteRecordOffset = changelog.deleteRecordOffset(); + } else { + currentInsertFileOffset = dataIterator.fileOffset(); + currentInsertRecordOffset = dataIterator.recordOffset(); + } + } + + public int currentInsertFileOffset() { + return currentInsertFileOffset; + } + + public int currentDeleteFileOffset() { + return currentDeleteFileOffset; + } + + public long currentInsertRecordOffset() { + return currentInsertRecordOffset; + } + + public long currentDeleteRecordOffset() { + return currentDeleteRecordOffset; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java new file mode 100644 index 0000000000..ce49544e0d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import static org.apache.amoro.flink.shuffle.RowKindUtil.convertToFlinkRowKind; +import static org.apache.amoro.utils.SchemaUtil.changeWriteSchema; +import static org.apache.amoro.utils.SchemaUtil.fillUpIdentifierFields; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.ChangeLogDataIterator; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.flink.read.source.FileScanTaskReader; +import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; +import org.apache.amoro.flink.read.source.FlinkUnkyedDataReader; +import org.apache.amoro.flink.read.source.MergeOnReadDataIterator; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.amoro.utils.NodeFilter; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.RowDataUtil; + +import java.util.Collections; + +/** + * This Function accept a {@link MixedFormatSplit} and produces an {@link DataIterator} of {@link + * RowData}. + */ +public class RowDataReaderFunction extends DataIteratorReaderFunction { + private static final long serialVersionUID = 1446614576495721883L; + private final Schema tableSchema; + private final Schema readSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final AuthenticatedFileIO io; + private final PrimaryKeySpec primaryKeySpec; + /** The accurate selected columns size if the mixed-format source projected */ + private final int columnSize; + /** + * The index of the mixed-format file offset field in the read schema Refer to {@link + * this#wrapFileOffsetColumnMeta} + */ + private final int fileOffsetIndex; + + private final boolean reuse; + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + PrimaryKeySpec primaryKeySpec, + String nameMapping, + boolean caseSensitive, + AuthenticatedFileIO io) { + this( + config, + tableSchema, + projectedSchema, + primaryKeySpec, + nameMapping, + caseSensitive, + io, + false); + } + + public RowDataReaderFunction( + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + PrimaryKeySpec primaryKeySpec, + String nameMapping, + boolean caseSensitive, + AuthenticatedFileIO io, + boolean reuse) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + this.tableSchema = tableSchema; + this.readSchema = fillUpReadSchema(tableSchema, projectedSchema, primaryKeySpec); + this.primaryKeySpec = primaryKeySpec; + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + this.io = io; + // Add file offset column after readSchema. + this.fileOffsetIndex = readSchema.columns().size(); + this.columnSize = + projectedSchema == null ? readSchema.columns().size() : projectedSchema.columns().size(); + this.reuse = reuse; + } + + @Override + public DataIterator createDataIterator(MixedFormatSplit split) { + if (split.isMergeOnReadSplit()) { + FlinkKeyedMORDataReader morDataReader = + new FlinkKeyedMORDataReader( + io, + tableSchema, + readSchema, + primaryKeySpec, + nameMapping, + caseSensitive, + RowDataUtil::convertConstant, + reuse); + return new MergeOnReadDataIterator( + morDataReader, split.asMergeOnReadSplit().keyedTableScanTask(), io); + } else if (split.isSnapshotSplit()) { + FileScanTaskReader rowDataReader = + new FlinkUnkyedDataReader( + io, + tableSchema, + readSchema, + primaryKeySpec, + nameMapping, + caseSensitive, + RowDataUtil::convertConstant, + Collections.singleton(split.dataTreeNode()), + reuse); + return new DataIterator<>( + rowDataReader, + split.asSnapshotSplit().insertTasks(), + rowData -> Long.MIN_VALUE, + this::removeMixedFormatMetaColumn); + } else if (split.isChangelogSplit()) { + FileScanTaskReader rowDataReader = + new FlinkUnkyedDataReader( + io, + wrapFileOffsetColumnMeta(tableSchema), + wrapFileOffsetColumnMeta(readSchema), + primaryKeySpec, + nameMapping, + caseSensitive, + RowDataUtil::convertConstant, + Collections.singleton(split.dataTreeNode()), + reuse); + return new ChangeLogDataIterator<>( + rowDataReader, + split.asChangelogSplit().insertTasks(), + split.asChangelogSplit().deleteTasks(), + this::mixedFormatFileOffset, + this::removeMixedFormatMetaColumn, + this::transformRowKind); + } else { + throw new IllegalArgumentException( + String.format( + "As of now this split %s is not supported.", split.getClass().getSimpleName())); + } + } + + private Schema wrapFileOffsetColumnMeta(Schema schema) { + return changeWriteSchema(schema); + } + + long mixedFormatFileOffset(RowData rowData) { + return rowData.getLong(fileOffsetIndex); + } + + /** + * @param rowData It may have more columns than readSchema. Refer to {@link + * FlinkUnkyedDataReader}'s annotation. + */ + RowData removeMixedFormatMetaColumn(RowData rowData) { + return MixedFormatUtils.removeMixedFormatMetaColumn(rowData, columnSize); + } + + RowData transformRowKind(ChangeLogDataIterator.ChangeActionTrans trans) { + RowData rowData = trans.row(); + rowData.setRowKind(convertToFlinkRowKind(trans.changeAction())); + return rowData; + } + + /** + * If the projected schema is not null, this method will check and fill up the identifierFields of + * the tableSchema and the projected schema. + * + *

projectedSchema may not include the primary keys, but the {@link NodeFilter} must filter the + * record with the value of the primary keys. So the mixed-format reader function schema must + * include the primary keys. + * + * @param tableSchema table schema + * @param projectedSchema projected schema + * @return a new Schema on which includes the identifier fields. + */ + private static Schema fillUpReadSchema( + Schema tableSchema, Schema projectedSchema, PrimaryKeySpec primaryKeySpec) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null + ? tableSchema + : fillUpIdentifierFields(tableSchema, projectedSchema, primaryKeySpec); + } + + private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { + Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); + return projectedSchema == null ? tableSchema : projectedSchema; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java new file mode 100644 index 0000000000..0e922e50dc --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.flink.data.RowDataUtil; + +/** A factory create a batch of empty {@link RowData}s. */ +class RowDataRecordFactory implements RecordFactory { + private final RowType rowType; + private final TypeSerializer[] fieldSerializers; + private final RowData.FieldGetter[] fieldGetters; + + RowDataRecordFactory(RowType rowType) { + this.rowType = rowType; + this.fieldSerializers = createFieldSerializers(rowType); + this.fieldGetters = createFieldGetters(rowType); + } + + static TypeSerializer[] createFieldSerializers(RowType rowType) { + return rowType.getChildren().stream() + .map(InternalSerializers::create) + .toArray(TypeSerializer[]::new); + } + + static RowData.FieldGetter[] createFieldGetters(RowType rowType) { + RowData.FieldGetter[] getters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); i++) { + getters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); + } + return getters; + } + + @Override + public RowData[] createBatch(int batchSize) { + RowData[] arr = new RowData[batchSize]; + for (int i = 0; i < batchSize; ++i) { + arr[i] = new GenericRowData(rowType.getFieldCount()); + } + return arr; + } + + @Override + public void clone(RowData from, RowData[] batch, int position) { + // Set the return value from RowDataUtil.clone back to the array. + // Clone method returns same clone target object (reused) if it is a GenericRowData. + // Clone method will allocate a new GenericRowData object + // if the target object is NOT a GenericRowData. + // So we should always set the clone return value back to the array. + batch[position] = + RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java new file mode 100644 index 0000000000..2b3129ef8b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.data.PrimaryKeyedFile; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.utils.FileScanTaskUtil; + +import java.util.Collection; +import java.util.Optional; + +/** A changelog split generated during planning change table. */ +public class ChangelogSplit extends MixedFormatSplit { + private static final long serialVersionUID = 1L; + private final int taskIndex; + private final Collection insertScanTasks; + private final Collection deleteScanTasks; + private int insertFileOffset; + private long insertRecordOffset; + private int deleteFileOffset; + private long deleteRecordOffset; + private DataTreeNode dataTreeNode; + + public ChangelogSplit( + Collection insertScanTasks, + Collection deleteScanTasks, + int taskIndex) { + Preconditions.checkArgument(insertScanTasks.size() > 0 || deleteScanTasks.size() > 0); + this.taskIndex = taskIndex; + this.insertScanTasks = insertScanTasks; + this.deleteScanTasks = deleteScanTasks; + Optional task = insertScanTasks.stream().findFirst(); + PrimaryKeyedFile file = + task.isPresent() ? task.get().file() : deleteScanTasks.stream().findFirst().get().file(); + this.dataTreeNode = DataTreeNode.of(file.node().mask(), file.node().index()); + } + + @Override + public Integer taskIndex() { + return taskIndex; + } + + @Override + public DataTreeNode dataTreeNode() { + return dataTreeNode; + } + + @Override + public void modifyTreeNode(DataTreeNode expectedNode) { + Preconditions.checkNotNull(expectedNode); + this.dataTreeNode = expectedNode; + } + + @Override + public void updateOffset(Object[] offsets) { + Preconditions.checkArgument(offsets.length == 4); + insertFileOffset = (int) offsets[0]; + insertRecordOffset = (long) offsets[1]; + deleteFileOffset = (int) offsets[2]; + deleteRecordOffset = (long) offsets[3]; + } + + @Override + public MixedFormatSplit copy() { + return new ChangelogSplit(insertScanTasks, deleteScanTasks, taskIndex); + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this) + .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) + .add("mixedFormatEquityDeletes", FileScanTaskUtil.toString(deleteScanTasks)) + .toString(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) + .add("mixedFormatEquityDeletes", FileScanTaskUtil.toString(deleteScanTasks)) + .add("dataTreeNode", dataTreeNode.toString()) + .toString(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof ChangelogSplit)) { + return false; + } + ChangelogSplit other = (ChangelogSplit) obj; + return splitId().equals(other.splitId()) + && insertFileOffset == other.insertFileOffset + && insertRecordOffset == other.insertRecordOffset + && deleteFileOffset == other.deleteFileOffset + && deleteRecordOffset == other.deleteRecordOffset + && taskIndex == other.taskIndex; + } + + public int insertFileOffset() { + return insertFileOffset; + } + + public long insertRecordOffset() { + return insertRecordOffset; + } + + public int deleteFileOffset() { + return deleteFileOffset; + } + + public long deleteRecordOffset() { + return deleteRecordOffset; + } + + public Collection insertTasks() { + return insertScanTasks; + } + + public Collection deleteTasks() { + return deleteScanTasks; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java new file mode 100644 index 0000000000..ccb2bc8996 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.scan.KeyedTableScanTask; +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.utils.FileScanTaskUtil; +import org.apache.flink.util.Preconditions; + +public class MergeOnReadSplit extends MixedFormatSplit { + private static final long serialVersionUID = 1L; + private final int taskIndex; + private final KeyedTableScanTask keyedTableScanTask; + private long recordOffset; + + public MergeOnReadSplit(int taskIndex, KeyedTableScanTask keyedTableScanTask) { + this.taskIndex = taskIndex; + this.keyedTableScanTask = keyedTableScanTask; + } + + public KeyedTableScanTask keyedTableScanTask() { + return keyedTableScanTask; + } + + @Override + public Integer taskIndex() { + return taskIndex; + } + + @Override + public void updateOffset(Object[] offsets) { + Preconditions.checkArgument(offsets.length == 2); + // offsets[0] is file offset, but we don't need it + recordOffset = (long) offsets[1]; + } + + @Override + public MixedFormatSplit copy() { + return new MergeOnReadSplit(taskIndex, keyedTableScanTask); + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this) + .add("insertTasks", FileScanTaskUtil.toString(keyedTableScanTask.insertTasks())) + .add("baseTasks", FileScanTaskUtil.toString(keyedTableScanTask.baseTasks())) + .add( + "mixedFormatEquityDeletes", + FileScanTaskUtil.toString(keyedTableScanTask.mixedEquityDeletes())) + .toString(); + } + + public long recordOffset() { + return recordOffset; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof MergeOnReadSplit)) { + return false; + } + MergeOnReadSplit other = (MergeOnReadSplit) obj; + return splitId().equals(other.splitId()) + && recordOffset == other.recordOffset + && taskIndex == other.taskIndex; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("\ninsertTasks", FileScanTaskUtil.toString(keyedTableScanTask.insertTasks())) + .add("\nbaseTasks", FileScanTaskUtil.toString(keyedTableScanTask.baseTasks())) + .add( + "\nmixedFormatEquityDeletes", + FileScanTaskUtil.toString(keyedTableScanTask.mixedEquityDeletes())) + .add("\ncost", keyedTableScanTask.cost() / 1024 + " KB") + .add("\nrecordCount", keyedTableScanTask.recordCount()) + .toString(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java new file mode 100644 index 0000000000..79818f9952 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.flink.api.connector.source.SourceSplit; + +import java.io.Serializable; + +/** An abstract mixed-format source split. */ +public abstract class MixedFormatSplit + implements SourceSplit, Serializable, Comparable { + private static final long serialVersionUID = 1L; + + public abstract Integer taskIndex(); + + public DataTreeNode dataTreeNode() { + throw new UnsupportedOperationException("This operation is not supported right now."); + } + + public void modifyTreeNode(DataTreeNode expectedNode) { + throw new UnsupportedOperationException("This operation is not supported right now."); + } + + /** Checks whether this split is a snapshot split. */ + public final boolean isSnapshotSplit() { + return getClass() == SnapshotSplit.class; + } + + /** Checks whether this split is a changelog split. */ + public final boolean isChangelogSplit() { + return getClass() == ChangelogSplit.class; + } + + public final boolean isMergeOnReadSplit() { + return getClass() == MergeOnReadSplit.class; + } + + /** Casts this split into a {@link SnapshotSplit}. */ + public final SnapshotSplit asSnapshotSplit() { + return (SnapshotSplit) this; + } + + /** Casts this split into a {@link ChangelogSplit}. */ + public final ChangelogSplit asChangelogSplit() { + return (ChangelogSplit) this; + } + + public final MergeOnReadSplit asMergeOnReadSplit() { + return (MergeOnReadSplit) this; + } + + /** + * update split current file offset and record offset if this split is {@link SnapshotSplit} + * recordOffsets means [insertFileOffset, insertRecordOffset] if this split is {@link + * ChangelogSplit} recordOffsets means [insertFileOffset, insertRecordOffset, deleteFileOffset, + * deleteRecordOffset, ] + * + * @param recordOffsets [insertFileOffset, insertRecordOffset] + */ + public abstract void updateOffset(Object[] recordOffsets); + + @Override + public int compareTo(MixedFormatSplit that) { + return this.taskIndex().compareTo(that.taskIndex()); + } + + public abstract MixedFormatSplit copy(); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java new file mode 100644 index 0000000000..2b7aef4a9e --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.log.Bytes; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.InstantiationUtil; + +import java.io.IOException; + +/** Serializer that serializes and deserializes {@link MixedFormatSplit}. */ +public class MixedFormatSplitSerializer implements SimpleVersionedSerializer { + public static final MixedFormatSplitSerializer INSTANCE = new MixedFormatSplitSerializer(); + private static final int VERSION = 1; + + private static final byte SNAPSHOT_SPLIT_FLAG = 1; + private static final byte CHANGELOG_SPLIT_FLAG = 2; + private static final byte MOR_SPLIT_FLAG = 3; + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(MixedFormatSplit split) throws IOException { + if (split == null) { + return new byte[0]; + } + if (split.isMergeOnReadSplit()) { + MergeOnReadSplit mergeOnReadSplit = (MergeOnReadSplit) split; + byte[] content = InstantiationUtil.serializeObject(mergeOnReadSplit); + return Bytes.mergeByte(new byte[] {MOR_SPLIT_FLAG}, content); + } else if (split.isSnapshotSplit()) { + SnapshotSplit snapshotSplit = (SnapshotSplit) split; + byte[] content = InstantiationUtil.serializeObject(snapshotSplit); + return Bytes.mergeByte(new byte[] {SNAPSHOT_SPLIT_FLAG}, content); + } else if (split.isChangelogSplit()) { + ChangelogSplit changelogSplit = (ChangelogSplit) split; + byte[] content = InstantiationUtil.serializeObject(changelogSplit); + return Bytes.mergeByte(new byte[] {CHANGELOG_SPLIT_FLAG}, content); + } else { + throw new IllegalArgumentException( + String.format( + "This mixed-format split is not supported, class %s.", + split.getClass().getSimpleName())); + } + } + + @Override + public MixedFormatSplit deserialize(int version, byte[] serialized) throws IOException { + if (serialized.length == 0) { + return null; + } + try { + byte flag = serialized[0]; + if (version == VERSION) { + byte[] content = Bytes.subByte(serialized, 1, serialized.length - 1); + if (flag == MOR_SPLIT_FLAG) { + return InstantiationUtil.deserializeObject( + content, MergeOnReadSplit.class.getClassLoader()); + } else if (flag == SNAPSHOT_SPLIT_FLAG) { + return InstantiationUtil.deserializeObject( + content, SnapshotSplit.class.getClassLoader()); + } else if (flag == CHANGELOG_SPLIT_FLAG) { + return InstantiationUtil.deserializeObject( + content, ChangelogSplit.class.getClassLoader()); + } else { + throw new IllegalArgumentException( + String.format( + "this flag split %s is unsupported. available: %s, %s, and %s.", + flag, SNAPSHOT_SPLIT_FLAG, CHANGELOG_SPLIT_FLAG, MOR_SPLIT_FLAG)); + } + } + } catch (ClassNotFoundException e) { + throw new FlinkRuntimeException("deserialize split failed", e); + } + throw new FlinkRuntimeException( + String.format("this version %s is not supported during deserialize split.", version)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java new file mode 100644 index 0000000000..5548d917c6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.flink.util.FlinkRuntimeException; + +/** This is the mutable state for per mixed-format source split. */ +public class MixedFormatSplitState { + private final MixedFormatSplit mixedFormatSplit; + + private int currentInsertFileOffset; + private long currentInsertRecordOffset; + private int currentDeleteFileOffset; + private long currentDeleteRecordOffset; + + public MixedFormatSplitState(MixedFormatSplit mixedFormatSplit) { + this.mixedFormatSplit = mixedFormatSplit; + } + + public MixedFormatSplit toSourceSplit() { + if (mixedFormatSplit.isMergeOnReadSplit()) { + MergeOnReadSplit mergeOnReadSplit = (MergeOnReadSplit) mixedFormatSplit; + mergeOnReadSplit.updateOffset( + new Object[] {currentInsertFileOffset, currentInsertRecordOffset}); + return mergeOnReadSplit; + } else if (mixedFormatSplit.isSnapshotSplit()) { + SnapshotSplit snapshotSplit = (SnapshotSplit) mixedFormatSplit; + snapshotSplit.updateOffset(new Object[] {currentInsertFileOffset, currentInsertRecordOffset}); + return snapshotSplit; + } else if (mixedFormatSplit.isChangelogSplit()) { + ChangelogSplit changelogSplit = (ChangelogSplit) mixedFormatSplit; + changelogSplit.updateOffset( + new Object[] { + currentInsertFileOffset, + currentInsertRecordOffset, + currentDeleteFileOffset, + currentDeleteRecordOffset + }); + return changelogSplit; + } + + throw new FlinkRuntimeException( + String.format( + "As of now this source split is unsupported %s, available split are %s, %s, and %s", + mixedFormatSplit.getClass().getSimpleName(), + SnapshotSplit.class.getSimpleName(), + ChangelogSplit.class.getSimpleName(), + MergeOnReadSplit.class.getSimpleName())); + } + + public void updateOffset(Object[] offsets) { + currentInsertFileOffset = (int) offsets[0]; + currentInsertRecordOffset = (long) offsets[1]; + if (mixedFormatSplit.isChangelogSplit()) { + currentDeleteFileOffset = (int) offsets[2]; + currentDeleteRecordOffset = (long) offsets[3]; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java new file mode 100644 index 0000000000..fd1bcab730 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.data.PrimaryKeyedFile; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.utils.FileScanTaskUtil; + +import java.util.Collection; + +/** A snapshot split generated during planning base table. */ +public class SnapshotSplit extends MixedFormatSplit { + private static final long serialVersionUID = 1L; + private final int taskIndex; + private final Collection insertScanTasks; + private int insertFileOffset; + private long insertRecordOffset; + private DataTreeNode dataTreeNode; + + public SnapshotSplit(Collection insertScanTasks, int taskIndex) { + Preconditions.checkArgument(insertScanTasks.size() > 0); + this.insertScanTasks = insertScanTasks; + this.taskIndex = taskIndex; + PrimaryKeyedFile file = insertScanTasks.stream().findFirst().get().file(); + this.dataTreeNode = DataTreeNode.of(file.node().mask(), file.node().index()); + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this) + .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) + .toString(); + } + + @Override + public Integer taskIndex() { + return taskIndex; + } + + @Override + public DataTreeNode dataTreeNode() { + return dataTreeNode; + } + + @Override + public void modifyTreeNode(DataTreeNode expectedNode) { + Preconditions.checkNotNull(expectedNode); + this.dataTreeNode = expectedNode; + } + + public Collection insertTasks() { + return insertScanTasks; + } + + @Override + public void updateOffset(Object[] offsets) { + Preconditions.checkArgument(offsets.length == 2); + insertFileOffset = (int) offsets[0]; + insertRecordOffset = (long) offsets[1]; + } + + @Override + public MixedFormatSplit copy() { + return new SnapshotSplit(insertScanTasks, taskIndex); + } + + public int insertFileOffset() { + return insertFileOffset; + } + + public long insertRecordOffset() { + return insertRecordOffset; + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) + .add("dataTreeNode", dataTreeNode.toString()) + .toString(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof SnapshotSplit)) { + return false; + } + SnapshotSplit other = (SnapshotSplit) obj; + return splitId().equals(other.splitId()) + && insertFileOffset == other.insertFileOffset + && insertRecordOffset == other.insertRecordOffset + && taskIndex == other.taskIndex; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java new file mode 100644 index 0000000000..b1eec7f40d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.connector.source.SourceEvent; + +import java.util.Collection; +import java.util.Collections; + +/** We can remove this class once FLINK-21364 is resolved. */ +@Internal +public class SplitRequestEvent implements SourceEvent { + private static final long serialVersionUID = 1L; + + private final Collection finishedSplitIds; + private final String requesterHostname; + + public SplitRequestEvent() { + this(Collections.emptyList()); + } + + public SplitRequestEvent(Collection finishedSplitIds) { + this(finishedSplitIds, null); + } + + public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { + this.finishedSplitIds = finishedSplitIds; + this.requesterHostname = requesterHostname; + } + + public Collection finishedSplitIds() { + return finishedSplitIds; + } + + public String requesterHostname() { + return requesterHostname; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java new file mode 100644 index 0000000000..6d6d79f84c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import static org.apache.amoro.flink.metric.MetricConstant.TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP; +import static org.apache.amoro.flink.metric.MetricConstant.TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP; + +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.util.CollectionUtil; +import org.apache.flink.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.time.LocalDateTime; +import java.util.Collection; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * If using mixed-format table as build-table, TemporalJoinSplits can record the first splits + * planned by Enumerator. + */ +public class TemporalJoinSplits implements Serializable { + + public static final long serialVersionUID = 1L; + public static final Logger LOGGER = LoggerFactory.getLogger(TemporalJoinSplits.class); + + private final transient MetricGroup metricGroup; + private final long startTimeMs = System.currentTimeMillis(); + private Map splits; + private long unfinishedCount; + /** transient because it is necessary to notify reader again after failover. */ + private transient boolean hasNotifiedReader = false; + + public TemporalJoinSplits(Collection splits, MetricGroup metricGroup) { + Preconditions.checkNotNull(splits, "plan splits should not be null"); + this.splits = + splits.stream().map(SourceSplit::splitId).collect(Collectors.toMap((k) -> k, (i) -> false)); + + unfinishedCount = this.splits.size(); + LOGGER.info("init splits at {}, size:{}", LocalDateTime.now(), unfinishedCount); + this.metricGroup = metricGroup; + if (metricGroup != null) { + metricGroup.gauge(TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP, () -> startTimeMs); + } + } + + public Map getSplits() { + return splits; + } + + public synchronized void addSplitsBack(Collection splits) { + if (this.splits == null || CollectionUtil.isNullOrEmpty(splits)) { + return; + } + splits.forEach( + (p) -> { + Boolean finished = this.splits.get(p.splitId()); + if (finished == null || !finished) { + return; + } + unfinishedCount++; + LOGGER.debug("add back split:{} at {}", p, LocalDateTime.now()); + this.splits.put(p.splitId(), false); + }); + } + + /** + * Remove finished splits. + * + * @return True if all splits are finished, otherwise false. + */ + public synchronized boolean removeAndReturnIfAllFinished(Collection finishedSplitIds) { + if (splits == null) { + return true; + } + if (CollectionUtil.isNullOrEmpty(finishedSplitIds)) { + return unfinishedCount == 0; + } + + finishedSplitIds.forEach( + (p) -> { + Boolean finished = this.splits.get(p); + if (finished == null || finished) { + return; + } + unfinishedCount--; + this.splits.put(p, true); + LOGGER.debug("finish split:{} at {}", p, LocalDateTime.now()); + }); + if (unfinishedCount == 0) { + LOGGER.info("finish all splits at {}", LocalDateTime.now()); + if (metricGroup != null) { + metricGroup.gauge(TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP, System::currentTimeMillis); + } + return true; + } + return false; + } + + public synchronized void clear() { + if (unfinishedCount == 0) { + this.splits = null; + } + } + + public boolean hasNotifiedReader() { + return hasNotifiedReader; + } + + public void notifyReader() { + this.hasNotifiedReader = true; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + TemporalJoinSplits that = (TemporalJoinSplits) o; + return startTimeMs == that.startTimeMs + && unfinishedCount == that.unfinishedCount + && hasNotifiedReader == that.hasNotifiedReader + && Objects.equals(metricGroup, that.metricGroup) + && Objects.equals(splits, that.splits); + } + + @Override + public int hashCode() { + return Objects.hash(metricGroup, startTimeMs, splits, unfinishedCount, hasNotifiedReader); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java new file mode 100644 index 0000000000..0b104e5c9a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java @@ -0,0 +1,499 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.internals; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.flink.connector.kafka.source.KafkaSourceOptions; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.flink.util.Preconditions; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.errors.WakeupException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.StringJoiner; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** A {@link SplitReader} implementation that reads records from Kafka partitions. */ +@Internal +public class KafkaPartitionSplitReader + implements SplitReader, KafkaPartitionSplit> { + private static final Logger LOG = LoggerFactory.getLogger(KafkaPartitionSplitReader.class); + protected static final long POLL_TIMEOUT = 10000L; + + protected final KafkaConsumer consumer; + private final Map stoppingOffsets; + private final String groupId; + private final int subtaskId; + + protected final KafkaSourceReaderMetrics kafkaSourceReaderMetrics; + + // Tracking empty splits that has not been added to finished splits in fetch() + private final Set emptySplits = new HashSet<>(); + + public KafkaPartitionSplitReader( + Properties props, + SourceReaderContext context, + KafkaSourceReaderMetrics kafkaSourceReaderMetrics) { + this.subtaskId = context.getIndexOfSubtask(); + this.kafkaSourceReaderMetrics = kafkaSourceReaderMetrics; + Properties consumerProps = new Properties(); + consumerProps.putAll(props); + consumerProps.setProperty(ConsumerConfig.CLIENT_ID_CONFIG, createConsumerClientId(props)); + this.consumer = new KafkaConsumer<>(consumerProps); + this.stoppingOffsets = new HashMap<>(); + this.groupId = consumerProps.getProperty(ConsumerConfig.GROUP_ID_CONFIG); + + // Metric registration + maybeRegisterKafkaConsumerMetrics(props, kafkaSourceReaderMetrics, consumer); + this.kafkaSourceReaderMetrics.registerNumBytesIn(consumer); + } + + @Override + public RecordsWithSplitIds> fetch() throws IOException { + ConsumerRecords consumerRecords; + try { + consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); + } catch (WakeupException | IllegalStateException e) { + // IllegalStateException will be thrown if the consumer is not assigned any partitions. + // This happens if all assigned partitions are invalid or empty (starting offset >= + // stopping offset). We just mark empty partitions as finished and return an empty + // record container, and this consumer will be closed by SplitFetcherManager. + KafkaPartitionSplitRecords recordsBySplits = + new KafkaPartitionSplitRecords(ConsumerRecords.empty(), kafkaSourceReaderMetrics); + markEmptySplitsAsFinished(recordsBySplits); + return recordsBySplits; + } + KafkaPartitionSplitRecords recordsBySplits = + new KafkaPartitionSplitRecords(consumerRecords, kafkaSourceReaderMetrics); + List finishedPartitions = new ArrayList<>(); + for (TopicPartition tp : consumerRecords.partitions()) { + long stoppingOffset = getStoppingOffset(tp); + final List> recordsFromPartition = consumerRecords.records(tp); + + if (recordsFromPartition.size() > 0) { + final ConsumerRecord lastRecord = + recordsFromPartition.get(recordsFromPartition.size() - 1); + + // After processing a record with offset of "stoppingOffset - 1", the split reader + // should not continue fetching because the record with stoppingOffset may not + // exist. Keep polling will just block forever. + if (lastRecord.offset() >= stoppingOffset - 1) { + recordsBySplits.setPartitionStoppingOffset(tp, stoppingOffset); + finishSplitAtRecord( + tp, stoppingOffset, lastRecord.offset(), finishedPartitions, recordsBySplits); + } + } + // Track this partition's record lag if it never appears before + kafkaSourceReaderMetrics.maybeAddRecordsLagMetric(consumer, tp); + } + + markEmptySplitsAsFinished(recordsBySplits); + + // Unassign the partitions that has finished. + if (!finishedPartitions.isEmpty()) { + finishedPartitions.forEach(kafkaSourceReaderMetrics::removeRecordsLagMetric); + unassignPartitions(finishedPartitions); + } + + // Update numBytesIn + kafkaSourceReaderMetrics.updateNumBytesInCounter(); + + return recordsBySplits; + } + + protected void markEmptySplitsAsFinished(KafkaPartitionSplitRecords recordsBySplits) { + // Some splits are discovered as empty when handling split additions. These splits should be + // added to finished splits to clean up states in split fetcher and source reader. + if (!emptySplits.isEmpty()) { + recordsBySplits.finishedSplits.addAll(emptySplits); + emptySplits.clear(); + } + } + + @Override + public void handleSplitsChanges(SplitsChange splitsChange) { + // Get all the partition assignments and stopping offsets. + if (!(splitsChange instanceof SplitsAddition)) { + throw new UnsupportedOperationException( + String.format("The SplitChange type of %s is not supported.", splitsChange.getClass())); + } + + // Assignment. + List newPartitionAssignments = new ArrayList<>(); + // Starting offsets. + Map partitionsStartingFromSpecifiedOffsets = new HashMap<>(); + List partitionsStartingFromEarliest = new ArrayList<>(); + List partitionsStartingFromLatest = new ArrayList<>(); + // Stopping offsets. + List partitionsStoppingAtLatest = new ArrayList<>(); + Set partitionsStoppingAtCommitted = new HashSet<>(); + + // Parse the starting and stopping offsets. + splitsChange + .splits() + .forEach( + s -> { + newPartitionAssignments.add(s.getTopicPartition()); + parseStartingOffsets( + s, + partitionsStartingFromEarliest, + partitionsStartingFromLatest, + partitionsStartingFromSpecifiedOffsets); + parseStoppingOffsets(s, partitionsStoppingAtLatest, partitionsStoppingAtCommitted); + // Track the new topic partition in metrics + kafkaSourceReaderMetrics.registerTopicPartition(s.getTopicPartition()); + }); + + // Assign new partitions. + newPartitionAssignments.addAll(consumer.assignment()); + consumer.assign(newPartitionAssignments); + + // Seek on the newly assigned partitions to their stating offsets. + seekToStartingOffsets( + partitionsStartingFromEarliest, + partitionsStartingFromLatest, + partitionsStartingFromSpecifiedOffsets); + // Setup the stopping offsets. + acquireAndSetStoppingOffsets(partitionsStoppingAtLatest, partitionsStoppingAtCommitted); + + // After acquiring the starting and stopping offsets, remove the empty splits if necessary. + removeEmptySplits(); + + maybeLogSplitChangesHandlingResult(splitsChange); + } + + @Override + public void wakeUp() { + consumer.wakeup(); + } + + @Override + public void close() throws Exception { + consumer.close(); + } + + // --------------- + + public void notifyCheckpointComplete( + Map offsetsToCommit, + OffsetCommitCallback offsetCommitCallback) { + consumer.commitAsync(offsetsToCommit, offsetCommitCallback); + } + + @VisibleForTesting + KafkaConsumer consumer() { + return consumer; + } + + // --------------- private helper method ---------------------- + + private void parseStartingOffsets( + KafkaPartitionSplit split, + List partitionsStartingFromEarliest, + List partitionsStartingFromLatest, + Map partitionsStartingFromSpecifiedOffsets) { + TopicPartition tp = split.getTopicPartition(); + // Parse starting offsets. + if (split.getStartingOffset() == KafkaPartitionSplit.EARLIEST_OFFSET) { + partitionsStartingFromEarliest.add(tp); + } else if (split.getStartingOffset() == KafkaPartitionSplit.LATEST_OFFSET) { + partitionsStartingFromLatest.add(tp); + } else if (split.getStartingOffset() == KafkaPartitionSplit.COMMITTED_OFFSET) { + // Do nothing here, the consumer will first try to get the committed offsets of + // these partitions by default. + } else { + partitionsStartingFromSpecifiedOffsets.put(tp, split.getStartingOffset()); + } + } + + private void parseStoppingOffsets( + KafkaPartitionSplit split, + List partitionsStoppingAtLatest, + Set partitionsStoppingAtCommitted) { + TopicPartition tp = split.getTopicPartition(); + split + .getStoppingOffset() + .ifPresent( + stoppingOffset -> { + if (stoppingOffset >= 0) { + stoppingOffsets.put(tp, stoppingOffset); + } else if (stoppingOffset == KafkaPartitionSplit.LATEST_OFFSET) { + partitionsStoppingAtLatest.add(tp); + } else if (stoppingOffset == KafkaPartitionSplit.COMMITTED_OFFSET) { + partitionsStoppingAtCommitted.add(tp); + } else { + // This should not happen. + throw new FlinkRuntimeException( + String.format( + "Invalid stopping offset %d for partition %s", stoppingOffset, tp)); + } + }); + } + + private void seekToStartingOffsets( + List partitionsStartingFromEarliest, + List partitionsStartingFromLatest, + Map partitionsStartingFromSpecifiedOffsets) { + + if (!partitionsStartingFromEarliest.isEmpty()) { + LOG.trace("Seeking starting offsets to beginning: {}", partitionsStartingFromEarliest); + consumer.seekToBeginning(partitionsStartingFromEarliest); + } + + if (!partitionsStartingFromLatest.isEmpty()) { + LOG.trace("Seeking starting offsets to end: {}", partitionsStartingFromLatest); + consumer.seekToEnd(partitionsStartingFromLatest); + } + + if (!partitionsStartingFromSpecifiedOffsets.isEmpty()) { + LOG.trace( + "Seeking starting offsets to specified offsets: {}", + partitionsStartingFromSpecifiedOffsets); + partitionsStartingFromSpecifiedOffsets.forEach(consumer::seek); + } + } + + private void acquireAndSetStoppingOffsets( + List partitionsStoppingAtLatest, + Set partitionsStoppingAtCommitted) { + Map endOffset = consumer.endOffsets(partitionsStoppingAtLatest); + stoppingOffsets.putAll(endOffset); + if (!partitionsStoppingAtCommitted.isEmpty()) { + retryOnWakeup( + () -> consumer.committed(partitionsStoppingAtCommitted), + "getting committed offset as stopping offsets") + .forEach( + (tp, offsetAndMetadata) -> { + Preconditions.checkNotNull( + offsetAndMetadata, + String.format( + "Partition %s should stop at committed offset. " + + "But there is no committed offset of this partition for group %s", + tp, groupId)); + stoppingOffsets.put(tp, offsetAndMetadata.offset()); + }); + } + } + + private void removeEmptySplits() { + List emptyPartitions = new ArrayList<>(); + // If none of the partitions have any records, + for (TopicPartition tp : consumer.assignment()) { + if (retryOnWakeup( + () -> consumer.position(tp), "getting starting offset to check if split is empty") + >= getStoppingOffset(tp)) { + emptyPartitions.add(tp); + } + } + if (!emptyPartitions.isEmpty()) { + LOG.debug( + "These assigning splits are empty and will be marked as finished in later fetch: {}", + emptyPartitions); + // Add empty partitions to empty split set for later cleanup in fetch() + emptySplits.addAll( + emptyPartitions.stream().map(KafkaPartitionSplit::toSplitId).collect(Collectors.toSet())); + // Un-assign partitions from Kafka consumer + unassignPartitions(emptyPartitions); + } + } + + private void maybeLogSplitChangesHandlingResult(SplitsChange splitsChange) { + if (LOG.isDebugEnabled()) { + StringJoiner splitsInfo = new StringJoiner(","); + for (KafkaPartitionSplit split : splitsChange.splits()) { + long startingOffset = + retryOnWakeup( + () -> consumer.position(split.getTopicPartition()), "logging starting position"); + long stoppingOffset = getStoppingOffset(split.getTopicPartition()); + splitsInfo.add( + String.format( + "[%s, start:%d, stop: %d]", + split.getTopicPartition(), startingOffset, stoppingOffset)); + } + LOG.debug("SplitsChange handling result: {}", splitsInfo); + } + } + + protected void unassignPartitions(Collection partitionsToUnassign) { + Collection newAssignment = new HashSet<>(consumer.assignment()); + newAssignment.removeAll(partitionsToUnassign); + consumer.assign(newAssignment); + } + + private String createConsumerClientId(Properties props) { + String prefix = props.getProperty(KafkaSourceOptions.CLIENT_ID_PREFIX.key()); + return prefix + "-" + subtaskId; + } + + protected void finishSplitAtRecord( + TopicPartition tp, + long stoppingOffset, + long currentOffset, + List finishedPartitions, + KafkaPartitionSplitRecords recordsBySplits) { + LOG.debug( + "{} has reached stopping offset {}, current offset is {}", + tp, + stoppingOffset, + currentOffset); + finishedPartitions.add(tp); + recordsBySplits.addFinishedSplit(KafkaPartitionSplit.toSplitId(tp)); + } + + protected long getStoppingOffset(TopicPartition tp) { + return stoppingOffsets.getOrDefault(tp, Long.MAX_VALUE); + } + + private void maybeRegisterKafkaConsumerMetrics( + Properties props, + KafkaSourceReaderMetrics kafkaSourceReaderMetrics, + KafkaConsumer consumer) { + final Boolean needToRegister = + KafkaSourceOptions.getOption( + props, KafkaSourceOptions.REGISTER_KAFKA_CONSUMER_METRICS, Boolean::parseBoolean); + if (needToRegister) { + kafkaSourceReaderMetrics.registerKafkaConsumerMetrics(consumer); + } + } + + /** + * Catch {@link WakeupException} in Kafka consumer call and retry the invocation on exception. + * + *

This helper function handles a race condition as below: + * + *

    + *
  1. Fetcher thread finishes a {@link KafkaConsumer#poll(Duration)} call + *
  2. Task thread assigns new splits so invokes {@link #wakeUp()}, then the wakeup is recorded + * and held by the consumer + *
  3. Later fetcher thread invokes {@link #handleSplitsChanges(SplitsChange)}, and interactions + * with consumer will throw {@link WakeupException} because of the previously held wakeup in + * the consumer + *
+ * + *

Under this case we need to catch the {@link WakeupException} and retry the operation. + */ + private V retryOnWakeup(Supplier consumerCall, String description) { + try { + return consumerCall.get(); + } catch (WakeupException we) { + LOG.info( + "Caught WakeupException while executing Kafka consumer call for {}. Will retry the consumer call.", + description); + return consumerCall.get(); + } + } + + // ---------------- private helper class ------------------------ + + public static class KafkaPartitionSplitRecords + implements RecordsWithSplitIds> { + + private final Set finishedSplits = new HashSet<>(); + private final Map stoppingOffsets = new HashMap<>(); + private final ConsumerRecords consumerRecords; + private final KafkaSourceReaderMetrics metrics; + private final Iterator splitIterator; + private Iterator> recordIterator; + private TopicPartition currentTopicPartition; + private Long currentSplitStoppingOffset; + + public KafkaPartitionSplitRecords( + ConsumerRecords consumerRecords, KafkaSourceReaderMetrics metrics) { + this.consumerRecords = consumerRecords; + this.splitIterator = consumerRecords.partitions().iterator(); + this.metrics = metrics; + } + + public void setPartitionStoppingOffset(TopicPartition topicPartition, long stoppingOffset) { + stoppingOffsets.put(topicPartition, stoppingOffset); + } + + public void addFinishedSplit(String splitId) { + finishedSplits.add(splitId); + } + + @Nullable + @Override + public String nextSplit() { + if (splitIterator.hasNext()) { + currentTopicPartition = splitIterator.next(); + recordIterator = consumerRecords.records(currentTopicPartition).iterator(); + currentSplitStoppingOffset = + stoppingOffsets.getOrDefault(currentTopicPartition, Long.MAX_VALUE); + return currentTopicPartition.toString(); + } else { + currentTopicPartition = null; + recordIterator = null; + currentSplitStoppingOffset = null; + return null; + } + } + + @Nullable + @Override + public ConsumerRecord nextRecordFromSplit() { + Preconditions.checkNotNull( + currentTopicPartition, + "Make sure nextSplit() did not return null before iterate over the records split."); + if (recordIterator.hasNext()) { + final ConsumerRecord record = recordIterator.next(); + // Only emit records before stopping offset + if (record.offset() < currentSplitStoppingOffset) { + metrics.recordCurrentOffset(currentTopicPartition, record.offset()); + return record; + } + } + return null; + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java new file mode 100644 index 0000000000..56317638a7 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.internals; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.PublicEvolving; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.connector.source.SplitEnumerator; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.source.KafkaSourceBuilder; +import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumState; +import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumStateSerializer; +import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumerator; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.reader.KafkaPartitionSplitReader; +import org.apache.flink.connector.kafka.source.reader.KafkaRecordEmitter; +import org.apache.flink.connector.kafka.source.reader.KafkaSourceReader; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.connector.kafka.source.reader.fetcher.KafkaSourceFetcherManager; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitSerializer; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.util.UserCodeClassLoader; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Collection; +import java.util.Properties; +import java.util.function.Consumer; +import java.util.function.Supplier; + +/** + * The Source implementation of Kafka. Please use a {@link KafkaSourceBuilder} to construct a {@link + * KafkaSource}. The following example shows how to create a KafkaSource emitting records of + * String type. + * + *

{@code
+ * KafkaSource source = KafkaSource
+ *     .builder()
+ *     .setBootstrapServers(KafkaSourceTestEnv.brokerConnectionStrings)
+ *     .setGroupId("MyGroup")
+ *     .setTopics(Arrays.asList(TOPIC1, TOPIC2))
+ *     .setDeserializer(new TestingKafkaRecordDeserializationSchema())
+ *     .setStartingOffsets(OffsetsInitializer.earliest())
+ *     .build();
+ * }
+ * + * @param the output type of the source. + */ +@PublicEvolving +public class KafkaSource + implements Source, ResultTypeQueryable { + private static final long serialVersionUID = -8755372893283732098L; + // Users can choose only one of the following ways to specify the topics to consume from. + private final KafkaSubscriber subscriber; + // Users can specify the starting / stopping offset initializer. + private final OffsetsInitializer startingOffsetsInitializer; + private final OffsetsInitializer stoppingOffsetsInitializer; + // Boundedness + private final Boundedness boundedness; + private final KafkaRecordDeserializationSchema deserializationSchema; + // The configurations. + protected final Properties props; + + protected KafkaSource( + KafkaSubscriber subscriber, + OffsetsInitializer startingOffsetsInitializer, + @Nullable OffsetsInitializer stoppingOffsetsInitializer, + Boundedness boundedness, + KafkaRecordDeserializationSchema deserializationSchema, + Properties props) { + this.subscriber = subscriber; + this.startingOffsetsInitializer = startingOffsetsInitializer; + this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; + this.boundedness = boundedness; + this.deserializationSchema = deserializationSchema; + this.props = props; + } + + @Override + public Boundedness getBoundedness() { + return this.boundedness; + } + + @Internal + @Override + public SourceReader createReader(SourceReaderContext readerContext) + throws Exception { + return createReader(readerContext, (ignore) -> {}); + } + + @VisibleForTesting + SourceReader createReader( + SourceReaderContext readerContext, Consumer> splitFinishedHook) + throws Exception { + FutureCompletingBlockingQueue>> + elementsQueue = new FutureCompletingBlockingQueue<>(); + deserializationSchema.open( + new DeserializationSchema.InitializationContext() { + @Override + public MetricGroup getMetricGroup() { + return readerContext.metricGroup().addGroup("deserializer"); + } + + @Override + public UserCodeClassLoader getUserCodeClassLoader() { + return readerContext.getUserCodeClassLoader(); + } + }); + final KafkaSourceReaderMetrics kafkaSourceReaderMetrics = + new KafkaSourceReaderMetrics(readerContext.metricGroup()); + + Supplier splitReaderSupplier = + () -> new KafkaPartitionSplitReader(props, readerContext, kafkaSourceReaderMetrics); + KafkaRecordEmitter recordEmitter = new KafkaRecordEmitter<>(deserializationSchema); + + return new KafkaSourceReader<>( + elementsQueue, + new KafkaSourceFetcherManager(elementsQueue, splitReaderSupplier::get, splitFinishedHook), + recordEmitter, + toConfiguration(props), + readerContext, + kafkaSourceReaderMetrics); + } + + @Internal + @Override + public SplitEnumerator createEnumerator( + SplitEnumeratorContext enumContext) { + return new KafkaSourceEnumerator( + subscriber, + startingOffsetsInitializer, + stoppingOffsetsInitializer, + props, + enumContext, + boundedness); + } + + @Internal + @Override + public SplitEnumerator restoreEnumerator( + SplitEnumeratorContext enumContext, KafkaSourceEnumState checkpoint) + throws IOException { + return new KafkaSourceEnumerator( + subscriber, + startingOffsetsInitializer, + stoppingOffsetsInitializer, + props, + enumContext, + boundedness, + checkpoint.assignedPartitions()); + } + + @Internal + @Override + public SimpleVersionedSerializer getSplitSerializer() { + return new KafkaPartitionSplitSerializer(); + } + + @Internal + @Override + public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { + return new KafkaSourceEnumStateSerializer(); + } + + @Override + public TypeInformation getProducedType() { + return deserializationSchema.getProducedType(); + } + + // ----------- private helper methods --------------- + + private Configuration toConfiguration(Properties props) { + Configuration config = new Configuration(); + props.stringPropertyNames().forEach(key -> config.setString(key, props.getProperty(key))); + return config; + } + + @VisibleForTesting + Configuration getConfiguration() { + return toConfiguration(props); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java new file mode 100644 index 0000000000..728f41ab07 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.internals; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderBase; +import org.apache.flink.connector.base.source.reader.fetcher.SingleThreadFetcherManager; +import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcher; +import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcherTask; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.clients.consumer.OffsetCommitCallback; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.function.Consumer; +import java.util.function.Supplier; + +/** + * The SplitFetcherManager for Kafka source. This class is needed to help commit the offsets to + * Kafka using the KafkaConsumer inside the {@link KafkaPartitionSplitReader}. + */ +public class KafkaSourceFetcherManager + extends SingleThreadFetcherManager, KafkaPartitionSplit> { + private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceFetcherManager.class); + + /** + * Creates a new SplitFetcherManager with a single I/O threads. + * + * @param elementsQueue The queue that is used to hand over data from the I/O thread (the + * fetchers) to the reader (which emits the records and book-keeps the state. This must be the + * same queue instance that is also passed to the {@link SourceReaderBase}. + * @param splitReaderSupplier The factory for the split reader that connects to the source system. + * @param splitFinishedHook Hook for handling finished splits in split fetchers. + */ + public KafkaSourceFetcherManager( + FutureCompletingBlockingQueue>> + elementsQueue, + Supplier, KafkaPartitionSplit>> + splitReaderSupplier, + Consumer> splitFinishedHook, + Configuration configuration) { + super(elementsQueue, splitReaderSupplier, configuration, splitFinishedHook); + } + + public void commitOffsets( + Map offsetsToCommit, OffsetCommitCallback callback) { + LOG.debug("Committing offsets {}", offsetsToCommit); + if (offsetsToCommit.isEmpty()) { + return; + } + SplitFetcher, KafkaPartitionSplit> splitFetcher = + fetchers.get(0); + if (splitFetcher != null) { + // The fetcher thread is still running. This should be the majority of the cases. + enqueueOffsetsCommitTask(splitFetcher, offsetsToCommit, callback); + } else { + splitFetcher = createSplitFetcher(); + enqueueOffsetsCommitTask(splitFetcher, offsetsToCommit, callback); + startFetcher(splitFetcher); + } + } + + private void enqueueOffsetsCommitTask( + SplitFetcher, KafkaPartitionSplit> splitFetcher, + Map offsetsToCommit, + OffsetCommitCallback callback) { + KafkaPartitionSplitReader kafkaReader = + (KafkaPartitionSplitReader) splitFetcher.getSplitReader(); + + splitFetcher.enqueueTask( + new SplitFetcherTask() { + @Override + public boolean run() throws IOException { + kafkaReader.notifyCheckpointComplete(offsetsToCommit, callback); + return true; + } + + @Override + public void wakeUp() {} + }); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java new file mode 100644 index 0000000000..706d163da9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.internals; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordEmitter; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; +import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.source.KafkaSourceOptions; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +/** The source reader for Kafka partitions. */ +@Internal +public class KafkaSourceReader + extends SingleThreadMultiplexSourceReaderBase< + ConsumerRecord, T, KafkaPartitionSplit, KafkaPartitionSplitState> { + private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceReader.class); + // These maps need to be concurrent because it will be accessed by both the main thread + // and the split fetcher thread in the callback. + private final SortedMap> offsetsToCommit; + private final ConcurrentMap offsetsOfFinishedSplits; + private final KafkaSourceReaderMetrics kafkaSourceReaderMetrics; + private final boolean commitOffsetsOnCheckpoint; + + public KafkaSourceReader( + FutureCompletingBlockingQueue>> + elementsQueue, + KafkaSourceFetcherManager kafkaSourceFetcherManager, + RecordEmitter, T, KafkaPartitionSplitState> recordEmitter, + Configuration config, + SourceReaderContext context, + KafkaSourceReaderMetrics kafkaSourceReaderMetrics) { + super(elementsQueue, kafkaSourceFetcherManager, recordEmitter, config, context); + this.offsetsToCommit = Collections.synchronizedSortedMap(new TreeMap<>()); + this.offsetsOfFinishedSplits = new ConcurrentHashMap<>(); + this.kafkaSourceReaderMetrics = kafkaSourceReaderMetrics; + this.commitOffsetsOnCheckpoint = config.get(KafkaSourceOptions.COMMIT_OFFSETS_ON_CHECKPOINT); + if (!commitOffsetsOnCheckpoint) { + LOG.warn( + "Offset commit on checkpoint is disabled. Consuming offset will not be reported back to Kafka cluster."); + } + } + + @Override + protected void onSplitFinished(Map finishedSplitIds) { + finishedSplitIds.forEach( + (ignored, splitState) -> { + if (splitState.getCurrentOffset() >= 0) { + offsetsOfFinishedSplits.put( + splitState.getTopicPartition(), + new OffsetAndMetadata(splitState.getCurrentOffset())); + } + }); + } + + @Override + public List snapshotState(long checkpointId) { + List splits = super.snapshotState(checkpointId); + if (!commitOffsetsOnCheckpoint) { + return splits; + } + + if (splits.isEmpty() && offsetsOfFinishedSplits.isEmpty()) { + offsetsToCommit.put(checkpointId, Collections.emptyMap()); + } else { + Map offsetsMap = + offsetsToCommit.computeIfAbsent(checkpointId, id -> new HashMap<>()); + // Put the offsets of the active splits. + for (KafkaPartitionSplit split : splits) { + // If the checkpoint is triggered before the partition starting offsets + // is retrieved, do not commit the offsets for those partitions. + if (split.getStartingOffset() >= 0) { + offsetsMap.put( + split.getTopicPartition(), new OffsetAndMetadata(split.getStartingOffset())); + } + } + // Put offsets of all the finished splits. + offsetsMap.putAll(offsetsOfFinishedSplits); + } + return splits; + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + LOG.debug("Committing offsets for checkpoint {}", checkpointId); + if (!commitOffsetsOnCheckpoint) { + return; + } + + Map committedPartitions = offsetsToCommit.get(checkpointId); + if (committedPartitions == null) { + LOG.debug( + "Offsets for checkpoint {} either do not exist or have already been committed.", + checkpointId); + return; + } + + ((KafkaSourceFetcherManager) splitFetcherManager) + .commitOffsets( + committedPartitions, + (ignored, e) -> { + // The offset commit here is needed by the external monitoring. It won't + // break Flink job's correctness if we fail to commit the offset here. + if (e != null) { + kafkaSourceReaderMetrics.recordFailedCommit(); + LOG.warn("Failed to commit consumer offsets for checkpoint {}", checkpointId, e); + } else { + LOG.debug("Successfully committed offsets for checkpoint {}", checkpointId); + kafkaSourceReaderMetrics.recordSucceededCommit(); + // If the finished topic partition has been committed, we remove it + // from the offsets of the finished splits map. + committedPartitions.forEach( + (tp, offset) -> + kafkaSourceReaderMetrics.recordCommittedOffset(tp, offset.offset())); + offsetsOfFinishedSplits + .entrySet() + .removeIf(entry -> committedPartitions.containsKey(entry.getKey())); + while (!offsetsToCommit.isEmpty() && offsetsToCommit.firstKey() <= checkpointId) { + offsetsToCommit.remove(offsetsToCommit.firstKey()); + } + } + }); + } + + @Override + protected KafkaPartitionSplitState initializedState(KafkaPartitionSplit split) { + return new KafkaPartitionSplitState(split); + } + + @Override + protected KafkaPartitionSplit toSplitType(String splitId, KafkaPartitionSplitState splitState) { + return splitState.toKafkaPartitionSplit(); + } + + // ------------------------ + + public SortedMap> getOffsetsToCommit() { + return offsetsToCommit; + } + + @VisibleForTesting + int getNumAliveFetchers() { + return splitFetcherManager.getNumAliveFetchers(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java new file mode 100644 index 0000000000..103392af21 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.internals.metrics; + +import org.apache.flink.annotation.Internal; + +/** + * A collection of Kafka consumer metrics related constant strings. + * + *

The names must not be changed, as that would break backward compatibility for the consumer's + * metrics. + */ +@Internal +public class KafkaConsumerMetricConstants { + + public static final String KAFKA_LATENCY_METRIC_NAME = "kafkaLatency"; +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java new file mode 100644 index 0000000000..05d2df58d2 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import static org.apache.amoro.data.ChangeAction.DELETE; +import static org.apache.amoro.data.ChangeAction.INSERT; +import static org.apache.amoro.data.ChangeAction.UPDATE_AFTER; +import static org.apache.amoro.data.ChangeAction.UPDATE_BEFORE; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.scan.MixedFileScanTask; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.function.Function; + +/** + * This is a change log data iterator that replays the change log data appended to mixed-format + * change table with ordered. + */ +public class ChangeLogDataIterator extends DataIterator { + private final DataIterator insertDataIterator; + private DataIterator deleteDataIterator = empty(); + + private final Function mixedFormatMetaColumnRemover; + private final Function, T> changeActionTransformer; + + private final QueueHolder insertHolder = new QueueHolder<>(); + private final QueueHolder deleteHolder = new QueueHolder<>(); + + public ChangeLogDataIterator( + FileScanTaskReader fileScanTaskReader, + Collection insertTasks, + Collection deleteTasks, + Function mixedFormatFileOffsetGetter, + Function mixedFormatMetaColumnRemover, + Function, T> changeActionTransformer) { + super( + fileScanTaskReader, + Collections.emptyList(), + mixedFormatFileOffsetGetter, + mixedFormatMetaColumnRemover); + this.insertDataIterator = + new DataIterator<>( + fileScanTaskReader, + insertTasks, + mixedFormatFileOffsetGetter, + mixedFormatMetaColumnRemover); + if (deleteTasks != null && !deleteTasks.isEmpty()) { + this.deleteDataIterator = + new DataIterator<>( + fileScanTaskReader, + deleteTasks, + mixedFormatFileOffsetGetter, + mixedFormatMetaColumnRemover); + } + this.mixedFormatMetaColumnRemover = mixedFormatMetaColumnRemover; + this.changeActionTransformer = changeActionTransformer; + } + + public void seek( + int startingInsertFileOffset, + int startingDeleteFileOffset, + long startingInsertRecordOffset, + long startingDeleteRecordOffset) { + insertDataIterator.seek(startingInsertFileOffset, startingInsertRecordOffset); + deleteDataIterator.seek(startingDeleteFileOffset, startingDeleteRecordOffset); + } + + @Override + public void seek(int startingFileOffset, long startingRecordOffset) { + throw new UnsupportedOperationException( + "This operation is not supported in change log data iterator."); + } + + private void loadQueueHolder(boolean insert) { + DataIterator dataIterator = insert ? insertDataIterator : deleteDataIterator; + QueueHolder holder = insert ? insertHolder : deleteHolder; + if (dataIterator.hasNext() && holder.isEmpty()) { + T next = dataIterator.next(); + long nextOffset = dataIterator.currentMixedFormatFileOffset(); + ChangeAction changeAction = insert ? INSERT : DELETE; + holder.put(next, changeAction, nextOffset); + } + } + + @Override + public boolean hasNext() { + loadQueueHolder(false); + loadQueueHolder(true); + + return deleteHolder.isNotEmpty() || insertHolder.isNotEmpty(); + } + + @Override + public boolean currentFileHasNext() { + return deleteDataIterator.currentFileHasNext() + || insertDataIterator.currentFileHasNext() + || deleteHolder.isNotEmpty() + || insertHolder.isNotEmpty(); + } + + @Override + public T next() { + T row; + if (deleteHolder.isEmpty() && insertHolder.isNotEmpty()) { + row = + changeActionTransformer.apply( + ChangeActionTrans.of(insertHolder.nextRow, insertHolder.changeAction)); + insertHolder.clean(); + } else if (deleteHolder.isNotEmpty() && insertHolder.isEmpty()) { + row = + changeActionTransformer.apply( + ChangeActionTrans.of(deleteHolder.nextRow, deleteHolder.changeAction)); + deleteHolder.clean(); + } else if (deleteHolder.equalTo(insertHolder)) { + row = + changeActionTransformer.apply(ChangeActionTrans.of(deleteHolder.nextRow, UPDATE_BEFORE)); + insertHolder.changeAction = UPDATE_AFTER; + deleteHolder.clean(); + } else if (deleteHolder.lesser(insertHolder)) { + row = + changeActionTransformer.apply( + ChangeActionTrans.of(deleteHolder.nextRow, deleteHolder.changeAction)); + deleteHolder.clean(); + } else { + row = + changeActionTransformer.apply( + ChangeActionTrans.of(insertHolder.nextRow, insertHolder.changeAction)); + insertHolder.clean(); + } + + return mixedFormatMetaColumnRemover.apply(row); + } + + @Override + public void close() throws IOException { + insertDataIterator.close(); + deleteDataIterator.close(); + } + + public int insertFileOffset() { + return insertDataIterator.fileOffset(); + } + + public long insertRecordOffset() { + return insertDataIterator.recordOffset(); + } + + public int deleteFileOffset() { + return deleteDataIterator.fileOffset(); + } + + public long deleteRecordOffset() { + return deleteDataIterator.recordOffset(); + } + + private static class QueueHolder { + T nextRow; + ChangeAction changeAction; + Long nextOffset; + + public QueueHolder() {} + + boolean isEmpty() { + return nextRow == null; + } + + boolean isNotEmpty() { + return nextRow != null; + } + + public void put(T nextRow, ChangeAction changeAction, Long nextOffset) { + this.nextRow = nextRow; + this.changeAction = changeAction; + this.nextOffset = nextOffset; + } + + public T get() { + return nextRow; + } + + boolean lesser(QueueHolder that) { + return this.nextOffset.compareTo(that.nextOffset) < 0; + } + + boolean equalTo(QueueHolder that) { + return this.nextOffset.compareTo(that.nextOffset) == 0; + } + + void clean() { + nextRow = null; + nextOffset = null; + } + } + + public static class ChangeActionTrans { + protected final T row; + protected final ChangeAction changeAction; + + private ChangeActionTrans(T row, ChangeAction changeAction) { + this.row = row; + this.changeAction = changeAction; + } + + public static ChangeActionTrans of(T row, ChangeAction changeAction) { + return new ChangeActionTrans<>(row, changeAction); + } + + public T row() { + return row; + } + + public ChangeAction changeAction() { + return changeAction; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java new file mode 100644 index 0000000000..1dccaeb9c3 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.function.Function; + +/** + * Flink data iterator that reads {@link MixedFileScanTask} into a {@link CloseableIterator} + * + * @param T is the output data type returned by this iterator. + */ +@Internal +public class DataIterator implements CloseableIterator { + + private final FileScanTaskReader fileScanTaskReader; + private final int taskSize; + + private Iterator tasks; + private CloseableIterator currentIterator; + private int fileOffset; + private long recordOffset; + private long currentFileOffset; + private final Function fileOffsetGetter; + private final Function metaColumnRemover; + + public DataIterator() { + this(null, Collections.emptyList(), t -> Long.MIN_VALUE, t -> t); + } + + public DataIterator( + FileScanTaskReader fileScanTaskReader, + Collection tasks, + Function fileOffsetGetter, + Function metaColumnRemover) { + this.fileScanTaskReader = fileScanTaskReader; + this.tasks = tasks.iterator(); + this.taskSize = tasks.size(); + this.fileOffsetGetter = fileOffsetGetter; + this.metaColumnRemover = metaColumnRemover; + + this.currentIterator = CloseableIterator.empty(); + + // fileOffset starts at -1 because we started + // from an empty iterator that is not from the split files. + this.fileOffset = -1; + // record offset points to the record that next() should return when called + this.recordOffset = 0L; + // actual record offset in data file. + // it's incremental within inserting and deleting files in the same tree node group. + this.currentFileOffset = 0L; + } + + /** + * (startingFileOffset, startingRecordOffset) points to the next row that the reader should resume + * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the + * second row in file 0. When next() is called after seek; the second row from file 0 should be + * returned. + */ + public void seek(int startingFileOffset, long startingRecordOffset) { + // It means file is empty. + if (taskSize == 0) { + return; + } + Preconditions.checkState( + fileOffset == -1, "Seek should be called before any other iterator actions"); + // skip files + Preconditions.checkState( + startingFileOffset < taskSize, + "Invalid starting file offset %s for combined scan task with %s files.", + startingFileOffset, + taskSize); + for (long i = 0L; i < startingFileOffset; ++i) { + tasks.next(); + } + + updateCurrentIterator(); + // skip records within the file + for (long i = 0; i < startingRecordOffset; ++i) { + if (currentFileHasNext() && hasNext()) { + next(); + } else { + throw new IllegalStateException( + String.format( + "Invalid starting record offset %d for file %d from FileScanTask List.", + startingRecordOffset, startingFileOffset)); + } + } + + fileOffset = startingFileOffset; + recordOffset = startingRecordOffset; + } + + @Override + public boolean hasNext() { + updateCurrentIterator(); + return currentIterator.hasNext(); + } + + @Override + public T next() { + updateCurrentIterator(); + recordOffset += 1; + T row = currentIterator.next(); + currentFileOffset = fileOffsetGetter.apply(row); + return metaColumnRemover.apply(row); + } + + public boolean currentFileHasNext() { + return currentIterator.hasNext(); + } + + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ + private void updateCurrentIterator() { + try { + while (!currentIterator.hasNext() && tasks.hasNext()) { + currentIterator.close(); + currentIterator = openTaskIterator(tasks.next()); + fileOffset += 1; + recordOffset = 0L; + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private CloseableIterator openTaskIterator(MixedFileScanTask scanTask) { + return fileScanTaskReader.open(scanTask); + } + + @Override + public void close() throws IOException { + // close the current iterator + currentIterator.close(); + tasks = null; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + public long currentMixedFormatFileOffset() { + return currentFileOffset; + } + + static DataIterator empty() { + return new EmptyIterator<>(); + } + + private static class EmptyIterator extends DataIterator { + + public EmptyIterator() { + super(null, Collections.emptyList(), t -> Long.MIN_VALUE, t -> t); + } + + @Override + public boolean hasNext() { + return false; + } + + @Override + public T next() { + throw new NoSuchElementException(); + } + + @Override + public void seek(int startingFileOffset, long startingRecordOffset) {} + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java new file mode 100644 index 0000000000..0eeacab445 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import org.apache.flink.annotation.Internal; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.Serializable; + +/** + * Read a {@link FileScanTask} into a {@link CloseableIterator} + * + * @param is the output data type returned by this iterator. + */ +@Internal +public interface FileScanTaskReader extends Serializable { + CloseableIterator open(FileScanTask fileScanTask); +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java new file mode 100644 index 0000000000..7ad5031877 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.orc.OrcRowReader; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.types.Type; +import org.apache.orc.TypeDescription; +import org.apache.parquet.schema.MessageType; + +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; + +public class FlinkKeyedMORDataReader extends AbstractAdaptHiveKeyedDataReader { + public FlinkKeyedMORDataReader( + AuthenticatedFileIO fileIO, + Schema tableSchema, + Schema projectedSchema, + PrimaryKeySpec primaryKeySpec, + String nameMapping, + boolean caseSensitive, + BiFunction convertConstant, + boolean reuseContainer) { + super( + fileIO, + tableSchema, + projectedSchema, + primaryKeySpec, + nameMapping, + caseSensitive, + convertConstant, + reuseContainer); + } + + @Override + protected Function> getParquetReaderFunction( + Schema projectSchema, Map idToConstant) { + return fileSchema -> + AdaptHiveFlinkParquetReaders.buildReader(projectSchema, fileSchema, idToConstant); + } + + @Override + protected Function> getOrcReaderFunction( + Schema projectSchema, Map idToConstant) { + return fileSchema -> new FlinkOrcReader(projectSchema, fileSchema, idToConstant); + } + + @Override + protected Function> toStructLikeFunction() { + return schema -> { + RowType requiredRowType = FlinkSchemaUtil.convert(schema); + RowDataWrapper asStructLike = new RowDataWrapper(requiredRowType, schema.asStruct()); + return asStructLike::wrap; + }; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java new file mode 100644 index 0000000000..04d17f7d31 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveUnkeyedDataReader; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.io.reader.DeleteFilter; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.orc.OrcRowReader; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.types.Type; +import org.apache.orc.TypeDescription; +import org.apache.parquet.schema.MessageType; + +import java.util.Map; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.function.Function; + +/** + * This is an mixed-format table reader accepts a {@link FileScanTask} and produces a {@link + * CloseableIterator}. The RowData read from this reader may have more columns than the + * original schema. The additional columns are added after the original columns, see {@link + * DeleteFilter}. It shall be projected before sent to downstream. This can be processed in {@link + * DataIterator#next()} + */ +public class FlinkUnkyedDataReader extends AbstractAdaptHiveUnkeyedDataReader + implements FileScanTaskReader { + private static final long serialVersionUID = -6773693031945244386L; + + public FlinkUnkyedDataReader( + AuthenticatedFileIO fileIO, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + BiFunction convertConstant, + boolean reuseContainer) { + super( + fileIO, + tableSchema, + projectedSchema, + nameMapping, + caseSensitive, + convertConstant, + reuseContainer); + } + + public FlinkUnkyedDataReader( + AuthenticatedFileIO fileIO, + Schema tableSchema, + Schema projectedSchema, + PrimaryKeySpec primaryKeySpec, + String nameMapping, + boolean caseSensitive, + BiFunction convertConstant, + Set sourceNodes, + boolean reuseContainer) { + super( + fileIO, + tableSchema, + projectedSchema, + primaryKeySpec, + nameMapping, + caseSensitive, + convertConstant, + sourceNodes, + reuseContainer); + } + + @Override + protected Function> getParquetReaderFunction( + Schema projectedSchema, Map idToConstant) { + return fileSchema -> + AdaptHiveFlinkParquetReaders.buildReader(projectedSchema, fileSchema, idToConstant); + } + + @Override + protected Function> getOrcReaderFunction( + Schema projectSchema, Map idToConstant) { + return fileSchema -> new FlinkOrcReader(projectSchema, fileSchema, idToConstant); + } + + @Override + protected Function> toStructLikeFunction() { + return schema -> { + RowType requiredRowType = FlinkSchemaUtil.convert(schema); + RowDataWrapper asStructLike = new RowDataWrapper(requiredRowType, schema.asStruct()); + return asStructLike::wrap; + }; + } + + @Override + public CloseableIterator open(FileScanTask fileScanTask) { + MixedFileScanTask mixedFileScanTask = (MixedFileScanTask) fileScanTask; + CloseableIterable rowDataIterable = readData(mixedFileScanTask); + return fileIO.doAs(rowDataIterable::iterator); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java new file mode 100644 index 0000000000..872cc9dade --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.scan.KeyedTableScanTask; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.IOException; + +/** + * Iterator for reading data in a Merge on Read (MOR) way. This iterator handles reading data from + * an Amoro mix-format table while keeping track of file and record offsets for efficient data + * retrieval. + */ +public class MergeOnReadDataIterator extends DataIterator { + private int fileOffset; + private long recordOffset; + private final CloseableIterator iterator; + + public MergeOnReadDataIterator( + FlinkKeyedMORDataReader flinkKeyedMORDataReader, + KeyedTableScanTask keyedTableScanTask, + AuthenticatedFileIO io) { + super(); + this.iterator = + IteratorWithIO.of(io, io.doAs(() -> flinkKeyedMORDataReader.readData(keyedTableScanTask))); + } + + @Override + public void seek(int startingFileOffset, long startingRecordOffset) { + // startingFileOffset is not used, because we only have one file per task + Preconditions.checkNotNull(iterator, "iterator is null in the MergeOnReadDataIterator."); + // skip records within the file + for (long i = 0; i < startingRecordOffset; ++i) { + if (hasNext()) { + next(); + } else { + throw new IllegalStateException( + String.format( + "Invalid starting record offset %d for file %d from KeyedTableScanTask.", + startingRecordOffset, startingFileOffset)); + } + } + this.fileOffset = startingFileOffset; + this.recordOffset = startingRecordOffset; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public RowData next() { + return iterator.next(); + } + + public boolean currentFileHasNext() { + return iterator.hasNext(); + } + + @Override + public int fileOffset() { + return fileOffset; + } + + @Override + public long recordOffset() { + return recordOffset; + } + + @Override + public void close() throws IOException { + // close the current iterator + if (iterator != null) { + iterator.close(); + } + } + + static class IteratorWithIO implements CloseableIterator { + private final AuthenticatedFileIO io; + private final CloseableIterator iterator; + + private IteratorWithIO(AuthenticatedFileIO io, CloseableIterator iterator) { + this.io = io; + this.iterator = iterator; + } + + static IteratorWithIO of(AuthenticatedFileIO io, CloseableIterator iterator) { + Preconditions.checkNotNull(io); + return new IteratorWithIO(io, iterator); + } + + @Override + public void close() throws IOException { + io.doAs( + () -> { + iterator.close(); + return null; + }); + } + + @Override + public boolean hasNext() { + return io.doAs(iterator::hasNext); + } + + @Override + public RowData next() { + return io.doAs(iterator::next); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java new file mode 100644 index 0000000000..a00e75821c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java @@ -0,0 +1,378 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source; + +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadOptions; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingStartingStrategy; + +import java.io.Serializable; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +/** This is an mixed-format source scan context. */ +public class MixedFormatScanContext extends ScanContext implements Serializable { + + private static final long serialVersionUID = 1L; + + private final String scanStartupMode; + private final boolean batchMode; + + protected MixedFormatScanContext(Builder builder) { + super( + builder.caseSensitive, + builder.snapshotId, + builder.startingStrategy, + builder.startSnapshotTimestamp, + builder.startSnapshotId, + builder.endSnapshotId, + builder.asOfTimestamp, + builder.splitSize, + builder.splitLookback, + builder.splitOpenFileCost, + builder.isStreaming, + builder.monitorInterval, + builder.nameMapping, + builder.projectedSchema, + builder.filters, + builder.limit, + builder.includeColumnStats, + builder.includeStatsForColumns, + builder.exposeLocality, + builder.planParallelism, + builder.maxPlanningSnapshotCount, + builder.maxAllowedPlanningFailures, + builder.watermarkColumn, + builder.watermarkColumnTimeUnit, + builder.branch, + builder.tag, + builder.startTag, + builder.endTag); + this.scanStartupMode = builder.scanStartupMode; + this.batchMode = builder.batchMode; + } + + public boolean caseSensitive() { + return caseSensitive; + } + + public Long snapshotId() { + return snapshotId; + } + + public Long startSnapshotId() { + return startSnapshotId; + } + + public Long endSnapshotId() { + return endSnapshotId; + } + + public Long asOfTimestamp() { + return asOfTimestamp; + } + + public Long splitSize() { + return splitSize; + } + + public Integer splitLookback() { + return splitLookback; + } + + public Long splitOpenFileCost() { + return splitOpenFileCost; + } + + public boolean isStreaming() { + return isStreaming; + } + + public Duration monitorInterval() { + return monitorInterval; + } + + public String nameMapping() { + return nameMapping; + } + + public Schema project() { + return schema; + } + + /** Only working for base store right now. */ + public List filters() { + return filters; + } + + public long limit() { + return limit; + } + + public static Builder contextBuilder() { + return new Builder(); + } + + public String scanStartupMode() { + return scanStartupMode; + } + + public boolean isBatchMode() { + return batchMode; + } + + public static class Builder { + private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); + private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); + private StreamingStartingStrategy startingStrategy = + FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); + private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); + private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); + private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); + private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); + private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); + private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); + private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); + private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); + private Duration monitorInterval = + TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); + private String nameMapping; + private Schema projectedSchema; + private List filters; + private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); + private boolean includeColumnStats = + FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); + private Collection includeStatsForColumns = null; + private boolean exposeLocality; + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private int maxPlanningSnapshotCount = MAX_PLANNING_SNAPSHOT_COUNT.defaultValue(); + + private int maxAllowedPlanningFailures = + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); + private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); + private TimeUnit watermarkColumnTimeUnit = + FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); + private String branch = FlinkReadOptions.BRANCH.defaultValue(); + + private String tag = FlinkReadOptions.TAG.defaultValue(); + + private String startTag = FlinkReadOptions.START_TAG.defaultValue(); + + private String endTag = FlinkReadOptions.END_TAG.defaultValue(); + private String scanStartupMode; + private boolean batchMode = false; + + private Builder() {} + + public Builder caseSensitive(boolean newCaseSensitive) { + this.caseSensitive = newCaseSensitive; + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + this.snapshotId = newSnapshotId; + return this; + } + + public Builder useTag(String tag) { + this.tag = tag; + return this; + } + + public Builder useBranch(String branch) { + this.branch = branch; + return this; + } + + public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { + this.startingStrategy = newStartingStrategy; + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + this.startSnapshotTimestamp = newStartSnapshotTimestamp; + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + this.startSnapshotId = newStartSnapshotId; + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + this.endSnapshotId = newEndSnapshotId; + return this; + } + + public Builder startTag(String startTag) { + this.startTag = startTag; + return this; + } + + public Builder endTag(String endTag) { + this.endTag = endTag; + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + this.asOfTimestamp = newAsOfTimestamp; + return this; + } + + public Builder splitSize(Long newSplitSize) { + this.splitSize = newSplitSize; + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + this.splitLookback = newSplitLookback; + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + this.splitOpenFileCost = newSplitOpenFileCost; + return this; + } + + public Builder streaming(boolean streaming) { + this.isStreaming = streaming; + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + this.monitorInterval = newMonitorInterval; + return this; + } + + public Builder nameMapping(String newNameMapping) { + this.nameMapping = newNameMapping; + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.projectedSchema = newProjectedSchema; + return this; + } + + public Builder filters(List newFilters) { + this.filters = newFilters; + return this; + } + + public Builder limit(long newLimit) { + this.limit = newLimit; + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder planParallelism(Integer parallelism) { + this.planParallelism = parallelism; + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; + return this; + } + + Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { + this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; + return this; + } + + public Builder scanStartupMode(String scanStartupMode) { + this.scanStartupMode = scanStartupMode; + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + this.includeColumnStats = newIncludeColumnStats; + return this; + } + + public Builder batchMode(boolean batchMode) { + this.batchMode = batchMode; + return this; + } + + public Builder fromProperties(Map properties) { + Configuration config = new Configuration(); + properties.forEach(config::setString); + + return this.useSnapshotId(config.get(SNAPSHOT_ID)) + .useTag(config.get(TAG)) + .useBranch(config.get(BRANCH)) + .startTag(config.get(START_TAG)) + .endTag(config.get(END_TAG)) + .caseSensitive(config.get(CASE_SENSITIVE)) + .asOfTimestamp(config.get(AS_OF_TIMESTAMP)) + .startingStrategy(config.get(STARTING_STRATEGY)) + .startSnapshotTimestamp(config.get(START_SNAPSHOT_TIMESTAMP)) + .startSnapshotId(config.get(START_SNAPSHOT_ID)) + .endSnapshotId(config.get(END_SNAPSHOT_ID)) + .splitSize(config.get(SPLIT_SIZE)) + .splitLookback(config.get(SPLIT_LOOKBACK)) + .splitOpenFileCost(config.get(SPLIT_FILE_OPEN_COST)) + .streaming(config.get(STREAMING)) + .monitorInterval(config.get(MONITOR_INTERVAL)) + .nameMapping(properties.get(DEFAULT_NAME_MAPPING)) + .scanStartupMode(properties.get(MixedFormatValidator.SCAN_STARTUP_MODE.key())) + .includeColumnStats(config.get(INCLUDE_COLUMN_STATS)) + .maxPlanningSnapshotCount(config.get(MAX_PLANNING_SNAPSHOT_COUNT)) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures); + } + + public MixedFormatScanContext build() { + scanStartupMode = scanStartupMode == null ? null : scanStartupMode.toLowerCase(); + Preconditions.checkArgument( + Objects.isNull(scanStartupMode) + || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST) + || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_LATEST), + String.format( + "only support %s, %s when %s is %s", + MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST, + MixedFormatValidator.SCAN_STARTUP_MODE_LATEST, + MixedFormatValidator.MIXED_FORMAT_READ_MODE, + MixedFormatValidator.MIXED_FORMAT_READ_FILE)); + Preconditions.checkArgument( + !(isStreaming && batchMode), + String.format( + "only support %s = false when execution.runtime-mode is batch", STREAMING.key())); + return new MixedFormatScanContext(this); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java new file mode 100644 index 0000000000..d908b92ef0 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log; + +import static org.apache.amoro.log.LogData.MAGIC_NUMBER; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplit; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; + +/** + * According to upstreamId and partition topic dealing with the flip message, when should begin to + * retract message and when to end it. + */ +public class LogSourceHelper implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(LogSourceHelper.class); + private static final long serialVersionUID = 1L; + + /** Record the topic partitions that are in retracting state. */ + private final Map retractingInfo; + /** + * Key: topic partition + "_" + upstream job id + "_" + epicNo, generated by {@link + * #combineTopicPartitionAndUpstreamIdAndEpicNo)} method. Value: epic start offset + */ + private final NavigableMap upstreamEpicStartOffsets; + + public LogSourceHelper() { + retractingInfo = new HashMap<>(); + upstreamEpicStartOffsets = new TreeMap<>(); + } + + public void initializedState(KafkaPartitionSplit s) { + if (!(s instanceof LogKafkaPartitionSplit)) { + return; + } + LogKafkaPartitionSplit split = (LogKafkaPartitionSplit) s; + if (split.isRetracting()) { + retractingInfo.put( + split.getTopicPartition(), + EpicRetractingInfo.of( + split.getRetractingEpicNo(), split.getRetractingUpstreamId(), + split.getRetractStopOffset(), split.getRevertStartOffset())); + } + Map upStreamEpicStartOffsets = split.getUpStreamEpicStartOffsets(); + + upStreamEpicStartOffsets.forEach( + (upstreamEpic, offset) -> { + String key = + combineTopicPartitionAndUpstreamIdAndEpicNo(split.getTopicPartition(), upstreamEpic); + upstreamEpicStartOffsets.putIfAbsent(key, offset); + }); + } + + /** + * Turn row kind of a row. + * + *

+   * +I -> -D
+   * -D -> +I
+   * -U -> +U
+   * +U -> -U
+   * 
+ * + * @param rowData Before reset row + * @return After reset row kind. + */ + public RowData turnRowKind(RowData rowData) { + switch (rowData.getRowKind()) { + case INSERT: + rowData.setRowKind(RowKind.DELETE); + break; + case DELETE: + rowData.setRowKind(RowKind.INSERT); + break; + case UPDATE_AFTER: + rowData.setRowKind(RowKind.UPDATE_BEFORE); + break; + case UPDATE_BEFORE: + rowData.setRowKind(RowKind.UPDATE_AFTER); + break; + default: + throw new FlinkRuntimeException("unKnown ChangeAction=" + rowData.getRowKind()); + } + LOG.debug("after retract a row, ChangeAction={}", rowData.getRowKind()); + return rowData; + } + + public Set getRetractTopicPartitions() { + return retractingInfo.keySet(); + } + + public EpicRetractingInfo getRetractInfo(TopicPartition topicPartition) { + EpicRetractingInfo info = retractingInfo.get(topicPartition); + if (info == null) { + throw new IllegalStateException( + String.format( + "the topic partition: %s, %d is not in retracting state", + topicPartition.topic(), topicPartition.partition())); + } + return info; + } + + public void suspendRetracting(TopicPartition tp) { + EpicRetractingInfo info = retractingInfo.remove(tp); + clearEpicStartOffsetsBeforeOrEqual(tp, info.upstreamId, info.epicNo); + } + + public void suspendRetracting(Collection tps) { + tps.forEach(this::suspendRetracting); + } + + /** + * clear the epic start offsets before or equal the epicNo in the topicPartition. + * + * @param tp + * @param upstreamId + * @param epicNo + */ + public void clearEpicStartOffsetsBeforeOrEqual( + TopicPartition tp, String upstreamId, long epicNo) { + String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); + NavigableMap beforeOrEqual = upstreamEpicStartOffsets.headMap(key, true); + + String prefix = combineTopicPartitionAndUpstreamId(tp, upstreamId); + for (String s : beforeOrEqual.keySet()) { + if (!s.contains(prefix)) { + continue; + } + upstreamEpicStartOffsets.remove(s); + } + } + + /** + * @param revertStartingOffset the offset where job revert to normal read starts from. It should + * skip the flip which has been read. + */ + public void startRetracting( + TopicPartition tp, String upstreamId, long epicNo, long revertStartingOffset) { + String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); + if (!upstreamEpicStartOffsets.containsKey(key)) { + // data have not been read, so that it's unnecessary to retract + return; + } + long retractStoppingOffset = upstreamEpicStartOffsets.get(key); + + retractingInfo.put( + tp, + new EpicRetractingInfo(epicNo, upstreamId, retractStoppingOffset, revertStartingOffset)); + } + + public void initialEpicStartOffsetIfEmpty( + TopicPartition tp, String upstreamId, long epicNo, long startOffset) { + String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); + upstreamEpicStartOffsets.putIfAbsent(key, startOffset); + } + + private String combineTopicPartitionAndUpstreamIdAndEpicNo( + TopicPartition tp, String upstreamId, long epicNo) { + return combineTopicPartitionAndUpstreamId(tp, upstreamId) + "_" + epicNo; + } + + private String combineTopicPartitionAndUpstreamIdAndEpicNo( + TopicPartition tp, String upstreamIdAndEpicNo) { + return combineTopicPartition(tp) + "_" + upstreamIdAndEpicNo; + } + + private String combineTopicPartitionAndUpstreamId(TopicPartition tp, String upstreamId) { + return combineTopicPartition(tp) + "_" + upstreamId; + } + + private String combineTopicPartition(TopicPartition tp) { + return tp.topic() + "_" + tp.partition(); + } + + public static boolean checkMagicNum(byte[] value) { + checkNotNull(value); + checkArgument(value.length >= 3); + return value[0] == MAGIC_NUMBER[0] + && value[1] == MAGIC_NUMBER[1] + && value[2] == MAGIC_NUMBER[2]; + } + + public static class EpicRetractingInfo implements Serializable { + private static final long serialVersionUID = 1L; + private final long epicNo; + private final String upstreamId; + private final long retractStoppingOffset; + private final long revertStartingOffset; + + public EpicRetractingInfo( + long epicNo, String upstreamId, long retractStoppingOffset, long revertStartingOffset) { + this.epicNo = epicNo; + this.upstreamId = upstreamId; + this.retractStoppingOffset = retractStoppingOffset; + this.revertStartingOffset = revertStartingOffset; + } + + private static EpicRetractingInfo of( + long epicNo, String upstreamId, long retractStopOffset, long revertStartOffset) { + return new EpicRetractingInfo(epicNo, upstreamId, retractStopOffset, revertStartOffset); + } + + public long getEpicNo() { + return epicNo; + } + + public String getUpstreamId() { + return upstreamId; + } + + public long getRetractStoppingOffset() { + return retractStoppingOffset; + } + + public long getRevertStartingOffset() { + return revertStartingOffset; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java new file mode 100644 index 0000000000..06a6ddf415 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; + +import java.util.NavigableMap; + +public class LogKafkaPartitionSplit extends KafkaPartitionSplit { + + /** + * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and + * opposite RowKind. + */ + private final boolean retracting; + /** The offset where job retract stops, i.e. Read reversely ends. */ + private final Long retractStopOffset; + /** + * The offset where job revert to normal read starts from. It should skip the flip which has been + * read. + */ + private final Long revertStartOffset; + /** + * The epic No. which has finished checkpoint. The data whose epic No. larger than it should be + * retracted. + */ + private final Long retractingEpicNo; + /** The upstream JobId which should be retracted. */ + private final String retractingUpstreamId; + /** Key: upstream job id + "_" + epicNo Value: epic start offset */ + private final NavigableMap upStreamEpicStartOffsets; + + public boolean isRetracting() { + return retracting; + } + + public Long getRetractStopOffset() { + return retractStopOffset; + } + + public Long getRevertStartOffset() { + return revertStartOffset; + } + + public NavigableMap getUpStreamEpicStartOffsets() { + return upStreamEpicStartOffsets; + } + + public Long getRetractingEpicNo() { + return retractingEpicNo; + } + + public String getRetractingUpstreamId() { + return retractingUpstreamId; + } + + public LogKafkaPartitionSplit(LogKafkaPartitionSplitState splitState) { + super( + splitState.getTopicPartition(), + splitState.getCurrentOffset(), + splitState.getStoppingOffset().orElse(NO_STOPPING_OFFSET)); + retracting = splitState.isRetracting(); + retractStopOffset = splitState.getRetractStopOffset(); + revertStartOffset = splitState.getRevertStartOffset(); + upStreamEpicStartOffsets = splitState.getUpstreamEpicStartOffsets(); + retractingEpicNo = splitState.getRetractingEpicNo(); + retractingUpstreamId = splitState.getRetractingUpstreamId(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java new file mode 100644 index 0000000000..f8ba4af61d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java @@ -0,0 +1,443 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import static org.apache.amoro.flink.read.source.log.LogSourceHelper.checkMagicNum; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY; + +import org.apache.amoro.flink.read.internals.KafkaPartitionSplitReader; +import org.apache.amoro.flink.read.source.log.LogSourceHelper; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.errors.WakeupException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.Set; + +/** + * This reader supports read log data in log-store. If {@link + * MixedFormatValidator#MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE} values true, reader would + * read data consistently with file-store. Some data would be written into log-store repeatedly if + * upstream job failovers several times, so it's necessary to retract these data to guarantee the + * consistency with file-store. + * + *
+ * The data in log-store with Flip like: 1 2 3 4 5   6 7 8 9  Flip  6 7 8 9 10 11 12   13 14
+ *                                       ckp-1     |ckp-2   |     | ckp-2            | ckp-3
+ * The data reads like: 1 2 3 4 5 6 7 8 9 -9 -8 -7 -6 6 7 8 9 10 11 12 13 14
+ *
+ * The implementation of reading consistently lists below:
+ * 1. read data normally {@link #readNormal()}
+ *    - convert data to {@link LogRecordWithRetractInfo} in {@link #convertToLogRecord(ConsumerRecords)}. If it comes to
+ *    Flip, the data would be cut.
+ *    - save retracting info {@link LogSourceHelper.EpicRetractingInfo} in
+ *    {@link LogSourceHelper#startRetracting(TopicPartition, String, long, long)}.
+ *    - record the epic start offsets
+ *    {@link LogSourceHelper#initialEpicStartOffsetIfEmpty(TopicPartition, String, long, long)} in
+ *    - handle normal data like {@link KafkaPartitionSplitReader}
+ * 2. read data reversely {@link #readReversely} if some topic partitions come into Flip,
+ *  i.e. {@link LogSourceHelper#getRetractTopicPartitions()}
+ *    - record the offsets that consumer's current positions, stoppingOffsetsFromConsumer.
+ *    - reset consumer to the offset: current position - batchSize
+ *    - poll data until stoppingOffsetsFromConsumer {@link #pollToDesignatedPositions}
+ *    - locate the stop offset in the batch data {@link #findIndexOfOffset(List, long)}, and start from it to read
+ *    reversely, stop at {@link LogSourceHelper.EpicRetractingInfo#getRetractStoppingOffset()}
+ *    - suspend retract {@link LogSourceHelper#suspendRetracting(TopicPartition)} when it comes to
+ *    {@link LogSourceHelper.EpicRetractingInfo#getRetractStoppingOffset()}, else repeat {@link #readReversely} in next
+ *    {@link #fetch()}
+ * 3. write offset and retract info into splitState in
+ * {@link LogKafkaPartitionSplitState#updateState(LogRecordWithRetractInfo)}
+ * 4. initialize state from state {@link LogSourceHelper#initializedState}
+ * 
+ */ +public class LogKafkaPartitionSplitReader extends KafkaPartitionSplitReader { + + private static final Logger LOG = LoggerFactory.getLogger(LogKafkaPartitionSplitReader.class); + + private final LogDataJsonDeserialization logDataJsonDeserialization; + private final LogSourceHelper logReadHelper; + private final boolean logRetractionEnable; + private final boolean logConsumerAppendOnly; + + public LogKafkaPartitionSplitReader( + Properties props, + SourceReaderContext context, + KafkaSourceReaderMetrics kafkaSourceReaderMetrics, + Schema schema, + boolean logRetractionEnable, + LogSourceHelper logReadHelper, + String logConsumerChangelogMode) { + super(props, context, kafkaSourceReaderMetrics); + + this.logDataJsonDeserialization = + new LogDataJsonDeserialization<>( + schema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); + this.logRetractionEnable = logRetractionEnable; + this.logReadHelper = logReadHelper; + this.logConsumerAppendOnly = + LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY.equalsIgnoreCase(logConsumerChangelogMode); + } + + public static int RETRACT_SIZE = 500; + public static long RETRACT_FETCH_MAX_ROUND = 5; + + @Override + public RecordsWithSplitIds> fetch() throws IOException { + KafkaPartitionSplitRecords recordsBySplits; + Set retractTps; + if (logRetractionEnable + && !(retractTps = logReadHelper.getRetractTopicPartitions()).isEmpty()) { + recordsBySplits = readReversely(retractTps); + } else { + recordsBySplits = readNormal(); + } + + return recordsBySplits; + } + + private KafkaPartitionSplitRecords readNormal() throws IOException { + ConsumerRecords consumerRecords; + try { + consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); + } catch (WakeupException | IllegalStateException e) { + // IllegalStateException will be thrown if the consumer is not assigned any partitions. + // This happens if all assigned partitions are invalid or empty (starting offset >= + // stopping offset). We just mark empty partitions as finished and return an empty + // record container, and this consumer will be closed by SplitFetcherManager. + KafkaPartitionSplitRecords recordsBySplits = + new KafkaPartitionSplitRecords(ConsumerRecords.empty(), kafkaSourceReaderMetrics); + markEmptySplitsAsFinished(recordsBySplits); + return recordsBySplits; + } + + ConsumerRecords logRecords = convertToLogRecord(consumerRecords); + KafkaPartitionSplitRecords recordsBySplits = + new KafkaPartitionSplitRecords(logRecords, kafkaSourceReaderMetrics); + + List finishedPartitions = new ArrayList<>(); + for (TopicPartition tp : logRecords.partitions()) { + long stoppingOffset = getStoppingOffset(tp); + final List> recordsFromPartition = logRecords.records(tp); + + if (recordsFromPartition.size() > 0) { + final ConsumerRecord lastRecord = + recordsFromPartition.get(recordsFromPartition.size() - 1); + + // After processing a record with offset of "stoppingOffset - 1", the split reader + // should not continue fetching because the record with stoppingOffset may not + // exist. Keep polling will just block forever. + if (lastRecord.offset() >= stoppingOffset - 1) { + recordsBySplits.setPartitionStoppingOffset(tp, stoppingOffset); + finishSplitAtRecord( + tp, stoppingOffset, lastRecord.offset(), finishedPartitions, recordsBySplits); + } + } + // Track this partition's record lag if it never appears before + kafkaSourceReaderMetrics.maybeAddRecordsLagMetric(consumer, tp); + } + + markEmptySplitsAsFinished(recordsBySplits); + + // Unassign the partitions that has finished. + if (!finishedPartitions.isEmpty()) { + finishedPartitions.forEach(kafkaSourceReaderMetrics::removeRecordsLagMetric); + unassignPartitions(finishedPartitions); + } + + // Update numBytesIn + kafkaSourceReaderMetrics.updateNumBytesInCounter(); + + return recordsBySplits; + } + + private ConsumerRecords convertToLogRecord( + ConsumerRecords consumerRecords) throws IOException { + Map>> records = new HashMap<>(); + + for (TopicPartition tp : consumerRecords.partitions()) { + List> rs = consumerRecords.records(tp); + List> recordsForSplit = new ArrayList<>(rs.size()); + records.put(tp, recordsForSplit); + + for (ConsumerRecord consumerRecord : rs) { + byte[] value = consumerRecord.value(); + boolean magicFormat = checkMagicNum(value); + if (!magicFormat) { + throw new UnsupportedOperationException( + "Can't deserialize mixed-format log queue message due to it does not contain magic number."); + } + + LogData logData = logDataJsonDeserialization.deserialize(value); + if (!logData.getFlip() && filterByRowKind(logData.getActualValue())) { + LOG.info( + "filter the rowData, because of logConsumerAppendOnly is true, and rowData={}.", + logData.getActualValue()); + continue; + } + + final long currentOffset = consumerRecord.offset(); + + if (logData.getFlip()) { + if (logRetractionEnable) { + logReadHelper.startRetracting( + tp, logData.getUpstreamId(), logData.getEpicNo(), currentOffset + 1); + break; + } else { + continue; + } + } + + if (logRetractionEnable) { + logReadHelper.initialEpicStartOffsetIfEmpty( + tp, logData.getUpstreamId(), logData.getEpicNo(), currentOffset); + } + recordsForSplit.add(LogRecordWithRetractInfo.of(consumerRecord, logData)); + } + } + return new ConsumerRecords<>(records); + } + + /** read reversely in retracting mode */ + private KafkaPartitionSplitRecords readReversely(Set retractTps) + throws IOException { + Set origin = consumer.assignment(); + consumer.assign(retractTps); + + // stop in current offsets, the msg in the offset would be read + Map stoppingOffsetsFromConsumer = new HashMap<>(); + for (TopicPartition tp : retractTps) { + // the next poll offset + long offset = consumer.position(tp); + stoppingOffsetsFromConsumer.put(tp, Math.max(0, offset - 1)); + long startFrom = Math.max(0, offset - RETRACT_SIZE); + LOG.info("consumer reset offset to: {}", startFrom); + consumer.seek(tp, startFrom); + } + Map>> records = + pollToDesignatedPositions(stoppingOffsetsFromConsumer); + + Map>> logRecords = new HashMap<>(); + + Set finishRetract = new HashSet<>(); + for (Map.Entry>> entry : + records.entrySet()) { + TopicPartition tp = entry.getKey(); + List> consumerRecords = entry.getValue(); + + List> recordsForSplit = + new ArrayList<>(consumerRecords.size()); + logRecords.put(tp, recordsForSplit); + + long stoppingOffsetFromConsumer = stoppingOffsetsFromConsumer.get(tp); + LogSourceHelper.EpicRetractingInfo retractingInfo = logReadHelper.getRetractInfo(tp); + // stoppingOffsetFromConsumer is the offset queried from consumer, it may be larger than flip + // offset because + // kafka poll batch records every time. + // revertStartingOffset is the offset after flip, so it should minus 2 to get the offset + // before flip. + long stoppingOffset = + Math.min(stoppingOffsetFromConsumer, retractingInfo.getRevertStartingOffset() - 2); + int startIndex = findIndexOfOffset(consumerRecords, stoppingOffset); + + for (int i = startIndex; i >= 0; i--) { + ConsumerRecord r = consumerRecords.get(i); + + if (r.offset() < retractingInfo.getRetractStoppingOffset()) { + finishRetract.add(tp); + break; + } + LogData logData = logDataJsonDeserialization.deserialize(r.value()); + + if (!Objects.equals(logData.getUpstreamId(), retractingInfo.getUpstreamId()) + || logData.getEpicNo() <= retractingInfo.getEpicNo()) { + LOG.debug( + "won't retract other job or the success ckp epic data, upstreamId: {}, epicNo: {}", + logData.getUpstreamId(), + logData.getEpicNo()); + } else { + RowData actualValue = logReadHelper.turnRowKind(logData.getActualValue()); + recordsForSplit.add( + LogRecordWithRetractInfo.ofRetract( + r, + retractingInfo.getRetractStoppingOffset(), + retractingInfo.getRevertStartingOffset(), + retractingInfo.getEpicNo(), + logData, + actualValue)); + } + + if (r.offset() == retractingInfo.getRetractStoppingOffset()) { + finishRetract.add(tp); + break; + } + } + } + + suspendRetracting(finishRetract); + consumer.assign(origin); + + return new KafkaPartitionSplitRecords( + new ConsumerRecords<>(logRecords), kafkaSourceReaderMetrics); + } + + private void suspendRetracting(Set finishRetract) { + revertConsumer(finishRetract); + logReadHelper.suspendRetracting(finishRetract); + } + + /** revert consumer to original offset after flip */ + public void revertConsumer(Set finishRetract) { + for (TopicPartition tp : finishRetract) { + LogSourceHelper.EpicRetractingInfo retractingInfo = logReadHelper.getRetractInfo(tp); + long revert = retractingInfo.getRevertStartingOffset(); + consumer.seek(tp, revert); + } + } + + /** + * @param records should be in order of kafka. + * @param offset Kafka offset + * @return the index in records + */ + private int findIndexOfOffset(List> records, long offset) { + int last = records.size() - 1; + int idx = Math.min(RETRACT_SIZE, last); + + long diff = -1; + while (idx >= 0 && idx <= last && (diff = records.get(idx).offset() - offset) != 0) { + if (diff > 0) { + idx--; + } else { + idx++; + } + } + if (diff == 0) { + LOG.debug("start index is: {}", idx); + return idx; + } + LOG.info( + "topic: {}, partition: {}, records' offset range: [{}, {}], need to find: {}", + records.get(0).topic(), + records.get(0).partition(), + records.get(0).offset(), + records.get(last).offset(), + offset); + throw new IllegalStateException("can not find offset in records"); + } + + /** + * @param stoppingOffsets the stopping offset is the position which should be read. + * @return value in map may contain some useless records. It should be filtered. + */ + private Map>> pollToDesignatedPositions( + Map stoppingOffsets) { + ConsumerRecords consumerRecords; + try { + consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); + } catch (WakeupException we) { + LOG.error("consume reversely error"); + return Collections.EMPTY_MAP; + } + + Map>> recordsForTps = new HashMap<>(); + + int unfinished = stoppingOffsets.size(); + int round = 0; + + Set unfinishedTps = new HashSet<>(); + while (unfinished > 0 && round++ < RETRACT_FETCH_MAX_ROUND) { + unfinishedTps.clear(); + + for (TopicPartition tp : consumerRecords.partitions()) { + recordsForTps.putIfAbsent(tp, new ArrayList<>(RETRACT_SIZE)); + List> records = recordsForTps.get(tp); + + records.addAll(consumerRecords.records(tp)); + + long stoppingOffset = stoppingOffsets.get(tp); + if (records.get(records.size() - 1).offset() >= stoppingOffset) { + unfinished--; + LOG.info( + "reach the stopping offset. stopping offset: {}, tp: {}. data size:{}", + stoppingOffset, + tp, + records.size()); + } else { + unfinishedTps.add(tp); + } + } + if (unfinished == 0) { + break; + } + consumer.assign(unfinishedTps); + } + + if (unfinished > 0) { + LOG.error("can not poll msg to designated positions. unfinished: {}", unfinishedTps); + for (TopicPartition tp : unfinishedTps) { + List> records = recordsForTps.get(tp); + LOG.info( + "tp: {}, polled offset:{}, stopping offset: {}", + tp, + records.get(records.size() - 1).offset(), + stoppingOffsets.get(tp)); + } + throw new UnsupportedOperationException("poll msg reversely error"); + } + + return recordsForTps; + } + + /** + * filter the rowData only works during {@link + * MixedFormatValidator#MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE} is false and {@link + * MixedFormatValidator#MIXED_FORMAT_CONSUMER_CHANGELOG_MODE} is {@link + * MixedFormatValidator#LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY} and rowData.rowKind != INSERT + * + * @param rowData the judged data + * @return true means should be filtered. + */ + boolean filterByRowKind(RowData rowData) { + return !logRetractionEnable + && logConsumerAppendOnly + && !rowData.getRowKind().equals(RowKind.INSERT); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java new file mode 100644 index 0000000000..2e499d05f9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; +import org.apache.flink.table.data.RowData; + +import javax.annotation.Nullable; + +import java.util.NavigableMap; +import java.util.TreeMap; + +public class LogKafkaPartitionSplitState extends KafkaPartitionSplitState { + + /** + * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and + * opposite RowKind. + */ + private boolean retracting; + /** @see LogKafkaPartitionSplit#retractStopOffset */ + @Nullable private Long retractStopOffset; + /** @see LogKafkaPartitionSplit#revertStartOffset */ + @Nullable private Long revertStartOffset; + /** @see LogKafkaPartitionSplit#retractingEpicNo */ + @Nullable private Long retractingEpicNo; + /** @see LogKafkaPartitionSplit#retractingUpstreamId */ + @Nullable private String retractingUpstreamId; + /** Key: upstream job id + "_" + epicNo, Value: epic start offset */ + private final NavigableMap upstreamEpicStartOffsets; + + public LogKafkaPartitionSplitState(KafkaPartitionSplit s) { + super(s); + + if (!(s instanceof LogKafkaPartitionSplit)) { + retracting = false; + upstreamEpicStartOffsets = new TreeMap<>(); + return; + } + LogKafkaPartitionSplit partitionSplit = (LogKafkaPartitionSplit) s; + upstreamEpicStartOffsets = partitionSplit.getUpStreamEpicStartOffsets(); + retracting = partitionSplit.isRetracting(); + revertStartOffset = partitionSplit.getRevertStartOffset(); + retractStopOffset = partitionSplit.getRetractStopOffset(); + retractingEpicNo = partitionSplit.getRetractingEpicNo(); + retractingUpstreamId = partitionSplit.getRetractingUpstreamId(); + } + + public void initEpicStartOffsetIfEmpty(String upstreamId, long epicNo, long offset) { + String key = combineUpstreamIdAndEpicNo(upstreamId, epicNo); + upstreamEpicStartOffsets.putIfAbsent(key, offset); + } + + public void updateState(LogRecordWithRetractInfo record) { + if (record.isRetracting()) { + setCurrentOffset(record.offset() - 1); + revertStartOffset = record.getRevertStartingOffset(); + retractStopOffset = record.getRetractStoppingOffset(); + retractingEpicNo = record.getRetractingEpicNo(); + retractingUpstreamId = record.getLogData().getUpstreamId(); + } else { + setCurrentOffset(record.offset() + 1); + } + initEpicStartOffsetIfEmpty( + record.getLogData().getUpstreamId(), record.getLogData().getEpicNo(), record.offset()); + + // todo: clear useless epic start offset in state + retracting = record.isRetracting(); + } + + public boolean isRetracting() { + return retracting; + } + + public Long getRetractStopOffset() { + return retractStopOffset; + } + + public Long getRevertStartOffset() { + return revertStartOffset; + } + + public NavigableMap getUpstreamEpicStartOffsets() { + return upstreamEpicStartOffsets; + } + + public Long getRetractingEpicNo() { + return retractingEpicNo; + } + + public String getRetractingUpstreamId() { + return retractingUpstreamId; + } + + private String combineUpstreamIdAndEpicNo(String upstreamId, long epicNo) { + return upstreamId + "_" + epicNo; + } + + public LogKafkaPartitionSplit toLogKafkaPartitionSplit() { + return new LogKafkaPartitionSplit(this); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java new file mode 100644 index 0000000000..b4b2f9628b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.flink.connector.kafka.source.reader.KafkaRecordEmitter; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; +import org.apache.flink.table.data.RowData; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +public class LogKafkaRecordEmitter extends KafkaRecordEmitter { + + public LogKafkaRecordEmitter(KafkaRecordDeserializationSchema deserializationSchema) { + super(deserializationSchema); + } + + @Override + public void emitRecord( + ConsumerRecord consumerRecord, + SourceOutput output, + KafkaPartitionSplitState splitState) + throws Exception { + LogRecordWithRetractInfo element = (LogRecordWithRetractInfo) consumerRecord; + output.collect(element.getActualValue(), element.timestamp()); + ((LogKafkaPartitionSplitState) splitState).updateState(element); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java new file mode 100644 index 0000000000..b3854f0743 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE; + +import org.apache.amoro.flink.read.internals.KafkaSource; +import org.apache.amoro.flink.read.internals.KafkaSourceFetcherManager; +import org.apache.amoro.flink.read.source.log.LogSourceHelper; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import javax.annotation.Nullable; + +import java.util.Map; +import java.util.Properties; +import java.util.function.Supplier; + +/** + * The Source implementation of LogKafka. + * + *
{@code
+ * LogKafkaSource source = LogKafkaSource.builder(mixedFormatSchema, configuration)
+ *    .setTopics(Arrays.asList(TOPIC1))
+ *    .setStartingOffsets(OffsetsInitializer.earliest())
+ *    .setProperties(properties)
+ *    .build();
+ * }
+ * + *

See {@link LogKafkaSourceBuilder} for more details. + */ +public class LogKafkaSource extends KafkaSource { + private static final long serialVersionUID = 1L; + + /** read schema, only contains the selected fields */ + private final Schema schema; + + private final boolean logRetractionEnable; + private final String logConsumerChangelogMode; + + LogKafkaSource( + KafkaSubscriber subscriber, + OffsetsInitializer startingOffsetsInitializer, + @Nullable OffsetsInitializer stoppingOffsetsInitializer, + Boundedness boundedness, + KafkaRecordDeserializationSchema deserializationSchema, + Properties props, + Schema schema, + Map tableProperties) { + super( + subscriber, + startingOffsetsInitializer, + stoppingOffsetsInitializer, + boundedness, + deserializationSchema, + props); + this.schema = schema; + logRetractionEnable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + tableProperties, + MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), + MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue()); + logConsumerChangelogMode = + CompatibleFlinkPropertyUtil.propertyAsString( + tableProperties, + MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), + MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.defaultValue()); + } + + /** + * Get a logKafkaSourceBuilder to build a {@link LogKafkaSource}. + * + * @return a Log Kafka source builder. + */ + public static LogKafkaSourceBuilder builder(Schema schema, Map tableProperties) { + return new LogKafkaSourceBuilder(schema, tableProperties); + } + + @Override + public SourceReader createReader( + SourceReaderContext readerContext) { + FutureCompletingBlockingQueue>> + elementsQueue = new FutureCompletingBlockingQueue<>(); + LogSourceHelper logReadHelper = logRetractionEnable ? new LogSourceHelper() : null; + + final KafkaSourceReaderMetrics kafkaSourceReaderMetrics = + new KafkaSourceReaderMetrics(readerContext.metricGroup()); + Supplier splitReaderSupplier = + () -> + new LogKafkaPartitionSplitReader( + props, + readerContext, + kafkaSourceReaderMetrics, + schema, + logRetractionEnable, + logReadHelper, + logConsumerChangelogMode); + LogKafkaRecordEmitter recordEmitter = new LogKafkaRecordEmitter(null); + + return new LogKafkaSourceReader<>( + elementsQueue, + new KafkaSourceFetcherManager( + elementsQueue, + splitReaderSupplier::get, + (ignore) -> {}, + readerContext.getConfiguration()), + recordEmitter, + toConfiguration(props), + readerContext, + kafkaSourceReaderMetrics, + logReadHelper); + } + + @Override + public TypeInformation getProducedType() { + RowType rowType = FlinkSchemaUtil.convert(schema); + return InternalTypeInfo.of(rowType); + } + + // ----------- private helper methods --------------- + + private Configuration toConfiguration(Properties props) { + Configuration config = new Configuration(); + props.stringPropertyNames().forEach(key -> config.setString(key, props.getProperty(key))); + return config; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java new file mode 100644 index 0000000000..2956965ea4 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java @@ -0,0 +1,578 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_GROUP_OFFSETS; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_SPECIFIC_OFFSETS; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_TIMESTAMP; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_SPECIFIC_OFFSETS; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_TIMESTAMP_MILLIS; +import static org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil.fetchLogstorePrefixProperties; +import static org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil.getLogTopic; +import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; + +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.table.TableProperties; +import org.apache.amoro.utils.CompatiblePropertyUtil; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.connector.kafka.source.KafkaSource; +import org.apache.flink.connector.kafka.source.KafkaSourceBuilder; +import org.apache.flink.connector.kafka.source.KafkaSourceOptions; +import org.apache.flink.connector.kafka.source.enumerator.initializer.NoStoppingOffsetsInitializer; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.Schema; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Random; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * The @builder class for {@link LogKafkaSource} to make it easier for the users to construct a + * {@link LogKafkaSource}. + * + *

{@code
+ * LogKafkaSource source = LogKafkaSource.builder(mixedFormatSchema, configuration)
+ *    .setTopics(Arrays.asList(TOPIC1))
+ *    .setStartingOffsets(OffsetsInitializer.earliest())
+ *    .setProperties(properties)
+ *    .build();
+ * }
+ */ +public class LogKafkaSourceBuilder { + private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceBuilder.class); + private static final String[] REQUIRED_CONFIGS = { + ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, ConsumerConfig.GROUP_ID_CONFIG + }; + private static final String PARTITION = "partition"; + private static final String OFFSET = "offset"; + // The subscriber specifies the partitions to subscribe to. + private KafkaSubscriber subscriber; + // Users can specify the starting / stopping offset initializer. + private OffsetsInitializer startingOffsetsInitializer; + private OffsetsInitializer stoppingOffsetsInitializer; + // Boundedness + private Boundedness boundedness; + private KafkaRecordDeserializationSchema deserializationSchema; + // The configurations. + protected Properties kafkaProperties; + + private final Schema schema; + private final Map tableProperties; + + /** + * @param schema read schema, only contains the selected fields + * @param tableProperties mixed-format table properties, maybe include Flink SQL hints. + */ + LogKafkaSourceBuilder(Schema schema, Map tableProperties) { + this.subscriber = null; + this.startingOffsetsInitializer = OffsetsInitializer.earliest(); + this.stoppingOffsetsInitializer = new NoStoppingOffsetsInitializer(); + this.boundedness = Boundedness.CONTINUOUS_UNBOUNDED; + this.deserializationSchema = null; + this.kafkaProperties = fetchLogstorePrefixProperties(tableProperties); + this.schema = schema; + this.tableProperties = tableProperties; + setupKafkaProperties(); + } + + /** + * Sets the bootstrap servers for the KafkaConsumer of the LogKafkaSource. + * + * @param bootstrapServers the bootstrap servers of the Kafka cluster. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setBootstrapServers(String bootstrapServers) { + return setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + } + + /** + * Sets the consumer group id of the LogKafkaSource. + * + * @param groupId the group id of the LogKafkaSource. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setGroupId(String groupId) { + return setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId); + } + + /** + * Set a list of topics the LogKafkaSource should consume from. All the topics in the list should + * have existed in the Kafka cluster. Otherwise an exception will be thrown. To allow some of the + * topics to be created lazily, please use {@link #setTopicPattern(Pattern)} instead. + */ + public LogKafkaSourceBuilder setTopics(List topics) { + ensureSubscriberIsNull("topics"); + subscriber = KafkaSubscriber.getTopicListSubscriber(topics); + return this; + } + + /** + * Set a list of topics the LogKafkaSource should consume from. All the topics in the list should + * have existed in the Kafka cluster. Otherwise an exception will be thrown. To allow some of the + * topics to be created lazily, please use {@link #setTopicPattern(Pattern)} instead. + * + * @param topics the list of topics to consume from. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setTopics(String... topics) { + return setTopics(Arrays.asList(topics)); + } + + /** + * Set a topic pattern to consume from use the java {@link Pattern}. + * + * @param topicPattern the pattern of the topic name to consume from. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setTopicPattern(Pattern topicPattern) { + ensureSubscriberIsNull("topic pattern"); + subscriber = KafkaSubscriber.getTopicPatternSubscriber(topicPattern); + return this; + } + + /** + * Set a set of partitions to consume from. + * + * @param partitions the set of partitions to consume from. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setPartitions(Set partitions) { + ensureSubscriberIsNull("partitions"); + subscriber = KafkaSubscriber.getPartitionSetSubscriber(partitions); + return this; + } + + /** + * Specify from which offsets the LogKafkaSource should start consume from by providing an {@link + * OffsetsInitializer}. + * + *

The following {@link OffsetsInitializer}s are commonly used and provided out of the box. + * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. + * + *

    + *
  • {@link OffsetsInitializer#earliest()} - starting from the earliest offsets. This is also + * the default {@link OffsetsInitializer} of the KafkaSource for starting offsets. + *
  • {@link OffsetsInitializer#latest()} - starting from the latest offsets. + *
  • {@link OffsetsInitializer#committedOffsets()} - starting from the committed offsets of + * the consumer group. + *
  • {@link + * OffsetsInitializer#committedOffsets(org.apache.kafka.clients.consumer.OffsetResetStrategy)} + * - starting from the committed offsets of the consumer group. If there is no committed + * offsets, starting from the offsets specified by the {@link + * org.apache.kafka.clients.consumer.OffsetResetStrategy OffsetResetStrategy}. + *
  • {@link OffsetsInitializer#offsets(Map)} - starting from the specified offsets for each + * partition. + *
  • {@link OffsetsInitializer#timestamp(long)} - starting from the specified timestamp for + * each partition. Note that the guarantee here is that all the records in Kafka whose + * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the + * given starting timestamp will be consumed. However, it is possible that some consumer + * records whose timestamp is smaller than the given starting timestamp are also consumed. + *
+ * + * @param startingOffsetsInitializer the {@link OffsetsInitializer} setting the starting offsets + * for the Source. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setStartingOffsets(OffsetsInitializer startingOffsetsInitializer) { + this.startingOffsetsInitializer = startingOffsetsInitializer; + LOG.info("Setting LogKafkaSource starting offset: {}", startingOffsetsInitializer); + return this; + } + + /** + * By default the LogKafkaSource is set to run in {@link Boundedness#CONTINUOUS_UNBOUNDED} manner + * and thus never stops until the Flink job fails or is canceled. To let the KafkaSource run as a + * streaming source but still stops at some point, one can set an {@link OffsetsInitializer} to + * specify the stopping offsets for each partition. When all the partitions have reached their + * stopping offsets, the KafkaSource will then exit. + * + *

This method is different from {@link #setBounded(OffsetsInitializer)} that after setting the + * stopping offsets with this method, {@link KafkaSource#getBoundedness()} will still return + * {@link Boundedness#CONTINUOUS_UNBOUNDED} even though it will stop at the stopping offsets + * specified by the stopping offsets {@link OffsetsInitializer}. + * + *

The following {@link OffsetsInitializer} are commonly used and provided out of the box. + * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. + * + *

    + *
  • {@link OffsetsInitializer#latest()} - stop at the latest offsets of the partitions when + * the KafkaSource starts to run. + *
  • {@link OffsetsInitializer#committedOffsets()} - stops at the committed offsets of the + * consumer group. + *
  • {@link OffsetsInitializer#offsets(Map)} - stops at the specified offsets for each + * partition. + *
  • {@link OffsetsInitializer#timestamp(long)} - stops at the specified timestamp for each + * partition. The guarantee of setting the stopping timestamp is that no Kafka records whose + * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the + * given stopping timestamp will be consumed. However, it is possible that some records + * whose timestamp is smaller than the specified stopping timestamp are not consumed. + *
+ * + * @param stoppingOffsetsInitializer The {@link OffsetsInitializer} to specify the stopping + * offset. + * @return this LogKafkaSourceBuilder. + * @see #setBounded(OffsetsInitializer) + */ + public LogKafkaSourceBuilder setUnbounded(OffsetsInitializer stoppingOffsetsInitializer) { + this.boundedness = Boundedness.CONTINUOUS_UNBOUNDED; + this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; + return this; + } + + /** + * By default the LogKafkaSource is set to run in {@link Boundedness#CONTINUOUS_UNBOUNDED} manner + * and thus never stops until the Flink job fails or is canceled. To let the KafkaSource run in + * {@link Boundedness#BOUNDED} manner and stops at some point, one can set an {@link + * OffsetsInitializer} to specify the stopping offsets for each partition. When all the partitions + * have reached their stopping offsets, the KafkaSource will then exit. + * + *

This method is different from {@link #setUnbounded(OffsetsInitializer)} that after setting + * the stopping offsets with this method, {@link KafkaSource#getBoundedness()} will return {@link + * Boundedness#BOUNDED} instead of {@link Boundedness#CONTINUOUS_UNBOUNDED}. + * + *

The following {@link OffsetsInitializer} are commonly used and provided out of the box. + * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. + * + *

    + *
  • {@link OffsetsInitializer#latest()} - stop at the latest offsets of the partitions when + * the KafkaSource starts to run. + *
  • {@link OffsetsInitializer#committedOffsets()} - stops at the committed offsets of the + * consumer group. + *
  • {@link OffsetsInitializer#offsets(Map)} - stops at the specified offsets for each + * partition. + *
  • {@link OffsetsInitializer#timestamp(long)} - stops at the specified timestamp for each + * partition. The guarantee of setting the stopping timestamp is that no Kafka records whose + * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the + * given stopping timestamp will be consumed. However, it is possible that some records + * whose timestamp is smaller than the specified stopping timestamp are not consumed. + *
+ * + * @param stoppingOffsetsInitializer the {@link OffsetsInitializer} to specify the stopping + * offsets. + * @return this LogKafkaSourceBuilder. + * @see #setUnbounded(OffsetsInitializer) + */ + public LogKafkaSourceBuilder setBounded(OffsetsInitializer stoppingOffsetsInitializer) { + this.boundedness = Boundedness.BOUNDED; + this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; + return this; + } + + /** + * Sets the {@link KafkaRecordDeserializationSchema deserializer} of the {@link + * org.apache.kafka.clients.consumer.ConsumerRecord ConsumerRecord} for LogKafkaSource. + * + * @param recordDeserializer the deserializer for Kafka {@link + * org.apache.kafka.clients.consumer.ConsumerRecord ConsumerRecord}. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setDeserializer( + KafkaRecordDeserializationSchema recordDeserializer) { + this.deserializationSchema = recordDeserializer; + return this; + } + + /** + * Sets the client id prefix of this LogKafkaSource. + * + * @param prefix the client id prefix to use for this LogKafkaSource. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setClientIdPrefix(String prefix) { + return setProperty(KafkaSourceOptions.CLIENT_ID_PREFIX.key(), prefix); + } + + /** + * Set an arbitrary property for the LogKafkaSource and LogKafkaConsumer. The valid keys can be + * found in {@link ConsumerConfig} and {@link KafkaSourceOptions}. + * + *

Note that the following keys will be overridden by the builder when the KafkaSource is + * created. + * + *

    + *
  • key.deserializer is always set to {@link ByteArrayDeserializer}. + *
  • value.deserializer is always set to {@link ByteArrayDeserializer}. + *
  • auto.offset.reset.strategy is overridden by {@link + * OffsetsInitializer#getAutoOffsetResetStrategy()} for the starting offsets, which is by + * default {@link OffsetsInitializer#earliest()}. + *
  • partition.discovery.interval.ms is overridden to -1 when {@link + * #setBounded(OffsetsInitializer)} has been invoked. + *
+ * + * @param key the key of the property. + * @param value the value of the property. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setProperty(String key, String value) { + kafkaProperties.setProperty(key, value); + return this; + } + + /** + * Set arbitrary properties for the LogKafkaSource and LogKafkaConsumer. The valid keys can be + * found in {@link ConsumerConfig} and {@link KafkaSourceOptions}. + * + *

Note that the following keys will be overridden by the builder when the KafkaSource is + * created. + * + *

    + *
  • key.deserializer is always set to {@link ByteArrayDeserializer}. + *
  • value.deserializer is always set to {@link ByteArrayDeserializer}. + *
  • auto.offset.reset.strategy is overridden by {@link + * OffsetsInitializer#getAutoOffsetResetStrategy()} for the starting offsets, which is by + * default {@link OffsetsInitializer#earliest()}. + *
  • partition.discovery.interval.ms is overridden to -1 when {@link + * #setBounded(OffsetsInitializer)} has been invoked. + *
  • client.id is overridden to the "client.id.prefix-RANDOM_LONG", or + * "group.id-RANDOM_LONG" if the client id prefix is not set. + *
+ * + * @param props the properties to set for the LogKafkaSource. + * @return this LogKafkaSourceBuilder. + */ + public LogKafkaSourceBuilder setProperties(Properties props) { + this.kafkaProperties.putAll(props); + return this; + } + + /** + * Build the {@link LogKafkaSource}. + * + * @return a LogKafkaSource with the settings made for this builder. + */ + public LogKafkaSource build() { + sanityCheck(); + parseAndSetRequiredProperties(); + return new LogKafkaSource( + subscriber, + startingOffsetsInitializer, + stoppingOffsetsInitializer, + boundedness, + deserializationSchema, + kafkaProperties, + schema, + tableProperties); + } + + private void setupKafkaProperties() { + if (tableProperties.containsKey(TableProperties.LOG_STORE_ADDRESS)) { + kafkaProperties.put( + BOOTSTRAP_SERVERS_CONFIG, tableProperties.get(TableProperties.LOG_STORE_ADDRESS)); + } + if (tableProperties.containsKey(TableProperties.LOG_STORE_MESSAGE_TOPIC)) { + setTopics(getLogTopic(tableProperties)); + } + + kafkaProperties.putIfAbsent( + "properties.key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + kafkaProperties.putIfAbsent( + "properties.value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + kafkaProperties.putIfAbsent( + "properties.key.deserializer", + "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + kafkaProperties.putIfAbsent( + "properties.value.deserializer", + "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + + setupStartupMode(); + } + + private void setupStartupMode() { + String startupMode = + CompatiblePropertyUtil.propertyAsString( + tableProperties, SCAN_STARTUP_MODE.key(), SCAN_STARTUP_MODE.defaultValue()) + .toLowerCase(); + + switch (startupMode) { + case SCAN_STARTUP_MODE_EARLIEST: + setStartingOffsets(OffsetsInitializer.earliest()); + break; + case SCAN_STARTUP_MODE_LATEST: + setStartingOffsets(OffsetsInitializer.latest()); + break; + case SCAN_STARTUP_MODE_TIMESTAMP: + long startupTimestampMillis = + Long.parseLong( + Preconditions.checkNotNull( + tableProperties.get(SCAN_STARTUP_TIMESTAMP_MILLIS.key()), + String.format( + "'%s' should be set in '%s' mode", + SCAN_STARTUP_TIMESTAMP_MILLIS.key(), SCAN_STARTUP_MODE_TIMESTAMP))); + setStartingOffsets(OffsetsInitializer.timestamp(startupTimestampMillis)); + break; + case SCAN_STARTUP_MODE_GROUP_OFFSETS: + setStartingOffsets(OffsetsInitializer.committedOffsets()); + break; + case SCAN_STARTUP_MODE_SPECIFIC_OFFSETS: + Map specificOffsets = new HashMap<>(); + String specificOffsetsStrOpt = + Preconditions.checkNotNull( + tableProperties.get(SCAN_STARTUP_SPECIFIC_OFFSETS.key()), + String.format( + "'%s' should be set in '%s' mode", + SCAN_STARTUP_SPECIFIC_OFFSETS.key(), SCAN_STARTUP_MODE_SPECIFIC_OFFSETS)); + final Map offsetMap = + parseSpecificOffsets(specificOffsetsStrOpt, SCAN_STARTUP_SPECIFIC_OFFSETS.key()); + offsetMap.forEach( + (partition, offset) -> { + final TopicPartition topicPartition = + new TopicPartition(getLogTopic(tableProperties).get(0), partition); + specificOffsets.put(topicPartition, offset); + }); + setStartingOffsets(OffsetsInitializer.offsets(specificOffsets)); + break; + default: + throw new ValidationException( + String.format( + "%s only support '%s', '%s', '%s', '%s', '%s'. But input is '%s'", + MixedFormatValidator.SCAN_STARTUP_MODE, + SCAN_STARTUP_MODE_LATEST, + SCAN_STARTUP_MODE_EARLIEST, + SCAN_STARTUP_MODE_TIMESTAMP, + SCAN_STARTUP_MODE_GROUP_OFFSETS, + SCAN_STARTUP_MODE_SPECIFIC_OFFSETS, + startupMode)); + } + } + + // ------------- private helpers -------------- + + private void ensureSubscriberIsNull(String attemptingSubscribeMode) { + if (subscriber != null) { + throw new IllegalStateException( + String.format( + "Cannot use %s for consumption because a %s is already set for consumption.", + attemptingSubscribeMode, subscriber.getClass().getSimpleName())); + } + } + + private void parseAndSetRequiredProperties() { + maybeOverride( + ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName(), true); + maybeOverride( + ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, + ByteArrayDeserializer.class.getName(), + true); + maybeOverride(ConsumerConfig.GROUP_ID_CONFIG, "KafkaSource-" + new Random().nextLong(), false); + maybeOverride(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false", false); + maybeOverride( + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, + startingOffsetsInitializer.getAutoOffsetResetStrategy().name().toLowerCase(), + true); + + // If the source is bounded, do not run periodic partition discovery. + maybeOverride( + KafkaSourceOptions.PARTITION_DISCOVERY_INTERVAL_MS.key(), + "-1", + boundedness == Boundedness.BOUNDED); + + // If the client id prefix is not set, reuse the consumer group id as the client id prefix. + maybeOverride( + KafkaSourceOptions.CLIENT_ID_PREFIX.key(), + kafkaProperties.getProperty(ConsumerConfig.GROUP_ID_CONFIG), + false); + } + + private boolean maybeOverride(String key, String value, boolean override) { + boolean overridden = false; + String userValue = kafkaProperties.getProperty(key); + if (userValue != null) { + if (override) { + LOG.warn( + String.format( + "Property %s is provided but will be overridden from %s to %s", + key, userValue, value)); + kafkaProperties.setProperty(key, value); + overridden = true; + } + } else { + kafkaProperties.setProperty(key, value); + } + return overridden; + } + + private void sanityCheck() { + // Check required configs. + checkNotNull( + kafkaProperties.getProperty(BOOTSTRAP_SERVERS_CONFIG), + String.format("Property %s is required but not provided", LOG_STORE_ADDRESS)); + // Check required settings. + checkNotNull( + subscriber, + String.format("No topic is specified, '%s' should be set.", LOG_STORE_MESSAGE_TOPIC)); + } + + public static Map parseSpecificOffsets( + String specificOffsetsStr, String optionKey) { + final Map offsetMap = new HashMap<>(); + final String[] pairs = specificOffsetsStr.split(";"); + final String validationExceptionMessage = + String.format( + "Invalid properties '%s' should follow the format " + + "'partition:0,offset:42;partition:1,offset:300', but is '%s'.", + optionKey, specificOffsetsStr); + + if (pairs.length == 0) { + throw new ValidationException(validationExceptionMessage); + } + + for (String pair : pairs) { + if (null == pair || pair.length() == 0 || !pair.contains(",")) { + throw new ValidationException(validationExceptionMessage); + } + + final String[] kv = pair.split(","); + if (kv.length != 2 || !kv[0].startsWith(PARTITION + ':') || !kv[1].startsWith(OFFSET + ':')) { + throw new ValidationException(validationExceptionMessage); + } + + String partitionValue = kv[0].substring(kv[0].indexOf(":") + 1); + String offsetValue = kv[1].substring(kv[1].indexOf(":") + 1); + try { + final Integer partition = Integer.valueOf(partitionValue); + final Long offset = Long.valueOf(offsetValue); + offsetMap.put(partition, offset); + } catch (NumberFormatException e) { + throw new ValidationException(validationExceptionMessage, e); + } + } + return offsetMap; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java new file mode 100644 index 0000000000..f49250b8ff --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import org.apache.amoro.flink.read.internals.KafkaSourceFetcherManager; +import org.apache.amoro.flink.read.internals.KafkaSourceReader; +import org.apache.amoro.flink.read.source.log.LogSourceHelper; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordEmitter; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +/** The source reader for Kafka partitions. */ +public class LogKafkaSourceReader extends KafkaSourceReader { + + private static final Logger LOG = LoggerFactory.getLogger(LogKafkaSourceReader.class); + + @Nullable private final LogSourceHelper logReadHelper; + + public LogKafkaSourceReader( + FutureCompletingBlockingQueue>> + elementsQueue, + KafkaSourceFetcherManager kafkaSourceFetcherManager, + RecordEmitter, T, KafkaPartitionSplitState> recordEmitter, + Configuration config, + SourceReaderContext context, + KafkaSourceReaderMetrics kafkaSourceReaderMetrics, + @Nullable LogSourceHelper logReadHelper) { + super( + elementsQueue, + kafkaSourceFetcherManager, + recordEmitter, + config, + context, + kafkaSourceReaderMetrics); + + this.logReadHelper = logReadHelper; + } + + @Override + protected KafkaPartitionSplitState initializedState(KafkaPartitionSplit split) { + if (logReadHelper != null) { + logReadHelper.initializedState(split); + } + return new LogKafkaPartitionSplitState(split); + } + + @Override + protected KafkaPartitionSplit toSplitType(String splitId, KafkaPartitionSplitState splitState) { + return ((LogKafkaPartitionSplitState) splitState).toLogKafkaPartitionSplit(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java new file mode 100644 index 0000000000..1f79bb9991 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.source.log.kafka; + +import org.apache.amoro.log.LogData; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +public class LogRecordWithRetractInfo extends ConsumerRecord { + + /** + * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and + * opposite RowKind. + */ + private final boolean retracting; + /** @see LogKafkaPartitionSplit#retractStopOffset */ + private final Long retractStoppingOffset; + /** @see LogKafkaPartitionSplit#revertStartOffset */ + private final Long revertStartingOffset; + /** @see LogKafkaPartitionSplit#retractingEpicNo */ + private final Long retractingEpicNo; + + private final LogData logData; + private final T actualValue; + + public LogRecordWithRetractInfo( + ConsumerRecord consumerRecord, + boolean retracting, + Long retractStoppingOffset, + Long revertStartingOffset, + Long retractingEpicNo, + LogData logData, + T actualValue) { + super( + consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.offset(), + consumerRecord.timestamp(), + consumerRecord.timestampType(), + consumerRecord.serializedKeySize(), + consumerRecord.serializedValueSize(), + consumerRecord.key(), + consumerRecord.value(), + consumerRecord.headers(), + consumerRecord.leaderEpoch()); + this.retracting = retracting; + this.retractStoppingOffset = retractStoppingOffset; + this.revertStartingOffset = revertStartingOffset; + this.retractingEpicNo = retractingEpicNo; + this.logData = logData; + this.actualValue = actualValue; + } + + public static LogRecordWithRetractInfo ofRetract( + ConsumerRecord consumerRecord, + Long retractStoppingOffset, + Long revertStartingOffset, + Long retractingEpicNo, + LogData logData, + T actualValue) { + return new LogRecordWithRetractInfo<>( + consumerRecord, + true, + retractStoppingOffset, + revertStartingOffset, + retractingEpicNo, + logData, + actualValue); + } + + public static LogRecordWithRetractInfo of( + ConsumerRecord consumerRecord, LogData logData) { + return new LogRecordWithRetractInfo<>( + consumerRecord, false, null, null, null, logData, logData.getActualValue()); + } + + public boolean isRetracting() { + return retracting; + } + + public Long getRetractStoppingOffset() { + return retractStoppingOffset; + } + + public Long getRevertStartingOffset() { + return revertStartingOffset; + } + + public LogData getLogData() { + return logData; + } + + public Long getRetractingEpicNo() { + return retractingEpicNo; + } + + public T getActualValue() { + return actualValue; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java new file mode 100644 index 0000000000..b0893e7df4 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.DistributionHashMode; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Objects; +import java.util.Random; + +/** Shuffle RowData with same key to same subtask, to make sure cdc data with same key in order. */ +public class ReadShuffleRulePolicy implements ShuffleRulePolicy { + private static final Logger LOG = LoggerFactory.getLogger(ReadShuffleRulePolicy.class); + + private final ShuffleHelper helper; + + private final DistributionHashMode distributionHashMode; + + public ReadShuffleRulePolicy(ShuffleHelper helper) { + this( + helper, + DistributionHashMode.autoSelect(helper.isPrimaryKeyExist(), helper.isPartitionKeyExist())); + } + + public ReadShuffleRulePolicy(ShuffleHelper helper, DistributionHashMode distributionHashMode) { + this.helper = helper; + this.distributionHashMode = distributionHashMode; + Preconditions.checkArgument(distributionHashMode != DistributionHashMode.AUTO); + } + + @Override + public KeySelector generateKeySelector() { + return new PrimaryKeySelector(); + } + + @Override + public Partitioner generatePartitioner() { + return new RoundRobinPartitioner(distributionHashMode, helper); + } + + @Override + public DistributionHashMode getPolicyType() { + return distributionHashMode; + } + + /** return ShuffleKey */ + static class PrimaryKeySelector implements KeySelector { + @Override + public ShuffleKey getKey(RowData value) throws Exception { + return new ShuffleKey(value); + } + } + + /** Circular polling feed a streamRecord into a special factor node */ + static class RoundRobinPartitioner implements Partitioner { + private final ShuffleHelper helper; + private final DistributionHashMode distributionHashMode; + private Random random = null; + + RoundRobinPartitioner(DistributionHashMode distributionHashMode, ShuffleHelper helper) { + this.distributionHashMode = distributionHashMode; + this.helper = helper; + if (!distributionHashMode.isSupportPartition() + && !distributionHashMode.isSupportPrimaryKey()) { + random = new Random(); + } + } + + @Override + public int partition(ShuffleKey key, int numPartitions) { + if (helper != null) { + helper.open(); + } + checkNotNull(key); + RowData row = checkNotNull(key.getRow()); + + Integer pkHashCode = null; + if (distributionHashMode.isSupportPrimaryKey()) { + pkHashCode = helper.hashKeyValue(row); + } + // shuffle by mixed-format partition for partitioned table + Integer partitionHashCode = null; + if (distributionHashMode.isSupportPartition()) { + partitionHashCode = helper.hashPartitionValue(row); + } + if (pkHashCode != null && partitionHashCode != null) { + return Math.abs(Objects.hash(pkHashCode, partitionHashCode)) % numPartitions; + } else if (pkHashCode != null) { + return pkHashCode % numPartitions; + } else if (partitionHashCode != null) { + return partitionHashCode % numPartitions; + } else { + return random.nextInt(numPartitions); + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java new file mode 100644 index 0000000000..46a5ca26ef --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.amoro.table.DistributionHashMode; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * After the primary key value hash is modulated based on concurrency, the row is routed to + * different subtask write + * + *

+ */ +public class RoundRobinShuffleRulePolicy implements ShuffleRulePolicy { + private static final Logger LOG = LoggerFactory.getLogger(RoundRobinShuffleRulePolicy.class); + + private final ShuffleHelper helper; + + private final int downStreamOperatorParallelism; + + private final int fileSplit; + + private int factor = -1; + + private Map> subtaskTreeNodes; + + private final DistributionHashMode distributionHashMode; + + public RoundRobinShuffleRulePolicy(int downStreamOperatorParallelism, int fileSplit) { + this(null, downStreamOperatorParallelism, fileSplit); + } + + public RoundRobinShuffleRulePolicy( + ShuffleHelper helper, int downStreamOperatorParallelism, int fileSplit) { + this( + helper, + downStreamOperatorParallelism, + fileSplit, + DistributionHashMode.autoSelect(helper.isPrimaryKeyExist(), helper.isPartitionKeyExist())); + } + + public RoundRobinShuffleRulePolicy( + ShuffleHelper helper, + int downStreamOperatorParallelism, + int fileSplit, + DistributionHashMode distributionHashMode) { + this.helper = helper; + this.downStreamOperatorParallelism = downStreamOperatorParallelism; + this.fileSplit = fileSplit; + this.distributionHashMode = distributionHashMode; + Preconditions.checkArgument(distributionHashMode != DistributionHashMode.NONE); + Preconditions.checkArgument(distributionHashMode != DistributionHashMode.AUTO); + } + + @Override + public KeySelector generateKeySelector() { + return new PrimaryKeySelector(); + } + + @Override + public Partitioner generatePartitioner() { + getSubtaskTreeNodes(); + return new RoundRobinPartitioner( + downStreamOperatorParallelism, factor, distributionHashMode, helper); + } + + @Override + public DistributionHashMode getPolicyType() { + return distributionHashMode; + } + + @Override + public Map> getSubtaskTreeNodes() { + if (this.subtaskTreeNodes == null) { + this.subtaskTreeNodes = initSubtaskFactorMap(this.downStreamOperatorParallelism); + return this.subtaskTreeNodes; + } + return this.subtaskTreeNodes; + } + + /** + * get factor sequence and writer subtask id mapping relationship Key:subtask id Value:treeNodes + * + * @return + */ + private Map> initSubtaskFactorMap(final int writerParallelism) { + Map> subtaskTreeNodes = new HashMap<>(writerParallelism); + if (distributionHashMode.isSupportPrimaryKey()) { + factor = fileSplit; + // every writer may accept all node data for partitioned table + if (distributionHashMode.isSupportPartition()) { + IntStream.range(0, writerParallelism) + .forEach( + subtaskId -> { + subtaskTreeNodes.put( + subtaskId, + IntStream.range(0, factor) + .mapToObj(index -> DataTreeNode.of(factor - 1, index)) + .collect(Collectors.toSet())); + }); + } else { + if (factor < writerParallelism) { + int actualDepth = getActualDepth(writerParallelism); + factor = (int) Math.pow(2, actualDepth - 1); + } + final int finalMask = factor - 1; + + IntStream.range(0, factor) + .forEach( + sequence -> { + int subtaskId = getSubtaskId(sequence, writerParallelism); + if (!subtaskTreeNodes.containsKey(subtaskId)) { + Set treeNodes = new HashSet<>(); + treeNodes.add(DataTreeNode.of(finalMask, sequence)); + subtaskTreeNodes.put(subtaskId, treeNodes); + } else { + subtaskTreeNodes.get(subtaskId).add(DataTreeNode.of(finalMask, sequence)); + } + }); + } + } else { + IntStream.range(0, writerParallelism) + .forEach( + subtaskId -> { + subtaskTreeNodes.put(subtaskId, Sets.newHashSet(DataTreeNode.of(0, 0))); + }); + } + subtaskTreeNodes.forEach( + (subtaskId, treeNodes) -> LOG.info("subtaskId={}, treeNodes={}.", subtaskId, treeNodes)); + return subtaskTreeNodes; + } + + private static int getActualDepth(int numPartitions) { + return (int) Math.ceil(Math.log(numPartitions) / Math.log(2)) + 1; + } + + private static int getSubtaskId(int sequence, int parallelism) { + return sequence % parallelism; + } + + /** return ShuffleKey */ + static class PrimaryKeySelector implements KeySelector { + @Override + public ShuffleKey getKey(RowData value) throws Exception { + return new ShuffleKey(value); + } + } + + /** Circular polling feed a streamRecord into a special factor node */ + static class RoundRobinPartitioner implements Partitioner { + private final int downStreamOperatorParallelism; + private final int factor; + private final ShuffleHelper helper; + private final DistributionHashMode distributionHashMode; + + RoundRobinPartitioner( + int downStreamOperatorParallelism, + int factor, + DistributionHashMode distributionHashMode, + ShuffleHelper helper) { + this.downStreamOperatorParallelism = downStreamOperatorParallelism; + this.factor = factor; + this.distributionHashMode = distributionHashMode; + this.helper = helper; + } + + @Override + public int partition(ShuffleKey key, int numPartitions) { + if (helper != null) { + helper.open(); + } + checkNotNull(key); + RowData row = checkNotNull(key.getRow()); + + checkArgument( + numPartitions == this.downStreamOperatorParallelism, + String.format( + "shuffle mixed-format record numPartition:%s is diff with writer parallelism:%s.", + numPartitions, this.downStreamOperatorParallelism)); + Integer factorIndex = null; + if (distributionHashMode.isSupportPrimaryKey()) { + long pkHashCode = helper.hashKeyValue(row); + factorIndex = (int) (pkHashCode % this.factor); + } + // shuffle by mixed-format tree node and partition for partitioned table + Integer partitionHashCode = null; + if (distributionHashMode.isSupportPartition()) { + partitionHashCode = helper.hashPartitionValue(row); + } + if (factorIndex != null && partitionHashCode != null) { + return Math.abs(Objects.hash(factorIndex, partitionHashCode)) % numPartitions; + } else if (factorIndex != null) { + return factorIndex % numPartitions; + } else if (partitionHashCode != null) { + return partitionHashCode % numPartitions; + } else { + return 0; + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java new file mode 100644 index 0000000000..5edcabc131 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import static org.apache.iceberg.IcebergSchemaUtil.projectPartition; + +import org.apache.amoro.data.PrimaryKeyData; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.types.Types; + +import java.io.Serializable; + +/** This helper operates to one mixed-format table and the data of the table. */ +public class ShuffleHelper implements Serializable { + private static final long serialVersionUID = 1L; + + private boolean primaryKeyExist = false; + private PrimaryKeyData primaryKeyData; + private PartitionKey partitionKey; + private RowType rowType; + private Types.StructType struct; + private transient RowDataWrapper rowDataWrapper; + + public static ShuffleHelper EMPTY = new ShuffleHelper(); + + public static ShuffleHelper build(MixedTable table, Schema schema, RowType rowType) { + PartitionKey partitionKey = null; + + if (table.spec() != null && !CollectionUtil.isNullOrEmpty(table.spec().fields())) { + partitionKey = new PartitionKey(projectPartition(table.spec(), schema), schema); + } + schema = addFieldsNotInMixedFormat(schema, rowType); + + if (table.isUnkeyedTable()) { + return new ShuffleHelper(rowType, schema.asStruct(), partitionKey); + } + + KeyedTable keyedTable = table.asKeyedTable(); + PrimaryKeyData primaryKeyData = new PrimaryKeyData(keyedTable.primaryKeySpec(), schema); + return new ShuffleHelper( + keyedTable.primaryKeySpec().primaryKeyExisted(), + primaryKeyData, + partitionKey, + rowType, + schema.asStruct()); + } + + /** + * If using mixed-format table as build table, there will be an additional implicit field, valuing + * process time. + * + * @param schema The physical schema in mixed-format table. + * @param rowType Flink RowData type. + * @return the mixed-format Schema with additional implicit field. + */ + public static Schema addFieldsNotInMixedFormat(Schema schema, RowType rowType) { + Types.NestedField[] nestedFields = new Types.NestedField[rowType.getFieldCount()]; + + for (int i = 0; i < nestedFields.length; i++) { + RowType.RowField field = rowType.getFields().get(i); + Types.NestedField nestedField; + if ((nestedField = schema.findField(field.getName())) != null) { + nestedFields[i] = nestedField; + } else { + // for now, there is only one case that virtual watermark exist in RowData, but not in + // mixed-format table schema. + nestedFields[i] = + Types.NestedField.optional(-1, field.getName(), Types.TimestampType.withoutZone()); + } + } + return new Schema(nestedFields); + } + + /** Should open firstly to initial RowDataWrapper, because it cannot be serialized. */ + public void open() { + if (rowDataWrapper != null) { + return; + } + if (rowType != null && struct != null) { + rowDataWrapper = new RowDataWrapper(rowType, struct); + } + } + + public ShuffleHelper() {} + + public ShuffleHelper(RowType rowType, Types.StructType structType, PartitionKey partitionKey) { + this(false, null, partitionKey, rowType, structType); + } + + public ShuffleHelper( + boolean primaryKeyExist, + PrimaryKeyData primaryKeyData, + PartitionKey partitionKey, + RowType rowType, + Types.StructType structType) { + this(primaryKeyExist, primaryKeyData, null, partitionKey, rowType, structType); + } + + public ShuffleHelper( + boolean primaryKeyExist, + PrimaryKeyData primaryKeyData, + RowDataWrapper rowDataWrapper, + PartitionKey partitionKey, + RowType rowType, + Types.StructType structType) { + this.primaryKeyExist = primaryKeyExist; + this.primaryKeyData = primaryKeyData; + this.rowDataWrapper = rowDataWrapper; + this.partitionKey = partitionKey; + this.rowType = rowType; + this.struct = structType; + } + + public boolean isPrimaryKeyExist() { + return primaryKeyExist; + } + + public boolean isPartitionKeyExist() { + return partitionKey != null && partitionKey.size() > 0; + } + + public int hashPartitionValue(RowData rowData) { + partitionKey.partition(rowDataWrapper.wrap(rowData)); + int hashcode = Math.abs(partitionKey.hashCode()); + return hashcode == Integer.MIN_VALUE ? Integer.MAX_VALUE : hashcode; + } + + public int hashKeyValue(RowData rowData) { + primaryKeyData.primaryKey(rowDataWrapper.wrap(rowData)); + return primaryKeyData.hashCode(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java new file mode 100644 index 0000000000..dd34b40615 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import org.apache.flink.table.data.RowData; + +public class ShuffleKey { + private final RowData row; + + public ShuffleKey(RowData row) { + this.row = row; + } + + public RowData getRow() { + return row; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java new file mode 100644 index 0000000000..4264778ebe --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.table.DistributionHashMode; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.api.java.functions.KeySelector; + +import java.io.Serializable; +import java.util.Map; +import java.util.Set; + +/** Policy for shuffle a streamRecord by primary keys */ +public interface ShuffleRulePolicy extends Serializable { + + /** + * Generate KeySelector + * + * @return + */ + KeySelector generateKeySelector(); + + /** + * Generate partitioner + * + * @return + */ + Partitioner generatePartitioner(); + + /** + * Get shuffle type. + * + * @return ShufflePolicyType + */ + DistributionHashMode getPolicyType(); + + /** + * Get factor sequence and writer subtask id mapping relationship Key:subtask id Value:treeNodes + * + * @return + */ + default Map> getSubtaskTreeNodes() { + return null; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java new file mode 100644 index 0000000000..3e4080e8a8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.interceptor.ProxyFactory; +import org.apache.amoro.flink.read.MixedFormatSource; +import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.amoro.flink.util.IcebergClassUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.util.ProxyUtil; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.io.InputFormat; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; +import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamSource; +import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; +import org.apache.flink.streaming.api.transformations.OneInputTransformation; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +/** An util class create mixed-format source data stream. */ +public class FlinkSource { + private FlinkSource() {} + + public static Builder forRowData() { + return new Builder(); + } + + public static final class Builder { + + private static final Logger LOG = LoggerFactory.getLogger(Builder.class); + private static final String MIXED_FORMAT_FILE_TRANSFORMATION = "mixed-format-file"; + private ProviderContext context; + private StreamExecutionEnvironment env; + private MixedTable mixedTable; + private MixedFormatTableLoader tableLoader; + private TableSchema projectedSchema; + private List filters; + private ReadableConfig flinkConf = new Configuration(); + private final Map properties = new HashMap<>(); + private long limit = -1L; + private WatermarkStrategy watermarkStrategy = WatermarkStrategy.noWatermarks(); + private final MixedFormatScanContext.Builder contextBuilder = + MixedFormatScanContext.contextBuilder(); + private boolean batchMode = false; + + private Builder() {} + + public Builder context(ProviderContext context) { + this.context = context; + return this; + } + + public Builder env(StreamExecutionEnvironment env) { + this.env = env; + return this; + } + + public Builder mixedFormatTable(MixedTable mixedTable) { + this.mixedTable = mixedTable; + properties.putAll(mixedTable.properties()); + return this; + } + + public Builder tableLoader(MixedFormatTableLoader tableLoader) { + this.tableLoader = tableLoader; + return this; + } + + public Builder project(TableSchema tableSchema) { + this.projectedSchema = tableSchema; + return this; + } + + public Builder limit(long limit) { + this.limit = limit; + contextBuilder.limit(limit); + return this; + } + + public Builder filters(List filters) { + this.filters = filters; + contextBuilder.filters(filters); + return this; + } + + public Builder flinkConf(ReadableConfig flinkConf) { + this.flinkConf = flinkConf; + return this; + } + + public Builder properties(Map properties) { + this.properties.putAll(properties); + return this; + } + + public Builder watermarkStrategy(WatermarkStrategy watermarkStrategy) { + if (watermarkStrategy != null) { + this.watermarkStrategy = watermarkStrategy; + } + return this; + } + + public Builder batchMode(boolean batchMode) { + this.batchMode = batchMode; + return this; + } + + public DataStream build() { + Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); + loadTableIfNeeded(); + + if (mixedTable.isUnkeyedTable()) { + String scanStartupMode = properties.get(MixedFormatValidator.SCAN_STARTUP_MODE.key()); + return buildUnkeyedTableSource(scanStartupMode); + } + + boolean dimTable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + properties, + MixedFormatValidator.DIM_TABLE_ENABLE.key(), + MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); + RowType rowType; + + if (projectedSchema == null) { + contextBuilder.project(mixedTable.schema()); + rowType = FlinkSchemaUtil.convert(mixedTable.schema()); + } else { + contextBuilder.project( + FlinkSchemaUtil.convert( + mixedTable.schema(), + org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema))); + // If dim table is enabled, we reserve a RowTime field in Emitter. + if (dimTable) { + rowType = org.apache.amoro.flink.FlinkSchemaUtil.toRowType(projectedSchema); + } else { + rowType = + org.apache.amoro.flink.FlinkSchemaUtil.toRowType( + org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema)); + } + } + MixedFormatScanContext scanContext = + contextBuilder.fromProperties(properties).batchMode(batchMode).build(); + + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + flinkConf, + mixedTable.schema(), + scanContext.project(), + mixedTable.asKeyedTable().primaryKeySpec(), + scanContext.nameMapping(), + scanContext.caseSensitive(), + mixedTable.io()); + + int scanParallelism = + flinkConf.getOptional(MixedFormatValidator.SCAN_PARALLELISM).orElse(env.getParallelism()); + DataStreamSource sourceStream = + env.fromSource( + new MixedFormatSource<>( + tableLoader, + scanContext, + rowDataReaderFunction, + InternalTypeInfo.of(rowType), + mixedTable.name(), + dimTable), + watermarkStrategy, + MixedFormatSource.class.getName()) + .setParallelism(scanParallelism); + context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); + return sourceStream; + } + + private void loadTableIfNeeded() { + if (tableLoader == null || mixedTable != null) { + return; + } + mixedTable = MixedFormatUtils.loadMixedTable(tableLoader); + properties.putAll(mixedTable.properties()); + } + + public DataStream buildUnkeyedTableSource(String scanStartupMode) { + scanStartupMode = scanStartupMode == null ? null : scanStartupMode.toLowerCase(); + Preconditions.checkArgument( + Objects.isNull(scanStartupMode) + || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST) + || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_LATEST), + String.format( + "only support %s, %s when %s is %s", + MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST, + MixedFormatValidator.SCAN_STARTUP_MODE_LATEST, + MixedFormatValidator.MIXED_FORMAT_READ_MODE, + MixedFormatValidator.MIXED_FORMAT_READ_FILE)); + org.apache.iceberg.flink.source.FlinkSource.Builder builder = + org.apache.iceberg.flink.source.FlinkSource.forRowData() + .env(env) + .project(org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema)) + .tableLoader(tableLoader) + .filters(filters) + .properties(properties) + .flinkConf(flinkConf) + .limit(limit); + if (MixedFormatValidator.SCAN_STARTUP_MODE_LATEST.equalsIgnoreCase(scanStartupMode)) { + Optional startSnapshotOptional = + Optional.ofNullable(tableLoader.loadTable().currentSnapshot()); + if (startSnapshotOptional.isPresent()) { + Snapshot snapshot = startSnapshotOptional.get(); + LOG.info( + "Get starting snapshot id {} based on scan startup mode {}", + snapshot.snapshotId(), + scanStartupMode); + builder.startSnapshotId(snapshot.snapshotId()); + } + } + DataStream origin = builder.build(); + return wrapKrb(origin).assignTimestampsAndWatermarks(watermarkStrategy); + } + + /** extract op from dataStream, and wrap krb support */ + private DataStream wrapKrb(DataStream ds) { + IcebergClassUtil.clean(env); + Transformation origin = ds.getTransformation(); + int scanParallelism = + flinkConf + .getOptional(MixedFormatValidator.SCAN_PARALLELISM) + .orElse(origin.getParallelism()); + + if (origin instanceof OneInputTransformation) { + OneInputTransformation tf = + (OneInputTransformation) ds.getTransformation(); + OneInputStreamOperatorFactory op = (OneInputStreamOperatorFactory) tf.getOperatorFactory(); + ProxyFactory inputFormatProxyFactory = + IcebergClassUtil.getInputFormatProxyFactory(op, mixedTable.io(), mixedTable.schema()); + + if (tf.getInputs().isEmpty()) { + return env.addSource( + new UnkeyedInputFormatSourceFunction(inputFormatProxyFactory, tf.getOutputType())) + .setParallelism(scanParallelism); + } + + LegacySourceTransformation tfSource = (LegacySourceTransformation) tf.getInputs().get(0); + StreamSource source = tfSource.getOperator(); + SourceFunction function = IcebergClassUtil.getSourceFunction(source); + + SourceFunction functionProxy = + (SourceFunction) ProxyUtil.getProxy(function, mixedTable.io()); + DataStreamSource sourceStream = + env.addSource(functionProxy, tfSource.getName(), tfSource.getOutputType()); + context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); + if (sourceStream instanceof ParallelSourceFunction) { + sourceStream.setParallelism(scanParallelism); + } + return sourceStream.transform( + tf.getName(), + tf.getOutputType(), + new UnkeyedInputFormatOperatorFactory(inputFormatProxyFactory)); + } + + LegacySourceTransformation tfSource = (LegacySourceTransformation) origin; + StreamSource source = tfSource.getOperator(); + InputFormatSourceFunction function = + (InputFormatSourceFunction) IcebergClassUtil.getSourceFunction(source); + + InputFormat inputFormatProxy = + (InputFormat) ProxyUtil.getProxy(function.getFormat(), mixedTable.io()); + DataStreamSource sourceStream = + env.createInput(inputFormatProxy, tfSource.getOutputType()) + .setParallelism(scanParallelism); + context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); + return sourceStream; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java new file mode 100644 index 0000000000..08316ef1c5 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.flink.table.connector.ChangelogMode.insertOnly; + +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSourceBuilder; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types.NestedField; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Collectors; + +/** This is a log source table api, create log queue consumer e.g. {@link LogKafkaSource} */ +public class LogDynamicSource + implements ScanTableSource, SupportsWatermarkPushDown, SupportsProjectionPushDown { + + private static final Logger LOG = LoggerFactory.getLogger(LogDynamicSource.class); + + private final MixedTable mixedTable; + private final Schema schema; + private final ReadableConfig tableOptions; + private final Optional consumerChangelogMode; + private final boolean logRetractionEnable; + + /** Watermark strategy that is used to generate per-partition watermark. */ + protected @Nullable WatermarkStrategy watermarkStrategy; + + /** Data type to configure the formats. */ + + /** Indices that determine the value fields and the target position in the produced row. */ + protected int[] projectedFields; + + /** Properties for the logStore consumer. */ + protected final Properties properties; + + private static final ChangelogMode ALL_KINDS = + ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.UPDATE_BEFORE) + .addContainedKind(RowKind.UPDATE_AFTER) + .addContainedKind(RowKind.DELETE) + .build(); + + public LogDynamicSource( + Properties properties, Schema schema, ReadableConfig tableOptions, MixedTable mixedTable) { + this.schema = schema; + this.tableOptions = tableOptions; + this.consumerChangelogMode = + tableOptions.getOptional(MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE); + this.logRetractionEnable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + mixedTable.properties(), + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue()); + this.mixedTable = mixedTable; + this.properties = properties; + } + + public LogDynamicSource( + Properties properties, + Schema schema, + ReadableConfig tableOptions, + MixedTable mixedTable, + boolean logRetractionEnable, + Optional consumerChangelogMode) { + this.schema = schema; + this.tableOptions = tableOptions; + this.consumerChangelogMode = consumerChangelogMode; + this.logRetractionEnable = logRetractionEnable; + this.mixedTable = mixedTable; + this.properties = properties; + } + + protected LogKafkaSource createKafkaSource() { + Schema projectedSchema = getProjectSchema(schema); + LOG.info("Schema used for create KafkaSource is: {}", projectedSchema); + + LogKafkaSourceBuilder kafkaSourceBuilder = + LogKafkaSource.builder(projectedSchema, mixedTable.properties()); + kafkaSourceBuilder.setProperties(properties); + + LOG.info("build log kafka source"); + return kafkaSourceBuilder.build(); + } + + @Override + public ChangelogMode getChangelogMode() { + String changeLogMode = + consumerChangelogMode.orElse( + mixedTable.isKeyedTable() + ? MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS + : MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY); + switch (changeLogMode) { + case MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY: + if (logRetractionEnable) { + throw new IllegalArgumentException( + String.format( + "Only %s is false when %s is %s", + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), + MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), + MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY)); + } + return insertOnly(); + case MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS: + return ALL_KINDS; + default: + throw new UnsupportedOperationException( + String.format( + "As of now, %s can't support this option %s.", + MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), + consumerChangelogMode)); + } + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { + final LogKafkaSource kafkaSource = createKafkaSource(); + + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { + if (watermarkStrategy == null) { + watermarkStrategy = WatermarkStrategy.noWatermarks(); + } + int scanParallelism = + tableOptions + .getOptional(MixedFormatValidator.SCAN_PARALLELISM) + .orElse(execEnv.getParallelism()); + return execEnv + .fromSource(kafkaSource, watermarkStrategy, "LogStoreSource-" + mixedTable.name()) + .setParallelism(scanParallelism); + } + + @Override + public boolean isBounded() { + return kafkaSource.getBoundedness() == Boundedness.BOUNDED; + } + }; + } + + @Override + public DynamicTableSource copy() { + return new LogDynamicSource( + this.properties, + this.schema, + this.tableOptions, + this.mixedTable, + this.logRetractionEnable, + this.consumerChangelogMode); + } + + @Override + public String asSummaryString() { + return "Mixed-format Log: " + mixedTable.name(); + } + + @Override + public void applyWatermark(WatermarkStrategy watermarkStrategy) { + this.watermarkStrategy = watermarkStrategy; + } + + @Override + public boolean supportsNestedProjection() { + return false; + } + + @Override + public void applyProjection(int[][] projectFields) { + this.projectedFields = new int[projectFields.length]; + for (int i = 0; i < projectFields.length; i++) { + Preconditions.checkArgument( + projectFields[i].length == 1, "Don't support nested projection now."); + this.projectedFields[i] = projectFields[i][0]; + } + } + + private Schema getProjectSchema(Schema projectedSchema) { + if (projectedFields != null) { + List projectedSchemaColumns = projectedSchema.columns(); + projectedSchema = + new Schema( + Arrays.stream(projectedFields) + .mapToObj(projectedSchemaColumns::get) + .collect(Collectors.toList())); + } + return projectedSchema; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java new file mode 100644 index 0000000000..d9ca8ea5bd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE_DEFAULT; +import static org.apache.flink.api.common.RuntimeExecutionMode.BATCH; +import static org.apache.flink.configuration.ExecutionOptions.RUNTIME_MODE; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.PROPS_BOOTSTRAP_SERVERS; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.PROPS_GROUP_ID; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_STARTUP_MODE; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_STARTUP_TIMESTAMP_MILLIS; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_TOPIC_PARTITION_DISCOVERY; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SINK_PARTITIONER; +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.TOPIC; + +import org.apache.amoro.flink.InternalCatalogBuilder; +import org.apache.amoro.flink.catalog.MixedCatalog; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.amoro.utils.CompatiblePropertyUtil; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.connector.kafka.source.KafkaSourceOptions; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.utils.TableSchemaUtils; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Duration; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; + +/** A factory generates {@link MixedFormatDynamicSource} and {@link MixedFormatDynamicSink} */ +public class MixedDynamicTableFactory + implements DynamicTableSourceFactory, DynamicTableSinkFactory { + private static final Logger LOG = LoggerFactory.getLogger(MixedDynamicTableFactory.class); + public static final String IDENTIFIER = "mixed-format"; + private InternalCatalogBuilder internalCatalogBuilder; + private String internalCatalogName; + + public MixedDynamicTableFactory(MixedCatalog mixedCatalog) { + this.internalCatalogBuilder = mixedCatalog.catalogBuilder(); + this.internalCatalogName = mixedCatalog.amsCatalogName(); + } + + public MixedDynamicTableFactory() {} + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + CatalogTable catalogTable = context.getCatalogTable(); + ObjectIdentifier identifier = context.getObjectIdentifier(); + ObjectPath objectPath; + + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + Configuration options = (Configuration) helper.getOptions(); + + InternalCatalogBuilder actualBuilder = internalCatalogBuilder; + String actualCatalogName = internalCatalogName; + + // It denotes create table by ddl 'connector' option, not through catalog.db.tableName + if (actualBuilder == null || actualCatalogName == null) { + actualCatalogName = options.get(MixedFormatValidator.MIXED_FORMAT_CATALOG); + Preconditions.checkNotNull( + actualCatalogName, + String.format("%s should be set", MixedFormatValidator.MIXED_FORMAT_CATALOG.key())); + String amsUri = options.get(CatalogFactoryOptions.AMS_URI); + actualBuilder = + InternalCatalogBuilder.builder() + .amsUri(amsUri) + .catalogName(actualCatalogName) + .properties(options.toMap()); + } + + if (options.containsKey(MixedFormatValidator.MIXED_FORMAT_DATABASE.key()) + && options.containsKey(MixedFormatValidator.MIXED_FORMAT_TABLE.key())) { + objectPath = + new ObjectPath( + options.get(MixedFormatValidator.MIXED_FORMAT_DATABASE), + options.get(MixedFormatValidator.MIXED_FORMAT_TABLE)); + } else { + objectPath = new ObjectPath(identifier.getDatabaseName(), identifier.getObjectName()); + } + MixedFormatTableLoader tableLoader = + createTableLoader(objectPath, actualCatalogName, actualBuilder, options.toMap()); + MixedTable mixedTable = MixedFormatUtils.loadMixedTable(tableLoader); + + Configuration confWithAll = Configuration.fromMap(mixedTable.properties()); + + ScanTableSource mixedFormatDynamicSource; + + String readMode = + PropertyUtil.propertyAsString( + mixedTable.properties(), + MixedFormatValidator.MIXED_FORMAT_READ_MODE, + MixedFormatValidator.MIXED_READ_MODE_DEFAULT); + + boolean dimTable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + mixedTable.properties(), + MixedFormatValidator.DIM_TABLE_ENABLE.key(), + MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); + + TableSchema tableSchema; + if (!dimTable) { + tableSchema = + org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema(catalogTable.getSchema()); + } else { + tableSchema = + org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchemaForDimTable( + catalogTable.getSchema()); + } + + switch (readMode) { + case MixedFormatValidator.MIXED_FORMAT_READ_FILE: + boolean batchMode = context.getConfiguration().get(RUNTIME_MODE).equals(BATCH); + LOG.info("Building a file reader in {} runtime mode", batchMode ? "batch" : "streaming"); + mixedFormatDynamicSource = + new MixedFormatFileSource(tableLoader, tableSchema, mixedTable, confWithAll, batchMode); + break; + case MixedFormatValidator.MIXED_FORMAT_READ_LOG: + default: + Preconditions.checkArgument( + CompatiblePropertyUtil.propertyAsBoolean( + mixedTable.properties(), ENABLE_LOG_STORE, ENABLE_LOG_STORE_DEFAULT), + String.format("Read log should enable %s at first", ENABLE_LOG_STORE)); + mixedFormatDynamicSource = createLogSource(mixedTable, context, confWithAll); + } + + return generateDynamicTableSource( + identifier.getObjectName(), mixedFormatDynamicSource, mixedTable, tableLoader); + } + + protected DynamicTableSource generateDynamicTableSource( + String tableName, + ScanTableSource mixedFormatDynamicSource, + MixedTable mixedTable, + MixedFormatTableLoader tableLoader) { + return new MixedFormatDynamicSource( + tableName, mixedFormatDynamicSource, mixedTable, mixedTable.properties(), tableLoader); + } + + @Override + public MixedFormatDynamicSink createDynamicTableSink(Context context) { + CatalogTable catalogTable = context.getCatalogTable(); + + ObjectIdentifier identifier = context.getObjectIdentifier(); + Map options = catalogTable.getOptions(); + + MixedFormatTableLoader tableLoader = + createTableLoader( + new ObjectPath(identifier.getDatabaseName(), identifier.getObjectName()), + internalCatalogName, + internalCatalogBuilder, + options); + + MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); + return new MixedFormatDynamicSink(catalogTable, tableLoader, table.isKeyedTable()); + } + + private static MixedFormatTableLoader createTableLoader( + ObjectPath tablePath, + String internalCatalogName, + InternalCatalogBuilder catalogBuilder, + Map flinkTableProperties) { + TableIdentifier identifier = + TableIdentifier.of( + internalCatalogName, tablePath.getDatabaseName(), tablePath.getObjectName()); + + return MixedFormatTableLoader.of(identifier, catalogBuilder, flinkTableProperties); + } + + @Override + public String factoryIdentifier() { + return IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + final Set> options = new HashSet<>(); + options.add(TOPIC); + options.add(PROPS_BOOTSTRAP_SERVERS); + options.add(PROPS_GROUP_ID); + options.add(SCAN_STARTUP_MODE); + options.add(SCAN_STARTUP_TIMESTAMP_MILLIS); + options.add(SINK_PARTITIONER); + options.add(MixedFormatValidator.MIXED_FORMAT_CATALOG); + options.add(MixedFormatValidator.MIXED_FORMAT_TABLE); + options.add(MixedFormatValidator.MIXED_FORMAT_DATABASE); + options.add(MixedFormatValidator.DIM_TABLE_ENABLE); + options.add(CatalogFactoryOptions.AMS_URI); + + // lookup + options.add(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS); + options.add(MixedFormatValidator.LOOKUP_RELOADING_INTERVAL); + options.add(MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE); + + options.add(MixedFormatValidator.ROCKSDB_AUTO_COMPACTIONS); + options.add(MixedFormatValidator.ROCKSDB_WRITING_THREADS); + options.add(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_CAPACITY); + options.add(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS); + return options; + } + + private ScanTableSource createLogSource( + MixedTable mixedTable, Context context, ReadableConfig tableOptions) { + CatalogTable catalogTable = context.getCatalogTable(); + TableSchema physicalSchema = TableSchemaUtils.getPhysicalSchema(catalogTable.getSchema()); + Schema schema = FlinkSchemaUtil.convert(physicalSchema); + + final Properties properties = OptionsUtil.getKafkaProperties(mixedTable.properties()); + + // add topic-partition discovery + final Optional partitionDiscoveryInterval = + tableOptions.getOptional(SCAN_TOPIC_PARTITION_DISCOVERY).map(Duration::toMillis); + properties.setProperty( + KafkaSourceOptions.PARTITION_DISCOVERY_INTERVAL_MS.key(), + partitionDiscoveryInterval.orElse(-1L).toString()); + + LOG.info("build log source"); + return new LogDynamicSource(properties, schema, tableOptions, mixedTable); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java new file mode 100644 index 0000000000..5de853ec21 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.write.FlinkSink; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.sink.DataStreamSinkProvider; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; +import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.hadoop.security.UserGroupInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; + +/** Flink table api that generates sink operators. */ +public class MixedFormatDynamicSink + implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { + + public static final Logger LOG = LoggerFactory.getLogger(MixedFormatDynamicSink.class); + + private final MixedFormatTableLoader tableLoader; + private final CatalogTable flinkTable; + private final boolean primaryKeyExisted; + private boolean overwrite = false; + + MixedFormatDynamicSink( + CatalogTable flinkTable, MixedFormatTableLoader tableLoader, boolean primaryKeyExisted) { + this.tableLoader = tableLoader; + this.flinkTable = flinkTable; + this.primaryKeyExisted = primaryKeyExisted; + } + + @Override + public ChangelogMode getChangelogMode(ChangelogMode changelogMode) { + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); + if (primaryKeyExisted) { + builder + .addContainedKind(RowKind.UPDATE_BEFORE) + .addContainedKind(RowKind.UPDATE_AFTER) + .addContainedKind(RowKind.DELETE); + } + return builder.build(); + } + + @Override + public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { + MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); + + return new DataStreamSinkProvider() { + @Override + public DataStreamSink consumeDataStream( + ProviderContext providerContext, DataStream dataStream) { + DataStreamSink ds = + FlinkSink.forRowData(dataStream) + .context(providerContext) + .table(table) + .flinkSchema(flinkTable.getSchema()) + .tableLoader(tableLoader) + .overwrite(overwrite) + .build(); + UserGroupInformation.reset(); + LOG.info("ugi reset"); + return ds; + } + }; + } + + @Override + public DynamicTableSink copy() { + return this; + } + + @Override + public String asSummaryString() { + return "mixed-format"; + } + + @Override + public void applyStaticPartition(Map map) { + // ignore + } + + @Override + public void applyOverwrite(boolean newOverwrite) { + this.overwrite = newOverwrite; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java new file mode 100644 index 0000000000..2588a8b789 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.lookup.KVTableFactory; +import org.apache.amoro.flink.lookup.MixedFormatRowDataLookupFunction; +import org.apache.amoro.flink.lookup.filter.RowDataPredicate; +import org.apache.amoro.flink.lookup.filter.RowDataPredicateExpressionVisitor; +import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; +import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; +import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; +import org.apache.amoro.flink.util.FilterUtil; +import org.apache.amoro.flink.util.IcebergAndFlinkFilters; +import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.utils.SchemaUtil; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.LookupTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; +import org.apache.flink.table.connector.source.lookup.LookupFunctionProvider; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.functions.BuiltInFunctionDefinitions; +import org.apache.flink.table.functions.FunctionIdentifier; +import org.apache.flink.table.functions.LookupFunction; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +/** Flink table api that generates source operators. */ +public class MixedFormatDynamicSource + implements ScanTableSource, + SupportsFilterPushDown, + SupportsProjectionPushDown, + SupportsLimitPushDown, + SupportsWatermarkPushDown, + LookupTableSource { + + private static final Logger LOG = LoggerFactory.getLogger(MixedFormatDynamicSource.class); + + protected final String tableName; + + protected final ScanTableSource mixedFormatDynamicSource; + protected final MixedTable mixedTable; + protected final Map properties; + + protected int[] projectFields; + protected List filters; + protected ResolvedExpression flinkExpression; + protected final MixedFormatTableLoader tableLoader; + + @Nullable protected WatermarkStrategy watermarkStrategy; + + /** + * @param tableName tableName + * @param mixedFormatDynamicSource underlying source + * @param mixedTable mixedTable + * @param properties With all mixed-format table properties and sql options + * @param tableLoader + */ + public MixedFormatDynamicSource( + String tableName, + ScanTableSource mixedFormatDynamicSource, + MixedTable mixedTable, + Map properties, + MixedFormatTableLoader tableLoader) { + this.tableName = tableName; + this.mixedFormatDynamicSource = mixedFormatDynamicSource; + this.mixedTable = mixedTable; + this.properties = properties; + this.tableLoader = tableLoader; + } + + public MixedFormatDynamicSource( + String tableName, + ScanTableSource mixedFormatDynamicSource, + MixedTable mixedTable, + Map properties, + MixedFormatTableLoader tableLoader, + int[] projectFields, + List filters, + ResolvedExpression flinkExpression) { + this.tableName = tableName; + this.mixedFormatDynamicSource = mixedFormatDynamicSource; + this.mixedTable = mixedTable; + this.properties = properties; + this.tableLoader = tableLoader; + this.projectFields = projectFields; + this.filters = filters; + this.flinkExpression = flinkExpression; + } + + @Override + public ChangelogMode getChangelogMode() { + return mixedFormatDynamicSource.getChangelogMode(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { + ScanRuntimeProvider origin = mixedFormatDynamicSource.getScanRuntimeProvider(scanContext); + Preconditions.checkArgument( + origin instanceof DataStreamScanProvider, + "file or log ScanRuntimeProvider should be DataStreamScanProvider, but provided is " + + origin.getClass()); + return origin; + } + + @Override + public DynamicTableSource copy() { + return new MixedFormatDynamicSource( + tableName, + mixedFormatDynamicSource, + mixedTable, + properties, + tableLoader, + projectFields, + filters, + flinkExpression); + } + + @Override + public String asSummaryString() { + return "Mixed-format Dynamic Source"; + } + + @Override + public Result applyFilters(List filters) { + IcebergAndFlinkFilters icebergAndFlinkFilters = + FilterUtil.convertFlinkExpressToIceberg(filters); + this.filters = icebergAndFlinkFilters.expressions(); + + if (filters.size() == 1) { + flinkExpression = filters.get(0); + } else if (filters.size() >= 2) { + flinkExpression = and(filters.get(0), filters.get(1)); + for (int i = 2; i < filters.size(); i++) { + flinkExpression = and(flinkExpression, filters.subList(i, i + 1).get(0)); + } + } + + if (mixedFormatDynamicSource instanceof SupportsFilterPushDown) { + return ((SupportsFilterPushDown) mixedFormatDynamicSource).applyFilters(filters); + } else { + return Result.of(Collections.emptyList(), filters); + } + } + + @Override + public boolean supportsNestedProjection() { + if (mixedFormatDynamicSource instanceof SupportsProjectionPushDown) { + return ((SupportsProjectionPushDown) mixedFormatDynamicSource).supportsNestedProjection(); + } else { + return false; + } + } + + protected CallExpression and(ResolvedExpression left, ResolvedExpression right) { + return CallExpression.permanent( + FunctionIdentifier.of(BuiltInFunctionDefinitions.AND.getName()), + BuiltInFunctionDefinitions.AND, + Arrays.asList(left, right), + DataTypes.BOOLEAN()); + } + + @Override + public void applyProjection(int[][] projectedFields, DataType producedDataType) { + projectFields = new int[projectedFields.length]; + for (int i = 0; i < projectedFields.length; i++) { + Preconditions.checkArgument( + projectedFields[i].length == 1, "Don't support nested projection now."); + projectFields[i] = projectedFields[i][0]; + } + + if (mixedFormatDynamicSource instanceof SupportsProjectionPushDown) { + ((SupportsProjectionPushDown) mixedFormatDynamicSource) + .applyProjection(projectedFields, producedDataType); + } + } + + @Override + public void applyLimit(long newLimit) { + if (mixedFormatDynamicSource instanceof SupportsLimitPushDown) { + ((SupportsLimitPushDown) mixedFormatDynamicSource).applyLimit(newLimit); + } + } + + @Override + public void applyWatermark(WatermarkStrategy watermarkStrategy) { + if (mixedFormatDynamicSource instanceof SupportsWatermarkPushDown) { + ((SupportsWatermarkPushDown) mixedFormatDynamicSource).applyWatermark(watermarkStrategy); + } + } + + @Override + public LookupRuntimeProvider getLookupRuntimeProvider(LookupContext context) { + int[] joinKeys = new int[context.getKeys().length]; + for (int i = 0; i < context.getKeys().length; i++) { + Preconditions.checkArgument( + context.getKeys()[i].length == 1, + "Mixed-format lookup join doesn't support the row field as a joining key."); + joinKeys[i] = context.getKeys()[i][0]; + } + + return LookupFunctionProvider.of(getLookupFunction(joinKeys)); + } + + protected LookupFunction getLookupFunction(int[] joinKeys) { + Schema projectedSchema = getProjectedSchema(); + + List joinKeyNames = getJoinKeyNames(joinKeys, projectedSchema); + + Configuration config = new Configuration(); + properties.forEach(config::setString); + + Optional rowDataPredicate = + generatePredicate(projectedSchema, flinkExpression); + + AbstractAdaptHiveKeyedDataReader flinkMORDataReader = + generateMORReader(mixedTable, projectedSchema); + DataIteratorReaderFunction readerFunction = + generateReaderFunction(mixedTable, projectedSchema); + + return new MixedFormatRowDataLookupFunction( + KVTableFactory.INSTANCE, + mixedTable, + joinKeyNames, + projectedSchema, + filters, + tableLoader, + config, + rowDataPredicate.orElse(null), + flinkMORDataReader, + readerFunction); + } + + protected DataIteratorReaderFunction generateReaderFunction( + MixedTable mixedTable, Schema projectedSchema) { + return new RowDataReaderFunction( + new Configuration(), + mixedTable.schema(), + projectedSchema, + mixedTable.asKeyedTable().primaryKeySpec(), + null, + true, + mixedTable.io(), + true); + } + + protected AbstractAdaptHiveKeyedDataReader generateMORReader( + MixedTable mixedTable, Schema projectedSchema) { + BiFunction convertConstant = new ConvertTask(); + + return new FlinkKeyedMORDataReader( + mixedTable.io(), + mixedTable.schema(), + projectedSchema, + mixedTable.asKeyedTable().primaryKeySpec(), + null, + true, + convertConstant, + true); + } + + static class ConvertTask implements BiFunction, Serializable { + private static final long serialVersionUID = 4607513893568225789L; + + @Override + public Object apply(Type t, Object u) { + return RowDataUtil.convertConstant(t, u); + } + } + + protected List getJoinKeyNames(int[] joinKeys, Schema projectedSchema) { + return Arrays.stream(joinKeys) + .mapToObj(index -> projectedSchema.columns().get(index).name()) + .collect(Collectors.toList()); + } + + protected Schema getProjectedSchema() { + Schema mixedFormatTableSchema = mixedTable.schema(); + Schema projectedSchema; + if (projectFields == null) { + LOG.info("The projected fields is null."); + projectedSchema = mixedTable.schema(); + } else { + if (mixedTable.isUnkeyedTable()) { + throw new UnsupportedOperationException("Unkeyed table doesn't support lookup join."); + } + List primaryKeys = mixedTable.asKeyedTable().primaryKeySpec().fieldNames(); + List projectFieldList = + Arrays.stream(projectFields).boxed().collect(Collectors.toList()); + List columns = mixedFormatTableSchema.columns(); + for (int i = 0; i < mixedFormatTableSchema.columns().size(); i++) { + if (primaryKeys.contains(columns.get(i).name()) && !projectFieldList.contains(i)) { + projectFieldList.add(i); + LOG.info( + "Add identifier field {} to projected schema, due to this field is mismatched.", + columns.get(i).name()); + } + } + + List projectedFieldNames = + projectFieldList.stream() + .map(index -> columns.get(index).name()) + .collect(Collectors.toList()); + projectedSchema = SchemaUtil.selectInOrder(mixedFormatTableSchema, projectedFieldNames); + LOG.info("The projected schema {}.\n table schema {}.", projectedSchema, mixedTable.schema()); + } + return projectedSchema; + } + + protected Optional generatePredicate( + final Schema projectedSchema, final ResolvedExpression flinkExpression) { + if (flinkExpression == null) { + return Optional.empty(); + } + + final Map fieldIndexMap = new HashMap<>(); + final Map fieldDataTypeMap = new HashMap<>(); + List fields = projectedSchema.asStruct().fields(); + for (int i = 0; i < fields.size(); i++) { + Types.NestedField field = fields.get(i); + fieldIndexMap.put(field.name(), i); + fieldDataTypeMap.put( + field.name(), + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(field.type()))); + } + + RowDataPredicateExpressionVisitor visitor = + generateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); + return flinkExpression.accept(visitor); + } + + protected RowDataPredicateExpressionVisitor generateExpressionVisitor( + Map fieldIndexMap, Map fieldDataTypeMap) { + return new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java new file mode 100644 index 0000000000..a408e387bd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.flink.api.common.RuntimeExecutionMode.BATCH; +import static org.apache.flink.configuration.ExecutionOptions.RUNTIME_MODE; + +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.amoro.flink.util.FilterUtil; +import org.apache.amoro.flink.util.IcebergAndFlinkFilters; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.connector.source.DataStreamScanProvider; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; +import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.types.DataType; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.expressions.Expression; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.util.Arrays; +import java.util.List; + +/** Flink table api that generates mixed-format base/change file source operators. */ +public class MixedFormatFileSource + implements ScanTableSource, + SupportsFilterPushDown, + SupportsProjectionPushDown, + SupportsLimitPushDown, + SupportsWatermarkPushDown { + + private static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileSource.class); + + private int[] projectedFields; + private long limit; + private List filters; + private final MixedTable table; + @Nullable protected WatermarkStrategy watermarkStrategy; + + private final MixedFormatTableLoader loader; + private final TableSchema tableSchema; + private final ReadableConfig readableConfig; + private final boolean batchMode; + + private MixedFormatFileSource(MixedFormatFileSource toCopy) { + this.loader = toCopy.loader; + this.tableSchema = toCopy.tableSchema; + this.projectedFields = toCopy.projectedFields; + this.limit = toCopy.limit; + this.filters = toCopy.filters; + this.readableConfig = toCopy.readableConfig; + this.table = toCopy.table; + this.watermarkStrategy = toCopy.watermarkStrategy; + this.batchMode = toCopy.batchMode; + } + + public MixedFormatFileSource( + MixedFormatTableLoader loader, + TableSchema tableSchema, + int[] projectedFields, + MixedTable table, + long limit, + List filters, + ReadableConfig readableConfig, + boolean batchMode) { + this.loader = loader; + this.tableSchema = tableSchema; + this.projectedFields = projectedFields; + this.limit = limit; + this.table = table; + this.filters = filters; + this.readableConfig = readableConfig; + this.batchMode = batchMode; + } + + public MixedFormatFileSource( + MixedFormatTableLoader loader, + TableSchema tableSchema, + MixedTable table, + ReadableConfig readableConfig, + boolean batchMode) { + this(loader, tableSchema, null, table, -1, ImmutableList.of(), readableConfig, batchMode); + } + + @Override + public void applyProjection(int[][] projectFields) { + this.projectedFields = new int[projectFields.length]; + for (int i = 0; i < projectFields.length; i++) { + Preconditions.checkArgument( + projectFields[i].length == 1, "Don't support nested projection now."); + this.projectedFields[i] = projectFields[i][0]; + } + } + + private DataStream createDataStream( + ProviderContext providerContext, StreamExecutionEnvironment execEnv) { + return FlinkSource.forRowData() + .context(providerContext) + .env(execEnv) + .tableLoader(loader) + .mixedFormatTable(table) + .project(getProjectedSchema()) + .limit(limit) + .filters(filters) + .flinkConf(readableConfig) + .batchMode(execEnv.getConfiguration().get(RUNTIME_MODE).equals(BATCH)) + .watermarkStrategy(watermarkStrategy) + .build(); + } + + private TableSchema getProjectedSchema() { + if (projectedFields == null) { + return tableSchema; + } else { + String[] fullNames = tableSchema.getFieldNames(); + DataType[] fullTypes = tableSchema.getFieldDataTypes(); + + String[] projectedColumns = + Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new); + TableSchema.Builder builder = + TableSchema.builder() + .fields( + projectedColumns, + Arrays.stream(projectedFields) + .mapToObj(i -> fullTypes[i]) + .toArray(DataType[]::new)); + boolean dimTable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + table.properties(), + MixedFormatValidator.DIM_TABLE_ENABLE.key(), + MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); + if (dimTable) { + builder.watermark(tableSchema.getWatermarkSpecs().get(0)); + } + + TableSchema ts = builder.build(); + LOG.info("TableSchema after projection:{}", ts); + return ts; + } + } + + @Override + public void applyLimit(long newLimit) { + this.limit = newLimit; + } + + @Override + public Result applyFilters(List flinkFilters) { + IcebergAndFlinkFilters icebergAndFlinkFilters = + FilterUtil.convertFlinkExpressToIceberg(flinkFilters); + this.filters = icebergAndFlinkFilters.expressions(); + return Result.of(icebergAndFlinkFilters.acceptedFilters(), flinkFilters); + } + + @Override + public boolean supportsNestedProjection() { + // TODO: support nested projection + return false; + } + + @Override + public ChangelogMode getChangelogMode() { + if (table.isUnkeyedTable() || batchMode) { + return ChangelogMode.insertOnly(); + } + return ChangelogMode.newBuilder() + .addContainedKind(RowKind.DELETE) + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.UPDATE_AFTER) + .addContainedKind(RowKind.UPDATE_BEFORE) + .build(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + return new DataStreamScanProvider() { + @Override + public DataStream produceDataStream( + ProviderContext providerContext, StreamExecutionEnvironment execEnv) { + return createDataStream(providerContext, execEnv); + } + + @Override + public boolean isBounded() { + return org.apache.iceberg.flink.source.FlinkSource.isBounded(table.properties()); + } + }; + } + + @Override + public DynamicTableSource copy() { + return new MixedFormatFileSource(this); + } + + @Override + public String asSummaryString() { + return "Mixed-Format File Source"; + } + + @Override + public void applyWatermark(WatermarkStrategy watermarkStrategy) { + Configuration conf = Configuration.fromMap(table.properties()); + boolean dimTable = + CompatibleFlinkPropertyUtil.propertyAsBoolean(conf, MixedFormatValidator.DIM_TABLE_ENABLE); + if (!dimTable) { + this.watermarkStrategy = watermarkStrategy; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java new file mode 100644 index 0000000000..d7282739fb --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.InternalCatalogBuilder; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.interceptor.FlinkTablePropertiesInvocationHandler; +import org.apache.amoro.mixed.MixedFormatCatalog; +import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.TableLoader; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** load a proxy table contains both mixed-format table properties and flink table properties */ +public class MixedFormatTableLoader implements TableLoader { + + private static final long serialVersionUID = 1L; + + protected final InternalCatalogBuilder catalogBuilder; + protected final TableIdentifier tableIdentifier; + protected final Map flinkTableProperties; + /** + * The mark of loading internal table, base or change table. For compatible with iceberg + * committer. + */ + protected boolean loadBaseForKeyedTable; + + protected transient MixedFormatCatalog mixedFormatCatalog; + + public static MixedFormatTableLoader of( + TableIdentifier tableIdentifier, InternalCatalogBuilder catalogBuilder) { + return of(tableIdentifier, catalogBuilder, new HashMap<>()); + } + + public static MixedFormatTableLoader of( + TableIdentifier tableIdentifier, + InternalCatalogBuilder catalogBuilder, + Map flinkTableProperties) { + return new MixedFormatTableLoader(tableIdentifier, catalogBuilder, flinkTableProperties); + } + + public static MixedFormatTableLoader of( + TableIdentifier tableIdentifier, Map flinkTableProperties) { + String metastoreUri = flinkTableProperties.get(CatalogFactoryOptions.AMS_URI.key()); + return new MixedFormatTableLoader( + tableIdentifier, + InternalCatalogBuilder.builder().amsUri(metastoreUri), + flinkTableProperties); + } + + public static MixedFormatTableLoader of( + TableIdentifier tableIdentifier, + String metastoreUri, + Map flinkTableProperties) { + return new MixedFormatTableLoader( + tableIdentifier, + InternalCatalogBuilder.builder().amsUri(metastoreUri), + flinkTableProperties); + } + + protected MixedFormatTableLoader( + TableIdentifier tableIdentifier, + InternalCatalogBuilder catalogBuilder, + Map flinkTableProperties) { + this(tableIdentifier, catalogBuilder, flinkTableProperties, null); + } + + protected MixedFormatTableLoader( + TableIdentifier tableIdentifier, + InternalCatalogBuilder catalogBuilder, + Map flinkTableProperties, + Boolean loadBaseForKeyedTable) { + this.catalogBuilder = catalogBuilder; + this.tableIdentifier = tableIdentifier; + this.flinkTableProperties = flinkTableProperties; + this.loadBaseForKeyedTable = loadBaseForKeyedTable == null || loadBaseForKeyedTable; + } + + @Override + public void open() { + mixedFormatCatalog = catalogBuilder.build(); + } + + @Override + public boolean isOpen() { + return mixedFormatCatalog != null; + } + + public MixedTable loadMixedFormatTable() { + return ((MixedTable) + new FlinkTablePropertiesInvocationHandler( + flinkTableProperties, mixedFormatCatalog.loadTable(tableIdentifier)) + .getProxy()); + } + + public void switchLoadInternalTableForKeyedTable(boolean loadBaseForKeyedTable) { + this.loadBaseForKeyedTable = loadBaseForKeyedTable; + } + + @Override + public Table loadTable() { + MixedTable table = loadMixedFormatTable(); + + if (table.isKeyedTable()) { + if (loadBaseForKeyedTable) { + return table.asKeyedTable().baseTable(); + } else { + return table.asKeyedTable().changeTable(); + } + } + if (!(table instanceof Table)) { + throw new UnsupportedOperationException( + String.format("table type mismatched. It's %s", table.getClass())); + } + return (Table) table; + } + + @Override + public TableLoader clone() { + return new MixedFormatTableLoader( + tableIdentifier, catalogBuilder, flinkTableProperties, loadBaseForKeyedTable); + } + + @Override + public void close() throws IOException {} + + @Override + public String toString() { + return MoreObjects.toStringHelper(this).add("tableIdentifier", tableIdentifier).toString(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java new file mode 100644 index 0000000000..38ef34666f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; + +import java.util.Map; +import java.util.Properties; + +public class OptionsUtil { + + // Prefix for specific properties. + public static final String PROPERTIES_PREFIX = "properties."; + + public static Properties getKafkaProperties(Map tableOptions) { + final Properties kafkaProperties = new Properties(); + + if (hasProperties(tableOptions)) { + tableOptions.keySet().stream() + .filter(key -> key.startsWith(PROPERTIES_PREFIX)) + .forEach( + key -> { + final String value = tableOptions.get(key); + final String subKey = key.substring((PROPERTIES_PREFIX).length()); + kafkaProperties.put(subKey, value); + }); + } + return kafkaProperties; + } + + public static Map getCatalogProperties(Map options) { + Map catalogProperties = Maps.newHashMap(); + options.forEach( + (key, value) -> { + if (key.startsWith(PROPERTIES_PREFIX)) { + catalogProperties.put(key.substring((PROPERTIES_PREFIX).length()), value); + } else { + catalogProperties.put(key, value); + } + }); + return catalogProperties; + } + + /** Decides if the table options contains table properties that start with prefix 'properties'. */ + private static boolean hasProperties(Map tableOptions) { + return tableOptions.keySet().stream().anyMatch(k -> k.startsWith(PROPERTIES_PREFIX)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java new file mode 100644 index 0000000000..71f30995f9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.TableFormat; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.ObjectIdentifier; +import org.apache.flink.table.connector.sink.DynamicTableSink; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.factories.DynamicTableSinkFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.factories.Factory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.util.Preconditions; + +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +/** + * UnifiedDynamicTableFactory is a factory for creating dynamic table sources and sinks. It + * implements both DynamicTableSourceFactory and DynamicTableSinkFactory interfaces. + */ +public class UnifiedDynamicTableFactory + implements DynamicTableSourceFactory, DynamicTableSinkFactory { + + private final Map availableCatalogs; + + public UnifiedDynamicTableFactory(Map availableCatalogs) { + this.availableCatalogs = + Preconditions.checkNotNull(availableCatalogs, "availableCatalogs cannot be null"); + } + + @Override + public DynamicTableSink createDynamicTableSink(Context context) { + ObjectIdentifier identifier = context.getObjectIdentifier(); + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + Configuration options = (Configuration) helper.getOptions(); + TableFormat tableFormat = TableFormat.valueOf(options.get(MixedFormatValidator.TABLE_FORMAT)); + + return getOriginalCatalog(tableFormat) + .flatMap(AbstractCatalog::getFactory) + .filter(factory -> factory instanceof DynamicTableSinkFactory) + .map(factory -> ((DynamicTableSinkFactory) factory).createDynamicTableSink(context)) + .orElseThrow( + () -> + new UnsupportedOperationException( + String.format( + "Invalid catalog or factory for table format: %s, table: %s.", + tableFormat, identifier))); + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + ObjectIdentifier identifier = context.getObjectIdentifier(); + FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + Configuration options = (Configuration) helper.getOptions(); + TableFormat tableFormat = TableFormat.valueOf(options.get(MixedFormatValidator.TABLE_FORMAT)); + + return getOriginalCatalog(tableFormat) + .flatMap(AbstractCatalog::getFactory) + .filter(factory -> factory instanceof DynamicTableSourceFactory) + .map(factory -> ((DynamicTableSourceFactory) factory).createDynamicTableSource(context)) + .orElseThrow( + () -> + new UnsupportedOperationException( + String.format( + "Invalid catalog or factory for table format: %s, table: %s.", + tableFormat, identifier))); + } + + private Optional getOriginalCatalog(TableFormat format) { + return Optional.of(availableCatalogs.get(format)); + } + + @Override + public String factoryIdentifier() { + return CatalogFactoryOptions.UNIFIED_IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + Set> requiredOptions = Sets.newHashSet(); + availableCatalogs.forEach( + (format, catalog) -> { + Optional factory = catalog.getFactory(); + factory.ifPresent(value -> requiredOptions.addAll(value.requiredOptions())); + }); + requiredOptions.add(MixedFormatValidator.TABLE_FORMAT); + return requiredOptions; + } + + @Override + public Set> optionalOptions() { + Set> optionalOptions = Sets.newHashSet(); + availableCatalogs.forEach( + (format, catalog) -> { + Optional factory = catalog.getFactory(); + factory.ifPresent(value -> optionalOptions.addAll(value.optionalOptions())); + }); + return optionalOptions; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java new file mode 100644 index 0000000000..644ef1f807 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.interceptor.ProxyFactory; +import org.apache.amoro.flink.util.IcebergClassUtil; +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.FlinkInputSplit; +import org.apache.iceberg.flink.source.StreamingReaderOperator; + +public class UnkeyedInputFormatOperatorFactory extends AbstractStreamOperatorFactory + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { + + private final ProxyFactory factory; + + private transient MailboxExecutor mailboxExecutor; + + public UnkeyedInputFormatOperatorFactory(ProxyFactory factory) { + this.factory = factory; + } + + @Override + public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { + this.mailboxExecutor = mailboxExecutor; + } + + @SuppressWarnings("unchecked") + @Override + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + IcebergClassUtil.newStreamingReaderOperator( + factory.getInstance(), processingTimeService, mailboxExecutor); + operator.setup( + parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + return (O) operator; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return StreamingReaderOperator.class; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java new file mode 100644 index 0000000000..9a85051e49 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.flink.interceptor.ProxyFactory; +import org.apache.flink.api.common.io.RichInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.io.InputSplit; +import org.apache.flink.metrics.Counter; +import org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider; +import org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException; +import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; +import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.FlinkInputSplit; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** Copy from {@link InputFormatSourceFunction} */ +public class UnkeyedInputFormatSourceFunction extends RichParallelSourceFunction { + private static final long serialVersionUID = 1L; + + private final TypeInformation typeInfo; + private transient TypeSerializer serializer; + + private FlinkInputFormat format; + private final ProxyFactory formatFactory; + + private transient InputSplitProvider provider; + private transient Iterator splitIterator; + + private volatile boolean isRunning = true; + + @SuppressWarnings("unchecked") + public UnkeyedInputFormatSourceFunction( + ProxyFactory formatFactory, TypeInformation typeInfo) { + this.formatFactory = formatFactory; + this.typeInfo = typeInfo; + } + + @Override + @SuppressWarnings("unchecked") + public void open(Configuration parameters) throws Exception { + StreamingRuntimeContext context = (StreamingRuntimeContext) getRuntimeContext(); + + format = formatFactory.getInstance(); + if (format instanceof RichInputFormat) { + format.setRuntimeContext(context); + } + format.configure(parameters); + + provider = context.getInputSplitProvider(); + serializer = typeInfo.createSerializer(getRuntimeContext().getExecutionConfig()); + splitIterator = getInputSplits(); + isRunning = splitIterator.hasNext(); + } + + @Override + public void run(SourceContext ctx) throws Exception { + try { + Counter completedSplitsCounter = + getRuntimeContext().getMetricGroup().counter("numSplitsProcessed"); + if (isRunning && format instanceof RichInputFormat) { + format.openInputFormat(); + } + + RowData nextElement = serializer.createInstance(); + while (isRunning) { + format.open((FlinkInputSplit) splitIterator.next()); + + // for each element we also check if cancel + // was called by checking the isRunning flag + + while (isRunning && !format.reachedEnd()) { + nextElement = format.nextRecord(nextElement); + if (nextElement != null) { + ctx.collect(nextElement); + } else { + break; + } + } + format.close(); + completedSplitsCounter.inc(); + + if (isRunning) { + isRunning = splitIterator.hasNext(); + } + } + } finally { + format.close(); + if (format instanceof RichInputFormat) { + format.closeInputFormat(); + } + isRunning = false; + } + } + + @Override + public void cancel() { + isRunning = false; + } + + @Override + public void close() throws Exception { + format.close(); + if (format instanceof RichInputFormat) { + format.closeInputFormat(); + } + } + + /** + * Returns the {@code InputFormat}. This is only needed because we need to set the input split + * assigner on the {@code StreamGraph}. + */ + public FlinkInputFormat getFormat() { + return format; + } + + private Iterator getInputSplits() { + + return new Iterator() { + + private InputSplit nextSplit; + + private boolean exhausted; + + @Override + public boolean hasNext() { + if (exhausted) { + return false; + } + + if (nextSplit != null) { + return true; + } + + final InputSplit split; + try { + split = provider.getNextInputSplit(getRuntimeContext().getUserCodeClassLoader()); + } catch (InputSplitProviderException e) { + throw new RuntimeException("Could not retrieve next input split.", e); + } + + if (split != null) { + this.nextSplit = split; + return true; + } else { + exhausted = true; + return false; + } + } + + @Override + public InputSplit next() { + if (this.nextSplit == null && !hasNext()) { + throw new NoSuchElementException(); + } + + final InputSplit tmp = this.nextSplit; + this.nextSplit = null; + return tmp; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java new file mode 100644 index 0000000000..662da35be3 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java @@ -0,0 +1,349 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table.descriptors; + +import static org.apache.flink.configuration.description.TextElement.text; + +import org.apache.amoro.TableFormat; +import org.apache.commons.lang.StringUtils; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.description.Description; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.descriptors.ConnectorDescriptorValidator; +import org.apache.flink.table.descriptors.DescriptorProperties; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Preconditions; + +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** Validate mixed-format table properties. */ +public class MixedFormatValidator extends ConnectorDescriptorValidator { + + public static final String MIXED_FORMAT_EMIT_LOG = "log"; + public static final String MIXED_FORMAT_EMIT_FILE = "file"; + + public static final String MIXED_FORMAT_EMIT_AUTO = "auto"; + + public static final String MIXED_FORMAT_READ_FILE = "file"; + public static final String MIXED_FORMAT_READ_LOG = "log"; + + public static final String MIXED_FORMAT_READ_MODE = "mixed-format.read.mode"; + public static final String MIXED_READ_MODE_DEFAULT = MIXED_FORMAT_READ_FILE; + + public static final String MIXED_FORMAT_LATENCY_METRIC_ENABLE = "metrics.event-latency.enabled"; + public static final boolean MIXED_FORMAT_LATENCY_METRIC_ENABLE_DEFAULT = false; + + @Deprecated + public static final String MIXED_FORMAT_LATENCY_METRIC_ENABLE_LEGACY = + "metrics.event-latency.enable"; + + public static final String MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE = "metrics.enabled"; + public static final boolean MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT = false; + + @Deprecated + public static final String MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_LEGACY = "metrics.enable"; + + public static final String BASE_WRITE_LOCATION = "base.write.location"; + public static final String BASE_WRITE_LOCATION_SUFFIX = "/init"; + + public static final String MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE = "write.open-files.size.max"; + public static final long MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT = + 671088640L; // 640M = 5 * 128M + + // log.consumer.changelog.mode + public static final String LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY = "append-only"; + public static final String LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS = "all-kinds"; + + // file scan startup mode + public static final String SCAN_STARTUP_MODE_EARLIEST = "earliest"; + public static final String SCAN_STARTUP_MODE_LATEST = "latest"; + public static final String SCAN_STARTUP_MODE_TIMESTAMP = "timestamp"; + public static final String SCAN_STARTUP_MODE_GROUP_OFFSETS = "group-offsets"; + public static final String SCAN_STARTUP_MODE_SPECIFIC_OFFSETS = "specific-offsets"; + + public static final ConfigOption MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE = + ConfigOptions.key("log-store.consistency-guarantee.enabled") + .booleanType() + .defaultValue(false) + .withDescription("Flag hidden kafka read retraction enable or not."); + + @Deprecated + public static final ConfigOption MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY = + ConfigOptions.key("log-store.consistency-guarantee.enable") + .booleanType() + .defaultValue(false) + .withDescription("Flag hidden kafka read retraction enable or not."); + + public static final ConfigOption MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE = + ConfigOptions.key("log.consumer.changelog.modes") + .stringType() + .defaultValue("all-kinds") + .withDescription( + Description.builder() + .text("Describe what changelog modes does the log consumer support ") + .list( + text("'all-kinds' (log consumer support +I/-D/-U/+U)"), + text("'append-only' (log consumer only support +I)")) + .build()) + .withDescription("Describe what changelog modes does the log consumer support."); + + public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = + ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") + .intType() + .defaultValue(2048) + .withDescription("The target number of records for Iceberg reader fetch batch."); + + public static final ConfigOption SCAN_STARTUP_MODE = + ConfigOptions.key("scan.startup.mode") + .stringType() + .defaultValue(SCAN_STARTUP_MODE_LATEST) + .withDescription( + String.format( + "Optional startup mode for mixed-format source, valid values are " + + "\"earliest\" or \"latest\", \"timestamp\". If %s values %s, \"earliest\":" + + " read earliest table data including base and change files from" + + " the current snapshot, \"latest\": read all incremental data in the change table starting from the" + + " current snapshot (the current snapshot will be excluded), \"timestamp\" has not supported yet." + + " If %s values %s, \"earliest\": start from the earliest offset possible." + + " \"latest\": start from the latest offset," + + " \"timestamp\": start from user-supplied timestamp for each partition.", + MIXED_FORMAT_READ_MODE, + MIXED_FORMAT_READ_FILE, + MIXED_FORMAT_READ_MODE, + MIXED_FORMAT_READ_LOG)); + + public static final ConfigOption SCAN_STARTUP_TIMESTAMP_MILLIS = + ConfigOptions.key("scan.startup.timestamp-millis") + .longType() + .noDefaultValue() + .withDescription("Optional timestamp used in case of \"timestamp\" startup mode"); + + public static final ConfigOption SCAN_STARTUP_SPECIFIC_OFFSETS = + ConfigOptions.key("scan.startup.specific-offsets") + .stringType() + .noDefaultValue() + .withDescription("Optional timestamp used in case of \"timestamp\" startup mode"); + + public static final ConfigOption SUBMIT_EMPTY_SNAPSHOTS = + ConfigOptions.key("submit.empty.snapshots") + .booleanType() + .defaultValue(false) + .withDescription( + "Optional submit empty snapshots to the mixed-format table, false means that writers will not emit" + + " empty WriteResults to the committer operator, and reduce the number of snapshots in File Cache; true" + + " means this job will submit empty snapshots to the table, it is suitable with some valid reasons, e.g." + + " advance watermark metadata stored in the table(https://github.com/apache/iceberg/pull/5561)."); + + public static final ConfigOption MIXED_FORMAT_CATALOG = + ConfigOptions.key("mixed-format.catalog") + .stringType() + .noDefaultValue() + .withDescription("underlying mixed-format catalog name."); + + public static final ConfigOption MIXED_FORMAT_DATABASE = + ConfigOptions.key("mixed-format.database") + .stringType() + .noDefaultValue() + .withDescription("underlying mixed-format database name."); + + public static final ConfigOption MIXED_FORMAT_TABLE = + ConfigOptions.key("mixed-format.table") + .stringType() + .noDefaultValue() + .withDescription("underlying mixed-format table name."); + + public static final ConfigOption DIM_TABLE_ENABLE = + ConfigOptions.key("dim-table.enabled") + .booleanType() + .defaultValue(false) + .withDescription( + "If it is true, mixed-format source will generate watermark after stock data being read"); + + @Deprecated + public static final ConfigOption DIM_TABLE_ENABLE_LEGACY = + ConfigOptions.key("dim-table.enable") + .booleanType() + .defaultValue(false) + .withDescription( + "If it is true, mixed-format source will generate watermark after stock data being read"); + + public static final ConfigOption MIXED_FORMAT_EMIT_MODE = + ConfigOptions.key("mixed-format.emit.mode") + .stringType() + .defaultValue(MIXED_FORMAT_EMIT_AUTO) + .withDescription( + "file, log, auto. e.g.\n" + + "'file' means only writing data into filestore.\n" + + "'log' means only writing data into logstore.\n" + + "'file,log' means writing data into both filestore and logstore.\n" + + "'auto' means writing data into filestore if the logstore of the mixed-format table is disabled;" + + " Also means writing data into both filestore and logstore if the logstore of the mixed-format table" + + " is enabled.\n" + + "'auto' is recommended."); + + public static final ConfigOption AUTO_EMIT_LOGSTORE_WATERMARK_GAP = + ConfigOptions.key("mixed-format.emit.auto-write-to-logstore.watermark-gap") + .durationType() + .noDefaultValue() + .withDescription( + "Only enabled when 'mixed-format.emit.mode'='auto', if the watermark of the mixed-format writers" + + " is greater than the current system timestamp subtracts the specific value, writers will also write" + + " data into the logstore.\n" + + "This value must be greater than 0."); + + public static final ConfigOption LOG_STORE_CATCH_UP = + ConfigOptions.key("log-store.catch-up") + .booleanType() + .defaultValue(false) + .withDescription( + "If it is true, mixed-format source will emit data to filestore and logstore. If it is false," + + " mixed-format source will only emit data to filestore."); + + public static final ConfigOption LOG_STORE_CATCH_UP_TIMESTAMP = + ConfigOptions.key("log-store.catch-up-timestamp") + .longType() + .defaultValue(0L) + .withDescription( + "Mark the time to start double writing (the logstore of mixed-format table catches up with the" + + " historical data)."); + + public static final ConfigOption LOOKUP_CACHE_MAX_ROWS = + ConfigOptions.key("lookup.cache.max-rows") + .longType() + .defaultValue(10000L) + .withDescription( + "The maximum number of rows in the lookup cache, beyond which the oldest row will expire." + + " By default, lookup cache is 10000."); + + public static final ConfigOption LOOKUP_CACHE_TTL_AFTER_WRITE = + ConfigOptions.key("lookup.cache.ttl-after-write") + .durationType() + .defaultValue(Duration.ZERO) + .withDescription("The TTL after which the row will expire in the lookup cache."); + + public static final ConfigOption LOOKUP_RELOADING_INTERVAL = + ConfigOptions.key("lookup.reloading.interval") + .durationType() + .defaultValue(Duration.ofSeconds(10)) + .withDescription( + "Configuration option for specifying the interval in seconds to reload lookup data in RocksDB." + + "\nThe default value is 10 seconds."); + + public static final ConfigOption ROCKSDB_AUTO_COMPACTIONS = + ConfigOptions.key("rocksdb.auto-compactions") + .booleanType() + .defaultValue(false) + .withDescription( + "Enable automatic compactions during the initialization process." + + "\nAfter the initialization completed, will enable the auto_compaction."); + + public static final ConfigOption ROCKSDB_WRITING_THREADS = + ConfigOptions.key("rocksdb.writing-threads") + .intType() + .defaultValue(5) + .withDescription("Writing data into rocksDB thread number."); + + public static final ConfigOption ROCKSDB_BLOCK_CACHE_CAPACITY = + ConfigOptions.key("rocksdb.block-cache.capacity") + .longType() + .defaultValue(32 * 1024 * 1024L) + .withDescription( + "Use the LRUCache strategy for blocks, the size of the BlockCache can be configured based on " + + "your memory requirements and available system resources. Default is 32MB."); + + public static final ConfigOption ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS = + ConfigOptions.key("rocksdb.block-cache.numShardBits") + .intType() + .defaultValue(-1) + .withDescription( + "Use the LRUCache strategy for blocks. The cache is sharded to 2^numShardBits shards, by hash " + + " of the key. Default is -1, means it is automatically determined: every shard will be at least 512KB and" + + " number of shard bits will not exceed 6."); + + public static final ConfigOption TABLE_FORMAT = + ConfigOptions.key("table.format") + .stringType() + .defaultValue(TableFormat.MIXED_ICEBERG.name()) + .withDescription( + String.format( + "The format of the table, valid values are %s, %s, %s or %s, and Flink choose '%s' as default format.", + TableFormat.ICEBERG.name(), + TableFormat.MIXED_ICEBERG.name(), + TableFormat.MIXED_HIVE.name(), + TableFormat.PAIMON.name(), + TableFormat.MIXED_ICEBERG.name())); + + public static final ConfigOption SCAN_PARALLELISM = + ConfigOptions.key("source.parallelism") + .intType() + .noDefaultValue() + .withDescription( + "Defines a custom parallelism for the source. " + + "By default, if this option is not defined, the planner will derive the parallelism " + + "for each statement individually by also considering the global configuration."); + + @Override + public void validate(DescriptorProperties properties) { + String emitMode = properties.getString(MIXED_FORMAT_EMIT_MODE.key()); + if (StringUtils.isBlank(emitMode)) { + throw new ValidationException("None value for property '" + MIXED_FORMAT_EMIT_MODE.key()); + } + + String[] actualEmitModes = emitMode.split(","); + List modeList = + Arrays.asList(MIXED_FORMAT_EMIT_FILE, MIXED_FORMAT_EMIT_LOG, MIXED_FORMAT_EMIT_AUTO); + for (String mode : actualEmitModes) { + if (!modeList.contains(mode)) { + throw new ValidationException( + "Unknown value for property '" + + MIXED_FORMAT_EMIT_MODE.key() + + "'.\n" + + "Supported values are " + + modeList.stream() + .collect(Collectors.toMap(v -> v, v -> DescriptorProperties.noValidation())) + .keySet() + + " but was: " + + mode); + } + + Preconditions.checkArgument( + !MIXED_FORMAT_EMIT_AUTO.equals(mode) || actualEmitModes.length == 1, + "The value of property '" + + MIXED_FORMAT_EMIT_MODE.key() + + "' must be only 'auto' when it is included."); + } + } + + public static Configuration asConfiguration(Map options) { + final Configuration configuration = new Configuration(); + options.forEach(configuration::setString); + return configuration; + } + + private static RowType getRowType(CatalogBaseTable flinkTable) { + return (RowType) flinkTable.getSchema().toRowDataType().getLogicalType(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java new file mode 100644 index 0000000000..fbe22e6c80 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.TOPIC; + +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.table.TableProperties; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.util.PropertyUtil; + +import java.util.List; +import java.util.Map; +import java.util.Properties; + +/** PropertyUtil compatible with legacy flink properties */ +public class CompatibleFlinkPropertyUtil { + + private CompatibleFlinkPropertyUtil() {} + + public static boolean propertyAsBoolean( + Map properties, String property, boolean defaultValue) { + return PropertyUtil.propertyAsBoolean( + properties, getCompatibleProperty(properties, property), defaultValue); + } + + public static boolean propertyAsBoolean( + ReadableConfig config, ConfigOption configOption) { + ConfigOption legacyProperty = getLegacyProperty(configOption); + if (legacyProperty != null + && config.getOptional(legacyProperty).isPresent() + && !config.getOptional(configOption).isPresent()) { + return config.get(legacyProperty); + } else { + return config.get(configOption); + } + } + + public static double propertyAsDouble( + Map properties, String property, double defaultValue) { + return PropertyUtil.propertyAsDouble( + properties, getCompatibleProperty(properties, property), defaultValue); + } + + public static int propertyAsInt( + Map properties, String property, int defaultValue) { + return PropertyUtil.propertyAsInt( + properties, getCompatibleProperty(properties, property), defaultValue); + } + + public static long propertyAsLong( + Map properties, String property, long defaultValue) { + return PropertyUtil.propertyAsLong( + properties, getCompatibleProperty(properties, property), defaultValue); + } + + public static String propertyAsString( + Map properties, String property, String defaultValue) { + return PropertyUtil.propertyAsString( + properties, getCompatibleProperty(properties, property), defaultValue); + } + + private static String getCompatibleProperty(Map properties, String property) { + String legacyProperty = getLegacyProperty(property); + if (legacyProperty != null + && properties.containsKey(legacyProperty) + && !properties.containsKey(property)) { + return legacyProperty; + } else { + return property; + } + } + + private static String getLegacyProperty(String property) { + if (property == null) { + return null; + } + if (MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key().equals(property)) { + return MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY.key(); + } else if (MixedFormatValidator.DIM_TABLE_ENABLE.key().equals(property)) { + return MixedFormatValidator.DIM_TABLE_ENABLE_LEGACY.key(); + } + switch (property) { + case MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE: + return MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE_LEGACY; + case MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE: + return MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_LEGACY; + default: + return null; + } + } + + private static ConfigOption getLegacyProperty(ConfigOption configOption) { + if (configOption == null) { + return null; + } + if (MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE + .key() + .equals(configOption.key())) { + return MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY; + } else if (MixedFormatValidator.DIM_TABLE_ENABLE.key().equals(configOption.key())) { + return MixedFormatValidator.DIM_TABLE_ENABLE_LEGACY; + } + return null; + } + + /** + * Get log-store properties from table properties and flink options, whose prefix is {@link + * TableProperties#LOG_STORE_PROPERTIES_PREFIX}. + * + * @param tableOptions including table properties and flink options + * @return Properties. The keys in it have no {@link TableProperties#LOG_STORE_PROPERTIES_PREFIX}. + */ + public static Properties fetchLogstorePrefixProperties(Map tableOptions) { + final Properties properties = new Properties(); + + if (hasPrefix(tableOptions, TableProperties.LOG_STORE_PROPERTIES_PREFIX)) { + tableOptions.keySet().stream() + .filter(key -> key.startsWith(TableProperties.LOG_STORE_PROPERTIES_PREFIX)) + .forEach( + key -> { + final String value = tableOptions.get(key); + final String subKey = + key.substring((TableProperties.LOG_STORE_PROPERTIES_PREFIX).length()); + properties.put(subKey, value); + }); + } + return properties; + } + + public static boolean hasPrefix(Map tableOptions, String prefix) { + return tableOptions.keySet().stream().anyMatch(k -> k.startsWith(prefix)); + } + + public static List getLogTopic(Map tableProperties) { + Configuration conf = new Configuration(); + conf.setString(TOPIC.key(), tableProperties.get(TableProperties.LOG_STORE_MESSAGE_TOPIC)); + return conf.get(TOPIC); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java new file mode 100644 index 0000000000..77e9f83a47 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java @@ -0,0 +1,1797 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static java.time.temporal.ChronoField.DAY_OF_MONTH; +import static java.time.temporal.ChronoField.HOUR_OF_DAY; +import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; +import static java.time.temporal.ChronoField.MONTH_OF_YEAR; +import static java.time.temporal.ChronoField.NANO_OF_SECOND; +import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; +import static java.time.temporal.ChronoField.YEAR; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.TableException; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.TimestampType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.sql.Timestamp; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.DateTimeException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.TemporalAccessor; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.TimeZone; + +/** + * Utility functions for datetime types: date, time, timestamp. + * + *

These utils include: + * + *

    + *
  • {@code parse[type]}: methods for parsing strings to date/time/timestamp + *
  • {@code format[type]}: methods for formatting date/time/timestamp + *
  • {@code to[externalTypeName]} and {@code toInternal}: methods for converting values from + * internal date/time/timestamp types from/to java.sql or java.time types + *
  • Various operations on timestamp, including floor, ceil and extract + *
  • {@link TimeUnit} and {@link TimeUnitRange} enums + *
+ * + *

Currently, this class is a bit messy because it includes a mix of functionalities both from + * common and planner. We should strive to reduce the number of functionalities here, eventually + * moving some methods closer to where they're needed. Connectors and formats should not use this + * class, but rather if a functionality is necessary, it should be part of the public APIs of our + * type system (e.g a new method in {@link TimestampData} or in {@link TimestampType}). Methods used + * only by the planner should live inside the planner whenever is possible. + * + *

Copied from flink-1.18 + */ +@Internal +public class DateTimeUtils { + + private static final Logger LOG = LoggerFactory.getLogger(DateTimeUtils.class); + + /** The julian date of the epoch, 1970-01-01. */ + public static final int EPOCH_JULIAN = 2440588; + + /** The number of milliseconds in a second. */ + private static final long MILLIS_PER_SECOND = 1000L; + + /** The number of milliseconds in a minute. */ + private static final long MILLIS_PER_MINUTE = 60000L; + + /** The number of milliseconds in an hour. */ + private static final long MILLIS_PER_HOUR = 3600000L; // = 60 * 60 * 1000 + + /** + * The number of milliseconds in a day. + * + *

This is the modulo 'mask' used when converting TIMESTAMP values to DATE and TIME values. + */ + public static final long MILLIS_PER_DAY = 86400000L; // = 24 * 60 * 60 * 1000 + + /** The SimpleDateFormat string for ISO dates, "yyyy-MM-dd". */ + private static final String DATE_FORMAT_STRING = "yyyy-MM-dd"; + + /** The SimpleDateFormat string for ISO times, "HH:mm:ss". */ + private static final String TIME_FORMAT_STRING = "HH:mm:ss"; + + /** The SimpleDateFormat string for ISO timestamps, "yyyy-MM-dd HH:mm:ss". */ + private static final String TIMESTAMP_FORMAT_STRING = + DATE_FORMAT_STRING + " " + TIME_FORMAT_STRING; + + /** The UTC time zone. */ + public static final TimeZone UTC_ZONE = TimeZone.getTimeZone("UTC"); + + /** The local time zone. */ + public static final TimeZone LOCAL_TZ = TimeZone.getDefault(); + + /** The valid minimum epoch milliseconds ('0000-01-01 00:00:00.000 UTC+0'). */ + private static final long MIN_EPOCH_MILLS = -62167219200000L; + + /** The valid minimum epoch seconds ('0000-01-01 00:00:00 UTC+0'). */ + private static final long MIN_EPOCH_SECONDS = -62167219200L; + + /** The valid maximum epoch milliseconds ('9999-12-31 23:59:59.999 UTC+0'). */ + private static final long MAX_EPOCH_MILLS = 253402300799999L; + + /** The valid maximum epoch seconds ('9999-12-31 23:59:59 UTC+0'). */ + private static final long MAX_EPOCH_SECONDS = 253402300799L; + + private static final DateTimeFormatter DEFAULT_TIMESTAMP_FORMATTER = + new DateTimeFormatterBuilder() + .appendPattern("yyyy-[MM][M]-[dd][d]") + .optionalStart() + .appendPattern(" [HH][H]:[mm][m]:[ss][s]") + .appendFraction(NANO_OF_SECOND, 0, 9, true) + .optionalEnd() + .toFormatter(); + + /** + * A ThreadLocal cache map for SimpleDateFormat, because SimpleDateFormat is not thread-safe. + * (string_format) => formatter + */ + private static final ThreadLocalCache FORMATTER_CACHE = + ThreadLocalCache.of(SimpleDateFormat::new); + + /** A ThreadLocal cache map for DateTimeFormatter. (string_format) => formatter */ + private static final ThreadLocalCache DATETIME_FORMATTER_CACHE = + ThreadLocalCache.of(DateTimeFormatter::ofPattern); + + /** A ThreadLocal cache map for TimeZone. (string_zone_id) => TimeZone */ + private static final ThreadLocalCache TIMEZONE_CACHE = + ThreadLocalCache.of(TimeZone::getTimeZone); + + // -------------------------------------------------------------------------------------------- + // java.sql Date/Time/Timestamp --> internal data types + // -------------------------------------------------------------------------------------------- + + /** + * Converts the internal representation of a SQL DATE (int) to the Java type used for UDF + * parameters ({@link java.sql.Date}). + */ + public static java.sql.Date toSQLDate(int v) { + // note that, in this case, can't handle Daylight Saving Time + final long t = v * MILLIS_PER_DAY; + return new java.sql.Date(t - LOCAL_TZ.getOffset(t)); + } + + /** + * Converts the internal representation of a SQL TIME (int) to the Java type used for UDF + * parameters ({@link java.sql.Time}). + */ + public static java.sql.Time toSQLTime(int v) { + // note that, in this case, can't handle Daylight Saving Time + return new java.sql.Time(v - LOCAL_TZ.getOffset(v)); + } + + /** + * Converts the internal representation of a SQL TIMESTAMP (long) to the Java type used for UDF + * parameters ({@link Timestamp}). + */ + public static Timestamp toSQLTimestamp(long v) { + return new Timestamp(v - LOCAL_TZ.getOffset(v)); + } + + /** + * Converts the Java type used for UDF parameters of SQL DATE type ({@link java.sql.Date}) to + * internal representation (int). + * + *

Converse of {@link #toSQLDate(int)}. + */ + public static int toInternal(java.sql.Date date) { + long ts = date.getTime() + LOCAL_TZ.getOffset(date.getTime()); + return (int) (ts / MILLIS_PER_DAY); + } + + /** + * Converts the Java type used for UDF parameters of SQL TIME type ({@link java.sql.Time}) to + * internal representation (int). + * + *

Converse of {@link #toSQLTime(int)}. + */ + public static int toInternal(java.sql.Time time) { + long ts = time.getTime() + LOCAL_TZ.getOffset(time.getTime()); + return (int) (ts % MILLIS_PER_DAY); + } + + /** + * Converts the Java type used for UDF parameters of SQL TIMESTAMP type ({@link Timestamp}) to + * internal representation (long). + * + *

Converse of {@link #toSQLTimestamp(long)}. + */ + public static long toInternal(Timestamp ts) { + long time = ts.getTime(); + return time + LOCAL_TZ.getOffset(time); + } + + public static int toInternal(LocalDate date) { + return ymdToUnixDate(date.getYear(), date.getMonthValue(), date.getDayOfMonth()); + } + + public static int toInternal(LocalTime time) { + return time.getHour() * (int) MILLIS_PER_HOUR + + time.getMinute() * (int) MILLIS_PER_MINUTE + + time.getSecond() * (int) MILLIS_PER_SECOND + + time.getNano() / 1000_000; + } + + // -------------------------------------------------------------------------------------------- + // Java 8 time conversion + // -------------------------------------------------------------------------------------------- + + public static LocalDate toLocalDate(int date) { + return julianToLocalDate(date + EPOCH_JULIAN); + } + + private static LocalDate julianToLocalDate(int julian) { + // this shifts the epoch back to astronomical year -4800 instead of the + // start of the Christian era in year AD 1 of the proleptic Gregorian + // calendar. + int j = julian + 32044; + int g = j / 146097; + int dg = j % 146097; + int c = (dg / 36524 + 1) * 3 / 4; + int dc = dg - c * 36524; + int b = dc / 1461; + int db = dc % 1461; + int a = (db / 365 + 1) * 3 / 4; + int da = db - a * 365; + + // integer number of full years elapsed since March 1, 4801 BC + int y = g * 400 + c * 100 + b * 4 + a; + // integer number of full months elapsed since the last March 1 + int m = (da * 5 + 308) / 153 - 2; + // number of days elapsed since day 1 of the month + int d = da - (m + 4) * 153 / 5 + 122; + int year = y - 4800 + (m + 2) / 12; + int month = (m + 2) % 12 + 1; + int day = d + 1; + return LocalDate.of(year, month, day); + } + + private static int ymdToUnixDate(int year, int month, int day) { + final int julian = ymdToJulian(year, month, day); + return julian - EPOCH_JULIAN; + } + + private static int ymdToJulian(int year, int month, int day) { + int a = (14 - month) / 12; + int y = year + 4800 - a; + int m = month + 12 * a - 3; + return day + (153 * m + 2) / 5 + 365 * y + y / 4 - y / 100 + y / 400 - 32045; + } + + public static LocalTime toLocalTime(int time) { + int h = time / 3600000; + int time2 = time % 3600000; + int m = time2 / 60000; + int time3 = time2 % 60000; + int s = time3 / 1000; + int ms = time3 % 1000; + return LocalTime.of(h, m, s, ms * 1000_000); + } + + public static LocalDateTime toLocalDateTime(long timestamp) { + int date = (int) (timestamp / MILLIS_PER_DAY); + int time = (int) (timestamp % MILLIS_PER_DAY); + if (time < 0) { + --date; + time += MILLIS_PER_DAY; + } + LocalDate localDate = toLocalDate(date); + LocalTime localTime = toLocalTime(time); + return LocalDateTime.of(localDate, localTime); + } + + // -------------------------------------------------------------------------------------------- + // Numeric -> Timestamp conversion + // -------------------------------------------------------------------------------------------- + + public static TimestampData toTimestampData(long v, int precision) { + switch (precision) { + case 0: + if (MIN_EPOCH_SECONDS <= v && v <= MAX_EPOCH_SECONDS) { + return timestampDataFromEpochMills(v * MILLIS_PER_SECOND); + } else { + return null; + } + case 3: + return timestampDataFromEpochMills(v); + default: + throw new TableException( + "The precision value '" + + precision + + "' for function " + + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," + + " the supported value is '0' for second or '3' for millisecond."); + } + } + + public static TimestampData toTimestampData(double v, int precision) { + switch (precision) { + case 0: + if (MIN_EPOCH_SECONDS <= v && v <= MAX_EPOCH_SECONDS) { + return timestampDataFromEpochMills((long) (v * MILLIS_PER_SECOND)); + } else { + return null; + } + case 3: + return timestampDataFromEpochMills((long) v); + default: + throw new TableException( + "The precision value '" + + precision + + "' for function " + + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," + + " the supported value is '0' for second or '3' for millisecond."); + } + } + + public static TimestampData toTimestampData(DecimalData v, int precision) { + long epochMills; + switch (precision) { + case 0: + epochMills = + v.toBigDecimal().setScale(0, RoundingMode.DOWN).longValue() * MILLIS_PER_SECOND; + return timestampDataFromEpochMills(epochMills); + case 3: + epochMills = toMillis(v); + return timestampDataFromEpochMills(epochMills); + default: + throw new TableException( + "The precision value '" + + precision + + "' for function " + + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," + + " the supported value is '0' for second or '3' for millisecond."); + } + } + + private static TimestampData timestampDataFromEpochMills(long epochMills) { + if (MIN_EPOCH_MILLS <= epochMills && epochMills <= MAX_EPOCH_MILLS) { + return TimestampData.fromEpochMillis(epochMills); + } + return null; + } + + private static long toMillis(DecimalData v) { + return v.toBigDecimal().setScale(0, RoundingMode.DOWN).longValue(); + } + + // -------------------------------------------------------------------------------------------- + // Parsing functions + // -------------------------------------------------------------------------------------------- + + public static TimestampData parseTimestampData(String dateStr) throws DateTimeException { + // Precision is hardcoded to match signature of TO_TIMESTAMP + // https://issues.apache.org/jira/browse/FLINK-14925 + return parseTimestampData(dateStr, 3); + } + + public static TimestampData parseTimestampData(String dateStr, int precision) + throws DateTimeException { + return TimestampData.fromLocalDateTime( + fromTemporalAccessor(DEFAULT_TIMESTAMP_FORMATTER.parse(dateStr), precision)); + } + + public static TimestampData parseTimestampData(String dateStr, int precision, TimeZone timeZone) + throws DateTimeException { + return TimestampData.fromInstant( + fromTemporalAccessor(DEFAULT_TIMESTAMP_FORMATTER.parse(dateStr), precision) + .atZone(timeZone.toZoneId()) + .toInstant()); + } + + public static TimestampData parseTimestampData(String dateStr, String format) { + DateTimeFormatter formatter = DATETIME_FORMATTER_CACHE.get(format); + + try { + TemporalAccessor accessor = formatter.parse(dateStr); + // Precision is hardcoded to match signature of TO_TIMESTAMP + // https://issues.apache.org/jira/browse/FLINK-14925 + LocalDateTime ldt = fromTemporalAccessor(accessor, 3); + return TimestampData.fromLocalDateTime(ldt); + } catch (DateTimeParseException e) { + // fall back to support cases like '1999-9-10 05:20:10' or '1999-9-10' + try { + dateStr = dateStr.trim(); + int space = dateStr.indexOf(' '); + if (space >= 0) { + Timestamp ts = Timestamp.valueOf(dateStr); + return TimestampData.fromTimestamp(ts); + } else { + java.sql.Date dt = java.sql.Date.valueOf(dateStr); + return TimestampData.fromLocalDateTime( + LocalDateTime.of(dt.toLocalDate(), LocalTime.MIDNIGHT)); + } + } catch (IllegalArgumentException ie) { + return null; + } + } + } + + /** + * This is similar to {@link LocalDateTime#from(TemporalAccessor)}, but it's less strict and + * introduces default values. + */ + private static LocalDateTime fromTemporalAccessor(TemporalAccessor accessor, int precision) { + // complement year with 1970 + int year = accessor.isSupported(YEAR) ? accessor.get(YEAR) : 1970; + // complement month with 1 + int month = accessor.isSupported(MONTH_OF_YEAR) ? accessor.get(MONTH_OF_YEAR) : 1; + // complement day with 1 + int day = accessor.isSupported(DAY_OF_MONTH) ? accessor.get(DAY_OF_MONTH) : 1; + // complement hour with 0 + int hour = accessor.isSupported(HOUR_OF_DAY) ? accessor.get(HOUR_OF_DAY) : 0; + // complement minute with 0 + int minute = accessor.isSupported(MINUTE_OF_HOUR) ? accessor.get(MINUTE_OF_HOUR) : 0; + // complement second with 0 + int second = accessor.isSupported(SECOND_OF_MINUTE) ? accessor.get(SECOND_OF_MINUTE) : 0; + // complement nano_of_second with 0 + int nanoOfSecond = accessor.isSupported(NANO_OF_SECOND) ? accessor.get(NANO_OF_SECOND) : 0; + + if (precision == 0) { + nanoOfSecond = 0; + } else if (precision != 9) { + nanoOfSecond = (int) floor(nanoOfSecond, powerX(10, 9 - precision)); + } + + return LocalDateTime.of(year, month, day, hour, minute, second, nanoOfSecond); + } + + /** + * Parse date time string to timestamp based on the given time zone and format. Returns null if + * parsing failed. + * + * @param dateStr the date time string + * @param format date time string format + * @param tz the time zone + */ + private static long parseTimestampMillis(String dateStr, String format, TimeZone tz) + throws ParseException { + SimpleDateFormat formatter = FORMATTER_CACHE.get(format); + formatter.setTimeZone(tz); + return formatter.parse(dateStr).getTime(); + } + + /** + * Parse date time string to timestamp based on the given time zone string and format. Returns + * null if parsing failed. + * + * @param dateStr the date time string + * @param tzStr the time zone id string + */ + private static long parseTimestampTz(String dateStr, String tzStr) throws ParseException { + TimeZone tz = TIMEZONE_CACHE.get(tzStr); + return parseTimestampMillis(dateStr, DateTimeUtils.TIMESTAMP_FORMAT_STRING, tz); + } + + /** Returns the epoch days since 1970-01-01. */ + public static int parseDate(String dateStr, String fromFormat) { + // It is OK to use UTC, we just want get the epoch days + // TODO use offset, better performance + long ts = internalParseTimestampMillis(dateStr, fromFormat, TimeZone.getTimeZone("UTC")); + ZoneId zoneId = ZoneId.of("UTC"); + Instant instant = Instant.ofEpochMilli(ts); + ZonedDateTime zdt = ZonedDateTime.ofInstant(instant, zoneId); + return ymdToUnixDate(zdt.getYear(), zdt.getMonthValue(), zdt.getDayOfMonth()); + } + + public static Integer parseDate(String s) { + // allow timestamp str to date, e.g. 2017-12-12 09:30:00.0 + int ws1 = s.indexOf(" "); + if (ws1 > 0) { + s = s.substring(0, ws1); + } + int hyphen1 = s.indexOf('-'); + int y; + int m; + int d; + if (hyphen1 < 0) { + if (!isInteger(s.trim())) { + return null; + } + y = Integer.parseInt(s.trim()); + m = 1; + d = 1; + } else { + if (!isInteger(s.substring(0, hyphen1).trim())) { + return null; + } + y = Integer.parseInt(s.substring(0, hyphen1).trim()); + final int hyphen2 = s.indexOf('-', hyphen1 + 1); + if (hyphen2 < 0) { + if (!isInteger(s.substring(hyphen1 + 1).trim())) { + return null; + } + m = Integer.parseInt(s.substring(hyphen1 + 1).trim()); + d = 1; + } else { + if (!isInteger(s.substring(hyphen1 + 1, hyphen2).trim())) { + return null; + } + m = Integer.parseInt(s.substring(hyphen1 + 1, hyphen2).trim()); + if (!isInteger(s.substring(hyphen2 + 1).trim())) { + return null; + } + d = Integer.parseInt(s.substring(hyphen2 + 1).trim()); + } + } + if (!isIllegalDate(y, m, d)) { + return null; + } + return ymdToUnixDate(y, m, d); + } + + public static Integer parseTime(String v) { + final int start = 0; + final int colon1 = v.indexOf(':', start); + // timezone hh:mm:ss[.ssssss][[+|-]hh:mm:ss] + // refer https://www.w3.org/TR/NOTE-datetime + int timezoneHour; + int timezoneMinute; + int hour; + int minute; + int second; + int milli; + int operator = -1; + int end = v.length(); + int timezone = v.indexOf('-', start); + if (timezone < 0) { + timezone = v.indexOf('+', start); + operator = 1; + } + if (timezone < 0) { + timezoneHour = 0; + timezoneMinute = 0; + } else { + end = timezone; + final int colon3 = v.indexOf(':', timezone); + if (colon3 < 0) { + if (!isInteger(v.substring(timezone + 1).trim())) { + return null; + } + timezoneHour = Integer.parseInt(v.substring(timezone + 1).trim()); + timezoneMinute = 0; + } else { + if (!isInteger(v.substring(timezone + 1, colon3).trim())) { + return null; + } + timezoneHour = Integer.parseInt(v.substring(timezone + 1, colon3).trim()); + if (!isInteger(v.substring(colon3 + 1).trim())) { + return null; + } + timezoneMinute = Integer.parseInt(v.substring(colon3 + 1).trim()); + } + } + if (colon1 < 0) { + if (!isInteger(v.substring(start, end).trim())) { + return null; + } + hour = Integer.parseInt(v.substring(start, end).trim()); + minute = 0; + second = 0; + milli = 0; + } else { + if (!isInteger(v.substring(start, colon1).trim())) { + return null; + } + hour = Integer.parseInt(v.substring(start, colon1).trim()); + final int colon2 = v.indexOf(':', colon1 + 1); + if (colon2 < 0) { + if (!isInteger(v.substring(colon1 + 1, end).trim())) { + return null; + } + minute = Integer.parseInt(v.substring(colon1 + 1, end).trim()); + second = 0; + milli = 0; + } else { + if (!isInteger(v.substring(colon1 + 1, colon2).trim())) { + return null; + } + minute = Integer.parseInt(v.substring(colon1 + 1, colon2).trim()); + int dot = v.indexOf('.', colon2); + if (dot < 0) { + if (!isInteger(v.substring(colon2 + 1, end).trim())) { + return null; + } + second = Integer.parseInt(v.substring(colon2 + 1, end).trim()); + milli = 0; + } else { + if (!isInteger(v.substring(colon2 + 1, dot).trim())) { + return null; + } + second = Integer.parseInt(v.substring(colon2 + 1, dot).trim()); + milli = parseFraction(v.substring(dot + 1, end).trim()); + } + } + } + hour += operator * timezoneHour; + minute += operator * timezoneMinute; + return hour * (int) MILLIS_PER_HOUR + + minute * (int) MILLIS_PER_MINUTE + + second * (int) MILLIS_PER_SECOND + + milli; + } + + /** + * Parses a fraction, multiplying the first character by {@code multiplier}, the second character + * by {@code multiplier / 10}, the third character by {@code multiplier / 100}, and so forth. + * + *

For example, {@code parseFraction("1234", 100)} yields {@code 123}. + */ + private static int parseFraction(String v) { + int multiplier = 100; + int r = 0; + for (int i = 0; i < v.length(); i++) { + char c = v.charAt(i); + int x = c < '0' || c > '9' ? 0 : (c - '0'); + r += multiplier * x; + if (multiplier < 10) { + // We're at the last digit. Check for rounding. + if (i + 1 < v.length() && v.charAt(i + 1) >= '5') { + ++r; + } + break; + } + multiplier /= 10; + } + return r; + } + + // -------------------------------------------------------------------------------------------- + // Format + // -------------------------------------------------------------------------------------------- + + public static String formatTimestamp(TimestampData ts, String format) { + return formatTimestamp(ts, format, ZoneId.of("UTC")); + } + + public static String formatTimestamp(TimestampData ts, String format, TimeZone zone) { + return formatTimestamp(ts, format, zone.toZoneId()); + } + + private static String formatTimestamp(TimestampData ts, int precision) { + LocalDateTime ldt = ts.toLocalDateTime(); + + String fraction = pad(9, ldt.getNano()); + while (fraction.length() > precision && fraction.endsWith("0")) { + fraction = fraction.substring(0, fraction.length() - 1); + } + + StringBuilder ymdhms = + ymdhms( + new StringBuilder(), + ldt.getYear(), + ldt.getMonthValue(), + ldt.getDayOfMonth(), + ldt.getHour(), + ldt.getMinute(), + ldt.getSecond()); + + if (fraction.length() > 0) { + ymdhms.append(".").append(fraction); + } + + return ymdhms.toString(); + } + + public static String formatTimestamp(TimestampData ts, TimeZone tz, int precision) { + return formatTimestamp(timestampWithLocalZoneToTimestamp(ts, tz), precision); + } + + private static String formatTimestamp(TimestampData ts, String format, ZoneId zoneId) { + DateTimeFormatter formatter = DATETIME_FORMATTER_CACHE.get(format); + Instant instant = ts.toInstant(); + return LocalDateTime.ofInstant(instant, zoneId).format(formatter); + } + + public static String formatTimestampString( + String dateStr, String fromFormat, String toFormat, TimeZone tz) { + SimpleDateFormat fromFormatter = FORMATTER_CACHE.get(fromFormat); + fromFormatter.setTimeZone(tz); + SimpleDateFormat toFormatter = FORMATTER_CACHE.get(toFormat); + toFormatter.setTimeZone(tz); + try { + return toFormatter.format(fromFormatter.parse(dateStr)); + } catch (ParseException e) { + LOG.error( + "Exception when formatting: '" + + dateStr + + "' from: '" + + fromFormat + + "' to: '" + + toFormat + + "'", + e); + return null; + } + } + + public static String formatTimestampString(String dateStr, String toFormat, TimeZone tz) { + // use yyyy-MM-dd HH:mm:ss as default + return formatTimestampString(dateStr, TIMESTAMP_FORMAT_STRING, toFormat, tz); + } + + public static String formatTimestampString(String dateStr, String toFormat) { + return formatTimestampString(dateStr, toFormat, UTC_ZONE); + } + + public static String formatTimestampMillis(long ts, String format, TimeZone tz) { + SimpleDateFormat formatter = FORMATTER_CACHE.get(format); + formatter.setTimeZone(tz); + Date dateTime = new Date(ts); + return formatter.format(dateTime); + } + + public static String formatTimestampMillis(int time, int precision) { + final StringBuilder buf = new StringBuilder(8 + (precision > 0 ? precision + 1 : 0)); + formatTimestampMillis(buf, time, precision); + return buf.toString(); + } + + private static void formatTimestampMillis(StringBuilder buf, int time, int precision) { + // we copy this method from Calcite DateTimeUtils but add the following changes + // time may be negative which means time milli seconds before 00:00:00 + // this maybe a bug in calcite avatica + while (time < 0) { + time += MILLIS_PER_DAY; + } + int h = time / 3600000; + int time2 = time % 3600000; + int m = time2 / 60000; + int time3 = time2 % 60000; + int s = time3 / 1000; + int ms = time3 % 1000; + int2(buf, h); + buf.append(':'); + int2(buf, m); + buf.append(':'); + int2(buf, s); + if (precision > 0) { + buf.append('.'); + while (precision > 0) { + buf.append((char) ('0' + (ms / 100))); + ms = ms % 100; + ms = ms * 10; + + // keep consistent with Timestamp.toString() + if (ms == 0) { + break; + } + + --precision; + } + } + } + + private static void int2(StringBuilder buf, int i) { + buf.append((char) ('0' + (i / 10) % 10)); + buf.append((char) ('0' + i % 10)); + } + + /** Helper for CAST({date} AS VARCHAR(n)). */ + public static String formatDate(int date) { + final StringBuilder buf = new StringBuilder(10); + formatDate(buf, date); + return buf.toString(); + } + + private static void formatDate(StringBuilder buf, int date) { + julianToString(buf, date + EPOCH_JULIAN); + } + + private static void julianToString(StringBuilder buf, int julian) { + // this shifts the epoch back to astronomical year -4800 instead of the + // start of the Christian era in year AD 1 of the proleptic Gregorian + // calendar. + int j = julian + 32044; + int g = j / 146097; + int dg = j % 146097; + int c = (dg / 36524 + 1) * 3 / 4; + int dc = dg - c * 36524; + int b = dc / 1461; + int db = dc % 1461; + int a = (db / 365 + 1) * 3 / 4; + int da = db - a * 365; + + // integer number of full years elapsed since March 1, 4801 BC + int y = g * 400 + c * 100 + b * 4 + a; + // integer number of full months elapsed since the last March 1 + int m = (da * 5 + 308) / 153 - 2; + // number of days elapsed since day 1 of the month + int d = da - (m + 4) * 153 / 5 + 122; + int year = y - 4800 + (m + 2) / 12; + int month = (m + 2) % 12 + 1; + int day = d + 1; + int4(buf, year); + buf.append('-'); + int2(buf, month); + buf.append('-'); + int2(buf, day); + } + + public static String formatIntervalYearMonth(int v) { + final StringBuilder buf = new StringBuilder(); + if (v >= 0) { + buf.append('+'); + } else { + buf.append('-'); + v = -v; + } + final int y = v / 12; + final int m = v % 12; + buf.append(y); + buf.append('-'); + number(buf, m, 2); + return buf.toString(); + } + + public static StringBuilder number(StringBuilder buf, int v, int n) { + for (int k = digitCount(v); k < n; k++) { + buf.append('0'); + } + return buf.append(v); + } + + private static int digitCount(int v) { + for (int n = 1; ; n++) { + v /= 10; + if (v == 0) { + return n; + } + } + } + + private static long roundUp(long dividend, long divisor) { + long remainder = dividend % divisor; + dividend -= remainder; + if (remainder * 2 > divisor) { + dividend += divisor; + } + return dividend; + } + + private static void fraction(StringBuilder buf, int scale, long ms) { + if (scale > 0) { + buf.append('.'); + long v1 = scale == 3 ? ms : scale == 2 ? ms / 10 : scale == 1 ? ms / 100 : 0; + number(buf, (int) v1, scale); + } + } + + private static long powerX(long a, long b) { + long x = 1; + while (b > 0) { + x *= a; + --b; + } + return x; + } + + public static String formatIntervalDayTime(long v) { + final int scale = 3; + final StringBuilder buf = new StringBuilder(); + if (v >= 0) { + buf.append('+'); + } else { + buf.append('-'); + v = -v; + } + final long ms; + final long s; + final long m; + final long h; + final long d; + v = roundUp(v, powerX(10, 3 - scale)); + ms = v % 1000; + v /= 1000; + s = v % 60; + v /= 60; + m = v % 60; + v /= 60; + h = v % 24; + v /= 24; + d = v; + buf.append((int) d); + buf.append(' '); + number(buf, (int) h, 2); + buf.append(':'); + number(buf, (int) m, 2); + buf.append(':'); + number(buf, (int) s, 2); + fraction(buf, scale, ms); + return buf.toString(); + } + + private static long internalParseTimestampMillis(String dateStr, String format, TimeZone tz) { + SimpleDateFormat formatter = FORMATTER_CACHE.get(format); + formatter.setTimeZone(tz); + try { + Date date = formatter.parse(dateStr); + return date.getTime(); + } catch (ParseException e) { + LOG.error( + String.format( + "Exception when parsing datetime string '%s' in format '%s'", dateStr, format), + e); + return Long.MIN_VALUE; + } + } + + // -------------------------------------------------------------------------------------------- + // EXTRACT + // -------------------------------------------------------------------------------------------- + + private static final TimestampType REUSE_TIMESTAMP_TYPE = new TimestampType(9); + + public static long extractFromDate(TimeUnitRange range, long date) { + return extractFromDate(range, (int) date); + } + + public static long extractFromDate(TimeUnitRange range, int date) { + switch (range) { + case EPOCH: + return date * 86400L; + default: + return julianExtract(range, date + 2440588); + } + } + + private static int julianExtract(TimeUnitRange range, int julian) { + int j = julian + 32044; + int g = j / 146097; + int dg = j % 146097; + int c = (dg / 36524 + 1) * 3 / 4; + int dc = dg - c * 36524; + int b = dc / 1461; + int db = dc % 1461; + int a = (db / 365 + 1) * 3 / 4; + int da = db - a * 365; + int y = g * 400 + c * 100 + b * 4 + a; + int m = (da * 5 + 308) / 153 - 2; + int d = da - (m + 4) * 153 / 5 + 122; + int year = y - 4800 + (m + 2) / 12; + int month = (m + 2) % 12 + 1; + int day = d + 1; + switch (range) { + case YEAR: + return year; + case YEAR_TO_MONTH: + case DAY_TO_SECOND: + case DAY_TO_MINUTE: + case DAY_TO_HOUR: + case HOUR: + case HOUR_TO_MINUTE: + case HOUR_TO_SECOND: + case MINUTE_TO_SECOND: + case MINUTE: + case SECOND: + case EPOCH: + default: + throw new AssertionError(range); + case MONTH: + return month; + case DAY: + return day; + case ISOYEAR: + int weekNumber = getIso8601WeekNumber(julian, year, month, day); + if (weekNumber == 1 && month == 12) { + return year + 1; + } else { + if (month == 1 && weekNumber > 50) { + return year - 1; + } + + return year; + } + case QUARTER: + return (month + 2) / 3; + case DOW: + return (int) floorMod(julian + 1, 7L) + 1; + case ISODOW: + return (int) floorMod(julian, 7L) + 1; + case WEEK: + return getIso8601WeekNumber(julian, year, month, day); + case DOY: + long janFirst = ymdToJulian(year, 1, 1); + return (int) ((long) julian - janFirst) + 1; + case DECADE: + return year / 10; + case CENTURY: + return year > 0 ? (year + 99) / 100 : (year - 99) / 100; + case MILLENNIUM: + return year > 0 ? (year + 999) / 1000 : (year - 999) / 1000; + } + } + + private static long firstMondayOfFirstWeek(int year) { + long janFirst = ymdToJulian(year, 1, 1); + long janFirstDow = floorMod(janFirst + 1L, 7L); + return janFirst + (11L - janFirstDow) % 7L - 3L; + } + + private static int getIso8601WeekNumber(int julian, int year, int month, int day) { + long fmofw = firstMondayOfFirstWeek(year); + if (month == 12 && day > 28) { + return 31 - day + 4 > 7 - ((int) floorMod(julian, 7L) + 1) + && 31 - day + (int) (floorMod(julian, 7L) + 1L) >= 4 + ? (int) ((long) julian - fmofw) / 7 + 1 + : 1; + } else if (month == 1 && day < 5) { + return 4 - day <= 7 - ((int) floorMod(julian, 7L) + 1) + && day - (int) (floorMod(julian, 7L) + 1L) >= -3 + ? 1 + : (int) ((long) julian - firstMondayOfFirstWeek(year - 1)) / 7 + 1; + } else { + return (int) ((long) julian - fmofw) / 7 + 1; + } + } + + private static long floorDiv(long x, long y) { + long r = x / y; + if ((x ^ y) < 0L && r * y != x) { + --r; + } + + return r; + } + + private static long floorMod(long x, long y) { + return x - floorDiv(x, y) * y; + } + + private static long divide(long res, BigDecimal value) { + if (value.equals(BigDecimal.ONE)) { + return res; + } else if (value.compareTo(BigDecimal.ONE) < 0 && value.signum() == 1) { + BigDecimal reciprocal = BigDecimal.ONE.divide(value, RoundingMode.UNNECESSARY); + return reciprocal.multiply(BigDecimal.valueOf(res)).longValue(); + } else { + return res / value.longValue(); + } + } + + private static long mod(long res, BigDecimal value) { + if (value.equals(BigDecimal.ONE)) { + return res; + } else { + return res % value.longValue(); + } + } + + private static BigDecimal getFactor(TimeUnit unit) { + switch (unit) { + case DAY: + return BigDecimal.ONE; + case HOUR: + return TimeUnit.DAY.multiplier; + case MINUTE: + return TimeUnit.HOUR.multiplier; + case SECOND: + return TimeUnit.MINUTE.multiplier; + case MILLISECOND: + case MICROSECOND: + case NANOSECOND: + return TimeUnit.SECOND.multiplier; + case YEAR: + return BigDecimal.ONE; + case MONTH: + return TimeUnit.YEAR.multiplier; + case QUARTER: + return TimeUnit.YEAR.multiplier; + case DECADE: + case CENTURY: + case MILLENNIUM: + return BigDecimal.ONE; + default: + throw new IllegalArgumentException("Invalid start unit."); + } + } + + // -------------------------------------------------------------------------------------------- + // Floor/Ceil/Convert tz + // -------------------------------------------------------------------------------------------- + + public static long timestampFloor(TimeUnitRange range, long ts, TimeZone tz) { + // assume that we are at UTC timezone, just for algorithm performance + long offset = tz.getOffset(ts); + long utcTs = ts + offset; + + switch (range) { + case HOUR: + return floor(utcTs, MILLIS_PER_HOUR) - offset; + case DAY: + return floor(utcTs, MILLIS_PER_DAY) - offset; + case MILLENNIUM: + case CENTURY: + case DECADE: + case MONTH: + case YEAR: + case QUARTER: + case WEEK: + int days = (int) (utcTs / MILLIS_PER_DAY + EPOCH_JULIAN); + return julianDateFloor(range, days, true) * MILLIS_PER_DAY - offset; + default: + // for MINUTE and SECONDS etc..., + // it is more effective to use arithmetic Method + throw new AssertionError(range); + } + } + + /** + * Keep the algorithm consistent with Calcite DateTimeUtils.julianDateFloor, but here we take time + * zone into account. + */ + public static long timestampCeil(TimeUnitRange range, long ts, TimeZone tz) { + // assume that we are at UTC timezone, just for algorithm performance + long offset = tz.getOffset(ts); + long utcTs = ts + offset; + + switch (range) { + case HOUR: + return ceil(utcTs, MILLIS_PER_HOUR) - offset; + case DAY: + return ceil(utcTs, MILLIS_PER_DAY) - offset; + case MILLENNIUM: + case CENTURY: + case DECADE: + case MONTH: + case YEAR: + case QUARTER: + case WEEK: + int days = (int) (utcTs / MILLIS_PER_DAY + EPOCH_JULIAN); + return julianDateFloor(range, days, false) * MILLIS_PER_DAY - offset; + default: + // for MINUTE and SECONDS etc..., + // it is more effective to use arithmetic Method + throw new AssertionError(range); + } + } + + private static long floor(long a, long b) { + long r = a % b; + if (r < 0) { + return a - r - b; + } else { + return a - r; + } + } + + private static long ceil(long a, long b) { + long r = a % b; + if (r > 0) { + return a - r + b; + } else { + return a - r; + } + } + + private static long julianDateFloor(TimeUnitRange range, int julian, boolean floor) { + // Algorithm the book "Astronomical Algorithms" by Jean Meeus, 1998 + int b = 0; + int c = 0; + if (julian > 2299160) { + int a = julian + 32044; + b = (4 * a + 3) / 146097; + c = a - b * 146097 / 4; + } else { + b = 0; + c = julian + 32082; + } + int d = (4 * c + 3) / 1461; + int e = c - (1461 * d) / 4; + int m = (5 * e + 2) / 153; + int day = e - (153 * m + 2) / 5 + 1; + int month = m + 3 - 12 * (m / 10); + int quarter = (month + 2) / 3; + int year = b * 100 + d - 4800 + (m / 10); + switch (range) { + case MILLENNIUM: + return floor + ? ymdToUnixDate(1000 * ((year + 999) / 1000) - 999, 1, 1) + : ymdToUnixDate(1000 * ((year + 999) / 1000) + 1, 1, 1); + case CENTURY: + return floor + ? ymdToUnixDate(100 * ((year + 99) / 100) - 99, 1, 1) + : ymdToUnixDate(100 * ((year + 99) / 100) + 1, 1, 1); + case DECADE: + return floor + ? ymdToUnixDate(10 * (year / 10), 1, 1) + : ymdToUnixDate(10 * (1 + year / 10), 1, 1); + case YEAR: + if (!floor && (month > 1 || day > 1)) { + year += 1; + } + return ymdToUnixDate(year, 1, 1); + case MONTH: + if (!floor && day > 1) { + month += 1; + } + return ymdToUnixDate(year, month, 1); + case QUARTER: + if (!floor && (month > 1 || day > 1)) { + quarter += 1; + } + return ymdToUnixDate(year, quarter * 3 - 2, 1); + case WEEK: + int dow = (int) floorMod(julian + 1, 7); // sun=0, sat=6 + int offset = dow; + if (!floor && offset > 0) { + offset -= 7; + } + return ymdToUnixDate(year, month, day) - offset; + case DAY: + int res = ymdToUnixDate(year, month, day); + return floor ? res : res + 1; + default: + throw new AssertionError(range); + } + } + + /** + * Convert datetime string from a time zone to another time zone. + * + * @param dateStr the date time string + * @param tzFrom the original time zone + * @param tzTo the target time zone + */ + public static String convertTz(String dateStr, String tzFrom, String tzTo) { + try { + return formatTimestampTz(parseTimestampTz(dateStr, tzFrom), tzTo); + } catch (ParseException e) { + return null; + } + } + + private static String formatTimestampTz(long ts, String tzStr) { + TimeZone tz = TIMEZONE_CACHE.get(tzStr); + return formatTimestampMillis(ts, DateTimeUtils.TIMESTAMP_FORMAT_STRING, tz); + } + + // -------------------------------------------------------------------------------------------- + // TIMESTAMP to DATE/TIME utils + // -------------------------------------------------------------------------------------------- + + /** + * Get date from a timestamp. + * + * @param ts the timestamp in milliseconds. + * @return the date in days. + */ + public static int timestampMillisToDate(long ts) { + int days = (int) (ts / MILLIS_PER_DAY); + if (days < 0) { + days = days - 1; + } + return days; + } + + /** + * Get time from a timestamp. + * + * @param ts the timestamp in milliseconds. + * @return the time in milliseconds. + */ + public static int timestampMillisToTime(long ts) { + return (int) (ts % MILLIS_PER_DAY); + } + + // -------------------------------------------------------------------------------------------- + // UNIX TIME + // -------------------------------------------------------------------------------------------- + + public static long fromTimestamp(long ts) { + return ts; + } + + /** + * Convert unix timestamp (seconds since '1970-01-01 00:00:00' UTC) to datetime string in the + * "yyyy-MM-dd HH:mm:ss" format. + */ + public static String formatUnixTimestamp(long unixtime, TimeZone tz) { + return formatUnixTimestamp(unixtime, TIMESTAMP_FORMAT_STRING, tz); + } + + /** + * Convert unix timestamp (seconds since '1970-01-01 00:00:00' UTC) to datetime string in the + * given format. + */ + public static String formatUnixTimestamp(long unixtime, String format, TimeZone tz) { + SimpleDateFormat formatter = FORMATTER_CACHE.get(format); + formatter.setTimeZone(tz); + Date date = new Date(unixtime * 1000); + try { + return formatter.format(date); + } catch (Exception e) { + LOG.error("Exception when formatting.", e); + return null; + } + } + + public static long toTimestampMillis(LocalDateTime dateTime) { + return unixTimestamp( + dateTime.getYear(), + dateTime.getMonthValue(), + dateTime.getDayOfMonth(), + dateTime.getHour(), + dateTime.getMinute(), + dateTime.getSecond(), + dateTime.getNano() / 1000_000); + } + + private static long unixTimestamp( + int year, int month, int day, int hour, int minute, int second, int mills) { + final int date = ymdToUnixDate(year, month, day); + return (long) date * MILLIS_PER_DAY + + (long) hour * MILLIS_PER_HOUR + + (long) minute * MILLIS_PER_MINUTE + + (long) second * MILLIS_PER_SECOND + + mills; + } + + /** Returns a Unix timestamp in seconds since '1970-01-01 00:00:00' UTC as an unsigned integer. */ + public static long unixTimestamp() { + return System.currentTimeMillis() / 1000; + } + + /** Returns the value of the timestamp to seconds since '1970-01-01 00:00:00' UTC. */ + public static long unixTimestamp(long ts) { + return ts / 1000; + } + + /** + * Returns the value of the argument as an unsigned integer in seconds since '1970-01-01 00:00:00' + * UTC. + */ + public static long unixTimestamp(String dateStr, TimeZone tz) { + return unixTimestamp(dateStr, TIMESTAMP_FORMAT_STRING, tz); + } + + /** + * Returns the value of the argument as an unsigned integer in seconds since '1970-01-01 00:00:00' + * UTC. + */ + public static long unixTimestamp(String dateStr, String format, TimeZone tz) { + long ts = internalParseTimestampMillis(dateStr, format, tz); + if (ts == Long.MIN_VALUE) { + return Long.MIN_VALUE; + } else { + // return the seconds + return ts / 1000; + } + } + + // -------------------------------------------------------------------------------------------- + // TIMESTAMP to TIMESTAMP_LTZ conversions + // -------------------------------------------------------------------------------------------- + + public static TimestampData timestampToTimestampWithLocalZone(TimestampData ts, TimeZone tz) { + return TimestampData.fromInstant(ts.toLocalDateTime().atZone(tz.toZoneId()).toInstant()); + } + + public static TimestampData timestampWithLocalZoneToTimestamp(TimestampData ts, TimeZone tz) { + return TimestampData.fromLocalDateTime(LocalDateTime.ofInstant(ts.toInstant(), tz.toZoneId())); + } + + public static int timestampWithLocalZoneToDate(TimestampData ts, TimeZone tz) { + return toInternal( + LocalDateTime.ofInstant(Instant.ofEpochMilli(ts.getMillisecond()), tz.toZoneId()) + .toLocalDate()); + } + + public static int timestampWithLocalZoneToTime(TimestampData ts, TimeZone tz) { + return toInternal( + LocalDateTime.ofInstant(Instant.ofEpochMilli(ts.getMillisecond()), tz.toZoneId()) + .toLocalTime()); + } + + public static TimestampData dateToTimestampWithLocalZone(int date, TimeZone tz) { + return TimestampData.fromInstant( + LocalDateTime.of(toLocalDate(date), LocalTime.MIDNIGHT).atZone(tz.toZoneId()).toInstant()); + } + + public static TimestampData timeToTimestampWithLocalZone(int time, TimeZone tz) { + return TimestampData.fromInstant(toLocalDateTime(time).atZone(tz.toZoneId()).toInstant()); + } + + private static boolean isInteger(String s) { + boolean isInt = s.length() > 0; + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) < '0' || s.charAt(i) > '9') { + isInt = false; + break; + } + } + return isInt; + } + + private static boolean isLeapYear(int s) { + return s % 400 == 0 || (s % 4 == 0 && s % 100 != 0); + } + + private static boolean isIllegalDate(int y, int m, int d) { + int[] monthOf31Days = new int[] {1, 3, 5, 7, 8, 10, 12}; + if (y < 0 || y > 9999 || m < 1 || m > 12 || d < 1 || d > 31) { + return false; + } + if (m == 2 && d > 28) { + if (!(isLeapYear(y) && d == 29)) { + return false; + } + } + if (d == 31) { + for (int i : monthOf31Days) { + if (i == m) { + return true; + } + } + return false; + } + return true; + } + + private static String pad(int length, long v) { + StringBuilder s = new StringBuilder(Long.toString(v)); + while (s.length() < length) { + s.insert(0, "0"); + } + return s.toString(); + } + + /** Appends hour:minute:second to a buffer; assumes they are valid. */ + private static StringBuilder hms(StringBuilder b, int h, int m, int s) { + int2(b, h); + b.append(':'); + int2(b, m); + b.append(':'); + int2(b, s); + return b; + } + + /** Appends year-month-day and hour:minute:second to a buffer; assumes they are valid. */ + private static StringBuilder ymdhms( + StringBuilder b, int year, int month, int day, int h, int m, int s) { + ymd(b, year, month, day); + b.append(' '); + hms(b, h, m, s); + return b; + } + + /** Appends year-month-day to a buffer; assumes they are valid. */ + private static StringBuilder ymd(StringBuilder b, int year, int month, int day) { + int4(b, year); + b.append('-'); + int2(b, month); + b.append('-'); + int2(b, day); + return b; + } + + private static void int4(StringBuilder buf, int i) { + buf.append((char) ('0' + (i / 1000) % 10)); + buf.append((char) ('0' + (i / 100) % 10)); + buf.append((char) ('0' + (i / 10) % 10)); + buf.append((char) ('0' + i % 10)); + } + + public static TimestampData truncate(TimestampData ts, int precision) { + String fraction = Integer.toString(ts.toLocalDateTime().getNano()); + if (fraction.length() <= precision) { + return ts; + } else { + // need to truncate + if (precision <= 3) { + return TimestampData.fromEpochMillis(zeroLastDigits(ts.getMillisecond(), 3 - precision)); + } else { + return TimestampData.fromEpochMillis( + ts.getMillisecond(), (int) zeroLastDigits(ts.getNanoOfMillisecond(), 9 - precision)); + } + } + } + + private static long zeroLastDigits(long l, int n) { + long tenToTheN = (long) Math.pow(10, n); + return (l / tenToTheN) * tenToTheN; + } + + public static long unixDateCeil(TimeUnitRange range, long date) { + return julianDateFloor(range, (int) date + 2440588, false); + } + + public static long unixDateFloor(TimeUnitRange range, long date) { + return julianDateFloor(range, (int) date + EPOCH_JULIAN, true); + } + + public static long unixTimestampFloor(TimeUnitRange range, long timestamp) { + int date = (int) (timestamp / MILLIS_PER_DAY); + final long f = julianDateFloor(range, date + EPOCH_JULIAN, true); + return f * MILLIS_PER_DAY; + } + + public static long unixTimestampCeil(TimeUnitRange range, long timestamp) { + int date = (int) (timestamp / MILLIS_PER_DAY); + final long f = julianDateFloor(range, date + EPOCH_JULIAN, false); + return f * MILLIS_PER_DAY; + } + + // -------------------------------------------------------------------------------------------- + // ADD/REMOVE months + // -------------------------------------------------------------------------------------------- + + /** + * Adds a given number of months to a timestamp, represented as the number of milliseconds since + * the epoch. + */ + public static long addMonths(long timestamp, int m) { + final long millis = DateTimeUtils.floorMod(timestamp, DateTimeUtils.MILLIS_PER_DAY); + timestamp -= millis; + final long x = addMonths((int) (timestamp / DateTimeUtils.MILLIS_PER_DAY), m); + return x * DateTimeUtils.MILLIS_PER_DAY + millis; + } + + /** Adds a given number of months to a date, represented as the number of days since the epoch. */ + public static int addMonths(int date, int m) { + int y0 = (int) extractFromDate(TimeUnitRange.YEAR, date); + int m0 = (int) extractFromDate(TimeUnitRange.MONTH, date); + int d0 = (int) extractFromDate(TimeUnitRange.DAY, date); + m0 += m; + int deltaYear = (int) DateTimeUtils.floorDiv(m0, 12); + y0 += deltaYear; + m0 = (int) DateTimeUtils.floorMod(m0, 12); + if (m0 == 0) { + y0 -= 1; + m0 += 12; + } + + int last = lastDay(y0, m0); + if (d0 > last) { + d0 = last; + } + return ymdToUnixDate(y0, m0, d0); + } + + private static int lastDay(int y, int m) { + switch (m) { + case 2: + return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0) ? 29 : 28; + case 4: + case 6: + case 9: + case 11: + return 30; + default: + return 31; + } + } + + /** + * Finds the number of months between two dates, each represented as the number of days since the + * epoch. + */ + public static int subtractMonths(int date0, int date1) { + if (date0 < date1) { + return -subtractMonths(date1, date0); + } + // Start with an estimate. + // Since no month has more than 31 days, the estimate is <= the true value. + int m = (date0 - date1) / 31; + while (true) { + int date2 = addMonths(date1, m); + if (date2 >= date0) { + return m; + } + int date3 = addMonths(date1, m + 1); + if (date3 > date0) { + return m; + } + ++m; + } + } + + public static int subtractMonths(long t0, long t1) { + final long millis0 = DateTimeUtils.floorMod(t0, DateTimeUtils.MILLIS_PER_DAY); + final int d0 = (int) DateTimeUtils.floorDiv(t0 - millis0, DateTimeUtils.MILLIS_PER_DAY); + final long millis1 = DateTimeUtils.floorMod(t1, DateTimeUtils.MILLIS_PER_DAY); + final int d1 = (int) DateTimeUtils.floorDiv(t1 - millis1, DateTimeUtils.MILLIS_PER_DAY); + int x = subtractMonths(d0, d1); + final long d2 = addMonths(d1, x); + if (d2 == d0 && millis0 < millis1) { + --x; + } + return x; + } + + // -------------------------------------------------------------------------------------------- + // TimeUnit and TimeUnitRange enums + // -------------------------------------------------------------------------------------------- + + /** + * Enumeration of time units used to construct an interval. + * + *

Only {@link #YEAR}, {@link #MONTH}, {@link #DAY}, {@link #HOUR}, {@link #MINUTE}, {@link + * #SECOND} can be the unit of a SQL interval. + * + *

The others ({@link #QUARTER}, {@link #WEEK}, {@link #MILLISECOND}, {@link #DOW}, {@link + * #DOY}, {@link #EPOCH}, {@link #DECADE}, {@link #CENTURY}, {@link #MILLENNIUM}, {@link + * #MICROSECOND}, {@link #NANOSECOND}, {@link #ISODOW} and {@link #ISOYEAR}) are convenient to use + * internally, when converting to and from UNIX timestamps. And also may be arguments to the + * {@code EXTRACT}, {@code TIMESTAMPADD} and {@code TIMESTAMPDIFF} functions. + */ + public enum TimeUnit { + YEAR(true, ' ', BigDecimal.valueOf(12) /* months */, null), + MONTH(true, '-', BigDecimal.ONE /* months */, BigDecimal.valueOf(12)), + DAY(false, '-', BigDecimal.valueOf(MILLIS_PER_DAY), null), + HOUR(false, ' ', BigDecimal.valueOf(MILLIS_PER_HOUR), BigDecimal.valueOf(24)), + MINUTE(false, ':', BigDecimal.valueOf(MILLIS_PER_MINUTE), BigDecimal.valueOf(60)), + SECOND(false, ':', BigDecimal.valueOf(MILLIS_PER_SECOND), BigDecimal.valueOf(60)), + + QUARTER(true, '*', BigDecimal.valueOf(3) /* months */, BigDecimal.valueOf(4)), + ISOYEAR(true, ' ', BigDecimal.valueOf(12) /* months */, null), + WEEK(false, '*', BigDecimal.valueOf(MILLIS_PER_DAY * 7), BigDecimal.valueOf(53)), + MILLISECOND(false, '.', BigDecimal.ONE, BigDecimal.valueOf(1000)), + MICROSECOND(false, '.', BigDecimal.ONE.scaleByPowerOfTen(-3), BigDecimal.valueOf(1000_000)), + NANOSECOND(false, '.', BigDecimal.ONE.scaleByPowerOfTen(-6), BigDecimal.valueOf(1000_000_000)), + DOW(false, '-', null, null), + ISODOW(false, '-', null, null), + DOY(false, '-', null, null), + EPOCH(false, '*', null, null), + DECADE(true, '*', BigDecimal.valueOf(120) /* months */, null), + CENTURY(true, '*', BigDecimal.valueOf(1200) /* months */, null), + MILLENNIUM(true, '*', BigDecimal.valueOf(12000) /* months */, null); + + public final boolean yearMonth; + public final char separator; + public final BigDecimal multiplier; + private final BigDecimal limit; + + private static final TimeUnit[] CACHED_VALUES = values(); + + TimeUnit(boolean yearMonth, char separator, BigDecimal multiplier, BigDecimal limit) { + this.yearMonth = yearMonth; + this.separator = separator; + this.multiplier = multiplier; + this.limit = limit; + } + + /** + * Returns the TimeUnit associated with an ordinal. The value returned is null if the ordinal is + * not a member of the TimeUnit enumeration. + */ + public static TimeUnit getValue(int ordinal) { + return ordinal < 0 || ordinal >= CACHED_VALUES.length ? null : CACHED_VALUES[ordinal]; + } + + /** + * Returns whether a given value is valid for a field of this time unit. + * + * @param field Field value + * @return Whether value + */ + public boolean isValidValue(BigDecimal field) { + return field.compareTo(BigDecimal.ZERO) >= 0 && (limit == null || field.compareTo(limit) < 0); + } + } + + /** + * A range of time units. The first is more significant than the other (e.g. year-to-day) or the + * same as the other (e.g. month). + */ + public enum TimeUnitRange { + YEAR(TimeUnit.YEAR, null), + YEAR_TO_MONTH(TimeUnit.YEAR, TimeUnit.MONTH), + MONTH(TimeUnit.MONTH, null), + DAY(TimeUnit.DAY, null), + DAY_TO_HOUR(TimeUnit.DAY, TimeUnit.HOUR), + DAY_TO_MINUTE(TimeUnit.DAY, TimeUnit.MINUTE), + DAY_TO_SECOND(TimeUnit.DAY, TimeUnit.SECOND), + HOUR(TimeUnit.HOUR, null), + HOUR_TO_MINUTE(TimeUnit.HOUR, TimeUnit.MINUTE), + HOUR_TO_SECOND(TimeUnit.HOUR, TimeUnit.SECOND), + MINUTE(TimeUnit.MINUTE, null), + MINUTE_TO_SECOND(TimeUnit.MINUTE, TimeUnit.SECOND), + SECOND(TimeUnit.SECOND, null), + + // non-standard time units cannot participate in ranges + ISOYEAR(TimeUnit.ISOYEAR, null), + QUARTER(TimeUnit.QUARTER, null), + WEEK(TimeUnit.WEEK, null), + MILLISECOND(TimeUnit.MILLISECOND, null), + MICROSECOND(TimeUnit.MICROSECOND, null), + NANOSECOND(TimeUnit.NANOSECOND, null), + DOW(TimeUnit.DOW, null), + ISODOW(TimeUnit.ISODOW, null), + DOY(TimeUnit.DOY, null), + EPOCH(TimeUnit.EPOCH, null), + DECADE(TimeUnit.DECADE, null), + CENTURY(TimeUnit.CENTURY, null), + MILLENNIUM(TimeUnit.MILLENNIUM, null); + + public final TimeUnit startUnit; + public final TimeUnit endUnit; + + private static final Map, TimeUnitRange> MAP = createMap(); + + /** + * Creates a TimeUnitRange. + * + * @param startUnit Start time unit + * @param endUnit End time unit + */ + TimeUnitRange(TimeUnit startUnit, TimeUnit endUnit) { + assert startUnit != null; + this.startUnit = startUnit; + this.endUnit = endUnit; + } + + /** + * Returns a {@code TimeUnitRange} with a given start and end unit. + * + * @param startUnit Start unit + * @param endUnit End unit + * @return Time unit range, or null if not valid + */ + public static TimeUnitRange of(TimeUnit startUnit, TimeUnit endUnit) { + return MAP.get(new Pair<>(startUnit, endUnit)); + } + + private static Map, TimeUnitRange> createMap() { + Map, TimeUnitRange> map = new HashMap<>(); + for (TimeUnitRange value : values()) { + map.put(new Pair<>(value.startUnit, value.endUnit), value); + } + return Collections.unmodifiableMap(map); + } + + /** Whether this is in the YEAR-TO-MONTH family of intervals. */ + public boolean monthly() { + return ordinal() <= MONTH.ordinal(); + } + + /** + * Immutable pair of values of the same type. + * + * @param the element type + */ + private static class Pair { + final E left; + final E right; + + private Pair(E left, E right) { + this.left = left; + this.right = right; + } + + @Override + public int hashCode() { + int k = (left == null) ? 0 : left.hashCode(); + int k1 = (right == null) ? 0 : right.hashCode(); + return ((k << 4) | k) ^ k1; + } + + @Override + public boolean equals(Object obj) { + return obj == this + || obj instanceof Pair + && Objects.equals(left, ((Pair) obj).left) + && Objects.equals(right, ((Pair) obj).right); + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java new file mode 100644 index 0000000000..9fc777d51d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkFilters; + +import java.util.List; +import java.util.Optional; + +public class FilterUtil { + + public static IcebergAndFlinkFilters convertFlinkExpressToIceberg( + List flinkFilters) { + List acceptedFilters = Lists.newArrayList(); + List expressions = Lists.newArrayList(); + + for (ResolvedExpression resolvedExpression : flinkFilters) { + Optional icebergExpression = FlinkFilters.convert(resolvedExpression); + if (icebergExpression.isPresent()) { + expressions.add(icebergExpression.get()); + acceptedFilters.add(resolvedExpression); + } + } + return IcebergAndFlinkFilters.of(expressions, acceptedFilters); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java new file mode 100644 index 0000000000..6b181b510d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.streaming.api.operators.source.ProgressiveTimestampsAndWatermarks; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +/** A util class to handle the reflection operation of Flink class. */ +public class FlinkClassReflectionUtil { + + public static final Logger LOG = LoggerFactory.getLogger(FlinkClassReflectionUtil.class); + + public static Object getSplitLocalOutput(ReaderOutput readerOutput) { + if (readerOutput == null) { + return null; + } + try { + return ReflectionUtil.getField( + (Class) ProgressiveTimestampsAndWatermarks.class.getDeclaredClasses()[2], + readerOutput, + "splitLocalOutputs"); + } catch (Exception e) { + LOG.warn("extract internal watermark error", e); + } + return null; + } + + public static void emitPeriodWatermark(@Nullable Object splitLocalOutput) { + if (splitLocalOutput == null) { + return; + } + try { + Method method = + ProgressiveTimestampsAndWatermarks.class.getDeclaredClasses()[1].getDeclaredMethod( + "emitPeriodicWatermark"); + method.setAccessible(true); + method.invoke(splitLocalOutput); + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + LOG.warn("no method found", e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java new file mode 100644 index 0000000000..bfba02d779 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.iceberg.expressions.Expression; + +import java.util.List; + +public class IcebergAndFlinkFilters { + + List expressions; + List acceptedFilters; + + private IcebergAndFlinkFilters( + List expressions, List acceptedFilters) { + this.expressions = expressions; + this.acceptedFilters = acceptedFilters; + } + + public static IcebergAndFlinkFilters of( + List expressions, List acceptedFilters) { + return new IcebergAndFlinkFilters(expressions, acceptedFilters); + } + + public List expressions() { + return expressions; + } + + public List acceptedFilters() { + return acceptedFilters; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java new file mode 100644 index 0000000000..8dcc3eb1bc --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.amoro.flink.interceptor.ProxyFactory; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.StreamingReaderOperator; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.util.ThreadPools; + +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** An util generates Apache Iceberg writer and committer operator w */ +public class IcebergClassUtil { + private static final String ICEBERG_SCAN_CONTEXT_CLASS = + "org.apache.iceberg.flink.source.ScanContext"; + private static final String ICEBERG_PARTITION_SELECTOR_CLASS = + "org.apache.iceberg.flink.sink.PartitionKeySelector"; + private static final String ICEBERG_FILE_COMMITTER_CLASS = + "org.apache.iceberg.flink.sink.IcebergFilesCommitter"; + private static final String ICEBERG_FILE_WRITER_CLASS = + "org.apache.iceberg.flink.sink.IcebergStreamWriter"; + + public static KeySelector newPartitionKeySelector( + PartitionSpec spec, Schema schema, RowType flinkSchema) { + try { + Class clazz = forName(ICEBERG_PARTITION_SELECTOR_CLASS); + Constructor c = clazz.getConstructor(PartitionSpec.class, Schema.class, RowType.class); + c.setAccessible(true); + return (KeySelector) c.newInstance(spec, schema, flinkSchema); + } catch (NoSuchMethodException + | IllegalAccessException + | InvocationTargetException + | InstantiationException e) { + throw new RuntimeException(e); + } + } + + public static OneInputStreamOperator newIcebergFilesCommitter( + TableLoader tableLoader, boolean replacePartitions, String branch, PartitionSpec spec) { + try { + Class clazz = forName(ICEBERG_FILE_COMMITTER_CLASS); + Constructor c = + clazz.getDeclaredConstructor( + TableLoader.class, + boolean.class, + Map.class, + Integer.class, + String.class, + PartitionSpec.class); + c.setAccessible(true); + return (OneInputStreamOperator) + c.newInstance( + tableLoader, + replacePartitions, + new HashMap<>(), + ThreadPools.WORKER_THREAD_POOL_SIZE, + branch, + spec); + } catch (NoSuchMethodException + | IllegalAccessException + | InvocationTargetException + | InstantiationException e) { + throw new RuntimeException(e); + } + } + + public static OneInputStreamOperator newIcebergFilesCommitter( + TableLoader tableLoader, + boolean replacePartitions, + String branch, + PartitionSpec spec, + AuthenticatedFileIO authenticatedFileIO) { + OneInputStreamOperator obj = + newIcebergFilesCommitter(tableLoader, replacePartitions, branch, spec); + return (OneInputStreamOperator) ProxyUtil.getProxy(obj, authenticatedFileIO); + } + + public static ProxyFactory getIcebergStreamWriterProxyFactory( + String fullTableName, + TaskWriterFactory taskWriterFactory, + AuthenticatedFileIO authenticatedFileIO) { + Class clazz = forName(ICEBERG_FILE_WRITER_CLASS); + return (ProxyFactory) + ProxyUtil.getProxyFactory( + clazz, + authenticatedFileIO, + new Class[] {String.class, TaskWriterFactory.class}, + new Object[] {fullTableName, taskWriterFactory}); + } + + public static StreamingReaderOperator newStreamingReaderOperator( + FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { + try { + Constructor c = + StreamingReaderOperator.class.getDeclaredConstructor( + FlinkInputFormat.class, ProcessingTimeService.class, MailboxExecutor.class); + c.setAccessible(true); + return c.newInstance(format, timeService, mailboxExecutor); + } catch (IllegalAccessException + | NoSuchMethodException + | InvocationTargetException + | InstantiationException e) { + throw new RuntimeException(e); + } + } + + public static FlinkInputFormat getInputFormat(OneInputStreamOperatorFactory operatorFactory) { + try { + Class[] classes = StreamingReaderOperator.class.getDeclaredClasses(); + Class clazz = null; + for (Class c : classes) { + if ("OperatorFactory".equals(c.getSimpleName())) { + clazz = c; + break; + } + } + Field field = clazz.getDeclaredField("format"); + field.setAccessible(true); + return (FlinkInputFormat) (field.get(operatorFactory)); + } catch (IllegalAccessException | NoSuchFieldException e) { + throw new RuntimeException(e); + } + } + + public static ProxyFactory getInputFormatProxyFactory( + OneInputStreamOperatorFactory operatorFactory, + AuthenticatedFileIO authenticatedFileIO, + Schema tableSchema) { + FlinkInputFormat inputFormat = getInputFormat(operatorFactory); + TableLoader tableLoader = + ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "tableLoader"); + FileIO io = ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "io"); + EncryptionManager encryption = + ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "encryption"); + Object context = ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "context"); + + return ProxyUtil.getProxyFactory( + FlinkInputFormat.class, + authenticatedFileIO, + new Class[] { + TableLoader.class, Schema.class, FileIO.class, EncryptionManager.class, ScanContext.class + }, + new Object[] {tableLoader, tableSchema, io, encryption, context}); + } + + private static Class forName(String className) { + try { + return Class.forName(className); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + public static SourceFunction getSourceFunction(AbstractUdfStreamOperator source) { + try { + Field field = AbstractUdfStreamOperator.class.getDeclaredField("userFunction"); + field.setAccessible(true); + return (SourceFunction) (field.get(source)); + } catch (IllegalAccessException | NoSuchFieldException e) { + throw new RuntimeException(e); + } + } + + public static void clean(StreamExecutionEnvironment env) { + try { + Field field = StreamExecutionEnvironment.class.getDeclaredField("transformations"); + field.setAccessible(true); + ((List) (field.get(env))).clear(); + } catch (IllegalAccessException | NoSuchFieldException e) { + throw new RuntimeException(e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java new file mode 100644 index 0000000000..677bfc3a8f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.amoro.flink.lookup.LookupOptions; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.flink.configuration.Configuration; + +public class LookupUtil { + + public static LookupOptions convertLookupOptions(Configuration config) { + return new LookupOptions.Builder() + .lruMaximumSize(config.get(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS)) + .writeRecordThreadNum(config.get(MixedFormatValidator.ROCKSDB_WRITING_THREADS)) + .ttlAfterWrite(config.get(MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE)) + .blockCacheCapacity(config.get(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_CAPACITY)) + .blockCacheNumShardBits(config.get(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS)) + .build(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java new file mode 100644 index 0000000000..cbd94f2413 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; +import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; +import static org.apache.amoro.table.TableProperties.LOG_STORE_DATA_VERSION; +import static org.apache.amoro.table.TableProperties.LOG_STORE_DATA_VERSION_DEFAULT; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; +import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_DEFAULT; +import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_KAFKA; +import static org.apache.amoro.table.TableProperties.LOG_STORE_TYPE; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; + +import org.apache.amoro.flink.metric.MetricsGenerator; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.write.AutomaticLogWriter; +import org.apache.amoro.flink.write.MixedFormatLogWriter; +import org.apache.amoro.flink.write.hidden.HiddenLogWriter; +import org.apache.amoro.flink.write.hidden.kafka.HiddenKafkaFactory; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.amoro.table.TableProperties; +import org.apache.amoro.utils.CompatiblePropertyUtil; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.ValidationException; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +/** An util that loads mixed-format table, build mixed-format log writer and so on. */ +public class MixedFormatUtils { + + public static final Logger LOG = LoggerFactory.getLogger(MixedFormatUtils.class); + + public static MixedTable loadMixedTable(MixedFormatTableLoader tableLoader) { + tableLoader.open(); + MixedTable table = tableLoader.loadMixedFormatTable(); + try { + tableLoader.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + return table; + } + + public static List getPrimaryKeys(MixedTable table) { + if (table.isUnkeyedTable()) { + return Collections.emptyList(); + } + return table.asKeyedTable().primaryKeySpec().fields().stream() + .map(PrimaryKeySpec.PrimaryKeyField::fieldName) + .collect(Collectors.toList()); + } + + public static MetricsGenerator getMetricsGenerator( + boolean metricsEventLatency, + boolean metricsEnable, + MixedTable mixedTable, + RowType flinkSchemaRowType, + Schema writeSchema) { + MetricsGenerator metricsGenerator; + if (metricsEventLatency) { + String modifyTimeColumn = mixedTable.properties().get(TableProperties.TABLE_EVENT_TIME_FIELD); + metricsGenerator = + MetricsGenerator.newGenerator( + mixedTable.schema(), flinkSchemaRowType, modifyTimeColumn, metricsEnable); + } else { + metricsGenerator = MetricsGenerator.empty(metricsEnable); + } + return metricsGenerator; + } + + public static boolean mixedFormatWALWriterEnable( + Map properties, String emitMode) { + boolean streamEnable = + CompatiblePropertyUtil.propertyAsBoolean( + properties, ENABLE_LOG_STORE, TableProperties.ENABLE_LOG_STORE_DEFAULT); + + if (emitMode.contains(MixedFormatValidator.MIXED_FORMAT_EMIT_LOG)) { + if (!streamEnable) { + throw new ValidationException( + "emit to kafka was set, but no kafka config be found, please set kafka config first"); + } + return true; + } else if (emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO)) { + LOG.info( + "mixed-format emit mode is auto, and the mixed-format table {} is {}", + ENABLE_LOG_STORE, + streamEnable); + return streamEnable; + } + + return false; + } + + /** + * only when {@link MixedFormatValidator#MIXED_FORMAT_EMIT_MODE} contains {@link + * MixedFormatValidator#MIXED_FORMAT_EMIT_FILE} and enable {@link + * TableProperties#ENABLE_LOG_STORE} create logWriter according to {@link + * TableProperties#LOG_STORE_DATA_VERSION} + * + * @param properties mixed-format table properties + * @param producerConfig + * @param topic + * @param tableSchema + * @param tableLoader mixed-format table loader + * @param watermarkWriteGap watermark gap that triggers automatic writing to log storage + * @return mixed-formatLogWriter + */ + public static MixedFormatLogWriter buildLogWriter( + Map properties, + @Nullable Properties producerConfig, + @Nullable String topic, + TableSchema tableSchema, + String emitMode, + ShuffleHelper helper, + MixedFormatTableLoader tableLoader, + Duration watermarkWriteGap) { + if (!mixedFormatWALWriterEnable(properties, emitMode)) { + return null; + } + + if (topic == null) { + topic = + CompatibleFlinkPropertyUtil.propertyAsString(properties, LOG_STORE_MESSAGE_TOPIC, null); + } + Preconditions.checkNotNull( + topic, + String.format("Topic should be specified. It can be set by '%s'", LOG_STORE_MESSAGE_TOPIC)); + + producerConfig = combineTableAndUnderlyingLogstoreProperties(properties, producerConfig); + + String version = + properties.getOrDefault(LOG_STORE_DATA_VERSION, LOG_STORE_DATA_VERSION_DEFAULT); + if (LOG_STORE_DATA_VERSION_DEFAULT.equals(version)) { + if (emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO)) { + LOG.info( + "mixed-format emit mode is auto, and we will build automatic log writer: AutomaticLogWriter(v1)"); + return new AutomaticLogWriter( + FlinkSchemaUtil.convert(tableSchema), + producerConfig, + topic, + new HiddenKafkaFactory<>(), + LogRecordV1.FIELD_GETTER_FACTORY, + IdGenerator.generateUpstreamId(), + helper, + tableLoader, + watermarkWriteGap); + } + + LOG.info("build log writer: HiddenLogWriter(v1)"); + return new HiddenLogWriter( + FlinkSchemaUtil.convert(tableSchema), + producerConfig, + topic, + new HiddenKafkaFactory<>(), + LogRecordV1.FIELD_GETTER_FACTORY, + IdGenerator.generateUpstreamId(), + helper); + } + throw new UnsupportedOperationException( + "don't support log version '" + version + "'. only support 'v1' or empty"); + } + + /** + * Extract and combine the properties for underlying log store queue. + * + * @param tableProperties mixed-format table properties + * @param producerConfig can be set by java API + * @return properties with tableProperties and producerConfig which has higher priority. + */ + private static Properties combineTableAndUnderlyingLogstoreProperties( + Map tableProperties, Properties producerConfig) { + Properties finalProp; + Properties underlyingLogStoreProps = + CompatibleFlinkPropertyUtil.fetchLogstorePrefixProperties(tableProperties); + if (producerConfig == null) { + finalProp = underlyingLogStoreProps; + } else { + underlyingLogStoreProps + .stringPropertyNames() + .forEach(k -> producerConfig.putIfAbsent(k, underlyingLogStoreProps.get(k))); + finalProp = producerConfig; + } + + String logStoreAddress = + CompatibleFlinkPropertyUtil.propertyAsString(tableProperties, LOG_STORE_ADDRESS, null); + + String logType = + CompatibleFlinkPropertyUtil.propertyAsString( + tableProperties, LOG_STORE_TYPE, LOG_STORE_STORAGE_TYPE_DEFAULT); + if (logType.equals(LOG_STORE_STORAGE_TYPE_KAFKA)) { + finalProp.putIfAbsent( + "key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + finalProp.putIfAbsent( + "value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + finalProp.putIfAbsent( + "key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + finalProp.putIfAbsent( + "value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + + if (logStoreAddress != null) { + finalProp.putIfAbsent(BOOTSTRAP_SERVERS_CONFIG, logStoreAddress); + } + + Preconditions.checkArgument( + finalProp.containsKey(BOOTSTRAP_SERVERS_CONFIG), + String.format("%s should be set", LOG_STORE_ADDRESS)); + } + + return finalProp; + } + + public static boolean fileWriterEnable(String emitMode) { + return emitMode.contains(MixedFormatValidator.MIXED_FORMAT_EMIT_FILE) + || emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO); + } + + public static boolean isToBase(boolean overwrite) { + boolean toBase = overwrite; + LOG.info("is write to base:{}", toBase); + return toBase; + } + + public static RowData removeMixedFormatMetaColumn(RowData rowData, int columnSize) { + GenericRowData newRowData = new GenericRowData(rowData.getRowKind(), columnSize); + if (rowData instanceof GenericRowData) { + GenericRowData before = (GenericRowData) rowData; + for (int i = 0; i < newRowData.getArity(); i++) { + newRowData.setField(i, before.getField(i)); + } + return newRowData; + } + throw new UnsupportedOperationException( + String.format( + "Can't remove mixed-format meta column from this RowData %s", + rowData.getClass().getSimpleName())); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java new file mode 100644 index 0000000000..9ceffe2c32 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java @@ -0,0 +1,430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static org.apache.flink.table.types.logical.LogicalTypeRoot.ROW; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.FieldsDataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.util.Preconditions; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.ListIterator; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * {@link Projection} represents a list of (possibly nested) indexes that can be used to project + * data types. A row projection includes both reducing the accessible fields and reordering them. + * + *

NOTE: Copied from Flink. + */ +public abstract class Projection { + + // sealed class + private Projection() {} + + /** + * Projects a (possibly nested) row data type by returning a new data type that only includes + * fields of the given index paths. + * + *

When extracting nested fields, the name of the resulting fields is the full path of the + * field separated by {@code _}. For example, the field {@code b} inside the row field {@code a} + * of the root {@link DataType} is named {@code a_b} in the result {@link DataType}. In case of + * naming conflicts the postfix notation '_$%d' is used, where {@code %d} is an arbitrary number, + * in order to generate a unique field name. For example if the root {@link DataType} includes + * both a field {@code a_b} and a nested row {@code a} with field {@code b}, the result {@link + * DataType} will contain one field named {@code a_b} and the other named {@code a_b_1}. + */ + public abstract DataType project(DataType dataType); + + /** Same as {@link #project(DataType)}, but accepting and returning {@link LogicalType}. */ + public LogicalType project(LogicalType logicalType) { + return this.project(TypeConversions.fromLogicalToDataType(logicalType)).getLogicalType(); + } + + /** @return {@code true} whether this projection is nested or not. */ + public abstract boolean isNested(); + + /** + * Perform a difference of this {@link Projection} with another {@link Projection}. The result of + * this operation is a new {@link Projection} retaining the same ordering of this instance but + * with the indexes from {@code other} removed. For example: + * + *

+   * 
+   * [4, 1, 0, 3, 2] - [4, 2] = [1, 0, 2]
+   * 
+   * 
+ * + *

Note how the index {@code 3} in the minuend becomes {@code 2} because it's rescaled to + * project correctly a {@link RowData} or arity 3. + * + * @param other the subtrahend + * @throws IllegalArgumentException when {@code other} is nested. + */ + public abstract Projection difference(Projection other); + + /** + * Complement this projection. The returned projection is an ordered projection of fields from 0 + * to {@code fieldsNumber} except the indexes in this {@link Projection}. For example: + * + *

+   * 
+   * [4, 2].complement(5) = [0, 1, 3]
+   * 
+   * 
+ * + * @param fieldsNumber the size of the universe + * @throws IllegalStateException if this projection is nested. + */ + public abstract Projection complement(int fieldsNumber); + + /** Like {@link #complement(int)}, using the {@code dataType} fields count. */ + public Projection complement(DataType dataType) { + return complement(dataType.getLogicalType().getChildren().size()); + } + + /** + * Convert this instance to a projection of top level indexes. The array represents the mapping of + * the fields of the original {@link DataType}. For example, {@code [0, 2, 1]} specifies to + * include in the following order the 1st field, the 3rd field and the 2nd field of the row. + * + * @throws IllegalStateException if this projection is nested. + */ + public abstract int[] toTopLevelIndexes(); + + /** + * Convert this instance to a nested projection index paths. The array represents the mapping of + * the fields of the original {@link DataType}, including nested rows. For example, {@code [[0, 2, + * 1], ...]} specifies to include the 2nd field of the 3rd field of the 1st field in the top-level + * row. + */ + public abstract int[][] toNestedIndexes(); + + /** + * Create an empty {@link Projection}, that is a projection that projects no fields, returning an + * empty {@link DataType}. + */ + public static Projection empty() { + return EmptyProjection.INSTANCE; + } + + /** + * Create a {@link Projection} of the provided {@code indexes}. + * + * @see #toTopLevelIndexes() + */ + public static Projection of(int[] indexes) { + if (indexes.length == 0) { + return empty(); + } + return new TopLevelProjection(indexes); + } + + /** + * Create a {@link Projection} of the provided {@code indexes}. + * + * @see #toNestedIndexes() + */ + public static Projection of(int[][] indexes) { + if (indexes.length == 0) { + return empty(); + } + return new NestedProjection(indexes); + } + + /** Create a {@link Projection} of a field range. */ + public static Projection range(int startInclusive, int endExclusive) { + return new TopLevelProjection(IntStream.range(startInclusive, endExclusive).toArray()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Projection)) { + return false; + } + Projection other = (Projection) o; + if (!this.isNested() && !other.isNested()) { + return Arrays.equals(this.toTopLevelIndexes(), other.toTopLevelIndexes()); + } + return Arrays.deepEquals(this.toNestedIndexes(), other.toNestedIndexes()); + } + + @Override + public int hashCode() { + if (isNested()) { + return Arrays.deepHashCode(toNestedIndexes()); + } + return Arrays.hashCode(toTopLevelIndexes()); + } + + @Override + public String toString() { + if (isNested()) { + return "Nested projection = " + Arrays.deepToString(toNestedIndexes()); + } + return "Top level projection = " + Arrays.toString(toTopLevelIndexes()); + } + + private static class EmptyProjection extends Projection { + + static final EmptyProjection INSTANCE = new EmptyProjection(); + + private EmptyProjection() {} + + @Override + public DataType project(DataType dataType) { + return new NestedProjection(toNestedIndexes()).project(dataType); + } + + @Override + public boolean isNested() { + return false; + } + + @Override + public Projection difference(Projection projection) { + return this; + } + + @Override + public Projection complement(int fieldsNumber) { + return new TopLevelProjection(IntStream.range(0, fieldsNumber).toArray()); + } + + @Override + public int[] toTopLevelIndexes() { + return new int[0]; + } + + @Override + public int[][] toNestedIndexes() { + return new int[0][]; + } + } + + private static class NestedProjection extends Projection { + + final int[][] projection; + final boolean nested; + + NestedProjection(int[][] projection) { + this.projection = projection; + this.nested = Arrays.stream(projection).anyMatch(arr -> arr.length > 1); + } + + @Override + public DataType project(DataType dataType) { + final List updatedFields = new ArrayList<>(); + final List updatedChildren = new ArrayList<>(); + Set nameDomain = new HashSet<>(); + int duplicateCount = 0; + for (int[] indexPath : this.projection) { + DataType fieldType = dataType.getChildren().get(indexPath[0]); + LogicalType fieldLogicalType = fieldType.getLogicalType(); + StringBuilder builder = + new StringBuilder( + ((RowType) dataType.getLogicalType()).getFieldNames().get(indexPath[0])); + for (int index = 1; index < indexPath.length; index++) { + Preconditions.checkArgument( + fieldLogicalType.getTypeRoot() == ROW, "Row data type expected."); + RowType rowtype = ((RowType) fieldLogicalType); + builder.append("_").append(rowtype.getFieldNames().get(indexPath[index])); + fieldLogicalType = rowtype.getFields().get(indexPath[index]).getType(); + fieldType = fieldType.getChildren().get(indexPath[index]); + } + String path = builder.toString(); + while (nameDomain.contains(path)) { + path = builder.append("_$").append(duplicateCount++).toString(); + } + updatedFields.add(new RowType.RowField(path, fieldLogicalType)); + updatedChildren.add(fieldType); + nameDomain.add(path); + } + return new FieldsDataType( + new RowType(dataType.getLogicalType().isNullable(), updatedFields), + dataType.getConversionClass(), + updatedChildren); + } + + @Override + public boolean isNested() { + return nested; + } + + @Override + public Projection difference(Projection other) { + if (other.isNested()) { + throw new IllegalArgumentException( + "Cannot perform difference between nested projection and nested projection"); + } + if (other instanceof EmptyProjection) { + return this; + } + if (!this.isNested()) { + return new TopLevelProjection(toTopLevelIndexes()).difference(other); + } + + // Extract the indexes to exclude and sort them + int[] indexesToExclude = other.toTopLevelIndexes(); + indexesToExclude = Arrays.copyOf(indexesToExclude, indexesToExclude.length); + Arrays.sort(indexesToExclude); + + List resultProjection = + Arrays.stream(projection).collect(Collectors.toCollection(ArrayList::new)); + + ListIterator resultProjectionIterator = resultProjection.listIterator(); + while (resultProjectionIterator.hasNext()) { + int[] indexArr = resultProjectionIterator.next(); + + // Let's check if the index is inside the indexesToExclude array + int searchResult = Arrays.binarySearch(indexesToExclude, indexArr[0]); + if (searchResult >= 0) { + // Found, we need to remove it + resultProjectionIterator.remove(); + } else { + // Not found, let's compute the offset. + // Offset is the index where the projection index should be inserted in the + // indexesToExclude array + int offset = (-(searchResult) - 1); + if (offset != 0) { + indexArr[0] = indexArr[0] - offset; + } + } + } + + return new NestedProjection(resultProjection.toArray(new int[0][])); + } + + @Override + public Projection complement(int fieldsNumber) { + if (isNested()) { + throw new IllegalStateException("Cannot perform complement of a nested projection"); + } + return new TopLevelProjection(toTopLevelIndexes()).complement(fieldsNumber); + } + + @Override + public int[] toTopLevelIndexes() { + if (isNested()) { + throw new IllegalStateException( + "Cannot convert a nested projection to a top level projection"); + } + return Arrays.stream(projection).mapToInt(arr -> arr[0]).toArray(); + } + + @Override + public int[][] toNestedIndexes() { + return projection; + } + } + + private static class TopLevelProjection extends Projection { + + final int[] projection; + + TopLevelProjection(int[] projection) { + this.projection = projection; + } + + @Override + public DataType project(DataType dataType) { + return new NestedProjection(toNestedIndexes()).project(dataType); + } + + @Override + public boolean isNested() { + return false; + } + + @Override + public Projection difference(Projection other) { + if (other.isNested()) { + throw new IllegalArgumentException( + "Cannot perform difference between top level projection and nested projection"); + } + if (other instanceof EmptyProjection) { + return this; + } + + // Extract the indexes to exclude and sort them + int[] indexesToExclude = other.toTopLevelIndexes(); + indexesToExclude = Arrays.copyOf(indexesToExclude, indexesToExclude.length); + Arrays.sort(indexesToExclude); + + List resultProjection = + Arrays.stream(projection).boxed().collect(Collectors.toCollection(ArrayList::new)); + + ListIterator resultProjectionIterator = resultProjection.listIterator(); + while (resultProjectionIterator.hasNext()) { + int index = resultProjectionIterator.next(); + + // Let's check if the index is inside the indexesToExclude array + int searchResult = Arrays.binarySearch(indexesToExclude, index); + if (searchResult >= 0) { + // Found, we need to remove it + resultProjectionIterator.remove(); + } else { + // Not found, let's compute the offset. + // Offset is the index where the projection index should be inserted in the + // indexesToExclude array + int offset = (-(searchResult) - 1); + if (offset != 0) { + resultProjectionIterator.set(index - offset); + } + } + } + + return new TopLevelProjection(resultProjection.stream().mapToInt(i -> i).toArray()); + } + + @Override + public Projection complement(int fieldsNumber) { + int[] indexesToExclude = Arrays.copyOf(projection, projection.length); + Arrays.sort(indexesToExclude); + + return new TopLevelProjection( + IntStream.range(0, fieldsNumber) + .filter(i -> Arrays.binarySearch(indexesToExclude, i) < 0) + .toArray()); + } + + @Override + public int[] toTopLevelIndexes() { + return projection; + } + + @Override + public int[][] toNestedIndexes() { + return Arrays.stream(projection).mapToObj(i -> new int[] {i}).toArray(int[][]::new); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java new file mode 100644 index 0000000000..a08798e0e7 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import net.sf.cglib.proxy.Enhancer; +import net.sf.cglib.proxy.MethodInterceptor; +import org.apache.amoro.flink.interceptor.KerberosInterceptor; +import org.apache.amoro.flink.interceptor.KerberosInvocationHandler; +import org.apache.amoro.flink.interceptor.ProxyFactory; +import org.apache.amoro.io.AuthenticatedFileIO; + +/** + * A proxy util wraps an object with the kerberos authenticate ability by {@link + * KerberosInvocationHandler}. + */ +public class ProxyUtil { + + public static Object getProxy(T obj, KerberosInvocationHandler handler) { + return handler.getProxy(obj); + } + + public static Object getProxy(T obj, AuthenticatedFileIO authenticatedFileIO) { + KerberosInvocationHandler handler = new KerberosInvocationHandler<>(authenticatedFileIO); + return getProxy(obj, handler); + } + + public static T getProxy( + Class clazz, MethodInterceptor interceptor, Class[] argumentTypes, Object[] arguments) { + Enhancer enhancer = new Enhancer(); + enhancer.setSuperclass(clazz); + enhancer.setCallback(interceptor); + return (T) enhancer.create(argumentTypes, arguments); + } + + public static T getProxy( + Class clazz, + AuthenticatedFileIO authenticatedFileIO, + Class[] argumentTypes, + Object[] arguments) { + return getProxy(clazz, new KerberosInterceptor(authenticatedFileIO), argumentTypes, arguments); + } + + public static ProxyFactory getProxyFactory( + Class clazz, + AuthenticatedFileIO authenticatedFileIO, + Class[] argumentTypes, + Object[] arguments) { + return new ProxyFactory( + clazz, new KerberosInterceptor(authenticatedFileIO), argumentTypes, arguments); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java new file mode 100644 index 0000000000..010a17c74a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** An util for reflection. */ +public class ReflectionUtil { + + /** get interfaces of class and its parent */ + public static Class[] getAllInterface(Class clazz) { + if (clazz.equals(Object.class)) { + return new Class[] {}; + } + Class[] current = clazz.getInterfaces(); + Class superClass = clazz.getSuperclass(); + Class[] superInterfaces = getAllInterface(superClass); + + Set> all = new HashSet<>(); + all.addAll(Arrays.asList(current)); + all.addAll(Arrays.asList(superInterfaces)); + + Class[] deduplicated = new Class[all.size()]; + return all.toArray(deduplicated); + } + + public static V getField(Class clazz, O obj, String fieldName) { + try { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + Object v = field.get(obj); + return v == null ? null : (V) v; + } catch (NoSuchFieldException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java new file mode 100644 index 0000000000..cf44a0b18e --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.annotation.Internal; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.function.Function; + +/** + * Provides a thread local cache with a maximum cache size per thread. + * + *

Note: Values must not be null. + * + *

Copied from flink-1.18 + */ +@Internal +public abstract class ThreadLocalCache { + + private static final int DEFAULT_CACHE_SIZE = 64; + + private final ThreadLocal> cache = new ThreadLocal<>(); + private final int maxSizePerThread; + + protected ThreadLocalCache() { + this(DEFAULT_CACHE_SIZE); + } + + protected ThreadLocalCache(int maxSizePerThread) { + this.maxSizePerThread = maxSizePerThread; + } + + public V get(K key) { + BoundedMap map = cache.get(); + if (map == null) { + map = new BoundedMap<>(maxSizePerThread); + cache.set(map); + } + V value = map.get(key); + if (value == null) { + value = getNewInstance(key); + map.put(key, value); + } + return value; + } + + public abstract V getNewInstance(K key); + + private static class BoundedMap extends LinkedHashMap { + + private static final long serialVersionUID = -211630219014422361L; + + private final int maxSize; + + private BoundedMap(int maxSize) { + this.maxSize = maxSize; + } + + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return this.size() > maxSize; + } + } + + public static ThreadLocalCache of(Function creator) { + return new ThreadLocalCache() { + @Override + public V getNewInstance(K key) { + return creator.apply(key); + } + }; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java new file mode 100644 index 0000000000..b801a5815b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetWriters; +import org.apache.iceberg.flink.data.FlinkAvroWriter; +import org.apache.iceberg.flink.data.FlinkOrcWriter; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.AdaptHiveParquet; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Map; + +public class AdaptHiveFlinkAppenderFactory implements FileAppenderFactory, Serializable { + private final Schema schema; + private final RowType flinkSchema; + private final Map props; + private final PartitionSpec spec; + private final int[] equalityFieldIds; + private final Schema eqDeleteRowSchema; + private final Schema posDeleteRowSchema; + + private RowType eqDeleteFlinkSchema = null; + private RowType posDeleteFlinkSchema = null; + + public AdaptHiveFlinkAppenderFactory( + Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { + this(schema, flinkSchema, props, spec, null, null, null); + } + + public AdaptHiveFlinkAppenderFactory( + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { + this.schema = schema; + this.flinkSchema = flinkSchema; + this.props = props; + this.spec = spec; + this.equalityFieldIds = equalityFieldIds; + this.eqDeleteRowSchema = eqDeleteRowSchema; + this.posDeleteRowSchema = posDeleteRowSchema; + } + + private RowType lazyEqDeleteFlinkSchema() { + if (eqDeleteFlinkSchema == null) { + Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); + this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); + } + return eqDeleteFlinkSchema; + } + + private RowType lazyPosDeleteFlinkSchema() { + if (posDeleteFlinkSchema == null) { + Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); + this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); + } + return this.posDeleteFlinkSchema; + } + + @Override + public FileAppender newAppender(OutputFile outputFile, FileFormat format) { + MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); + try { + switch (format) { + case AVRO: + return Avro.write(outputFile) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .setAll(props) + .schema(schema) + .metricsConfig(metricsConfig) + .overwrite() + .build(); + + case ORC: + return ORC.write(outputFile) + .createWriterFunc( + (schema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, schema)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + case PARQUET: + return AdaptHiveParquet.write(outputFile) + .createWriterFunc( + msgType -> AdaptHiveFlinkParquetWriters.buildWriter(flinkSchema, msgType)) + .setAll(props) + .metricsConfig(metricsConfig) + .schema(schema) + .overwrite() + .build(); + + default: + throw new UnsupportedOperationException("Cannot write unknown file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); + } + + @Override + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, + "Equality field ids shouldn't be null or empty when creating equality-delete writer"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality delete row schema shouldn't be null when creating equality-delete writer"); + + MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case PARQUET: + return AdaptHiveParquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> + AdaptHiveFlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + case ORC: + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (schema, typDesc) -> + FlinkOrcWriter.buildWriter(lazyEqDeleteFlinkSchema(), schema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(eqDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .equalityFieldIds(equalityFieldIds) + .buildEqualityWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write equality-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) + .withPartition(partition) + .overwrite() + .setAll(props) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .buildPositionWriter(); + + case PARQUET: + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return AdaptHiveParquet.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + msgType -> + AdaptHiveFlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .transformPaths(path -> StringData.fromString(path.toString())) + .buildPositionWriter(); + + case ORC: + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + return ORC.writeDeletes(outputFile.encryptingOutputFile()) + .createWriterFunc( + (schema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, schema)) + .withPartition(partition) + .overwrite() + .setAll(props) + .metricsConfig(metricsConfig) + .rowSchema(posDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(outputFile.keyMetadata()) + .buildPositionWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java new file mode 100644 index 0000000000..7193dd646b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.iceberg.UpdateProperties; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.time.Duration; +import java.util.Map; + +/** This is an automatic logstore writer util class. */ +public class AutomaticDoubleWriteStatus implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(AutomaticDoubleWriteStatus.class); + + private static final long serialVersionUID = 1L; + private final MixedFormatTableLoader tableLoader; + private final AutomaticWriteSpecification specification; + private MixedTable table; + private transient boolean shouldDoubleWrite = false; + private int subtaskId; + + public AutomaticDoubleWriteStatus( + MixedFormatTableLoader tableLoader, Duration writeLogstoreWatermarkGap) { + this.tableLoader = tableLoader; + this.specification = new AutomaticWriteSpecification(writeLogstoreWatermarkGap); + } + + public void setup(int indexOfThisSubtask) { + this.subtaskId = indexOfThisSubtask; + } + + public void open() { + table = MixedFormatUtils.loadMixedTable(tableLoader); + sync(); + } + + public boolean isDoubleWrite() { + return shouldDoubleWrite; + } + + public void processWatermark(Watermark mark) { + if (isDoubleWrite()) { + return; + } + if (specification.shouldDoubleWrite(mark.getTimestamp())) { + shouldDoubleWrite = true; + LOG.info( + "processWatermark {}, subTaskId is {}, should double write is true.", mark, subtaskId); + LOG.info( + "begin update mixed-format table, set {} to true", + MixedFormatValidator.LOG_STORE_CATCH_UP.key()); + UpdateProperties updateProperties = table.updateProperties(); + updateProperties.set(MixedFormatValidator.LOG_STORE_CATCH_UP.key(), String.valueOf(true)); + updateProperties.set( + MixedFormatValidator.LOG_STORE_CATCH_UP_TIMESTAMP.key(), + String.valueOf(System.currentTimeMillis())); + updateProperties.remove(MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); + updateProperties.commit(); + LOG.info("end update mixed-format table."); + } + } + + public void sync() { + table.refresh(); + Map properties = table.properties(); + shouldDoubleWrite = + !properties.containsKey(MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); + LOG.info( + "AutomaticDoubleWriteStatus sync, subTaskId: {}, should double write: {}", + subtaskId, + shouldDoubleWrite); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java new file mode 100644 index 0000000000..03fc90e63a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.write.hidden.HiddenLogWriter; +import org.apache.amoro.flink.write.hidden.LogMsgFactory; +import org.apache.amoro.log.LogData; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; + +import java.time.Duration; +import java.util.Properties; + +/** + * This is an automatic logstore writer util class. It will write logstore when the system current + * timestamp is greater than the watermark of all subtasks plus the {@link + * MixedFormatValidator#AUTO_EMIT_LOGSTORE_WATERMARK_GAP} value. + */ +public class AutomaticLogWriter extends MixedFormatLogWriter { + private final AutomaticDoubleWriteStatus status; + private final MixedFormatLogWriter mixedFormatLogWriter; + + public AutomaticLogWriter( + Schema schema, + Properties producerConfig, + String topic, + LogMsgFactory factory, + LogData.FieldGetterFactory fieldGetterFactory, + byte[] jobId, + ShuffleHelper helper, + MixedFormatTableLoader tableLoader, + Duration writeLogstoreWatermarkGap) { + this.mixedFormatLogWriter = + new HiddenLogWriter( + schema, producerConfig, topic, factory, fieldGetterFactory, jobId, helper); + this.status = new AutomaticDoubleWriteStatus(tableLoader, writeLogstoreWatermarkGap); + } + + @Override + public void setup( + StreamTask containingTask, StreamConfig config, Output> output) { + super.setup(containingTask, config, output); + mixedFormatLogWriter.setup(containingTask, config, output); + status.setup(getRuntimeContext().getIndexOfThisSubtask()); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + mixedFormatLogWriter.initializeState(context); + } + + @Override + public void open() throws Exception { + super.open(); + mixedFormatLogWriter.open(); + status.open(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.processElement(element); + } + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + status.processWatermark(mark); + super.processWatermark(mark); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.prepareSnapshotPreBarrier(checkpointId); + } else { + status.sync(); + } + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.snapshotState(context); + } + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.notifyCheckpointComplete(checkpointId); + } + } + + @Override + public void notifyCheckpointAborted(long checkpointId) throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.notifyCheckpointAborted(checkpointId); + } + } + + @Override + public void close() throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.close(); + } + } + + @Override + public void endInput() throws Exception { + if (status.isDoubleWrite()) { + mixedFormatLogWriter.endInput(); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java new file mode 100644 index 0000000000..6ad19f8868 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.Serializable; +import java.time.Duration; + +/** Automatic write specification. */ +public class AutomaticWriteSpecification implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(AutomaticWriteSpecification.class); + + private static final long serialVersionUID = 1L; + public final Duration writeLogstoreWatermarkGap; + + public AutomaticWriteSpecification(@Nullable Duration writeLogstoreWatermarkGap) { + this.writeLogstoreWatermarkGap = writeLogstoreWatermarkGap; + } + + /** + * Returns whether the automatic writing is enabled. + * + * @param watermark the watermark of the operator + * @return true: double write, false: single write. + */ + public boolean shouldDoubleWrite(long watermark) { + // The writeLogstoreWatermarkGap is null, which means that the logstore writer is enabled + // immediately once the job + // is launched. + if (writeLogstoreWatermarkGap == null) { + LOG.info( + "The logstore writer is enabled and the {} is null," + + " so double write immediately once the job is launched.", + AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); + return true; + } + long now = System.currentTimeMillis(); + boolean result = watermark >= now - writeLogstoreWatermarkGap.toMillis(); + if (result) { + LOG.info( + "The logstore writer is enabled and the {} is {}, the watermark has caught up, {}.", + AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), + writeLogstoreWatermarkGap, + watermark); + } else { + LOG.debug( + "The logstore writer is enabled and the {} is {}, the watermark has not caught up, {}.", + AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), + writeLogstoreWatermarkGap, + watermark); + } + return result; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java new file mode 100644 index 0000000000..c63cdd5555 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.io.writer.BaseTaskWriter; +import org.apache.amoro.io.writer.OutputFileFactory; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.io.FileAppenderFactory; + +/** + * task writer for {@link KeyedTable#baseTable()}. Dev should make sure outputFileFactory write to + * base table's location + */ +public class FlinkBaseTaskWriter extends BaseTaskWriter { + + private final RowDataWrapper wrapper; + + public FlinkBaseTaskWriter( + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory outputFileFactory, + AuthenticatedFileIO io, + long targetFileSize, + long mask, + Schema schema, + RowType flinkSchema, + PartitionSpec spec, + PrimaryKeySpec primaryKeySpec) { + super( + format, + appenderFactory, + outputFileFactory, + io, + targetFileSize, + mask, + schema, + spec, + primaryKeySpec, + false); + this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + } + + @Override + protected StructLike asStructLike(RowData data) { + return wrapper.wrap(data); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java new file mode 100644 index 0000000000..50aaacd086 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.data.PrimaryKeyData; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.io.writer.ChangeTaskWriter; +import org.apache.amoro.io.writer.OutputFileFactory; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.utils.JoinedRowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.io.FileAppenderFactory; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +/** + * task writer for {@link KeyedTable#changeTable()} ()}. Dev should make sure outputFileFactory + * write to change table's location + */ +public class FlinkChangeTaskWriter extends ChangeTaskWriter { + + private final RowDataWrapper wrapper; + private final boolean upsert; + private final Set hasUpdateBeforeKeys = new HashSet<>(); + + public FlinkChangeTaskWriter( + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory outputFileFactory, + AuthenticatedFileIO io, + long targetFileSize, + long mask, + Schema schema, + RowType flinkSchema, + PartitionSpec spec, + PrimaryKeySpec primaryKeySpec, + boolean upsert) { + super( + format, + appenderFactory, + outputFileFactory, + io, + targetFileSize, + mask, + schema, + spec, + primaryKeySpec, + false); + this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); + this.upsert = upsert; + } + + @Override + protected StructLike asStructLike(RowData data) { + return wrapper.wrap(data); + } + + @Override + protected RowData appendMetaColumns(RowData data, Long fileOffset) { + return new JoinedRowData(data, GenericRowData.of(fileOffset)); + } + + @Override + public void write(RowData row) throws IOException { + processMultiUpdateAfter(row); + if (upsert && RowKind.INSERT.equals(row.getRowKind())) { + row.setRowKind(RowKind.DELETE); + super.write(row); + row.setRowKind(RowKind.INSERT); + } + super.write(row); + } + + @Override + protected ChangeAction action(RowData data) { + switch (data.getRowKind()) { + case DELETE: + return ChangeAction.DELETE; + case INSERT: + return ChangeAction.INSERT; + case UPDATE_BEFORE: + return ChangeAction.UPDATE_BEFORE; + case UPDATE_AFTER: + return ChangeAction.UPDATE_AFTER; + } + return ChangeAction.INSERT; + } + + /** Turn update_after to insert if there isn't update_after followed by update_before. */ + private void processMultiUpdateAfter(RowData row) { + RowKind rowKind = row.getRowKind(); + if (RowKind.UPDATE_BEFORE.equals(rowKind) || RowKind.UPDATE_AFTER.equals(rowKind)) { + PrimaryKeyData primaryKey = getPrimaryKey(); + primaryKey.primaryKey(asStructLike(row)); + + if (RowKind.UPDATE_AFTER.equals(rowKind)) { + if (!hasUpdateBeforeKeys.contains(primaryKey)) { + row.setRowKind(RowKind.INSERT); + } else { + hasUpdateBeforeKeys.remove(primaryKey); + } + } else { + PrimaryKeyData copyKey = primaryKey.copy(); + hasUpdateBeforeKeys.add(copyKey); + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java new file mode 100644 index 0000000000..38f0ea532e --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_EMIT_FILE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_EMIT_MODE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SUBMIT_EMPTY_SNAPSHOTS; +import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_HASH_MODE; +import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_HASH_MODE_DEFAULT; +import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_MODE_DEFAULT; +import static org.apache.flink.table.factories.FactoryUtil.SINK_PARALLELISM; + +import org.apache.amoro.flink.metric.MetricsGenerator; +import org.apache.amoro.flink.shuffle.RoundRobinShuffleRulePolicy; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.shuffle.ShuffleKey; +import org.apache.amoro.flink.shuffle.ShuffleRulePolicy; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; +import org.apache.amoro.flink.util.IcebergClassUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.util.ProxyUtil; +import org.apache.amoro.table.DistributionHashMode; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableProperties; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.TypeExtractor; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.functions.sink.DiscardingSink; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.connector.ProviderContext; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.time.Duration; +import java.util.Properties; + +/** + * An util generates mixed-format sink operator including log writer, file writer and file committer + * operators. + */ +public class FlinkSink { + private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); + + public static final String FILES_COMMITTER_NAME = "FilesCommitter"; + + public static Builder forRowData(DataStream input) { + return new Builder().forRowData(input); + } + + public static class Builder { + private DataStream rowDataInput = null; + private ProviderContext context; + private MixedTable table; + private MixedFormatTableLoader tableLoader; + private TableSchema flinkSchema; + private Properties producerConfig; + private String topic; + private boolean overwrite = false; + private final String branch = SnapshotRef.MAIN_BRANCH; + private DistributionHashMode distributionMode = null; + + private Builder() {} + + private Builder forRowData(DataStream newRowDataInput) { + this.rowDataInput = newRowDataInput; + return this; + } + + public Builder context(ProviderContext context) { + this.context = context; + return this; + } + + public Builder table(MixedTable table) { + this.table = table; + return this; + } + + public Builder flinkSchema(TableSchema flinkSchema) { + this.flinkSchema = flinkSchema; + return this; + } + + public Builder producerConfig(Properties producerConfig) { + this.producerConfig = producerConfig; + return this; + } + + public Builder topic(String topic) { + this.topic = topic; + return this; + } + + public Builder tableLoader(MixedFormatTableLoader tableLoader) { + this.tableLoader = tableLoader; + return this; + } + + public Builder overwrite(boolean overwrite) { + this.overwrite = overwrite; + return this; + } + + public Builder distribute(DistributionHashMode distributionMode) { + this.distributionMode = distributionMode; + return this; + } + + DataStreamSink withEmit( + DataStream input, + MixedFormatLogWriter logWriter, + MixedFormatFileWriter fileWriter, + OneInputStreamOperator committer, + int writeOperatorParallelism, + MetricsGenerator metricsGenerator, + String emitMode) { + SingleOutputStreamOperator writerStream = + input + .transform( + MixedFormatWriter.class.getName(), + TypeExtractor.createTypeInfo(WriteResult.class), + new MixedFormatWriter<>(logWriter, fileWriter, metricsGenerator)) + .name(String.format("MixedFormatWriter %s(%s)", table.name(), emitMode)) + .setParallelism(writeOperatorParallelism); + + if (committer != null) { + writerStream = + writerStream + .transform(FILES_COMMITTER_NAME, Types.VOID, committer) + .setParallelism(1) + .setMaxParallelism(1); + } + + return writerStream + .addSink(new DiscardingSink<>()) + .name(String.format("MixedFormatSink %s", table.name())) + .setParallelism(1); + } + + public DataStreamSink build() { + Preconditions.checkNotNull(tableLoader, "table loader can not be null"); + initTableIfNeeded(); + + Configuration config = new Configuration(); + table.properties().forEach(config::setString); + + RowType flinkSchemaRowType = + (RowType) getPhysicalSchema(flinkSchema).toRowDataType().getLogicalType(); + Schema writeSchema = + TypeUtil.reassignIds( + FlinkSchemaUtil.convert(getPhysicalSchema(flinkSchema)), table.schema()); + + int writeOperatorParallelism = + PropertyUtil.propertyAsInt( + table.properties(), + SINK_PARALLELISM.key(), + rowDataInput.getExecutionEnvironment().getParallelism()); + + DistributionHashMode distributionMode = getDistributionHashMode(); + LOG.info("take effect distribute mode: {}", distributionMode); + ShuffleHelper helper = ShuffleHelper.build(table, writeSchema, flinkSchemaRowType); + + ShuffleRulePolicy shufflePolicy = + buildShuffleRulePolicy( + helper, writeOperatorParallelism, distributionMode, overwrite, table); + LOG.info( + "shuffle policy config={}, actual={}", + distributionMode, + shufflePolicy == null ? DistributionMode.NONE : distributionMode.getDesc()); + + String emitMode = + table + .properties() + .getOrDefault(MIXED_FORMAT_EMIT_MODE.key(), MIXED_FORMAT_EMIT_MODE.defaultValue()); + final boolean metricsEventLatency = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + table.properties(), + MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE, + MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE_DEFAULT); + + final boolean metricsEnable = + CompatibleFlinkPropertyUtil.propertyAsBoolean( + table.properties(), + MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE, + MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT); + + final Duration watermarkWriteGap = config.get(AUTO_EMIT_LOGSTORE_WATERMARK_GAP); + + MixedFormatFileWriter fileWriter = + createFileWriter( + table, shufflePolicy, overwrite, flinkSchemaRowType, emitMode, tableLoader); + + MixedFormatLogWriter logWriter = + MixedFormatUtils.buildLogWriter( + table.properties(), + producerConfig, + topic, + flinkSchema, + emitMode, + helper, + tableLoader, + watermarkWriteGap); + + MetricsGenerator metricsGenerator = + MixedFormatUtils.getMetricsGenerator( + metricsEventLatency, metricsEnable, table, flinkSchemaRowType, writeSchema); + + if (shufflePolicy != null) { + rowDataInput = + rowDataInput.partitionCustom( + shufflePolicy.generatePartitioner(), shufflePolicy.generateKeySelector()); + } + + return withEmit( + rowDataInput, + logWriter, + fileWriter, + createFileCommitter(table, tableLoader, overwrite, branch, table.spec(), emitMode), + writeOperatorParallelism, + metricsGenerator, + emitMode); + } + + private void initTableIfNeeded() { + if (table == null) { + table = MixedFormatUtils.loadMixedTable(tableLoader); + } + } + + /** + * Transform {@link org.apache.iceberg.TableProperties#WRITE_DISTRIBUTION_MODE} to + * ShufflePolicyType + */ + private DistributionHashMode getDistributionHashMode() { + if (distributionMode != null) { + return distributionMode; + } + + String modeName = + PropertyUtil.propertyAsString( + table.properties(), WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_DEFAULT); + + DistributionMode mode = DistributionMode.fromName(modeName); + switch (mode) { + case NONE: + return DistributionHashMode.NONE; + case HASH: + String hashMode = + PropertyUtil.propertyAsString( + table.properties(), + WRITE_DISTRIBUTION_HASH_MODE, + WRITE_DISTRIBUTION_HASH_MODE_DEFAULT); + return DistributionHashMode.valueOfDesc(hashMode); + case RANGE: + LOG.warn( + "Fallback to use 'none' distribution mode, because {}={} is not supported in flink now", + WRITE_DISTRIBUTION_MODE, + DistributionMode.RANGE.modeName()); + return DistributionHashMode.NONE; + default: + return DistributionHashMode.AUTO; + } + } + + @Nullable + public static ShuffleRulePolicy buildShuffleRulePolicy( + ShuffleHelper helper, + int writeOperatorParallelism, + DistributionHashMode distributionHashMode, + boolean overwrite, + MixedTable table) { + if (distributionHashMode == DistributionHashMode.AUTO) { + distributionHashMode = + DistributionHashMode.autoSelect( + helper.isPrimaryKeyExist(), helper.isPartitionKeyExist()); + } + if (distributionHashMode == DistributionHashMode.NONE) { + return null; + } else { + if (distributionHashMode.mustByPrimaryKey() && !helper.isPrimaryKeyExist()) { + throw new IllegalArgumentException( + "illegal shuffle policy " + + distributionHashMode.getDesc() + + " for table without primary key"); + } + if (distributionHashMode.mustByPartition() && !helper.isPartitionKeyExist()) { + throw new IllegalArgumentException( + "illegal shuffle policy " + + distributionHashMode.getDesc() + + " for table without partition"); + } + int writeFileSplit; + if (MixedFormatUtils.isToBase(overwrite)) { + writeFileSplit = + PropertyUtil.propertyAsInt( + table.properties(), + TableProperties.BASE_FILE_INDEX_HASH_BUCKET, + TableProperties.BASE_FILE_INDEX_HASH_BUCKET_DEFAULT); + } else { + writeFileSplit = + PropertyUtil.propertyAsInt( + table.properties(), + TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET, + TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET_DEFAULT); + } + + return new RoundRobinShuffleRulePolicy( + helper, writeOperatorParallelism, writeFileSplit, distributionHashMode); + } + } + } + + public static MixedFormatFileWriter createFileWriter( + MixedTable mixedTable, + ShuffleRulePolicy shufflePolicy, + boolean overwrite, + RowType flinkSchema, + MixedFormatTableLoader tableLoader) { + return createFileWriter( + mixedTable, shufflePolicy, overwrite, flinkSchema, MIXED_FORMAT_EMIT_FILE, tableLoader); + } + + public static MixedFormatFileWriter createFileWriter( + MixedTable mixedTable, + ShuffleRulePolicy shufflePolicy, + boolean overwrite, + RowType flinkSchema, + String emitMode, + MixedFormatTableLoader tableLoader) { + if (!MixedFormatUtils.fileWriterEnable(emitMode)) { + return null; + } + long maxOpenFilesSizeBytes = + PropertyUtil.propertyAsLong( + mixedTable.properties(), + MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE, + MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT); + LOG.info( + "with maxOpenFilesSizeBytes = {}MB, close biggest/earliest file to avoid OOM", + maxOpenFilesSizeBytes >> 20); + + int minFileSplitCount = + PropertyUtil.propertyAsInt( + mixedTable.properties(), + TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET, + TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET_DEFAULT); + + boolean upsert = + mixedTable.isKeyedTable() + && PropertyUtil.propertyAsBoolean( + mixedTable.properties(), + TableProperties.UPSERT_ENABLED, + TableProperties.UPSERT_ENABLED_DEFAULT); + boolean submitEmptySnapshot = + PropertyUtil.propertyAsBoolean( + mixedTable.properties(), + SUBMIT_EMPTY_SNAPSHOTS.key(), + SUBMIT_EMPTY_SNAPSHOTS.defaultValue()); + + return new MixedFormatFileWriter( + shufflePolicy, + createTaskWriterFactory(mixedTable, overwrite, flinkSchema), + minFileSplitCount, + tableLoader, + upsert, + submitEmptySnapshot); + } + + private static TaskWriterFactory createTaskWriterFactory( + MixedTable mixedTable, boolean overwrite, RowType flinkSchema) { + return new MixedFormatRowDataTaskWriterFactory(mixedTable, flinkSchema, overwrite); + } + + public static OneInputStreamOperator createFileCommitter( + MixedTable mixedTable, + MixedFormatTableLoader tableLoader, + boolean overwrite, + String branch, + PartitionSpec spec) { + return createFileCommitter( + mixedTable, tableLoader, overwrite, branch, spec, MIXED_FORMAT_EMIT_FILE); + } + + public static OneInputStreamOperator createFileCommitter( + MixedTable mixedTable, + MixedFormatTableLoader tableLoader, + boolean overwrite, + String branch, + PartitionSpec spec, + String emitMode) { + if (!MixedFormatUtils.fileWriterEnable(emitMode)) { + return null; + } + tableLoader.switchLoadInternalTableForKeyedTable(MixedFormatUtils.isToBase(overwrite)); + return (OneInputStreamOperator) + ProxyUtil.getProxy( + IcebergClassUtil.newIcebergFilesCommitter( + tableLoader, overwrite, branch, spec, mixedTable.io()), + mixedTable.io()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java new file mode 100644 index 0000000000..1fec38f803 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.hive.io.writer.AdaptHiveOperateToTableRelation; +import org.apache.amoro.hive.io.writer.AdaptHiveOutputFileFactory; +import org.apache.amoro.hive.table.HiveLocationKind; +import org.apache.amoro.hive.table.SupportHive; +import org.apache.amoro.hive.utils.TableTypeUtil; +import org.apache.amoro.io.writer.CommonOutputFileFactory; +import org.apache.amoro.io.writer.OutputFileFactory; +import org.apache.amoro.io.writer.SortedPosDeleteWriter; +import org.apache.amoro.io.writer.TaskWriterBuilder; +import org.apache.amoro.properties.HiveTableProperties; +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.BaseLocationKind; +import org.apache.amoro.table.ChangeLocationKind; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.LocationKind; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.amoro.table.TableProperties; +import org.apache.amoro.table.UnkeyedTable; +import org.apache.amoro.table.WriteOperationKind; +import org.apache.amoro.utils.SchemaUtil; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.sink.FlinkAppenderFactory; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PropertyUtil; + +import java.util.Locale; + +public class FlinkTaskWriterBuilder implements TaskWriterBuilder { + + private final MixedTable table; + private Long transactionId; + private int partitionId = 0; + private long taskId = 0; + private RowType flinkSchema; + private long mask; + + private FlinkTaskWriterBuilder(MixedTable table) { + this.table = table; + } + + public FlinkTaskWriterBuilder withTransactionId(Long transactionId) { + this.transactionId = transactionId; + return this; + } + + public FlinkTaskWriterBuilder withPartitionId(int partitionId) { + this.partitionId = partitionId; + return this; + } + + public FlinkTaskWriterBuilder withTaskId(long taskId) { + this.taskId = taskId; + return this; + } + + public FlinkTaskWriterBuilder withFlinkSchema(RowType flinkSchema) { + this.flinkSchema = flinkSchema; + return this; + } + + public FlinkTaskWriterBuilder withMask(long mask) { + this.mask = mask; + return this; + } + + @Override + public TaskWriter buildWriter(WriteOperationKind writeOperationKind) { + LocationKind locationKind = + AdaptHiveOperateToTableRelation.INSTANT.getLocationKindsFromOperateKind( + table, writeOperationKind); + return buildWriter(locationKind); + } + + @Override + public TaskWriter buildWriter(LocationKind locationKind) { + if (locationKind == ChangeLocationKind.INSTANT) { + return buildChangeWriter(); + } else if (locationKind == BaseLocationKind.INSTANT + || locationKind == HiveLocationKind.INSTANT) { + return buildBaseWriter(locationKind); + } else { + throw new IllegalArgumentException("Not support Location Kind:" + locationKind); + } + } + + private FlinkBaseTaskWriter buildBaseWriter(LocationKind locationKind) { + Preconditions.checkArgument(transactionId == null); + FileFormat fileFormat = + FileFormat.valueOf( + (table + .properties() + .getOrDefault( + TableProperties.BASE_FILE_FORMAT, TableProperties.BASE_FILE_FORMAT_DEFAULT) + .toUpperCase(Locale.ENGLISH))); + long fileSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + + String baseLocation; + EncryptionManager encryptionManager; + Schema schema; + Table icebergTable; + PrimaryKeySpec primaryKeySpec = null; + if (table.isKeyedTable()) { + KeyedTable keyedTable = table.asKeyedTable(); + baseLocation = keyedTable.baseLocation(); + encryptionManager = keyedTable.baseTable().encryption(); + schema = keyedTable.baseTable().schema(); + primaryKeySpec = keyedTable.primaryKeySpec(); + icebergTable = keyedTable.baseTable(); + } else { + UnkeyedTable table = this.table.asUnkeyedTable(); + baseLocation = table.location(); + encryptionManager = table.encryption(); + schema = table.schema(); + icebergTable = table; + } + + Schema selectSchema = + TypeUtil.reassignIds( + FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(flinkSchema)), schema); + boolean hiveConsistentWriteEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + HiveTableProperties.HIVE_CONSISTENT_WRITE_ENABLED, + HiveTableProperties.HIVE_CONSISTENT_WRITE_ENABLED_DEFAULT); + + OutputFileFactory outputFileFactory = + locationKind == HiveLocationKind.INSTANT + ? new AdaptHiveOutputFileFactory( + ((SupportHive) table).hiveLocation(), + table.spec(), + fileFormat, + table.io(), + encryptionManager, + partitionId, + taskId, + transactionId, + hiveConsistentWriteEnabled) + : new CommonOutputFileFactory( + baseLocation, + table.spec(), + fileFormat, + table.io(), + encryptionManager, + partitionId, + taskId, + transactionId); + FileAppenderFactory appenderFactory = + TableTypeUtil.isHive(table) + ? new AdaptHiveFlinkAppenderFactory( + schema, flinkSchema, table.properties(), table.spec()) + : new FlinkAppenderFactory( + icebergTable, + schema, + flinkSchema, + table.properties(), + table.spec(), + null, + null, + null); + return new FlinkBaseTaskWriter( + fileFormat, + appenderFactory, + outputFileFactory, + table.io(), + fileSizeBytes, + mask, + selectSchema, + flinkSchema, + table.spec(), + primaryKeySpec); + } + + private TaskWriter buildChangeWriter() { + if (table.isUnkeyedTable()) { + throw new IllegalArgumentException("UnKeyed table UnSupport change writer"); + } + Preconditions.checkArgument(transactionId == null); + + FileFormat fileFormat = + FileFormat.valueOf( + (table + .properties() + .getOrDefault( + TableProperties.BASE_FILE_FORMAT, TableProperties.BASE_FILE_FORMAT_DEFAULT) + .toUpperCase(Locale.ENGLISH))); + long fileSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + + KeyedTable keyedTable = table.asKeyedTable(); + Schema selectSchema = + TypeUtil.reassignIds( + FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(flinkSchema)), + keyedTable.baseTable().schema()); + Schema changeSchemaWithMeta = SchemaUtil.changeWriteSchema(keyedTable.baseTable().schema()); + RowType flinkSchemaWithMeta = FlinkSchemaUtil.convert(changeSchemaWithMeta); + + OutputFileFactory outputFileFactory = + new CommonOutputFileFactory( + keyedTable.changeLocation(), + keyedTable.spec(), + fileFormat, + keyedTable.io(), + keyedTable.baseTable().encryption(), + partitionId, + taskId, + transactionId); + FileAppenderFactory appenderFactory = + TableTypeUtil.isHive(table) + ? new AdaptHiveFlinkAppenderFactory( + changeSchemaWithMeta, + flinkSchemaWithMeta, + keyedTable.properties(), + keyedTable.spec()) + : new FlinkAppenderFactory( + keyedTable.changeTable(), + changeSchemaWithMeta, + flinkSchemaWithMeta, + keyedTable.properties(), + keyedTable.spec(), + null, + null, + null); + boolean upsert = + table.isKeyedTable() + && PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.UPSERT_ENABLED, + TableProperties.UPSERT_ENABLED_DEFAULT); + return new FlinkChangeTaskWriter( + fileFormat, + appenderFactory, + outputFileFactory, + keyedTable.io(), + fileSizeBytes, + mask, + selectSchema, + flinkSchema, + keyedTable.spec(), + keyedTable.primaryKeySpec(), + upsert); + } + + @Override + public SortedPosDeleteWriter buildBasePosDeleteWriter( + long mask, long index, StructLike partitionKey) { + throw new UnsupportedOperationException("flink not support position delete"); + } + + public static FlinkTaskWriterBuilder buildFor(MixedTable table) { + return new FlinkTaskWriterBuilder(table); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java new file mode 100644 index 0000000000..da7bf7285d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.flink.shuffle.ShuffleKey; +import org.apache.amoro.flink.shuffle.ShuffleRulePolicy; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.amoro.table.MixedTable; +import org.apache.commons.lang.ArrayUtils; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** This is mixed-format table includes writing file data to un keyed table and keyed table. */ +public class MixedFormatFileWriter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileWriter.class); + + private final ShuffleRulePolicy shuffleRule; + + private final TaskWriterFactory taskWriterFactory; + private final int minFileSplitCount; + private final MixedFormatTableLoader tableLoader; + private final boolean submitEmptySnapshot; + + private transient TaskWriter writer; + private transient int subTaskId; + private transient int attemptId; + /** + * Load table in runtime, because that table's refresh method will be invoked in serialization. + * And it will set {@link org.apache.hadoop.security.UserGroupInformation#authenticationMethod} to + * KERBEROS if mixed-format's table is KERBEROS enabled. It will cause ugi relevant exception when + * deploy to yarn cluster. + */ + private transient MixedTable table; + + public MixedFormatFileWriter( + ShuffleRulePolicy shuffleRule, + TaskWriterFactory taskWriterFactory, + int minFileSplitCount, + MixedFormatTableLoader tableLoader, + boolean upsert, + boolean submitEmptySnapshot) { + this.shuffleRule = shuffleRule; + this.taskWriterFactory = taskWriterFactory; + this.minFileSplitCount = minFileSplitCount; + this.tableLoader = tableLoader; + this.submitEmptySnapshot = submitEmptySnapshot; + LOG.info( + "MixedFormatFileWriter is created with minFileSplitCount: {}, upsert: {}, submitEmptySnapshot: {}", + minFileSplitCount, + upsert, + submitEmptySnapshot); + } + + @Override + public void open() { + this.attemptId = getRuntimeContext().getAttemptNumber(); + table = MixedFormatUtils.loadMixedTable(tableLoader); + + long mask = getMask(subTaskId); + initTaskWriterFactory(mask); + + this.writer = table.io().doAs(taskWriterFactory::create); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + + this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + } + + private void initTaskWriterFactory(long mask) { + if (taskWriterFactory instanceof MixedFormatRowDataTaskWriterFactory) { + ((MixedFormatRowDataTaskWriterFactory) taskWriterFactory).setMask(mask); + } + taskWriterFactory.initialize(subTaskId, attemptId); + } + + private long getMask(int subTaskId) { + Set initRootNodes; + if (shuffleRule != null) { + initRootNodes = shuffleRule.getSubtaskTreeNodes().get(subTaskId); + } else { + if (table.isKeyedTable()) { + initRootNodes = + IntStream.range(0, minFileSplitCount) + .mapToObj(index -> DataTreeNode.of(minFileSplitCount - 1, index)) + .collect(Collectors.toSet()); + } else { + initRootNodes = Sets.newHashSet(); + initRootNodes.add(DataTreeNode.of(0, 0)); + } + } + + return initRootNodes.iterator().next().mask(); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + table + .io() + .doAs( + () -> { + completeAndEmitFiles(); + + this.writer = null; + return null; + }); + } + + @Override + public void endInput() throws Exception { + table + .io() + .doAs( + () -> { + completeAndEmitFiles(); + return null; + }); + } + + private void completeAndEmitFiles() throws IOException { + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining + // completed files to downstream before closing the writer so that we won't miss any of them. + if (writer != null) { + emit(writer.complete()); + } + } + + @Override + public void processElement(StreamRecord element) throws Exception { + RowData row = element.getValue(); + table + .io() + .doAs( + () -> { + if (writer == null) { + this.writer = taskWriterFactory.create(); + } + writer.write(row); + return null; + }); + } + + @Override + public void close() throws Exception { + super.close(); + if (writer != null) { + table + .io() + .doAs( + () -> { + writer.close(); + return null; + }); + writer = null; + } + } + + private void emit(WriteResult writeResult) { + if (shouldEmit(writeResult)) { + // Only emit a non-empty WriteResult to committer operator, thus avoiding submitting too much + // empty snapshots. + output.collect(new StreamRecord<>(writeResult)); + } + } + + /** + * Whether to emit the WriteResult. + * + * @param writeResult the WriteResult to emit + * @return true if the WriteResult should be emitted, or the WriteResult isn't empty, false only + * if the WriteResult is empty and the submitEmptySnapshot is false. + */ + private boolean shouldEmit(WriteResult writeResult) { + return submitEmptySnapshot + || (writeResult != null + && (!ArrayUtils.isEmpty(writeResult.dataFiles()) + || !ArrayUtils.isEmpty(writeResult.deleteFiles()) + || !ArrayUtils.isEmpty(writeResult.referencedDataFiles()))); + } + + @VisibleForTesting + public TaskWriter getWriter() { + return writer; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java new file mode 100644 index 0000000000..ed4dd10e1f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.table.data.RowData; + +/** This is a common abstract mixed-format log writer. */ +public abstract class MixedFormatLogWriter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput {} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java new file mode 100644 index 0000000000..d6f128a1b8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.WriteOperationKind; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; + +/** This is an mixed-format table writer factory. */ +public class MixedFormatRowDataTaskWriterFactory implements TaskWriterFactory { + + private final MixedTable table; + private final RowType flinkSchema; + + private final boolean overwrite; + + private transient Long mask = null; + private final transient Long transactionId = null; + private transient Integer taskId = null; + private transient Integer attemptId = null; + + public MixedFormatRowDataTaskWriterFactory( + MixedTable table, RowType flinkSchema, boolean overwrite) { + this.table = table; + this.flinkSchema = flinkSchema; + this.overwrite = overwrite; + } + + public void setMask(long mask) { + this.mask = mask; + } + + @Override + public void initialize(int taskId, int attemptId) { + this.taskId = taskId; + this.attemptId = attemptId; + } + + @Override + public TaskWriter create() { + Preconditions.checkNotNull( + mask, "Mask should be set first. Invoke setMask() before this method"); + + FlinkTaskWriterBuilder builder = + FlinkTaskWriterBuilder.buildFor(table) + .withTaskId(taskId) + .withMask(mask) + .withTransactionId(transactionId) + .withFlinkSchema(flinkSchema) + .withPartitionId(attemptId); + if (overwrite) { + return builder.buildWriter(WriteOperationKind.OVERWRITE); + } else { + return builder.buildWriter(WriteOperationKind.APPEND); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java new file mode 100644 index 0000000000..e5dfb7a4a7 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.flink.metric.MetricsGenerator; +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.metrics.Meter; +import org.apache.flink.metrics.MeterView; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.Input; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.OutputTag; + +import java.util.Objects; + +/** + * This is the general entry of an mixed-format writer that wraps different operators insides. + * + * @param + */ +public class MixedFormatWriter extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private transient Meter meterFlowRate; + + private transient Meter meterSpeed; + + private final AbstractStreamOperator fileWriter; + private final MixedFormatLogWriter logWriter; + private final MetricsGenerator metricsGenerator; + + private static final String INFLUXDB_TAG_NAME = "mixed_format_task_id"; + + public MixedFormatWriter( + MixedFormatLogWriter logWriter, + AbstractStreamOperator fileWriter, + MetricsGenerator metricsGenerator) { + this.logWriter = logWriter; + this.fileWriter = fileWriter; + this.metricsGenerator = metricsGenerator; + } + + @Override + public void setup( + StreamTask containingTask, StreamConfig config, Output> output) { + super.setup(containingTask, config, output); + if (logWriter != null) { + logWriter.setup(containingTask, config, EMPTY_OUTPUT); + } + if (fileWriter != null) { + fileWriter.setup(containingTask, config, output); + } + } + + @Override + public void open() throws Exception { + ExecutionConfig.GlobalJobParameters globalJobParameters = + getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); + String taskId = + Objects.nonNull(globalJobParameters.toMap().get(INFLUXDB_TAG_NAME)) + ? globalJobParameters.toMap().get(INFLUXDB_TAG_NAME) + : "null"; + // latency + if (metricsGenerator.enable()) { + getRuntimeContext() + .getMetricGroup() + .addGroup(INFLUXDB_TAG_NAME, taskId) + .gauge("record-latency", metricsGenerator::getCurrentLatency); + LOG.info("add metrics record-latency"); + } + if (metricsGenerator.isMetricEnable()) { + // speed + meterFlowRate = + getRuntimeContext() + .getMetricGroup() + .addGroup(INFLUXDB_TAG_NAME, taskId) + .meter("record-meter", new MeterView(60)); + LOG.info("add metrics record-meter"); + // rate of flow + meterSpeed = + getRuntimeContext() + .getMetricGroup() + .addGroup(INFLUXDB_TAG_NAME, taskId) + .meter("record-count", new MeterView(60)); + LOG.info("add metrics record-count"); + } + if (logWriter != null) { + logWriter.open(); + } + if (fileWriter != null) { + fileWriter.open(); + } + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + if (logWriter != null) { + logWriter.initializeState(context); + } + if (fileWriter != null) { + fileWriter.initializeState(context); + } + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + if (logWriter != null) { + logWriter.prepareSnapshotPreBarrier(checkpointId); + } + if (fileWriter != null) { + fileWriter.prepareSnapshotPreBarrier(checkpointId); + } + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + if (logWriter != null) { + logWriter.snapshotState(context); + } + if (fileWriter != null) { + fileWriter.snapshotState(context); + } + } + + @Override + public void endInput() throws Exception { + if (logWriter != null) { + logWriter.endInput(); + } + if (fileWriter instanceof BoundedOneInput) { + ((BoundedOneInput) fileWriter).endInput(); + } + } + + @Override + public void processElement(StreamRecord element) throws Exception { + if (metricsGenerator.isMetricEnable()) { + meterSpeed.markEvent(); + } + if (logWriter != null) { + logWriter.processElement(element); + } + if (fileWriter instanceof Input) { + ((Input) fileWriter).processElement(element); + } + metricsGenerator.recordLatency(element); + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + if (logWriter != null) { + logWriter.processWatermark(mark); + } + if (fileWriter instanceof Input) { + ((Input) fileWriter).processWatermark(mark); + } + super.processWatermark(mark); + } + + @Override + public void close() throws Exception { + super.close(); + if (logWriter != null) { + logWriter.close(); + } + if (fileWriter != null) { + fileWriter.close(); + } + } + + private static final Output> EMPTY_OUTPUT = + new Output>() { + @Override + public void emitWatermark(Watermark watermark) {} + + @Override + public void emitWatermarkStatus(WatermarkStatus watermarkStatus) {} + + @Override + public void collect(OutputTag outputTag, StreamRecord streamRecord) {} + + @Override + public void collect(StreamRecord rowDataStreamRecord) {} + + @Override + public void emitLatencyMarker(LatencyMarker latencyMarker) {} + + @Override + public void close() {} + }; +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java new file mode 100644 index 0000000000..dfd04264b0 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden; + +import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.write.MixedFormatLogWriter; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.base.IntSerializer; +import org.apache.flink.api.common.typeutils.base.StringSerializer; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Properties; + +/** + * This is an abstract log queue writer. Sending flip message to the kafka topic when the operator + * occurs restoring, through the {@link GlobalFlipCommitter} commit {@link + * GlobalFlipCommitter.CommitRequest} to the jobMaster. {@link this#processElement(StreamRecord)} + * will process records after all operators has sent flip message to the jobMaster and the jobMaster + * has finished handling these requests. + */ +public abstract class AbstractHiddenLogWriter extends MixedFormatLogWriter { + public static final Logger LOG = LoggerFactory.getLogger(AbstractHiddenLogWriter.class); + + private static final long serialVersionUID = 1L; + private int subtaskId; + private transient ListState hiddenLogJobIdentifyState; + private transient ListState parallelismState; + private transient Long ckpComplete; + private final Schema schema; + private final Properties producerConfig; + private final String topic; + private final ShuffleHelper helper; + protected final LogMsgFactory factory; + protected LogMsgFactory.Producer producer; + + private transient boolean shouldCheckFlipSent = false; + private transient boolean flipSentSucceed = false; + + private GlobalFlipCommitter flipCommitter; + private final LogData.FieldGetterFactory fieldGetterFactory; + protected transient LogDataJsonSerialization logDataJsonSerialization; + + protected FormatVersion logVersion = FormatVersion.FORMAT_VERSION_V1; + protected byte[] jobIdentify; + // start from 1L, epicNo is similar to checkpoint id. + protected long epicNo = 1L; + + protected transient LogData logFlip; + + public AbstractHiddenLogWriter( + Schema schema, + Properties producerConfig, + String topic, + LogMsgFactory factory, + LogData.FieldGetterFactory fieldGetterFactory, + byte[] jobId, + ShuffleHelper helper) { + this.schema = schema; + this.producerConfig = checkNotNull(producerConfig); + this.topic = checkNotNull(topic); + this.factory = factory; + this.fieldGetterFactory = fieldGetterFactory; + this.jobIdentify = jobId; + this.helper = helper; + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + subtaskId = getRuntimeContext().getIndexOfThisSubtask(); + + hiddenLogJobIdentifyState = + context + .getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "hidden-wal-writer-job-identify", StringSerializer.INSTANCE)); + + parallelismState = + context + .getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "job-" + Arrays.toString(jobIdentify) + "-parallelism", + IntSerializer.INSTANCE)); + // init flip committer function + flipCommitter = + new GlobalFlipCommitter( + getRuntimeContext().getGlobalAggregateManager(), + new GlobalFlipCommitter.FlipCommitFunction( + getRuntimeContext().getNumberOfParallelSubtasks(), + schema, + fieldGetterFactory, + factory, + producerConfig, + topic, + helper)); + int parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); + + if (context.isRestored() && parallelismSame(parallelism)) { + ckpComplete = context.getRestoredCheckpointId().getAsLong(); + + jobIdentify = + hiddenLogJobIdentifyState.get().iterator().next().getBytes(StandardCharsets.UTF_8); + + epicNo = ckpComplete; + + logFlip = + new LogRecordV1( + logVersion, jobIdentify, epicNo, true, ChangeAction.INSERT, new GenericRowData(0)); + // signal flip topic + shouldCheckFlipSent = true; + flipSentSucceed = flipCommitter.commit(subtaskId, logFlip); + // after send flip, epicNo + 1 The epicNo of the data sent by the subsequent processElement() + // method will be 1 larger than the flip.epicNo. + epicNo++; + } else { + hiddenLogJobIdentifyState.clear(); + hiddenLogJobIdentifyState.add(new String(jobIdentify, StandardCharsets.UTF_8)); + } + + logDataJsonSerialization = + new LogDataJsonSerialization<>(checkNotNull(schema), checkNotNull(fieldGetterFactory)); + + producer = factory.createProducer(producerConfig, topic, logDataJsonSerialization, helper); + + parallelismState.clear(); + parallelismState.add(parallelism); + + LOG.info( + "initializeState subtaskId={}, restore={}, lastCkpComplete={}.", + subtaskId, + context.isRestored(), + ckpComplete); + } + + private boolean parallelismSame(int parallelism) throws Exception { + if (parallelismState == null + || parallelismState.get() == null + || !parallelismState.get().iterator().hasNext()) { + LOG.info("Can't find out parallelism state, ignore sending flips."); + return false; + } + int beforeParallelism = parallelismState.get().iterator().next(); + if (beforeParallelism != parallelism) { + LOG.warn( + "This job restored from state, but has changed parallelism, before:{}, now:{}," + + " So ignore sending flips now.", + beforeParallelism, + parallelism); + return false; + } + return true; + } + + @Override + public void open() throws Exception { + producer.open(); + } + + public void processElement(StreamRecord element) throws Exception { + int waitCount = 0; + // this is a sync step that will check sending flip succeed or not + while (shouldCheckFlip() && !alreadySentFlip()) { + Thread.sleep(100); + if (waitCount++ % 100 == 0) { + LOG.info( + "Still waiting for sending flip," + + " while the other subtasks have committed to Global State. this subtask is {}.", + subtaskId); + } + } + } + + private boolean alreadySentFlip() throws IOException { + if (!flipSentSucceed) { + flipSentSucceed = flipCommitter.hasCommittedFlip(logFlip); + } + return flipSentSucceed; + } + + private boolean shouldCheckFlip() { + return shouldCheckFlipSent; + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + super.prepareSnapshotPreBarrier(checkpointId); + LOG.info("prepareSnapshotPreBarrier subtaskId={}, checkpointId={}.", subtaskId, checkpointId); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + producer.flush(); + LOG.info("snapshotState subtaskId={}, checkpointId={}.", subtaskId, context.getCheckpointId()); + epicNo++; + } + + @Override + public void close() throws Exception { + if (producer != null) { + producer.close(); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java new file mode 100644 index 0000000000..5f64438709 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden; + +import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; + +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.flink.api.common.functions.AggregateFunction; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.CopyOnWriteArraySet; + +/** This is a global flip committer used by every log writer operator. */ +public class GlobalFlipCommitter { + private static final Logger LOG = LoggerFactory.getLogger(GlobalFlipCommitter.class); + + private static final String AGGREGATE_NAME = "flip-committer"; + private final GlobalAggregateManager aggregateManager; + private final FlipCommitFunction flipCommitFunction; + + public GlobalFlipCommitter( + GlobalAggregateManager aggregateManager, FlipCommitFunction flipCommitFunction) { + this.aggregateManager = aggregateManager; + this.flipCommitFunction = flipCommitFunction; + } + + public boolean commit(int subtaskId, LogData logData) throws IOException { + Long committedEpicNo = + aggregateManager.updateGlobalAggregate( + AGGREGATE_NAME, new CommitRequest(subtaskId, logData), flipCommitFunction); + return committedEpicNo != null && committedEpicNo == logData.getEpicNo(); + } + + public boolean hasCommittedFlip(LogData logData) throws IOException { + Long committedEpicNo = + aggregateManager.updateGlobalAggregate( + AGGREGATE_NAME, new CommitRequest(null, logData, true), flipCommitFunction); + return committedEpicNo != null && committedEpicNo == logData.getEpicNo(); + } + + static class FlipCommitFunction + implements AggregateFunction { + private static final long serialVersionUID = 6399278898504357412L; + private final int numberOfTasks; + private final LogDataJsonSerialization logDataJsonSerialization; + private final LogMsgFactory factory; + private final Properties producerConfig; + private final String topic; + private final ShuffleHelper helper; + private transient LogMsgFactory.Producer producer; + + public FlipCommitFunction( + int numberOfTasks, + Schema schema, + LogData.FieldGetterFactory fieldGetterFactory, + LogMsgFactory factory, + Properties producerConfig, + String topic, + ShuffleHelper helper) { + this.numberOfTasks = numberOfTasks; + this.factory = checkNotNull(factory); + this.logDataJsonSerialization = + new LogDataJsonSerialization<>(checkNotNull(schema), checkNotNull(fieldGetterFactory)); + this.producerConfig = producerConfig; + this.topic = topic; + this.helper = helper; + } + + @Override + public LogGlobalState createAccumulator() { + return new LogGlobalState(); + } + + @Override + public LogGlobalState add(CommitRequest value, LogGlobalState globalState) { + if (value.checkCommitted) { + return globalState; + } + LOG.info("receive CommitRequest={}.", value); + NavigableMap accumulator = globalState.accumulators; + Long epicNo = value.logRecord.getEpicNo(); + accumulator.compute( + epicNo, + (cpId, subAccumulator) -> { + subAccumulator = subAccumulator == null ? new SubAccumulator() : subAccumulator; + if (!subAccumulator.hasCommittedFlip) { + subAccumulator.add(value.subtaskId, value); + } + return subAccumulator; + }); + + SubAccumulator subAccumulator = globalState.accumulators.get(epicNo); + if (subAccumulator.taskIds.size() == numberOfTasks) { + // this sync step, wait for sent records to topic. + try { + LOG.info( + "already receive {} commit requests. The last subtask received is {}.", + numberOfTasks, + value.subtaskId); + sendFlip(subAccumulator, value); + LOG.info("sent flip messages success, cost {}ms.", subAccumulator.cost.time()); + } catch (Exception e) { + LOG.error("sending flip messages to topic failed, subAccumulator:{}.", subAccumulator, e); + throw new RuntimeException(e); + } + } else { + LOG.info( + "As of now, global state has received a total of {} commit requests which are {}.", + subAccumulator.taskIds.size(), + Arrays.toString(subAccumulator.taskIds.toArray(new Integer[0]))); + } + return globalState; + } + + private void sendFlip(SubAccumulator subAccumulator, CommitRequest value) throws Exception { + if (null == producer) { + producer = factory.createProducer(producerConfig, topic, logDataJsonSerialization, helper); + producer.open(); + } + + producer.sendToAllPartitions(value.logRecord); + subAccumulator.committed(); + } + + @Override + public Long getResult(LogGlobalState globalState) { + // find the maximum epic number and has already committed flip message to log queue. + Optional result = + globalState.accumulators.descendingMap().entrySet().stream() + .filter(entry -> entry.getValue().hasCommittedFlip) + .findFirst() + .map(Map.Entry::getKey); + return result.orElse(null); + } + + @Override + public LogGlobalState merge(LogGlobalState a, LogGlobalState b) { + b.accumulators.forEach( + (cpId, acc) -> + a.accumulators.compute( + cpId, + (key, subAccumulator) -> { + subAccumulator = subAccumulator == null ? new SubAccumulator() : subAccumulator; + if (!subAccumulator.hasCommittedFlip) { + subAccumulator.merge(acc); + } + return subAccumulator; + })); + return a; + } + } + + static class CommitRequest implements Serializable { + private static final long serialVersionUID = 5469815741394678192L; + private final Integer subtaskId; + private final LogData logRecord; + // TURE means check committerFunction has sent flip to topic whether. + private final boolean checkCommitted; + + private CommitRequest(Integer subtaskId, LogData logRecord) { + this.subtaskId = subtaskId; + this.logRecord = logRecord; + this.checkCommitted = false; + } + + private CommitRequest(Integer subtaskId, LogData logRecord, Boolean checkCommitted) { + this.subtaskId = subtaskId; + this.logRecord = logRecord; + this.checkCommitted = checkCommitted; + } + + @Override + public String toString() { + return "CommitRequest{subtaskId=" + + subtaskId + + ", flip message=" + + logRecord.toString() + + "}"; + } + } + + static class LogGlobalState implements Serializable { + private static final long serialVersionUID = 9132207718335661833L; + // this map keys mean epicNo, which is not exactly equal to checkpoint id + private final NavigableMap accumulators; + + public LogGlobalState() { + accumulators = new ConcurrentSkipListMap<>(); + } + } + + private static class SubAccumulator implements Serializable { + private static final long serialVersionUID = 1252547231163598559L; + private final Set taskIds = new CopyOnWriteArraySet<>(); + private CommitRequest commitRequest = null; + // TRUE means has already sent flip msg to topic successfully. + private volatile boolean hasCommittedFlip = false; + // Mark how long it took to collect all commit requests. + private final Cost cost = new Cost(); + + void add(int taskId, CommitRequest commitRequest) { + this.taskIds.add(taskId); + if (null == this.commitRequest && null != commitRequest) { + this.commitRequest = commitRequest; + } + cost.markStart(); + } + + void committed() { + this.hasCommittedFlip = true; + cost.markEnd(); + } + + void merge(SubAccumulator subAccumulator) { + this.taskIds.addAll(subAccumulator.taskIds); + this.commitRequest = subAccumulator.commitRequest; + } + + static class Cost implements Serializable { + private static final long serialVersionUID = 1L; + Long start; + Long end; + + long time() { + return end - start; + } + + void markStart() { + if (start == null) { + start = System.currentTimeMillis(); + } + } + + void markEnd() { + if (end == null) { + end = System.currentTimeMillis(); + } + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java new file mode 100644 index 0000000000..cfcb5d6029 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden; + +import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; + +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.log.LogData; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.Schema; + +import java.util.Properties; + +/** This is a hidden log writer. */ +public class HiddenLogWriter extends AbstractHiddenLogWriter { + private static final long serialVersionUID = 1L; + + public HiddenLogWriter( + Schema schema, + Properties producerConfig, + String topic, + LogMsgFactory factory, + LogData.FieldGetterFactory fieldGetterFactory, + byte[] jobId, + ShuffleHelper helper) { + super(schema, producerConfig, topic, factory, fieldGetterFactory, jobId, helper); + } + + @Override + public void endInput() throws Exception { + producer.flush(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + // check send flip successfully or not + super.processElement(element); + + // continue process element + RowData rowData = element.getValue(); + LogData logData = + new LogRecordV1( + logVersion, + jobIdentify, + epicNo, + false, + transformFromFlinkRowKind(rowData.getRowKind()), + rowData); + producer.send(logData); + output.collect(new StreamRecord<>(rowData)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java new file mode 100644 index 0000000000..91eec0dbb6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden; + +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.flink.configuration.Configuration; + +import java.io.Serializable; +import java.util.Properties; + +/** + * A factory creates log queue producers or consumers, e.g. kafka or pulsar distributed event + * streaming platform. + */ +public interface LogMsgFactory extends Serializable { + + Producer createProducer( + Properties producerConfig, + String topic, + LogDataJsonSerialization logDataJsonSerialization, + ShuffleHelper helper); + + Consumer createConsumer(); + + interface Producer { + void open() throws Exception; + + void send(LogData logData) throws Exception; + + void sendToAllPartitions(LogData logData) throws Exception; + + void flush(); + + void close() throws Exception; + } + + interface Consumer { + + default void open(Configuration parameters) throws Exception {} + + default void close() throws Exception {} + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java new file mode 100644 index 0000000000..962e39e917 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden; + +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.log.LogData; +import org.apache.commons.lang.ArrayUtils; +import org.apache.flink.table.data.RowData; +import org.apache.kafka.common.utils.Utils; + +import java.io.Serializable; +import java.util.concurrent.atomic.AtomicInteger; + +/** This is a log message partitioner that makes sure the record is without out-of-order. */ +public class MixedFormatLogPartitioner implements Serializable { + private static final long serialVersionUID = 9184708069203854226L; + private final AtomicInteger counter = new AtomicInteger(0); + private final ShuffleHelper helper; + + public MixedFormatLogPartitioner(ShuffleHelper shuffleHelper) { + this.helper = shuffleHelper; + } + + public int partition(LogData logData, int[] partitions) { + checkNotNull(logData, "record is null"); + checkArgument(ArrayUtils.isNotEmpty(partitions), "Partitions of the target topic is empty."); + + int partition; + if (helper == null || !helper.isPrimaryKeyExist()) { + int nextValue = nextValue(); + int part = Utils.toPositive(nextValue) % partitions.length; + partition = partitions[part]; + } else { + helper.open(); + long hash = helper.hashKeyValue((RowData) logData.getActualValue()); + partition = partitions[(int) (hash % partitions.length)]; + } + return partition; + } + + private int nextValue() { + return counter.getAndIncrement(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java new file mode 100644 index 0000000000..cd2ccffcdc --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden.kafka; + +import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; + +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.write.hidden.LogMsgFactory; +import org.apache.amoro.flink.write.hidden.MixedFormatLogPartitioner; +import org.apache.amoro.log.LogDataJsonSerialization; + +import java.util.Properties; + +/** A factory creates kafka log queue producers or consumers. */ +public class HiddenKafkaFactory implements LogMsgFactory { + private static final long serialVersionUID = -1L; + + @Override + public Producer createProducer( + Properties producerConfig, + String topic, + LogDataJsonSerialization logDataJsonSerialization, + ShuffleHelper helper) { + checkNotNull(topic); + return new HiddenKafkaProducer<>( + producerConfig, topic, logDataJsonSerialization, new MixedFormatLogPartitioner<>(helper)); + } + + @Override + public Consumer createConsumer() { + throw new UnsupportedOperationException("not supported right now"); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java new file mode 100644 index 0000000000..2479c80f2a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden.kafka; + +import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; + +import org.apache.amoro.flink.write.hidden.LogMsgFactory; +import org.apache.amoro.flink.write.hidden.MixedFormatLogPartitioner; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.flink.streaming.connectors.kafka.FlinkKafkaErrorCode; +import org.apache.flink.streaming.connectors.kafka.FlinkKafkaException; +import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; +import org.apache.flink.util.ExceptionUtils; +import org.apache.flink.util.FlinkRuntimeException; +import org.apache.kafka.clients.producer.Callback; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.PartitionInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Properties; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * This is hidden log queue kafka producer that serializes {@link LogData} and emits to the kafka + * topic. + */ +public class HiddenKafkaProducer implements LogMsgFactory.Producer { + private static final Logger LOG = LoggerFactory.getLogger(HiddenKafkaProducer.class); + /** User defined properties for the Kafka Producer. */ + protected final Properties producerConfig; + + private final String topic; + + private final LogDataJsonSerialization logDataJsonSerialization; + + /** The callback than handles error propagation or logging callbacks. */ + @Nullable protected transient Callback callback; + /** Errors encountered in the async producer are stored here. */ + @Nullable protected transient volatile Exception asyncException; + + private transient FlinkKafkaInternalProducer producer; + private transient FlinkKafkaInternalProducer transactionalProducer; + + private final MixedFormatLogPartitioner mixedFormatLogPartitioner; + private int[] partitions; + + public HiddenKafkaProducer( + Properties producerConfig, + String topic, + LogDataJsonSerialization logDataJsonSerialization, + MixedFormatLogPartitioner mixedFormatLogPartitioner) { + this.producerConfig = producerConfig; + this.topic = topic; + this.logDataJsonSerialization = logDataJsonSerialization; + this.mixedFormatLogPartitioner = mixedFormatLogPartitioner; + } + + @Override + public void open() throws Exception { + callback = + (metadata, exception) -> { + if (exception != null && asyncException == null) { + asyncException = exception; + } + acknowledgeMessage(); + }; + producer = createProducer(); + transactionalProducer = createTransactionalProducer(); + transactionalProducer.initTransactions(); + partitions = getPartitionsByTopic(topic, producer); + LOG.info("HiddenKafkaPartition topic:{}, partitions:{}.", topic, partitions); + } + + @Override + public void send(LogData logData) throws Exception { + checkErroneous(); + byte[] message = logDataJsonSerialization.serialize(logData); + int partition = mixedFormatLogPartitioner.partition(logData, partitions); + ProducerRecord producerRecord = + new ProducerRecord<>(topic, partition, null, null, message); + producer.send(producerRecord, callback); + } + + @Override + public void sendToAllPartitions(LogData logData) throws Exception { + checkErroneous(); + byte[] message = logDataJsonSerialization.serialize(logData); + List> recordList = + IntStream.of(partitions) + .mapToObj(i -> new ProducerRecord(topic, i, null, null, message)) + .collect(Collectors.toList()); + LOG.info("sending {} partitions with flip message={}.", recordList.size(), logData); + long start = System.currentTimeMillis(); + try { + transactionalProducer.beginTransaction(); + for (ProducerRecord producerRecord : recordList) { + checkErroneous(); + transactionalProducer.send(producerRecord, callback); + } + transactionalProducer.commitTransaction(); + LOG.info("finished flips sending, cost {}ms.", System.currentTimeMillis() - start); + } catch (Throwable e) { + LOG.error("", e); + transactionalProducer.abortTransaction(); + throw new FlinkRuntimeException(e); + } + } + + @Override + public void flush() { + producer.flush(); + } + + @Override + public void close() throws Exception { + try { + if (producer != null) { + producer.close(Duration.ofSeconds(0)); + } + transactionalProducer.close(Duration.ofSeconds(0)); + } catch (Exception e) { + asyncException = ExceptionUtils.firstOrSuppressed(e, asyncException); + } finally { + checkErroneous(); + } + } + + protected FlinkKafkaInternalProducer createTransactionalProducer() { + Properties transactionalProperties = new Properties(); + transactionalProperties.putAll(producerConfig); + transactionalProperties.computeIfAbsent( + TRANSACTIONAL_ID_CONFIG, o -> UUID.randomUUID().toString()); + return new FlinkKafkaInternalProducer<>(transactionalProperties); + } + + protected FlinkKafkaInternalProducer createProducer() { + return new FlinkKafkaInternalProducer<>(producerConfig); + } + + public static int[] getPartitionsByTopic( + String topic, org.apache.kafka.clients.producer.Producer producer) { + // the fetched list is immutable, so we're creating a mutable copy in order to sort it + List partitionsList = new ArrayList<>(producer.partitionsFor(topic)); + + // sort the partitions by partition id to make sure the fetched partition list is the same + // across subtasks + partitionsList.sort(Comparator.comparingInt(PartitionInfo::partition)); + + return partitionsList.stream().mapToInt(PartitionInfo::partition).toArray(); + } + + protected void checkErroneous() throws FlinkKafkaException { + Exception e = asyncException; + if (e != null) { + // prevent double throwing + asyncException = null; + throw new FlinkKafkaException( + FlinkKafkaErrorCode.EXTERNAL_ERROR, "Failed to send data to Kafka: " + e.getMessage(), e); + } + } + + /** + * ATTENTION to subclass implementors: When overriding this method, please always call + * {@code super.acknowledgeMessage()} to keep the invariants of the internal bookkeeping of the + * producer. If not, be sure to know what you are doing. + */ + protected void acknowledgeMessage() {} +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java new file mode 100644 index 0000000000..4d0ec20839 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.common.eventtime.WatermarkGenerator; +import org.apache.flink.api.common.eventtime.WatermarkOutput; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.source.SourceFunctionProvider; +import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.planner.factories.TableFactoryHarness; + +import java.io.Serializable; + +public abstract class DynamicTableSourceTestBase extends TableFactoryHarness.ScanSourceBase + implements SupportsWatermarkPushDown, Serializable { + + public static final long serialVersionUID = 1L; + private WatermarkStrategy watermarkStrategy; + + @Override + public ChangelogMode getChangelogMode() { + return ChangelogMode.all(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + init(); + return SourceFunctionProvider.of( + new SourceFunction() { + @Override + public void run(SourceContext ctx) { + WatermarkGenerator generator = + watermarkStrategy.createWatermarkGenerator(() -> null); + WatermarkOutput output = new TestWatermarkOutput(ctx); + doRun(generator, output, ctx); + } + + @Override + public void cancel() {} + }, + false); + } + + public void init() {} + + public abstract void doRun( + WatermarkGenerator generator, + WatermarkOutput output, + SourceFunction.SourceContext ctx); + + @Override + public void applyWatermark(WatermarkStrategy watermarkStrategy) { + this.watermarkStrategy = watermarkStrategy; + } + + public class TestWatermarkOutput implements WatermarkOutput, Serializable { + public static final long serialVersionUID = 1L; + public SourceFunction.SourceContext ctx; + + public TestWatermarkOutput(SourceFunction.SourceContext ctx) { + this.ctx = ctx; + } + + @Override + public void emitWatermark(Watermark watermark) { + ctx.emitWatermark( + new org.apache.flink.streaming.api.watermark.Watermark(watermark.getTimestamp())); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java new file mode 100644 index 0000000000..de8d1be1dd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.write.MixedFormatRowDataTaskWriterFactory; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.amoro.table.UnkeyedTable; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; + +import java.util.Arrays; + +/** + * This class contains flink table rowType schema and others, and will replace {@link FlinkTestBase} + * base class in the future. + */ +public interface FlinkTableTestBase { + default TaskWriter createTaskWriter(MixedTable mixedTable, RowType rowType) { + return mixedTable.isKeyedTable() + ? createKeyedTaskWriter((KeyedTable) mixedTable, rowType) + : createUnkeyedTaskWriter((UnkeyedTable) mixedTable, rowType); + } + + default TaskWriter createBaseTaskWriter(MixedTable mixedTable, RowType rowType) { + return mixedTable.isKeyedTable() + ? createKeyedTaskWriter((KeyedTable) mixedTable, rowType, true, 3) + : createUnkeyedTaskWriter((UnkeyedTable) mixedTable, rowType); + } + + default TaskWriter createKeyedTaskWriter(KeyedTable keyedTable, RowType rowType) { + return createKeyedTaskWriter(keyedTable, rowType, false, 3); + } + + default TaskWriter createKeyedTaskWriter( + KeyedTable keyedTable, RowType rowType, boolean overwrite, long mask) { + return createTaskWriter(keyedTable, rowType, overwrite, mask); + } + + default TaskWriter createUnkeyedTaskWriter(UnkeyedTable unkeyedTable, RowType rowType) { + return createTaskWriter(unkeyedTable, rowType, false, 3); + } + + default TaskWriter createTaskWriter( + MixedTable mixedTable, RowType rowType, boolean overwrite, long mask) { + MixedFormatRowDataTaskWriterFactory taskWriterFactory = + new MixedFormatRowDataTaskWriterFactory(mixedTable, rowType, overwrite); + taskWriterFactory.setMask(mask); + taskWriterFactory.initialize(0, 0); + return taskWriterFactory.create(); + } + + default void commit(MixedTable mixedTable, WriteResult result, boolean base) { + if (mixedTable.isKeyedTable()) { + KeyedTable keyedTable = mixedTable.asKeyedTable(); + if (base) { + AppendFiles baseAppend = keyedTable.baseTable().newAppend(); + Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); + baseAppend.commit(); + } else { + AppendFiles changeAppend = keyedTable.changeTable().newAppend(); + Arrays.stream(result.dataFiles()).forEach(changeAppend::appendFile); + changeAppend.commit(); + } + } else { + if (!base) { + throw new IllegalArgumentException( + String.format( + "mixed-format table %s is a unkeyed table, can't commit to change table", + mixedTable.name())); + } + UnkeyedTable unkeyedTable = mixedTable.asUnkeyedTable(); + AppendFiles baseAppend = unkeyedTable.newAppend(); + Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); + baseAppend.commit(); + } + } + + default MixedFormatTableLoader getTableLoader( + String catalogName, String amsUri, MixedTable mixedTable) { + TableIdentifier identifier = + TableIdentifier.of( + catalogName, mixedTable.id().getDatabase(), mixedTable.id().getTableName()); + InternalCatalogBuilder internalCatalogBuilder = InternalCatalogBuilder.builder().amsUri(amsUri); + return MixedFormatTableLoader.of(identifier, internalCatalogBuilder, mixedTable.properties()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java new file mode 100644 index 0000000000..cd3501a079 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java @@ -0,0 +1,324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import static org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.catalog.TableTestBase; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.flink.write.MixedFormatRowDataTaskWriterFactory; +import org.apache.amoro.io.reader.GenericKeyedDataReader; +import org.apache.amoro.scan.CombinedScanTask; +import org.apache.amoro.scan.KeyedTableScanTask; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableSet; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.filesystem.FsStateBackend; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.IdentityPartitionConverters; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.MiniClusterResource; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutionException; + +public class FlinkTestBase extends TableTestBase { + private static final Logger LOG = LoggerFactory.getLogger(FlinkTestBase.class); + + @ClassRule + public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = + MiniClusterResource.createWithClassloaderCheckDisabled(); + + @Rule public TestName name = new TestName(); + + public static String metastoreUri; + + protected static final int KAFKA_PARTITION_NUMS = 1; + + private volatile StreamTableEnvironment tEnv = null; + protected Map props; + private volatile StreamExecutionEnvironment env = null; + public static final Schema TABLE_SCHEMA = BasicTableTestHelper.TABLE_SCHEMA; + public static final TableSchema FLINK_SCHEMA = + TableSchema.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .field("ts", DataTypes.BIGINT()) + .field("op_time", DataTypes.TIMESTAMP()) + .build(); + public static final RowType FLINK_ROW_TYPE = + (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); + + public static InternalCatalogBuilder catalogBuilder; + + public FlinkTestBase(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { + super(catalogTestHelper, tableTestHelper); + } + + @Before + public void before() throws Exception { + metastoreUri = getCatalogUri(); + catalogBuilder = InternalCatalogBuilder.builder().amsUri(metastoreUri); + } + + public void config() { + props = Maps.newHashMap(); + props.put("type", MIXED_ICEBERG_IDENTIFIER); + props.put(CatalogFactoryOptions.AMS_URI.key(), metastoreUri); + } + + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + protected StreamTableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + StreamTableEnvironment.create( + getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); + Configuration configuration = tEnv.getConfig().getConfiguration(); + // set low-level key-value options + configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); + } + } + } + return tEnv; + } + + protected StreamExecutionEnvironment getEnv() { + if (env == null) { + synchronized (this) { + if (env == null) { + StateBackend backend = + new FsStateBackend( + "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.setParallelism(1); + env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig().setCheckpointInterval(300); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + env.setStateBackend(backend); + env.setRestartStrategy(RestartStrategies.noRestart()); + } + } + } + return env; + } + + protected List sql(String query, Object... args) { + TableResult tableResult = getTableEnv().executeSql(String.format(query, args)); + tableResult + .getJobClient() + .ifPresent( + c -> { + try { + c.getJobExecutionResult().get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); + try (CloseableIterator iter = tableResult.collect()) { + List results = Lists.newArrayList(iter); + return results; + } catch (Exception e) { + LOG.warn("Failed to collect table result", e); + return null; + } + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected Set sqlSet(String query, Object... args) { + return new HashSet<>(sql(query, args)); + } + + public static List read(KeyedTable table) { + CloseableIterable combinedScanTasks = table.newScan().planTasks(); + Schema schema = table.schema(); + GenericKeyedDataReader genericKeyedDataReader = + new GenericKeyedDataReader( + table.io(), + schema, + schema, + table.primaryKeySpec(), + null, + true, + IdentityPartitionConverters::convertConstant); + ImmutableList.Builder builder = ImmutableList.builder(); + for (CombinedScanTask combinedScanTask : combinedScanTasks) { + for (KeyedTableScanTask keyedTableScanTask : combinedScanTask.tasks()) { + builder.addAll(genericKeyedDataReader.readData(keyedTableScanTask)); + } + } + return builder.build(); + } + + public static Set toRecords(Collection rows) { + GenericRecord record = GenericRecord.create(TABLE_SCHEMA); + ImmutableSet.Builder b = ImmutableSet.builder(); + rows.forEach( + r -> + b.add( + record.copy( + ImmutableMap.of( + "id", + r.getField(0), + "name", + r.getField(1), + "ts", + r.getField(2), + "op_time", + r.getField(3))))); + return b.build(); + } + + public static String toWithClause(Map props) { + StringBuilder builder = new StringBuilder(); + builder.append("("); + int propCount = 0; + for (Map.Entry entry : props.entrySet()) { + if (propCount > 0) { + builder.append(","); + } + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); + propCount++; + } + builder.append(")"); + return builder.toString(); + } + + protected static RowData createRowData( + Integer id, String name, String dateTime, RowKind rowKind) { + return GenericRowData.ofKind( + rowKind, + id, + StringData.fromString(name), + LocalDateTime.parse(dateTime).toInstant(ZoneOffset.UTC).toEpochMilli(), + TimestampData.fromLocalDateTime(LocalDateTime.parse(dateTime))); + } + + protected static RowData createRowData(RowKind rowKind, Object... objects) { + return GenericRowData.ofKind( + rowKind, + objects[0], + StringData.fromString((String) objects[1]), + objects[2], + TimestampData.fromLocalDateTime((LocalDateTime) objects[3])); + } + + protected static RowData createRowData(Integer id, String name, String dateTime) { + return createRowData(id, name, dateTime, RowKind.INSERT); + } + + protected static void commit(KeyedTable keyedTable, WriteResult result, boolean base) { + if (base) { + AppendFiles baseAppend = keyedTable.baseTable().newAppend(); + Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); + baseAppend.commit(); + } else { + AppendFiles changeAppend = keyedTable.changeTable().newAppend(); + Arrays.stream(result.dataFiles()).forEach(changeAppend::appendFile); + changeAppend.commit(); + } + } + + protected static TaskWriter createKeyedTaskWriter( + KeyedTable keyedTable, RowType rowType, boolean base) { + return createKeyedTaskWriter(keyedTable, rowType, base, 3); + } + + protected static TaskWriter createKeyedTaskWriter( + KeyedTable keyedTable, RowType rowType, boolean base, long mask) { + MixedFormatRowDataTaskWriterFactory taskWriterFactory = + new MixedFormatRowDataTaskWriterFactory(keyedTable, rowType, base); + taskWriterFactory.setMask(mask); + taskWriterFactory.initialize(0, 0); + return taskWriterFactory.create(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java new file mode 100644 index 0000000000..0eeee1bc9d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.iceberg.Schema; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Map; + +public class TestFlinkSchemaUtil { + @Test + public void testFlinkSchemaToIcebergSchema() { + // flinkSchema with physical column,compute column, watermark + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) + .field("ts", DataTypes.TIMESTAMP(6)) + .field("compute_id", DataTypes.INT(), "`id` + 5") + .field("proc", DataTypes.TIMESTAMP_LTZ(), "PROCTIME()") + // org.apache.iceberg.flink.TypeToFlinkType will convert Timestamp to Timestamp(6), so + // we cast datatype manually + .field("ts3", DataTypes.TIMESTAMP(3), "cast(`ts` as timestamp(3))") + .watermark("ts3", "`ts3` - INTERVAL '5' SECOND", DataTypes.TIMESTAMP(3)) + .build(); + + // get physicalSchema from tableSchema and convert into iceberg Schema + Schema icebergSchema = + org.apache.iceberg.flink.FlinkSchemaUtil.convert( + FlinkSchemaUtil.getPhysicalSchema(flinkSchema)); + + Map extraOptions = FlinkSchemaUtil.generateExtraOptionsFrom(flinkSchema); + + // Convert iceberg Schema with extraOptions into flink TableSchema + TableSchema fromIcebergSchema = + FlinkSchemaUtil.toSchema(icebergSchema, new ArrayList<>(), extraOptions); + + Assert.assertEquals(flinkSchema, fromIcebergSchema); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java new file mode 100644 index 0000000000..92b2a286a1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.amoro.flink.table.AmoroCatalogITCaseBase; +import org.apache.amoro.formats.AmoroCatalogTestHelper; +import org.apache.amoro.formats.paimon.PaimonHadoopCatalogTestHelper; +import org.apache.amoro.formats.paimon.PaimonHiveCatalogTestHelper; +import org.apache.amoro.formats.paimon.PaimonTable; +import org.apache.amoro.hive.TestHMS; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.types.Row; +import org.apache.paimon.table.FileStoreTable; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +/** ITCase for Flink UnifiedCatalog based on AmoroCatalogTestBase */ +@RunWith(value = Parameterized.class) +public class FlinkAmoroCatalogITCase extends AmoroCatalogITCaseBase { + static final TestHMS TEST_HMS = new TestHMS(); + AbstractCatalog flinkCatalog; + + public FlinkAmoroCatalogITCase(AmoroCatalogTestHelper catalogTestHelper) { + super(catalogTestHelper); + } + + @Parameterized.Parameters(name = "{0}") + public static Object[] parameters() { + return new Object[] { + PaimonHiveCatalogTestHelper.defaultHelper(), PaimonHadoopCatalogTestHelper.defaultHelper() + }; + } + + @BeforeClass + public static void beforeAll() throws Exception { + TEST_HMS.before(); + } + + @Before + public void setup() throws Exception { + createDatabase(); + createTable(); + String catalog = "unified_catalog"; + exec( + "CREATE CATALOG %s WITH ('type'='unified', 'metastore.url'='%s')", + catalog, getCatalogUrl()); + exec("USE CATALOG %s", catalog); + exec("USE %s", TEST_DB_NAME); + Optional catalogOptional = getTableEnv().getCatalog(catalog); + assertTrue(catalogOptional.isPresent()); + flinkCatalog = (AbstractCatalog) catalogOptional.get(); + assertEquals(catalog, flinkCatalog.getName()); + } + + @After + public void teardown() { + TEST_HMS.after(); + if (flinkCatalog != null) { + flinkCatalog.close(); + } + } + + public void createDatabase() { + try { + catalogTestHelper.createDatabase(TEST_DB_NAME); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public void createTable() { + try { + catalogTestHelper.createTable(TEST_DB_NAME, TEST_TABLE_NAME); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Test + public void testTableExists() throws Exception { + CatalogBaseTable catalogBaseTable = + flinkCatalog.getTable(new ObjectPath(TEST_DB_NAME, TEST_TABLE_NAME)); + assertNotNull(catalogBaseTable); + PaimonTable paimonTable = + (PaimonTable) catalogTestHelper.amoroCatalog().loadTable(TEST_DB_NAME, TEST_TABLE_NAME); + FileStoreTable originalPaimonTable = (FileStoreTable) paimonTable.originalTable(); + assertEquals( + originalPaimonTable.schema().fields().size(), + catalogBaseTable.getUnresolvedSchema().getColumns().size()); + } + + @Test + public void testInsertAndQuery() throws Exception { + exec("INSERT INTO %s SELECT 1, 'Lily', 1234567890", TEST_TABLE_NAME); + TableResult tableResult = + exec("select * from %s /*+OPTIONS('monitor-interval'='1s')*/ ", TEST_TABLE_NAME); + + tableResult.await(30, TimeUnit.SECONDS); + + Row actualRow = tableResult.collect().next(); + assertEquals(Row.of(1, "Lily", 1234567890).toString(), actualRow.toString()); + } + + @Test + public void testSwitchCurrentCatalog() { + String memCatalog = "mem_catalog"; + exec("create catalog %s with('type'='generic_in_memory')", memCatalog); + exec( + "create table %s.`default`.datagen_table(\n" + + " a int,\n" + + " b varchar" + + ") with(\n" + + " 'connector'='datagen',\n" + + " 'number-of-rows'='1'\n" + + ")", + memCatalog); + TableResult tableResult = exec("select * from mem_catalog.`default`.datagen_table"); + assertNotNull(tableResult.collect().next()); + exec("use catalog %s", memCatalog); + tableResult = exec("select * from datagen_table"); + assertNotNull(tableResult.collect().next()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java new file mode 100644 index 0000000000..86d134322d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions.AMS_URI; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; + +import org.apache.amoro.TableFormat; +import org.apache.amoro.TestAms; +import org.apache.amoro.api.CatalogMeta; +import org.apache.amoro.flink.catalog.factories.FlinkUnifiedCatalogFactory; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.junit.jupiter.params.provider.Arguments; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; + +public class FlinkCatalogContext { + + static final TestHMS TEST_HMS = new TestHMS(); + static final TestAms TEST_AMS = new TestAms(); + static final FlinkUnifiedCatalogFactory FLINK_UNIFIED_CATALOG_FACTORY = + new FlinkUnifiedCatalogFactory(); + + static ResolvedSchema resolvedSchema = + ResolvedSchema.of( + Column.physical("name", DataTypes.STRING()), Column.physical("age", DataTypes.INT())); + static Schema schema = Schema.newBuilder().fromResolvedSchema(resolvedSchema).build(); + + ObjectPath objectPath = new ObjectPath("default", "test_hive_from_flink"); + + static Stream getFlinkCatalogAndTable() { + return Stream.of( + Arguments.of( + initFlinkCatalog(TableFormat.MIXED_HIVE), + generateFlinkTable(TableFormat.MIXED_HIVE.toString()), + TableFormat.MIXED_HIVE), + Arguments.of( + initFlinkCatalog(TableFormat.MIXED_ICEBERG), + generateFlinkTable(TableFormat.MIXED_ICEBERG.toString()), + TableFormat.MIXED_ICEBERG), + Arguments.of( + initFlinkCatalog(TableFormat.ICEBERG), + generateFlinkTable(TableFormat.ICEBERG.toString()), + TableFormat.ICEBERG), + Arguments.of( + initFlinkCatalog(TableFormat.PAIMON), + generateFlinkTable(TableFormat.PAIMON.toString()), + TableFormat.PAIMON)); + } + + static ResolvedCatalogTable generateFlinkTable(String tableFormat) { + return new ResolvedCatalogTable( + CatalogTable.of( + schema, + "Flink managed table", + new ArrayList<>(), + new HashMap() { + { + put(TABLE_FORMAT.key(), tableFormat); + } + }), + resolvedSchema); + } + + void initial() throws Exception { + TEST_HMS.before(); + TEST_AMS.before(); + } + + void close() { + TEST_AMS.after(); + TEST_HMS.after(); + } + + static FlinkUnifiedCatalog initFlinkCatalog(TableFormat tableFormat) { + FlinkUnifiedCatalog flinkUnifiedCatalog; + Map factoryOptions = Maps.newHashMap(); + CatalogMeta meta = + HiveCatalogTestHelper.build(TEST_HMS.getHiveConf(), tableFormat) + .buildCatalogMeta(TEST_HMS.getWareHouseLocation()); + meta.setCatalogName(tableFormat.name().toLowerCase()); + + TEST_AMS.getAmsHandler().dropCatalog(meta.getCatalogName()); + TEST_AMS.getAmsHandler().createCatalog(meta); + + factoryOptions.put(AMS_URI.key(), TEST_AMS.getServerUrl() + "/" + meta.getCatalogName()); + final FactoryUtil.DefaultCatalogContext context = + new FactoryUtil.DefaultCatalogContext( + "FLINK_" + tableFormat, + factoryOptions, + new Configuration(), + FlinkCatalogContext.class.getClassLoader()); + flinkUnifiedCatalog = + (FlinkUnifiedCatalog) FLINK_UNIFIED_CATALOG_FACTORY.createCatalog(context); + flinkUnifiedCatalog.open(); + return flinkUnifiedCatalog; + } + + HiveMetaStoreClient getHMSClient() { + return TEST_HMS.getHiveClient(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java new file mode 100644 index 0000000000..6e9a654bf5 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.table.CatalogITCaseBase; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.catalog.AbstractCatalog; +import org.apache.flink.table.catalog.Catalog; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.flink.types.Row; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +@RunWith(value = Parameterized.class) +public class FlinkUnifiedCatalogITCase extends CatalogITCaseBase { + static final TestHMS TEST_HMS = new TestHMS(); + AbstractCatalog flinkCatalog; + TableIdentifier identifier; + + public FlinkUnifiedCatalogITCase(CatalogTestHelper catalogTestHelper) { + super(catalogTestHelper, new BasicTableTestHelper(true, false)); + } + + @Parameterized.Parameters(name = "catalogTestHelper = {0}") + public static Object[][] parameters() { + return new Object[][] { + {new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf())}, + {new HiveCatalogTestHelper(TableFormat.MIXED_ICEBERG, TEST_HMS.getHiveConf())}, + {new HiveCatalogTestHelper(TableFormat.ICEBERG, TEST_HMS.getHiveConf())} + }; + } + + @BeforeClass + public static void beforeAll() throws Exception { + TEST_HMS.before(); + } + + @Before + public void setup() throws Exception { + String catalog = "unified_catalog"; + exec("CREATE CATALOG %s WITH ('type'='unified', 'ams.uri'='%s')", catalog, getCatalogUri()); + exec("USE CATALOG %s", catalog); + exec("USE %s", tableTestHelper().id().getDatabase()); + Optional catalogOptional = getTableEnv().getCatalog(catalog); + assertTrue(catalogOptional.isPresent()); + flinkCatalog = (AbstractCatalog) catalogOptional.get(); + assertEquals(catalog, flinkCatalog.getName()); + identifier = tableTestHelper().id(); + } + + @After + public void teardown() { + TEST_HMS.after(); + if (flinkCatalog != null) { + flinkCatalog.close(); + } + } + + @Test + public void testTableExists() throws TableNotExistException { + CatalogBaseTable catalogBaseTable = + flinkCatalog.getTable(new ObjectPath(identifier.getDatabase(), identifier.getTableName())); + assertNotNull(catalogBaseTable); + assertEquals( + tableTestHelper().tableSchema().columns().size(), + catalogBaseTable.getUnresolvedSchema().getColumns().size()); + } + + @Test + public void testInsertAndQuery() throws Exception { + exec( + "INSERT INTO %s SELECT 1, 'Lily', 1234567890, TO_TIMESTAMP('2020-01-01 01:02:03')", + identifier.getTableName()); + TableResult tableResult = + exec("select * from %s /*+OPTIONS('monitor-interval'='1s')*/ ", identifier.getTableName()); + + tableResult.await(30, TimeUnit.SECONDS); + + Row actualRow = tableResult.collect().next(); + assertEquals( + Row.of(1, "Lily", 1234567890L, "2020-01-01T01:02:03").toString(), actualRow.toString()); + } + + @Test + public void testSwitchCurrentCatalog() { + String memCatalog = "mem_catalog"; + exec("create catalog %s with('type'='generic_in_memory')", memCatalog); + exec( + "create table %s.`default`.datagen_table(\n" + + " a int,\n" + + " b varchar" + + ") with(\n" + + " 'connector'='datagen',\n" + + " 'number-of-rows'='1'\n" + + ")", + memCatalog); + TableResult tableResult = exec("select * from mem_catalog.`default`.datagen_table"); + assertNotNull(tableResult.collect().next()); + exec("use catalog %s", memCatalog); + tableResult = exec("select * from datagen_table"); + assertNotNull(tableResult.collect().next()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java new file mode 100644 index 0000000000..4536db0c86 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.amoro.TableFormat; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.catalog.CatalogBaseTable; +import org.apache.flink.table.catalog.CatalogDatabaseImpl; +import org.apache.flink.table.catalog.CatalogTable; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.ResolvedCatalogTable; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; +import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; +import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; +import org.apache.flink.table.catalog.exceptions.TableNotExistException; +import org.apache.thrift.TException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +class TestFlinkUnifiedCatalogs { + static FlinkCatalogContext flinkCatalogContext = new FlinkCatalogContext(); + + @BeforeAll + public static void setupCatalogMeta() throws Exception { + flinkCatalogContext.initial(); + } + + @AfterAll + public static void tearDown() { + flinkCatalogContext.close(); + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testListDatabases(FlinkUnifiedCatalog flinkUnifiedCatalog) throws TException { + List expects = flinkCatalogContext.getHMSClient().getAllDatabases(); + assertEquals(expects, flinkUnifiedCatalog.listDatabases()); + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testDatabaseExists(FlinkUnifiedCatalog flinkUnifiedCatalog) { + assertTrue(flinkUnifiedCatalog.databaseExists("default")); + assertFalse(flinkUnifiedCatalog.databaseExists("not_exists_db")); + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testCreateAndDropDatabase(FlinkUnifiedCatalog flinkUnifiedCatalog) + throws DatabaseAlreadyExistException, DatabaseNotEmptyException, DatabaseNotExistException { + flinkUnifiedCatalog.createDatabase( + "test", new CatalogDatabaseImpl(Collections.emptyMap(), "test"), false); + assertTrue(flinkUnifiedCatalog.databaseExists("test")); + + flinkUnifiedCatalog.dropDatabase("test", false); + assertFalse(flinkUnifiedCatalog.databaseExists("test")); + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testAlterDatabase( + FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) + throws DatabaseNotExistException { + try { + flinkUnifiedCatalog.alterDatabase( + "default", new CatalogDatabaseImpl(Collections.emptyMap(), "default"), false); + } catch (UnsupportedOperationException e) { + // Mixed-format,Iceberg and paimon catalog does not support altering database. + if (!tableFormat.in( + TableFormat.MIXED_HIVE, + TableFormat.MIXED_ICEBERG, + TableFormat.ICEBERG, + TableFormat.PAIMON)) { + throw e; + } + } + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testCreateGetAndDropTable( + FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) + throws TableAlreadyExistException, DatabaseNotExistException, TableNotExistException { + ObjectPath objectPath = flinkCatalogContext.objectPath; + + flinkUnifiedCatalog.createTable(flinkCatalogContext.objectPath, table, false); + assertTrue(flinkUnifiedCatalog.tableExists(objectPath)); + + CatalogBaseTable actualTable = flinkUnifiedCatalog.getTable(objectPath); + assertEquals(table.getUnresolvedSchema(), actualTable.getUnresolvedSchema()); + assertEquals(tableFormat.toString(), actualTable.getOptions().get(TABLE_FORMAT.key())); + + flinkUnifiedCatalog.dropTable(objectPath, false); + assertFalse(flinkUnifiedCatalog.tableExists(objectPath)); + } + + @ParameterizedTest + @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") + void testAlterTable( + FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) + throws TableNotExistException, TableAlreadyExistException, DatabaseNotExistException { + try { + flinkUnifiedCatalog.createTable(flinkCatalogContext.objectPath, table, true); + + ResolvedSchema newResolvedSchema = + ResolvedSchema.of( + Column.physical("name", DataTypes.STRING()), + Column.physical("age", DataTypes.INT()), + Column.physical("address", DataTypes.STRING())); + String comment = "Flink new Table"; + Map newProperties = Maps.newHashMap(); + newProperties.put("new_key", "new_value"); + + CatalogBaseTable newTable = + new ResolvedCatalogTable( + CatalogTable.of( + Schema.newBuilder().fromResolvedSchema(newResolvedSchema).build(), + comment, + new ArrayList<>(), + newProperties), + newResolvedSchema); + try { + flinkUnifiedCatalog.alterTable(flinkCatalogContext.objectPath, newTable, false); + } catch (UnsupportedOperationException e) { + // https://github.com/apache/amoro/issues/2 altering Mixed format table is not supported. + // Altering Iceberg schema is also not supported yet. + if (!tableFormat.in( + TableFormat.MIXED_ICEBERG, TableFormat.MIXED_HIVE, TableFormat.ICEBERG)) { + throw e; + } + } + } finally { + flinkUnifiedCatalog.dropTable(flinkCatalogContext.objectPath, true); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java new file mode 100644 index 0000000000..5e4bb582ba --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static org.apache.amoro.flink.FlinkSchemaUtil.COMPUTED_COLUMNS; +import static org.apache.amoro.flink.FlinkSchemaUtil.FLINK_PREFIX; +import static org.apache.amoro.flink.FlinkSchemaUtil.WATERMARK; +import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; +import static org.apache.flink.table.descriptors.DescriptorProperties.DATA_TYPE; +import static org.apache.flink.table.descriptors.DescriptorProperties.EXPR; +import static org.apache.flink.table.descriptors.DescriptorProperties.NAME; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_ROWTIME; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_DATA_TYPE; +import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_EXPR; + +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestBase; +import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.filesystem.FsStateBackend; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.MiniClusterResource; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Test cases for mixed catalog factories, including: + * CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER, CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER, + * CatalogFactoryOptions.LEGACY_MIXED_IDENTIFIER + */ +@RunWith(value = Parameterized.class) +public class TestMixedCatalog extends CatalogTestBase { + private String catalogName; + private String catalogFactoryType; + private static final Logger LOG = LoggerFactory.getLogger(TestMixedCatalog.class); + + public TestMixedCatalog(String catalogFactoryType) { + super(new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG)); + this.catalogFactoryType = catalogFactoryType; + this.catalogName = catalogFactoryType + "_catalog"; + } + + @Parameterized.Parameters(name = "catalogFactoryType = {0}") + public static Object[] parameters() { + return new Object[] { + CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER, CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER + }; + } + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + protected Map props; + + private static final String DB = TableTestHelper.TEST_DB_NAME; + private static final String TABLE = TableTestHelper.TEST_TABLE_NAME; + private volatile StreamExecutionEnvironment env = null; + private volatile StreamTableEnvironment tEnv = null; + + @Before + public void before() throws Exception { + props = Maps.newHashMap(); + props.put("type", catalogFactoryType); + props.put(CatalogFactoryOptions.AMS_URI.key(), getCatalogUri()); + sql("CREATE CATALOG " + catalogName + " WITH %s", toWithClause(props)); + sql("USE CATALOG " + catalogName); + sql("CREATE DATABASE " + catalogName + "." + DB); + } + + @After + public void after() { + sql("DROP TABLE IF EXISTS " + catalogName + "." + DB + "." + TABLE); + sql("DROP DATABASE IF EXISTS " + catalogName + "." + DB); + Assert.assertTrue(CollectionUtil.isNullOrEmpty(getMixedFormatCatalog().listDatabases())); + sql("USE CATALOG default_catalog"); + sql("DROP CATALOG " + catalogName); + } + + @Test + public void testMixedCatalog() { + String[] catalogs = getTableEnv().listCatalogs(); + Assert.assertArrayEquals( + Arrays.stream(catalogs).sorted().toArray(), + Stream.of("default_catalog", catalogName).sorted().toArray()); + } + + @Test + public void testDDL() { + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + sql("USE " + catalogName + "." + DB); + sql("SHOW tables"); + + Assert.assertTrue( + getMixedFormatCatalog() + .loadTable(TableIdentifier.of(catalogName, DB, TABLE)) + .isKeyedTable()); + } + + @Test + public void testComputeIndex() { + // if compute column before any physical column, will throw exception. + Assert.assertThrows( + org.apache.flink.table.api.TableException.class, + () -> + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " compute_id as id+5 ," + + " proc as PROCTIME() ," + + " name STRING" + + ") ")); + + // compute column must come after all the physical columns + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " proc as PROCTIME() " + + ") "); + } + + @Test + public void testDDLWithVirtualColumn() throws IOException { + // create mixed-format table with compute columns and watermark under mixed-format catalog + // org.apache.iceberg.flink.TypeToFlinkType will convert Timestamp to Timestamp(6), so we cast + // datatype manually + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP," + + " t3 as cast(t as TIMESTAMP(3))," + + " compute_id as id+5 ," + + " proc as PROCTIME() ," + + " watermark FOR t3 AS t3 - INTERVAL '5' SECOND, " + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + + Map properties = + getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); + + // index for compute columns + int[] computedIndex = {1, 2, 3}; + Arrays.stream(computedIndex) + .forEach( + x -> { + Assert.assertTrue( + properties.containsKey(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, NAME))); + Assert.assertTrue( + properties.containsKey(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, EXPR))); + Assert.assertTrue( + properties.containsKey( + compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, DATA_TYPE))); + }); + + Assert.assertTrue( + properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME))); + Assert.assertTrue( + properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR))); + Assert.assertTrue( + properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE))); + + List result = sql("DESC " + catalogName + "." + DB + "." + TABLE + ""); + Assert.assertEquals(6, result.size()); + } + + @Test + public void testDMLWithVirtualColumn() throws IOException { + // create mixed-format table with compute columns under mixed-format catalog + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " t TIMESTAMP(6)," + + " compute_id as id+5 ," + + " proc as PROCTIME(), " + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + + // insert values into mixed-format table + insertValue(); + + // select from mixed-format table with compute columns under mixed-format catalog + List rows = + sql( + "SELECT * FROM " + + catalogName + + "." + + DB + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"); + checkRows(rows); + } + + @Test + public void testReadNotMatchColumn() throws IOException { + // create mixed-format table with compute columns under mixed-format catalog + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " t TIMESTAMP(6)," + + " proc as PROCTIME(), " + + " compute_id as id+5 ," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + + MixedTable amoroTable = + getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)); + String beforeExpr = + amoroTable.properties().get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR)); + // change property "flink.computed-column.2.expr" from "`id` +5" to "`newId` +5" + String afterExpr = "`newId` +5"; + amoroTable + .updateProperties() + .set(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR), afterExpr) + .commit(); + + Assert.assertNotEquals( + beforeExpr, + amoroTable.properties().get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR))); + + // property for expr do not match any columns in amoro, will throw exception. + Assert.assertThrows( + IllegalStateException.class, + () -> sql("DESC " + catalogName + "." + DB + "." + TABLE + "")); + amoroTable + .updateProperties() + .set(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR), beforeExpr) + .commit(); + + // can get table normally + sql("DESC " + catalogName + "." + DB + "." + TABLE + ""); + } + + @Test + public void testDML() throws IOException { + sql( + "CREATE TABLE default_catalog.default_database." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) " + + " WITH (" + + " 'connector' = 'datagen'," + + " 'fields.id.kind'='sequence'," + + " 'fields.id.start'='1'," + + " 'fields.id.end'='1'" + + ")"); + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + + sql( + "INSERT INTO " + + catalogName + + "." + + DB + + "." + + TABLE + + " SELECT * FROM default_catalog.default_database." + + TABLE); + List rows = + sql( + "SELECT * FROM " + + catalogName + + "." + + DB + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"); + Assert.assertEquals(1, rows.size()); + + sql("DROP TABLE default_catalog.default_database." + TABLE); + } + + private void checkRows(List rows) { + Assert.assertEquals(1, rows.size()); + int id = (int) rows.get(0).getField("id"); + int computeId = (int) rows.get(0).getField("compute_id"); + Assert.assertEquals(1, id); + // computeId should be id+5 + Assert.assertEquals(id + 5, computeId); + Assert.assertEquals(4, rows.get(0).getFieldNames(true).size()); + } + + protected List sql(String query, Object... args) { + TableResult tableResult = getTableEnv().executeSql(String.format(query, args)); + tableResult + .getJobClient() + .ifPresent( + c -> { + try { + c.getJobExecutionResult().get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }); + try (CloseableIterator iter = tableResult.collect()) { + List results = Lists.newArrayList(iter); + return results; + } catch (Exception e) { + LOG.warn("Failed to collect table result", e); + return null; + } + } + + protected StreamTableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + StreamTableEnvironment.create( + getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); + Configuration configuration = tEnv.getConfig().getConfiguration(); + // set low-level key-value options + configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); + } + } + } + return tEnv; + } + + protected StreamExecutionEnvironment getEnv() { + if (env == null) { + synchronized (this) { + if (env == null) { + StateBackend backend = + new FsStateBackend( + "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.setParallelism(1); + env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig().setCheckpointInterval(300); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + env.setStateBackend(backend); + env.setRestartStrategy(RestartStrategies.noRestart()); + } + } + } + return env; + } + + public static String toWithClause(Map props) { + StringBuilder builder = new StringBuilder(); + builder.append("("); + int propCount = 0; + for (Map.Entry entry : props.entrySet()) { + if (propCount > 0) { + builder.append(","); + } + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); + propCount++; + } + builder.append(")"); + return builder.toString(); + } + + private String compoundKey(Object... components) { + return Stream.of(components).map(Object::toString).collect(Collectors.joining(".")); + } + + private void insertValue() { + sql( + "CREATE TABLE default_catalog.default_database." + + TABLE + + " (" + + " id INT," + + " t TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) " + + " WITH (" + + " 'connector' = 'datagen'," + + " 'fields.id.kind'='sequence'," + + " 'fields.id.start'='1'," + + " 'fields.id.end'='1'" + + ")"); + + sql( + "INSERT INTO " + + catalogName + + "." + + DB + + "." + + TABLE + + " SELECT * FROM default_catalog.default_database." + + TABLE); + + sql("DROP TABLE default_catalog.default_database." + TABLE); + } + + @Test + public void testAlterUnKeyTable() throws Exception { + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP" + + ") PARTITIONED BY(t) " + + " WITH (" + + " 'self-optimizing.enabled' = 'false'" + + ")"); + + sql( + "ALTER TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " " + + "SET ( 'write.metadata.delete-after-commit.enabled' = 'false')"); + Map unKeyTableProperties = + getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); + Assert.assertEquals( + unKeyTableProperties.get("write.metadata.delete-after-commit.enabled"), "false"); + } + + @Test + public void testAlterKeyTable() throws Exception { + sql( + "CREATE TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " t TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(t) "); + sql( + "ALTER TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " " + + "SET ( 'self-optimizing.group' = 'flink')"); + sql( + "ALTER TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " " + + "SET ( 'self-optimizing.enabled' = 'true')"); + + sql( + "ALTER TABLE " + + catalogName + + "." + + DB + + "." + + TABLE + + " " + + "SET ( 'write.upsert.enabled' = 'true')"); + + Map keyTableProperties = + getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); + Assert.assertEquals(keyTableProperties.get("self-optimizing.enabled"), "true"); + Assert.assertEquals(keyTableProperties.get("self-optimizing.group"), "flink"); + Assert.assertEquals(keyTableProperties.get("write.upsert.enabled"), "true"); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java new file mode 100644 index 0000000000..e951acce03 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.catalog; + +import static java.util.Collections.singletonList; +import static org.apache.flink.table.api.Expressions.$; +import static org.apache.flink.table.expressions.ApiExpressionUtils.valueLiteral; +import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.EQUALS; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.table.api.ApiExpression; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.catalog.CatalogPartitionSpec; +import org.apache.flink.table.catalog.ObjectPath; +import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; +import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.expressions.CallExpression; +import org.apache.flink.table.expressions.FieldReferenceExpression; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.types.RowKind; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public class TestMixedCatalogTablePartitions extends FlinkTestBase { + private final String tableName = "test_partition_table"; + private final String db = "test_partition_db"; + + public TestMixedCatalogTablePartitions() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + public void before() throws Exception { + super.before(); + super.config(); + } + + @Test + public void testListPartitionsUnKeyedTable() throws TableNotPartitionedException { + List data = new LinkedList<>(); + data.add(new Object[] {1, "mark", "2023-10-01"}); + data.add(new Object[] {2, "Gerry", "2023-10-02"}); + + List rows = DataUtil.toRows(data); + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixedCatalog." + + db + + "." + + tableName + + "(" + + " id INT, name STRING, dt STRING) PARTITIONED BY (dt)"); + + sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); + ObjectPath objectPath = new ObjectPath(db, tableName); + MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); + List list = mixedCatalog.listPartitions(objectPath); + + List expected = Lists.newArrayList(); + CatalogPartitionSpec partitionSpec1 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01")); + CatalogPartitionSpec partitionSpec2 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-02")); + expected.add(partitionSpec1); + expected.add(partitionSpec2); + Assert.assertEquals("Should produce the expected catalog partition specs.", list, expected); + } + + @Test + public void testListPartitionsKeyedTable() throws TableNotPartitionedException { + List data = new LinkedList<>(); + data.add(new Object[] {1, "mark", "2023-10-01"}); + data.add(new Object[] {2, "Gerry", "2023-10-02"}); + data.add(new Object[] {RowKind.DELETE, 2, "Gerry", "2023-10-02"}); + + DataStreamSource rowData = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType())); + Table input = getTableEnv().fromDataStream(rowData, $("id"), $("name"), $("dt")); + + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixedCatalog." + + db + + "." + + tableName + + "(" + + " id INT, name STRING, dt STRING, PRIMARY KEY (id) NOT ENFORCED) PARTITIONED BY (dt)"); + + sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); + ObjectPath objectPath = new ObjectPath(db, tableName); + MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); + List partitionList = mixedCatalog.listPartitions(objectPath); + + List expected = Lists.newArrayList(); + CatalogPartitionSpec partitionSpec1 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01")); + CatalogPartitionSpec partitionSpec2 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-02")); + expected.add(partitionSpec1); + expected.add(partitionSpec2); + Assert.assertEquals( + "Should produce the expected catalog partition specs.", partitionList, expected); + } + + @Test + public void testListPartitionsByFilter() + throws TableNotPartitionedException, PartitionSpecInvalidException { + List data = new LinkedList<>(); + data.add(new Object[] {1, "mark", "2023-10-01"}); + data.add(new Object[] {2, "Gerry", "2023-10-02"}); + data.add(new Object[] {2, "mark", "2023-10-02"}); + data.add(new Object[] {2, "Gerry", "2023-10-01"}); + data.add(new Object[] {RowKind.DELETE, 2, "Gerry", "2023-10-02"}); + + DataStreamSource rowData = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType())); + Table input = getTableEnv().fromDataStream(rowData, $("id"), $("name"), $("dt")); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); + sql( + "CREATE TABLE IF NOT EXISTS mixedCatalog." + + db + + "." + + tableName + + "(" + + " id INT, name STRING, dt STRING) PARTITIONED BY (dt,name)"); + sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); + + ResolvedExpression dtRef = new FieldReferenceExpression("dt", DataTypes.STRING(), 0, 3); + CallExpression callExpression = + CallExpression.permanent( + EQUALS, + Arrays.asList(dtRef, valueLiteral("2023-10-01", DataTypes.STRING().notNull())), + DataTypes.BOOLEAN()); + + ObjectPath objectPath = new ObjectPath(db, tableName); + MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); + List list = + mixedCatalog.listPartitionsByFilter(objectPath, singletonList(callExpression)); + + List expected = Lists.newArrayList(); + CatalogPartitionSpec partitionSpec1 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "Gerry")); + CatalogPartitionSpec partitionSpec2 = + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "mark")); + expected.add(partitionSpec1); + expected.add(partitionSpec2); + Assert.assertEquals("Should produce the expected catalog partition specs.", list, expected); + + List listCatalogPartitionSpec = + mixedCatalog.listPartitions( + objectPath, + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "Gerry"))); + Assert.assertEquals( + "Should produce the expected catalog partition specs.", listCatalogPartitionSpec.size(), 1); + + try { + mixedCatalog.listPartitions( + objectPath, + new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name1", "Gerry"))); + } catch (Exception e) { + Assert.assertTrue(e instanceof PartitionSpecInvalidException); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java new file mode 100644 index 0000000000..9b24642735 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.kafka.testutils; + +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; + +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.ByteArraySerializer; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.kafka.common.serialization.StringSerializer; + +import java.util.Properties; + +public interface KafkaConfigGenerate { + + static Properties getProperties() { + Properties properties = new Properties(); + properties.put( + BOOTSTRAP_SERVERS_CONFIG, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + properties.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); + properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + return properties; + } + + static Properties getProperties(Properties properties) { + properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + return properties; + } + + static Properties getPropertiesWithByteArray() { + Properties properties = new Properties(); + properties.put( + BOOTSTRAP_SERVERS_CONFIG, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + properties.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); + properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); + properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); + return properties; + } + + static Properties getPropertiesWithByteArray(Properties properties) { + properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); + properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); + return properties; + } + + static Properties getStandardProperties(Properties properties) { + properties.put(ConsumerConfig.GROUP_ID_CONFIG, "mixed-format-tests"); + properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); // read from the beginning. + properties.put("max.partition.fetch.bytes", "256"); + return properties; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java new file mode 100644 index 0000000000..609158255a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.kafka.testutils; + +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.time.Duration; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +@Testcontainers +public class KafkaContainerTest { + private static final Logger LOG = LoggerFactory.getLogger(KafkaContainerTest.class); + public static String INTER_CONTAINER_KAFKA_ALIAS = "kafka"; + public static Network NETWORK = Network.newNetwork(); + public static String KAFKA = "confluentinc/cp-kafka:7.2.6"; + + @Container + public static KafkaContainer KAFKA_CONTAINER = + KafkaUtil.createKafkaContainer(KAFKA, LOG) + .withStartupTimeout(Duration.ofSeconds(120L)) + .withSharedMemorySize(134217728L) + .withEmbeddedZookeeper() + .withNetwork(NETWORK) + .withNetworkAliases(INTER_CONTAINER_KAFKA_ALIAS); + + public static ConsumerRecords readRecords(String topic) { + Properties properties = getProperties(); + properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + KafkaConsumer consumer = new KafkaConsumer<>(properties); + consumer.assign( + consumer.partitionsFor(topic).stream() + .map(partitionInfo -> new TopicPartition(topic, partitionInfo.partition())) + .collect(Collectors.toSet())); + consumer.seekToBeginning(consumer.assignment()); + return consumer.poll(Duration.ofMillis(1000)); + } + + public static ConsumerRecords readRecordsBytes(String topic) { + return (ConsumerRecords) readRecords(topic, getPropertiesWithByteArray()); + } + + public static ConsumerRecords readRecords(String topic, Properties properties) { + properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + KafkaConsumer consumer = new KafkaConsumer<>(properties); + consumer.assign( + consumer.partitionsFor(topic).stream() + .map(partitionInfo -> new TopicPartition(topic, partitionInfo.partition())) + .collect(Collectors.toSet())); + consumer.seekToBeginning(consumer.assignment()); + return consumer.poll(Duration.ofMillis(1000)); + } + + public static Integer countAllRecords(String topic, Properties properties) { + properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + return KafkaUtil.drainAllRecordsFromTopic(topic, properties).size(); + } + + public static void createTopics(int numPartitions, int replicationFactor, String... topics) { + List newTopics = + Arrays.stream(topics) + .map(topic -> new NewTopic(topic, numPartitions, (short) replicationFactor)) + .collect(Collectors.toList()); + Map params = new HashMap<>(); + params.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + try (AdminClient admin = AdminClient.create(params)) { + admin.createTopics(newTopics); + } + } + + public static void deleteTopics(String... topics) { + Map params = new HashMap<>(); + params.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + try (AdminClient admin = AdminClient.create(params)) { + admin.deleteTopics(Arrays.asList(topics)); + } + } + + public static Properties getPropertiesByTopic(String topic) { + Properties properties = getPropertiesWithByteArray(getProperties()); + properties.put(LOG_STORE_MESSAGE_TOPIC, topic); + properties.put(ProducerConfig.ACKS_CONFIG, "all"); + return properties; + } + + public static List getPartitionsForTopic(String topic) { + Properties properties = getProperties(); + KafkaConsumer consumer = new KafkaConsumer<>(properties); + return consumer.partitionsFor(topic).stream() + .map(pi -> new TopicPartition(pi.topic(), pi.partition())) + .collect(Collectors.toList()); + } + + public static KafkaProducer getProducer() { + Properties properties = getPropertiesWithByteArray(); + KafkaProducer producer = new KafkaProducer<>(properties); + return producer; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java new file mode 100644 index 0000000000..a97f7d0835 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.kafka.testutils; + +import org.apache.flink.util.StringUtils; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.KafkaException; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.utility.DockerImageName; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +/** Collection of methods to interact with a Kafka cluster. */ +public class KafkaUtil { + + private static final Logger LOG = LoggerFactory.getLogger(KafkaUtil.class); + private static final Duration CONSUMER_POLL_DURATION = Duration.ofSeconds(1); + + private KafkaUtil() {} + + /** + * This method helps to set commonly used Kafka configurations and aligns the internal Kafka log + * levels with the ones used by the capturing logger. + * + * @param dockerImageVersion describing the Kafka image + * @param logger to derive the log level from + * @return configured Kafka container + */ + public static KafkaContainer createKafkaContainer(String dockerImageVersion, Logger logger) { + return createKafkaContainer(dockerImageVersion, logger, null); + } + + /** + * This method helps to set commonly used Kafka configurations and aligns the internal Kafka log + * levels with the ones used by the capturing logger, and set the prefix of logger. + */ + public static KafkaContainer createKafkaContainer( + String dockerImageVersion, Logger logger, String loggerPrefix) { + String logLevel; + if (logger.isTraceEnabled()) { + logLevel = "TRACE"; + } else if (logger.isDebugEnabled()) { + logLevel = "DEBUG"; + } else if (logger.isInfoEnabled()) { + logLevel = "INFO"; + } else if (logger.isWarnEnabled()) { + logLevel = "WARN"; + } else if (logger.isErrorEnabled()) { + logLevel = "ERROR"; + } else { + logLevel = "OFF"; + } + + Slf4jLogConsumer logConsumer = new Slf4jLogConsumer(logger); + if (!StringUtils.isNullOrWhitespaceOnly(loggerPrefix)) { + logConsumer.withPrefix(loggerPrefix); + } + return new KafkaContainer(DockerImageName.parse(dockerImageVersion)) + .withEnv("KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR", "1") + .withEnv("KAFKA_TRANSACTION_STATE_LOG_MIN_ISR", "1") + .withEnv("KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE", "false") + .withEnv("KAFKA_LOG4J_ROOT_LOGLEVEL", logLevel) + .withEnv("KAFKA_LOG4J_LOGGERS", "state.change.logger=" + logLevel) + .withEnv("KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR", "1") + .withEnv("KAFKA_TRANSACTION_STATE_LOG_MIN_ISR", "1") + .withEnv("KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE", "false") + .withEnv("KAFKA_TRANSACTION_MAX_TIMEOUT_MS", String.valueOf(Duration.ofHours(2).toMillis())) + .withEnv("KAFKA_LOG4J_TOOLS_ROOT_LOGLEVEL", logLevel) + .withLogConsumer(logConsumer); + } + + /** + * Drain all records available from the given topic from the beginning until the current highest + * offset. + * + *

This method will fetch the latest offsets for the partitions once and only return records + * until that point. + * + * @param topic to fetch from + * @param properties used to configure the created {@link KafkaConsumer} + * @param committed determines the mode {@link ConsumerConfig#ISOLATION_LEVEL_CONFIG} with which + * the consumer reads the records. + * @return all {@link ConsumerRecord} in the topic + * @throws KafkaException + */ + public static List> drainAllRecordsFromTopic( + String topic, Properties properties, boolean committed) throws KafkaException { + final Properties consumerConfig = new Properties(); + consumerConfig.putAll(properties); + consumerConfig.put( + ConsumerConfig.ISOLATION_LEVEL_CONFIG, committed ? "read_committed" : "read_uncommitted"); + return drainAllRecordsFromTopic(topic, consumerConfig); + } + + /** + * Drain all records available from the given topic from the beginning until the current highest + * offset. + * + *

This method will fetch the latest offsets for the partitions once and only return records + * until that point. + * + * @param topic to fetch from + * @param properties used to configure the created {@link KafkaConsumer} + * @return all {@link ConsumerRecord} in the topic + * @throws KafkaException + */ + public static List> drainAllRecordsFromTopic( + String topic, Properties properties) throws KafkaException { + final Properties consumerConfig = new Properties(); + consumerConfig.putAll(properties); + consumerConfig.put("key.deserializer", ByteArrayDeserializer.class.getName()); + consumerConfig.put("value.deserializer", ByteArrayDeserializer.class.getName()); + try (KafkaConsumer consumer = new KafkaConsumer<>(consumerConfig)) { + Set topicPartitions = getAllPartitions(consumer, topic); + Map endOffsets = consumer.endOffsets(topicPartitions); + consumer.assign(topicPartitions); + consumer.seekToBeginning(topicPartitions); + + final List> consumerRecords = new ArrayList<>(); + while (!topicPartitions.isEmpty()) { + ConsumerRecords records = consumer.poll(CONSUMER_POLL_DURATION); + LOG.debug("Fetched {} records from topic {}.", records.count(), topic); + + // Remove partitions from polling which have reached its end. + final List finishedPartitions = new ArrayList<>(); + for (final TopicPartition topicPartition : topicPartitions) { + final long position = consumer.position(topicPartition); + final long endOffset = endOffsets.get(topicPartition); + LOG.debug( + "Endoffset {} and current position {} for partition {}", + endOffset, + position, + topicPartition.partition()); + if (endOffset - position > 0) { + continue; + } + finishedPartitions.add(topicPartition); + } + if (topicPartitions.removeAll(finishedPartitions)) { + consumer.assign(topicPartitions); + } + for (ConsumerRecord r : records) { + consumerRecords.add(r); + } + } + return consumerRecords; + } + } + + private static Set getAllPartitions( + KafkaConsumer consumer, String topic) { + return consumer.partitionsFor(topic).stream() + .map(info -> new TopicPartition(info.topic(), info.partition())) + .collect(Collectors.toSet()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java new file mode 100644 index 0000000000..b114e6b8c8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.kafka.testutils; + +/** Exception that is thrown to terminate a program and indicate success. */ +public class SuccessException extends RuntimeException { + private static final long serialVersionUID = -7011865671593955887L; +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java new file mode 100644 index 0000000000..8f7aae0606 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashSet; +import java.util.Set; + +public class ByteArraySetSerializerTest { + private static final Logger LOG = LoggerFactory.getLogger(ByteArraySetSerializerTest.class); + + @Test + public void testByteArraySetSerializer() { + Set byteSet = new HashSet<>(); + byte[] data = "Hello".getBytes(); + byteSet.add(new ByteArrayWrapper(data, data.length)); + data = "World".getBytes(); + byteSet.add(new ByteArrayWrapper(data, data.length)); + byte[] serialized = ByteArraySetSerializer.serialize(byteSet); + Set actualSet = ByteArraySetSerializer.deserialize(serialized); + Assert.assertEquals(byteSet.size(), actualSet.size()); + Assert.assertEquals(byteSet, actualSet); + } + + @Test + public void testPerformance() { + Set byteArraySet = new HashSet<>(); + StringBuilder sb = new StringBuilder(); + int num = 10000; + long start = System.currentTimeMillis(); + int totalSize = 4; + for (int i = 0; i < num; i++) { + sb.append(i); + byte[] tmp = sb.toString().getBytes(); + byteArraySet.add(new ByteArrayWrapper(tmp, tmp.length)); + totalSize += 4 + tmp.length; + } + LOG.info("added {} items process time: {}", num, System.currentTimeMillis() - start); + Assert.assertEquals(num, byteArraySet.size()); + + start = System.currentTimeMillis(); + byte[] serialized = ByteArraySetSerializer.serialize(byteArraySet); + long cost = System.currentTimeMillis() - start; + assert serialized != null; + Assert.assertEquals(totalSize, serialized.length); + LOG.info( + "serialized cost: {}, num= {}, result byte array size={}.", cost, num, serialized.length); + + start = System.currentTimeMillis(); + Set actualSet = ByteArraySetSerializer.deserialize(serialized); + cost = System.currentTimeMillis() - start; + LOG.info("deserialized cost: {}, num= {}, set size={}.", cost, num, actualSet.size()); + Assert.assertEquals(byteArraySet, actualSet); + + // exists + sb = new StringBuilder(); + start = System.currentTimeMillis(); + for (int i = 0; i < num; i++) { + sb.append(i); + Assert.assertTrue( + actualSet.contains( + new ByteArrayWrapper(sb.toString().getBytes(), sb.toString().getBytes().length))); + } + long end = System.currentTimeMillis(); + LOG.info("contains process time:{}", end - start); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java new file mode 100644 index 0000000000..1f830cb6d9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java @@ -0,0 +1,584 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.ROCKSDB_WRITING_THREADS; +import static org.junit.Assert.assertEquals; + +import org.apache.amoro.flink.lookup.filter.RowDataPredicate; +import org.apache.amoro.flink.lookup.filter.RowDataPredicateExpressionVisitor; +import org.apache.amoro.flink.lookup.filter.TestRowDataPredicateBase; +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputSerializer; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; +import org.apache.flink.shaded.guava30.com.google.common.cache.CacheBuilder; +import org.apache.flink.shaded.guava30.com.google.common.collect.Lists; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.binary.BinaryRowData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; +import org.apache.flink.table.runtime.typeutils.RowDataSerializer; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +@SuppressWarnings("OptionalUsedAsFieldOrParameterType") +@RunWith(value = Parameterized.class) +public class TestKVTable extends TestRowDataPredicateBase { + private static final Logger LOG = LoggerFactory.getLogger(TestKVTable.class); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TestName name = new TestName(); + private final Configuration config = new Configuration(); + private final List primaryKeys = Lists.newArrayList("id", "grade"); + private final List primaryKeysDisorder = Lists.newArrayList("grade", "num", "id"); + + private final boolean guavaCacheEnabled; + + private final Schema mixedTableSchema = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "grade", Types.StringType.get()), + Types.NestedField.required(3, "num", Types.IntegerType.get())); + + private String dbPath; + + @Parameterized.Parameters(name = "guavaCacheEnabled = {0}") + public static Object[][] parameters() { + return new Object[][] {{true}, {false}}; + } + + public TestKVTable(boolean guavaCacheEnabled) { + this.guavaCacheEnabled = guavaCacheEnabled; + } + + @Before + public void before() throws IOException { + dbPath = temp.newFolder().getPath(); + if (!guavaCacheEnabled) { + config.set(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS, 0L); + } + } + + @Test + public void testRowDataSerializer() throws IOException { + BinaryRowDataSerializer binaryRowDataSerializer = new BinaryRowDataSerializer(3); + + GenericRowData genericRowData = (GenericRowData) row(1, "2", 3); + RowType rowType = FlinkSchemaUtil.convert(mixedTableSchema); + RowDataSerializer rowDataSerializer = new RowDataSerializer(rowType); + BinaryRowData record = rowDataSerializer.toBinaryRow(genericRowData); + + DataOutputSerializer view = new DataOutputSerializer(32); + binaryRowDataSerializer.serialize(record, view); + System.out.println(Arrays.toString(view.getCopyOfBuffer())); + + BinaryRowData desRowData = + binaryRowDataSerializer.deserialize(new DataInputDeserializer(view.getCopyOfBuffer())); + Assert.assertNotNull(desRowData); + Assert.assertEquals(record.getInt(0), desRowData.getInt(0)); + Assert.assertEquals(record.getInt(1), desRowData.getInt(1)); + Assert.assertEquals(record.getInt(2), desRowData.getInt(2)); + + // test join key rowData + binaryRowDataSerializer = new BinaryRowDataSerializer(2); + List keys = Lists.newArrayList("id", "grade"); + Schema keySchema = mixedTableSchema.select(keys); + rowType = FlinkSchemaUtil.convert(keySchema); + rowDataSerializer = new RowDataSerializer(rowType); + KeyRowData keyRowData = new KeyRowData(new int[] {0, 1}, row(2, "3", 4)); + KeyRowData keyRowData1 = new KeyRowData(new int[] {0, 1}, row(2, "3", 4)); + + BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(keyRowData); + view.clear(); + binaryRowDataSerializer.serialize(binaryRowData, view); + byte[] rowBytes = view.getCopyOfBuffer(); + + BinaryRowData binaryRowData1 = rowDataSerializer.toBinaryRow(keyRowData1); + view.clear(); + binaryRowDataSerializer.serialize(binaryRowData1, view); + byte[] rowBytes1 = view.getCopyOfBuffer(); + Assert.assertArrayEquals(rowBytes1, rowBytes); + } + + @Test + public void testInitialUniqueKeyTable() throws IOException { + config.setInteger(ROCKSDB_WRITING_THREADS, 5); + List joinKeys = Lists.newArrayList("id", "grade"); + try (UniqueIndexTable uniqueIndexTable = (UniqueIndexTable) createTable(joinKeys)) { + uniqueIndexTable.open(); + + // During the initialization phase, the Merge-on-Read approach is used to retrieve data, + // which will only return INSERT data. + // When there are multiple entries with the same primary key, only one entry will be returned. + initTable( + uniqueIndexTable, + upsertStream( + row(RowKind.INSERT, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.INSERT, 2, "3", 3), + row(RowKind.INSERT, 2, "4", 4), + row(RowKind.INSERT, 2, "5", 5))); + + if (!uniqueIndexTable.initialized()) { + uniqueIndexTable.waitInitializationCompleted(); + } + + assertTable( + uniqueIndexTable, + row(1, "1"), + row(1, "1", 1), + row(2, "2"), + row(2, "2", 2), + row(2, "3"), + row(2, "3", 3), + row(2, "4"), + row(2, "4", 4), + row(2, "5"), + row(2, "5", 5)); + + // upsert table + upsertTable( + uniqueIndexTable, + upsertStream( + row(RowKind.DELETE, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.DELETE, 2, "2", 2), + row(RowKind.UPDATE_BEFORE, 3, "3", 4), + row(RowKind.UPDATE_AFTER, 3, "3", 5), + row(RowKind.INSERT, 4, "4", 4))); + + assertTable( + uniqueIndexTable, + row(1, "1"), + null, + row(2, "2"), + null, + row(3, "3"), + row(3, "3", 5), + row(4, "4"), + row(4, "4", 4)); + } + } + + @Test + public void testSecondaryKeysMapping() throws IOException { + // primary keys are id and grade. + List joinKeys = Lists.newArrayList("grade", "id"); + try (SecondaryIndexTable secondaryIndexTable = + (SecondaryIndexTable) createTableWithDisorderPK(joinKeys)) { + secondaryIndexTable.open(); + + initTable( + secondaryIndexTable, + upsertStream( + row(RowKind.INSERT, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.INSERT, 2, "3", 3), + row(RowKind.INSERT, 2, "3", 4), + row(RowKind.INSERT, 2, "5", 5))); + + if (!secondaryIndexTable.initialized()) { + secondaryIndexTable.waitInitializationCompleted(); + } + + assertTableSet(secondaryIndexTable, row("1", 1), row(1, "1", 1)); + assertTableSet(secondaryIndexTable, row("2", 2), row(2, "2", 2)); + + upsertTable( + secondaryIndexTable, + upsertStream( + row(RowKind.DELETE, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.DELETE, 2, "2", 2), + row(RowKind.UPDATE_BEFORE, 3, "3", 4), + row(RowKind.UPDATE_AFTER, 3, "3", 5), + row(RowKind.INSERT, 3, "4", 4))); + + assertTableSet(secondaryIndexTable, row("1", 1), null); + assertTableSet(secondaryIndexTable, row("3", 2), row(2, "3", 3), row(2, "3", 4)); + assertTableSet(secondaryIndexTable, row("4", 3), row(3, "4", 4)); + } + } + + @Test + public void testInitialSecondaryKeyTable() throws IOException { + config.setInteger(ROCKSDB_WRITING_THREADS, 10); + config.set(LOOKUP_CACHE_TTL_AFTER_WRITE, Duration.ofMinutes(1000)); + // primary keys are id and grade. + List joinKeys = Lists.newArrayList("id"); + try (SecondaryIndexTable secondaryIndexTable = (SecondaryIndexTable) createTable(joinKeys)) { + writeAndAssert(secondaryIndexTable); + } + } + + private void writeAndAssert(SecondaryIndexTable secondaryIndexTable) throws IOException { + secondaryIndexTable.open(); + + initTable( + secondaryIndexTable, + upsertStream( + row(RowKind.INSERT, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.INSERT, 2, "3", 3), + row(RowKind.INSERT, 2, "4", 4), + row(RowKind.INSERT, 2, "5", 5))); + + if (!secondaryIndexTable.initialized()) { + secondaryIndexTable.waitInitializationCompleted(); + } + + assertTableSet(secondaryIndexTable, row(1), row(1, "1", 1)); + assertTableSet( + secondaryIndexTable, + row(2), + row(2, "2", 2), + row(2, "3", 3), + row(2, "4", 4), + row(2, "5", 5)); + + upsertTable( + secondaryIndexTable, + upsertStream( + row(RowKind.DELETE, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.DELETE, 2, "2", 2), + row(RowKind.UPDATE_BEFORE, 3, "3", 4), + row(RowKind.UPDATE_AFTER, 3, "3", 5), + row(RowKind.INSERT, 3, "4", 4))); + + assertTableSet(secondaryIndexTable, row(1), null); + assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4), row(2, "5", 5)); + assertTableSet(secondaryIndexTable, row(3), row(3, "3", 5), row(3, "4", 4)); + } + + @Test + public void testCacheExpired() throws InterruptedException { + Cache cache = + CacheBuilder.newBuilder().expireAfterWrite(Duration.ofSeconds(1)).build(); + cache.put(1, 1); + cache + .asMap() + .compute( + 2, + (k, v) -> { + if (v == null) { + return k; + } + return v; + }); + Assert.assertEquals(Integer.valueOf(1), cache.getIfPresent(1)); + Assert.assertEquals(Integer.valueOf(2), cache.getIfPresent(2)); + Thread.sleep(1001); + Assert.assertEquals(2, cache.size()); + Assert.assertNull(cache.getIfPresent(1)); + Assert.assertNull(cache.getIfPresent(2)); + cache.cleanUp(); + cache.put(3, 3); + Assert.assertEquals(1, cache.size()); + Assert.assertNull(cache.getIfPresent(1)); + Assert.assertEquals(Integer.valueOf(3), cache.getIfPresent(3)); + } + + @Test + public void testPredicate() throws IOException { + String filter = "id >= 2 and num < 5 and num > 2"; + Optional rowDataPredicate = generatePredicate(filter); + + KVTable uniqueIndexTable = + createTable(Lists.newArrayList("id", "grade"), rowDataPredicate); + uniqueIndexTable.open(); + initTable( + uniqueIndexTable, + upsertStream( + row(RowKind.INSERT, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.INSERT, 2, "3", 3), + row(RowKind.INSERT, 3, "4", 4), + row(RowKind.INSERT, 3, "5", 5))); + + if (!uniqueIndexTable.initialized()) { + uniqueIndexTable.waitInitializationCompleted(); + } + + assertTable( + uniqueIndexTable, + row(1, "1"), + null, + row(2, "2"), + null, + row(2, "3"), + row(2, "3", 3), + row(3, "4"), + row(3, "4", 4), + row(3, "5"), + null); + + // upsert table + upsertTable( + uniqueIndexTable, + upsertStream( + row(RowKind.DELETE, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.DELETE, 2, "2", 2), + row(RowKind.UPDATE_BEFORE, 2, "3", 3), + row(RowKind.UPDATE_AFTER, 2, "3", 5), + row(RowKind.INSERT, 4, "4", 4))); + + assertTable( + uniqueIndexTable, + row(1, "1"), + null, + row(2, "2"), + null, + row(2, "3"), + null, + row(4, "4"), + row(4, "4", 4)); + } + + @Test + public void testSecondaryIndexPredicate() throws IOException { + String filter = "id >= 2 and num < 5 and num > 2"; + Optional rowDataPredicate = generatePredicate(filter); + + // primary keys are id and grade. + List joinKeys = Lists.newArrayList("id"); + try (SecondaryIndexTable secondaryIndexTable = + (SecondaryIndexTable) createTable(joinKeys, rowDataPredicate)) { + secondaryIndexTable.open(); + + initTable( + secondaryIndexTable, + upsertStream( + row(RowKind.INSERT, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.INSERT, 2, "3", 3), + row(RowKind.INSERT, 2, "4", 4), + row(RowKind.INSERT, 2, "5", 5))); + + if (!secondaryIndexTable.initialized()) { + secondaryIndexTable.waitInitializationCompleted(); + } + + assertTableSet(secondaryIndexTable, row(1), null); + assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4)); + + upsertTable( + secondaryIndexTable, + upsertStream( + row(RowKind.DELETE, 1, "1", 1), + row(RowKind.INSERT, 2, "2", 2), + row(RowKind.DELETE, 2, "2", 2), + row(RowKind.UPDATE_BEFORE, 3, "3", 4), + row(RowKind.UPDATE_AFTER, 3, "3", 5), + row(RowKind.INSERT, 3, "4", 4))); + + assertTableSet(secondaryIndexTable, row(1), null); + assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4)); + assertTableSet(secondaryIndexTable, row(3), row(3, "4", 4)); + } + } + + private Optional generatePredicate(String filterSql) { + Map fieldIndexMap = new HashMap<>(); + Map fieldTypeMap = new HashMap<>(); + List fields = mixedTableSchema.asStruct().fields(); + List columns = new ArrayList<>(fields.size()); + for (int i = 0; i < fields.size(); i++) { + String name = fields.get(i).name(); + DataType dataType = + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(fields.get(i).type())); + fieldIndexMap.put(name, i); + fieldTypeMap.put(name, dataType); + columns.add(i, Column.physical(name, dataType)); + } + ResolvedSchema schema = + new ResolvedSchema( + columns, Collections.emptyList(), UniqueConstraint.primaryKey("", primaryKeys)); + + RowDataPredicateExpressionVisitor rowDataPredicateExpressionVisitor = + new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldTypeMap); + + List expressions = resolveSQLFilterToExpression(filterSql, schema); + assertEquals(1, expressions.size()); + return expressions.get(0).accept(rowDataPredicateExpressionVisitor); + } + + private KVTable createTableWithDisorderPK(List joinKeys) { + return createTable(joinKeys, Optional.empty(), true); + } + + private KVTable createTable( + List joinKeys, Optional rowDataPredicate) { + return createTable(joinKeys, rowDataPredicate, false); + } + + private KVTable createTable( + List joinKeys, Optional rowDataPredicate, boolean isDisorderPK) { + return KVTableFactory.INSTANCE.create( + new RowDataStateFactory(dbPath, new UnregisteredMetricsGroup()), + isDisorderPK ? primaryKeysDisorder : primaryKeys, + joinKeys, + mixedTableSchema, + config, + rowDataPredicate.orElse(null)); + } + + private KVTable createTable(List joinKeys) { + return createTable(joinKeys, Optional.empty()); + } + + private void initTable(KVTable table, Iterator initStream) throws IOException { + if (initStream != null) { + table.initialize(initStream); + } + } + + private void upsertTable(KVTable table, Iterator upsertStream, RowData... rows) + throws IOException { + if (upsertStream != null) { + table.upsert(upsertStream); + } + } + + private void assertTable(KVTable table, RowData... rows) throws IOException { + // Loop through the rows array in steps of 2 + for (int i = 0; i < rows.length; i = i + 2) { + // Get the key and expected value at the current index and the next index + RowData key = rows[i], expected = rows[i + 1]; + + List values = table.get(key); + Assert.assertNotNull(values); + if (expected == null) { + Assert.assertEquals(0, values.size()); + continue; + } + Assert.assertEquals(expected.toString(), 1, values.size()); + RowData actual = values.get(0); + assertRecord(expected, actual); + } + } + + private void assertTableSet(KVTable table, RowData key, RowData... expects) + throws IOException { + List values = table.get(key); + if (expects == null) { + Assert.assertEquals(0, values.size()); + return; + } + Assert.assertEquals(expects.length, values.size()); + values = values.stream().sorted(compare()).collect(Collectors.toList()); + List expectsAfterSort = + Arrays.stream(expects).sorted(compare()).collect(Collectors.toList()); + for (int i = 0; i < expects.length; i = i + 1) { + // Get the key and expected value at the current index and the next index + RowData expected = expectsAfterSort.get(i); + + RowData actual = values.get(i); + assertRecord(expected, actual); + } + } + + private Comparator compare() { + return Comparator.comparingInt((RowData o) -> o.getInt(0)) + .thenComparing(o -> o.getString(1)) + .thenComparingInt(o -> o.getInt(2)); + } + + private void assertRecord(RowData expected, RowData actual) { + if (!(actual instanceof BinaryRowData)) { + throw new IllegalArgumentException("Only support BinaryRowData"); + } + BinaryRowData binaryRowData = (BinaryRowData) actual; + for (int j = 0; j < binaryRowData.getArity(); j++) { + switch (j) { + case 0: + case 2: + Assert.assertEquals( + String.format("expected:%s, actual:%s.", expected.toString(), actual), + expected.getInt(j), + binaryRowData.getInt(j)); + break; + case 1: + Assert.assertEquals( + String.format("expected:%s, actual:%s.", expected, actual), + expected.getString(j), + binaryRowData.getString(j)); + break; + } + } + } + + RowData row(RowKind rowKind, Object... objects) { + return GenericRowData.ofKind(rowKind, wrapStringData(objects)); + } + + RowData row(Object... objects) { + return GenericRowData.of(wrapStringData(objects)); + } + + Object[] wrapStringData(Object... objects) { + for (int i = 0; i < objects.length; i++) { + if (objects[i] instanceof String) { + objects[i] = StringData.fromString(objects[i].toString()); + } + } + return objects; + } + + Iterator upsertStream(RowData... rows) { + return Lists.newArrayList(rows).iterator(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java new file mode 100644 index 0000000000..b48aa9bfeb --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup.filter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.amoro.flink.util.DateTimeUtils; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.types.DataType; +import org.junit.Before; +import org.junit.Test; + +import java.math.BigDecimal; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; + +/** Test for {@link RowDataPredicate}. */ +public class TestRowDataPredicateAllFieldTypes extends TestRowDataPredicateBase { + protected RowDataPredicateExpressionVisitor visitor; + protected final Map fieldIndexMap = new HashMap<>(); + protected final Map fieldDataTypeMap = new HashMap<>(); + List columns = new ArrayList<>(); + protected ResolvedSchema schema; + + @Before + public void setUp() { + columns.add(0, Column.physical("f0", DataTypes.INT())); + columns.add(1, Column.physical("f1", DataTypes.STRING())); + columns.add(2, Column.physical("f2", DataTypes.CHAR(1))); + columns.add(3, Column.physical("f3", DataTypes.BOOLEAN())); + columns.add(4, Column.physical("f4", DataTypes.BINARY(1))); + columns.add(5, Column.physical("f5", DataTypes.VARBINARY(10))); + columns.add(6, Column.physical("f6", DataTypes.DECIMAL(38, 10))); + columns.add(7, Column.physical("f7", DataTypes.TINYINT())); + columns.add(8, Column.physical("f8", DataTypes.SMALLINT())); + columns.add(9, Column.physical("f9", DataTypes.BIGINT())); + columns.add(10, Column.physical("f10", DataTypes.FLOAT())); + columns.add(11, Column.physical("f11", DataTypes.DOUBLE())); + columns.add(12, Column.physical("f12", DataTypes.DATE())); + columns.add(13, Column.physical("f13", DataTypes.TIME())); + columns.add(14, Column.physical("f14", DataTypes.TIMESTAMP(3))); + columns.add(15, Column.physical("f15", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))); + schema = new ResolvedSchema(columns, Collections.emptyList(), null); + for (int i = 0; i < columns.size(); i++) { + Column column = columns.get(i); + fieldDataTypeMap.put(column.getName(), column.getDataType()); + fieldIndexMap.put(column.getName(), i); + } + visitor = new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); + } + + @Test + public void testInt() { + String equalExpr = "f0 = 2"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f0", 2))); + assertFalse(predicate.test(generateRowData("f0", 1))); + } + + @Test + public void testString() { + String equalExpr = "f1 = 'a'"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f1", StringData.fromString("a")))); + assertFalse(predicate.test(GenericRowData.of("f1", StringData.fromString("b")))); + } + + @Test + public void testChar() { + String equalExpr = "f2 = 'a'"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f2", StringData.fromString("a")))); + assertFalse(predicate.test(generateRowData("f2", StringData.fromString("b")))); + } + + @Test + public void testBoolean() { + String equalExpr = "f3 = true"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f3", Boolean.TRUE))); + assertFalse(predicate.test(generateRowData("f3", Boolean.FALSE))); + } + + // @Test + public void testBinary() { + String equalExpr = "f4 = '1'"; // byte[] + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f4", (byte) 1))); + assertFalse(predicate.test(generateRowData("f4", (byte) 2))); + } + + @Test + public void testDecimal() { + String equalExpr = "f6 = 1.1"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue( + predicate.test( + generateRowData("f6", DecimalData.fromBigDecimal(BigDecimal.valueOf(1.1d), 38, 1)))); + assertFalse( + predicate.test( + generateRowData("f6", DecimalData.fromBigDecimal(BigDecimal.valueOf(1.2d), 38, 1)))); + } + + // @Test + public void testTinyint() { + String equalExpr = "f7 = cast('1' as tinyint)"; // byte + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f7", 1))); + assertFalse(predicate.test(generateRowData("f7", 0))); + } + + // @Test + public void testSmallint() { + String equalExpr = "f8 = 1"; // short + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f8", (short) 1))); + assertFalse(predicate.test(generateRowData("f8", (short) 0))); + } + + @Test + public void testBigint() { + String equalExpr = "f9 = 1"; // long + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f9", 1L))); + assertFalse(predicate.test(generateRowData("f9", 0L))); + } + + // @Test + public void testFloat() { + String equalExpr = "f10 = 1.1"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f10", 1.1f))); + assertFalse(predicate.test(generateRowData("f10", 1.2f))); + } + + @Test + public void testDouble() { + String equalExpr = "f11 = 1.1"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f11", 1.1d))); + assertFalse(predicate.test(generateRowData("f11", 1.2d))); + } + + // @Test + public void testTimestamp() { + String equalExpr = "f14 = TO_TIMESTAMP('2020-01-01 00:00:00', 'yyyy-MM-dd HH:mm:ss')"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue( + predicate.test( + generateRowData( + "f14", TimestampData.fromTimestamp(Timestamp.valueOf("2020-01-01 00:00:00"))))); + assertFalse( + predicate.test( + generateRowData( + "f14", TimestampData.fromTimestamp(Timestamp.valueOf("2020-01-01 00:00:01"))))); + } + + // @Test + public void testUnixTimestamp() { + String equalExpr = "f1 = cast(from_unixtime(unix_timestamp(),'yyyy-MM-dd') as String)"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + String format = "yyyy-MM-dd"; + String current = + DateTimeUtils.formatUnixTimestamp( + System.currentTimeMillis() / 1000, format, TimeZone.getDefault()); + assertTrue(predicate.test(generateRowData("f1", StringData.fromString(current)))); + assertFalse(predicate.test(generateRowData("f1", StringData.fromString("2020-01-01-01")))); + } + + // @Test + public void testFromUnixTimestampMinus() { + String equalExpr = "f1 = from_unixtime(unix_timestamp()- 3 * 3600,'yyyy-MM-dd')"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + String format = "yyyy-MM-dd"; + String current = + DateTimeUtils.formatUnixTimestamp( + System.currentTimeMillis() / 1000 - 3 * 3600, format, TimeZone.getDefault()); + assertTrue(predicate.test(generateRowData("f1", StringData.fromString(current)))); + assertFalse(predicate.test(generateRowData("f1", StringData.fromString("2020-01-01-01")))); + } + + @Test + public void testArithmetic() { + // bigint type + String arithmeticExpr = "f9 = (1514356320000 + 1) * 10 / 2"; + List resolved = resolveSQLFilterToExpression(arithmeticExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); + assertTrue(predicate.test(generateRowData("f9", 7571781600005L))); + assertFalse(predicate.test(generateRowData("f9", 7571781600004L))); + } + + protected RowData generateRowData(String fieldName, Object val) { + int index = Integer.parseInt(fieldName.substring(1)); + Object[] objects = new Object[columns.size()]; + objects[index] = val; + return GenericRowData.of(objects); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java new file mode 100644 index 0000000000..c50d2d38a1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup.filter; + +import org.apache.amoro.flink.planner.calcite.FlinkTypeSystem; +import org.apache.calcite.rex.RexBuilder; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableException; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.bridge.java.internal.StreamTableEnvironmentImpl; +import org.apache.flink.table.catalog.CatalogManager; +import org.apache.flink.table.catalog.FunctionCatalog; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.expressions.resolver.ExpressionResolver; +import org.apache.flink.table.planner.calcite.FlinkContext; +import org.apache.flink.table.planner.calcite.FlinkTypeFactory; +import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.table.planner.expressions.RexNodeExpression; +import org.apache.flink.table.planner.plan.utils.RexNodeToExpressionConverter; +import org.apache.flink.table.types.logical.RowType; +import org.junit.Before; + +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.TimeZone; + +public abstract class TestRowDataPredicateBase { + public static StreamExecutionEnvironment env; + public static TableEnvironment tEnv; + + @Before + public void init() { + env = StreamExecutionEnvironment.getExecutionEnvironment(); + tEnv = StreamTableEnvironment.create(env); + } + + /** + * This method takes in an SQL filter expression and a ResolvedSchema object, and returns a List + * of ResolvedExpression objects. + */ + protected List resolveSQLFilterToExpression( + String sqlExp, ResolvedSchema schema) { + StreamTableEnvironmentImpl tbImpl = (StreamTableEnvironmentImpl) tEnv; + + FlinkContext ctx = ((PlannerBase) tbImpl.getPlanner()).getFlinkContext(); + CatalogManager catMan = tbImpl.getCatalogManager(); + FunctionCatalog funCat = ctx.getFunctionCatalog(); + RowType sourceType = (RowType) schema.toSourceRowDataType().getLogicalType(); + ClassLoader classLoader = tEnv.getClass().getClassLoader(); + FlinkTypeFactory typeFactory = new FlinkTypeFactory(classLoader, FlinkTypeSystem.INSTANCE); + RexNodeToExpressionConverter converter = + new RexNodeToExpressionConverter( + new RexBuilder(typeFactory), + sourceType.getFieldNames().toArray(new String[0]), + funCat, + catMan, + TimeZone.getTimeZone(tEnv.getConfig().getLocalTimeZone())); + + RexNodeExpression rexExp = + (RexNodeExpression) tbImpl.getParser().parseSqlExpression(sqlExp, sourceType, null); + ResolvedExpression resolvedExp = + rexExp + .getRexNode() + .accept(converter) + .getOrElse( + () -> { + throw new IllegalArgumentException( + "Cannot convert " + + rexExp.getRexNode() + + " to Expression, this likely " + + "means you used some function(s) not " + + "supported with this setup."); + }); + ExpressionResolver resolver = + ExpressionResolver.resolverFor( + tEnv.getConfig(), + classLoader, + name -> Optional.empty(), + funCat.asLookup( + str -> { + throw new TableException( + "We should not need to lookup any expressions at this point"); + }), + catMan.getDataTypeFactory(), + (sqlExpression, inputRowType, outputType) -> { + throw new TableException( + "SQL expression parsing is not supported at this location."); + }) + .build(); + return resolver.resolve(Collections.singletonList(resolvedExp)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java new file mode 100644 index 0000000000..e69689dbbd --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.lookup.filter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.expressions.ResolvedExpression; +import org.apache.flink.table.types.DataType; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** This class contains unit tests for the {@link RowDataPredicateExpressionVisitor} class. */ +public class TestRowDataPredicateExpressionVisitor extends TestRowDataPredicateBase { + + RowDataPredicateExpressionVisitor visitor; + final Map fieldIndexMap = new HashMap<>(); + final Map fieldDataTypeMap = new HashMap<>(); + List columns = new ArrayList<>(); + ResolvedSchema schema; + + @Before + public void setUp() { + columns.add(0, Column.physical("id", DataTypes.INT())); + columns.add(1, Column.physical("name", DataTypes.STRING())); + columns.add(2, Column.physical("age", DataTypes.INT())); + schema = new ResolvedSchema(columns, Collections.emptyList(), null); + for (int i = 0; i < columns.size(); i++) { + Column column = columns.get(i); + fieldDataTypeMap.put(column.getName(), column.getDataType()); + fieldIndexMap.put(column.getName(), i); + } + + visitor = new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); + } + + @Test + public void testVisitCallExpressionEquals() { + String equalExpr = "id = NULL"; + List resolved = resolveSQLFilterToExpression(equalExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + } + + @Test + public void testVisitCallExpressionNotEquals() { + String notEqualExpr = "id <> 1"; + List resolved = resolveSQLFilterToExpression(notEqualExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(2, StringData.fromString("2"), 6))); + assertFalse(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 6))); + } + + @Test + public void testVisitCallExpressionGreaterThanOrEqual() { + String greaterThanOrEqualExpr = "age >= 5"; + List resolved = + resolveSQLFilterToExpression(greaterThanOrEqualExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); + assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 4))); + } + + @Test + public void testVisitCallExpressionGreaterThan() { + String greaterThanExpr = "age > 5"; + List resolved = resolveSQLFilterToExpression(greaterThanExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + } + + @Test + public void testVisitCallExpressionLessThanOrEqual() { + String lessThanOrEqualExpr = "age <= 5"; + List resolved = resolveSQLFilterToExpression(lessThanOrEqualExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); + assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + } + + @Test + public void testVisitCallExpressionLessThan() { + String lessThanExpr = "age < 5"; + List resolved = resolveSQLFilterToExpression(lessThanExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 4))); + } + + @Test + public void testVisitCallExpressionIsNotNull() { + String isNotNullExpr = "id is not NULL"; + List resolved = resolveSQLFilterToExpression(isNotNullExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("1"), 6))); + assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("2"), 6))); + } + + @Test + public void testVisitCallExpressionIsNull() { + String isNullExpr = "id is NULL"; + List resolved = resolveSQLFilterToExpression(isNullExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + } + + @Test + public void testVisitCallExpressionEqualsAndGreaterThan() { + String andExpr = "id = NULL AND age > 5"; + List resolved = resolveSQLFilterToExpression(andExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + } + + @Test + public void testVisitCallExpressionEqualsOrLessThan() { + String orExpr = "id = NULL OR age < 5"; + List resolved = resolveSQLFilterToExpression(orExpr, schema); + assertEquals(1, resolved.size()); + RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); + assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); + assertFalse(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 5))); + assertTrue(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 4))); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java new file mode 100644 index 0000000000..ef5d08a68b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import static org.apache.amoro.flink.write.TestMixedFormatFileWriter.TARGET_FILE_SIZE; +import static org.apache.amoro.flink.write.TestMixedFormatFileWriter.createUnkeyedTaskWriter; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.table.FlinkSource; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamUtils; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.Set; + +public class TestFlinkSource extends FlinkTestBase { + + protected static final FileFormat FILE_FORMAT = + FileFormat.valueOf("parquet".toUpperCase(Locale.ENGLISH)); + + public TestFlinkSource() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(false, true)); + } + + protected static void commit(WriteResult result, Table table) { + AppendFiles append = table.newAppend(); + Arrays.stream(result.dataFiles()).forEach(append::appendFile); + append.commit(); + } + + protected static void write(Collection data, Table table, RowType rowType) + throws IOException { + try (TaskWriter taskWriter = + createUnkeyedTaskWriter(table, TARGET_FILE_SIZE, FILE_FORMAT, rowType)) { + data.forEach( + d -> { + try { + taskWriter.write(DataUtil.toRowData(d)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + taskWriter.close(); + + commit(taskWriter.complete(), table); + } + } + + @Test + public void testUnkeyedTableDataStream() throws Exception { + Configuration conf = new Configuration(); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.setParallelism(1); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List data = new LinkedList<>(); + LocalDateTime localDateTime = LocalDateTime.parse("2022-06-18T10:10:11.0"); + long timestamp = localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(); + data.add(new Object[] {1000004, "a", timestamp, localDateTime}); + data.add(new Object[] {1000015, "b", timestamp, localDateTime}); + data.add(new Object[] {1000011, "c", timestamp, localDateTime}); + data.add(new Object[] {1000014, "d", timestamp, localDateTime}); + data.add(new Object[] {1000021, "d", timestamp, localDateTime}); + data.add(new Object[] {1000015, "e", timestamp, localDateTime}); + + Collection expectedRecords = DataUtil.toRowData(data); + write(data, getMixedTable().asUnkeyedTable(), FLINK_ROW_TYPE); + + final CloseableIterator resultIterator = + FlinkSource.forRowData() + .env(env) + .context(Optional::of) + .project(FLINK_SCHEMA) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkConf(conf) + .properties( + new HashMap() { + { + put("streaming", "false"); + } + }) + .build() + .executeAndCollect(); + + Set rowData = new HashSet<>(); + resultIterator.forEachRemaining( + o -> + rowData.add( + GenericRowData.of( + o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6)))); + + Assert.assertEquals(new HashSet<>(expectedRecords), rowData); + } + + @Test + public void testUnkeyedStreamingRead() throws Exception { + Configuration conf = new Configuration(); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.setParallelism(1); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List data = new LinkedList<>(); + LocalDateTime localDateTime = LocalDateTime.parse("2022-06-18T10:10:11.0"); + long timestamp = localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(); + data.add(new Object[] {1000004, "a", timestamp, localDateTime}); + data.add(new Object[] {1000015, "b", timestamp, localDateTime}); + data.add(new Object[] {1000011, "c", timestamp, localDateTime}); + data.add(new Object[] {1000014, "d", timestamp, localDateTime}); + data.add(new Object[] {1000021, "d", timestamp, localDateTime}); + data.add(new Object[] {1000015, "e", timestamp, localDateTime}); + + Collection expectedRecords = DataUtil.toRowData(data); + write(data, getMixedTable().asUnkeyedTable(), FLINK_ROW_TYPE); + + DataStream ds = + FlinkSource.forRowData() + .env(env) + .context(Optional::of) + .project(FLINK_SCHEMA) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkConf(conf) + .build(); + + ClientAndIterator clientAndIterator = + DataStreamUtils.collectWithClient(ds, this.getClass().getName()); + + JobClient jobClient = clientAndIterator.client; + CloseableIterator iterator = clientAndIterator.iterator; + + Set rowData = new HashSet<>(); + while (iterator.hasNext()) { + RowData o = iterator.next(); + rowData.add( + GenericRowData.of(o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6))); + if (rowData.size() == expectedRecords.size()) { + break; + } + } + jobClient.cancel(); + + Assert.assertEquals(new HashSet<>(expectedRecords), rowData); + } + + @Test + public void testUnkeyedSnapshotRead() throws Exception { + Configuration conf = new Configuration(); + final Table testTable = getMixedTable().asUnkeyedTable(); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.setParallelism(1); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List s1 = new LinkedList<>(); + LocalDateTime localDateTime1 = LocalDateTime.parse("2022-06-18T10:10:11.0"); + long timestamp1 = localDateTime1.toInstant(ZoneOffset.UTC).toEpochMilli(); + s1.add(new Object[] {1000004, "a", timestamp1, localDateTime1}); + s1.add(new Object[] {1000015, "b", timestamp1, localDateTime1}); + s1.add(new Object[] {1000011, "c", timestamp1, localDateTime1}); + s1.add(new Object[] {1000014, "d", timestamp1, localDateTime1}); + s1.add(new Object[] {1000021, "d", timestamp1, localDateTime1}); + s1.add(new Object[] {1000015, "e", timestamp1, localDateTime1}); + + write(s1, testTable, FLINK_ROW_TYPE); + + List s2 = new LinkedList<>(); + LocalDateTime localDateTime2 = LocalDateTime.parse("2022-06-19T10:10:11.0"); + long timestamp2 = localDateTime2.toInstant(ZoneOffset.UTC).toEpochMilli(); + s2.add(new Object[] {12, "ac", timestamp2, localDateTime2}); + s2.add(new Object[] {52, "ad", timestamp2, localDateTime2}); + s2.add(new Object[] {15, "ad", timestamp2, localDateTime2}); + s2.add(new Object[] {26, "ae", timestamp2, localDateTime2}); + + Collection expectedRecords = DataUtil.toRowData(s2); + write(s2, testTable, FLINK_ROW_TYPE); + + testTable.refresh(); + Snapshot s = testTable.snapshots().iterator().next(); + + DataStream ds = + FlinkSource.forRowData() + .env(env) + .context(Optional::of) + .project(FLINK_SCHEMA) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkConf(conf) + .properties( + new HashMap() { + { + put("streaming", "true"); + put("start-snapshot-id", String.valueOf(s.snapshotId())); + } + }) + .build(); + + ClientAndIterator clientAndIterator = + DataStreamUtils.collectWithClient(ds, this.getClass().getName()); + + JobClient jobClient = clientAndIterator.client; + CloseableIterator iterator = clientAndIterator.iterator; + + Set rowData = new HashSet<>(); + while (iterator.hasNext()) { + RowData o = iterator.next(); + rowData.add( + GenericRowData.of(o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6))); + if (rowData.size() == expectedRecords.size()) { + break; + } + } + jobClient.cancel(); + + Assert.assertEquals(new HashSet<>(expectedRecords), rowData); + + CloseableIterator resultIterator = + FlinkSource.forRowData() + .env(env) + .context(Optional::of) + .project(FLINK_SCHEMA) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkConf(conf) + .properties( + new HashMap() { + { + put("streaming", "false"); + put("snapshot-id", String.valueOf(s.snapshotId())); + } + }) + .build() + .executeAndCollect(); + + rowData.clear(); + resultIterator.forEachRemaining( + o -> + rowData.add( + GenericRowData.of( + o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6)))); + + expectedRecords = DataUtil.toRowData(s1); + Assert.assertEquals(new HashSet<>(expectedRecords), rowData); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java new file mode 100644 index 0000000000..859776b94c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.scan.ChangeTableIncrementalScan; +import org.apache.iceberg.Snapshot; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestFlinkSplitPlanner extends TestRowDataReaderFunction { + + @Test + public void testPlanSplitFromKeyedTable() { + testKeyedTable.baseTable().refresh(); + testKeyedTable.changeTable().refresh(); + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + Assert.assertEquals(7, splitList.size()); + } + + @Test + public void testIncrementalChangelog() throws IOException { + testKeyedTable.baseTable().refresh(); + testKeyedTable.changeTable().refresh(); + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + + Assert.assertEquals(7, splitList.size()); + + long startSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); + writeUpdate(); + testKeyedTable.changeTable().refresh(); + + Snapshot snapshot = testKeyedTable.changeTable().snapshot(startSnapshotId); + long fromSequence = snapshot.sequenceNumber(); + + long nowSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); + ChangeTableIncrementalScan changeTableScan = + testKeyedTable + .changeTable() + .newScan() + .useSnapshot(nowSnapshotId) + .fromSequence(fromSequence); + + List changeSplits = + FlinkSplitPlanner.planChangeTable(changeTableScan, new AtomicInteger()); + + Assert.assertEquals(1, changeSplits.size()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java new file mode 100644 index 0000000000..ecdf4c47e9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java @@ -0,0 +1,1128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read; + +import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; +import static org.apache.amoro.MockAmoroManagementServer.TEST_DB_NAME; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; +import static org.apache.flink.util.Preconditions.checkArgument; +import static org.apache.flink.util.Preconditions.checkNotNull; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.flink.read.hybrid.reader.ReaderFunction; +import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.write.FlinkSink; +import org.apache.amoro.mixed.MixedFormatCatalog; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.amoro.table.UnkeyedTable; +import org.apache.amoro.utils.TableFileUtil; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.RpcServiceSharing; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.datastream.DataStreamUtils; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFiles; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.time.Duration; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestMixedFormatSource extends TestRowDataReaderFunction implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(TestMixedFormatSource.class); + private static final long serialVersionUID = 7418812854449034756L; + private static final int PARALLELISM = 1; + + @Rule + public final MiniClusterWithClientResource miniClusterResource = + new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .setRpcServiceSharing(RpcServiceSharing.DEDICATED) + .withHaLeadershipControl() + .build()); + + protected KeyedTable testFailoverTable; + protected static final String SINK_TABLE_NAME = "test_sink_exactly_once"; + protected static final TableIdentifier FAIL_TABLE_ID = + TableIdentifier.of( + TableTestHelper.TEST_CATALOG_NAME, TableTestHelper.TEST_DB_NAME, SINK_TABLE_NAME); + + @Before + public void testSetup() throws IOException { + MixedFormatCatalog testCatalog = getMixedFormatCatalog(); + + String db = FAIL_TABLE_ID.getDatabase(); + if (!testCatalog.listDatabases().contains(db)) { + testCatalog.createDatabase(db); + } + + if (!testCatalog.tableExists(FAIL_TABLE_ID)) { + testFailoverTable = + testCatalog + .newTableBuilder(FAIL_TABLE_ID, TABLE_SCHEMA) + .withPartitionSpec(BasicTableTestHelper.SPEC) + .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) + .create() + .asKeyedTable(); + } + } + + @After + public void dropTable() { + miniClusterResource.cancelAllJobs(); + getMixedFormatCatalog().dropTable(FAIL_TABLE_ID, true); + getMixedFormatCatalog().dropTable(TableTestHelper.TEST_TABLE_ID, true); + getMixedFormatCatalog().dropDatabase(TableTestHelper.TEST_DB_NAME); + } + + @Test + public void testMixedFormatSourceStatic() throws Exception { + MixedFormatSource mixedFormatSource = initMixedFormatSource(false); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(3000); + // set the source parallelism to 4 + final CloseableIterator resultIterator = + env.fromSource( + mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") + .setParallelism(PARALLELISM) + .executeAndCollect(); + + List actualResult = new ArrayList<>(); + + resultIterator.forEachRemaining( + row -> { + GenericRowData rowData = convert(row); + actualResult.add(rowData); + }); + RowData[] expected = expectedAfterMOR(); + assertArrayEquals(expected, actualResult); + } + + @Test + public void testMixedFormatSourceStaticJobManagerFailover() throws Exception { + testMixedFormatSource(FailoverType.JM); + } + + @Test + public void testMixedFormatSourceStaticTaskManagerFailover() throws Exception { + testMixedFormatSource(FailoverType.TM); + } + + public void testMixedFormatSource(FailoverType failoverType) throws Exception { + List expected = new ArrayList<>(expectedCollection()); + List updated = updateRecords(); + writeUpdate(updated); + List records = generateRecords(2, 1); + writeUpdate(records); + expected.addAll(updated); + expected.addAll(records); + + MixedFormatSource mixedFormatSource = initMixedFormatSource(false); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); + + DataStream input = + env.fromSource( + mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") + .setParallelism(PARALLELISM); + + List expectedAfterMoR = new ArrayList<>(mor(expected)); + DataStream streamFailingInTheMiddleOfReading = + RecordCounterToFail.wrapWithFailureAfter(input, expectedAfterMoR.size() / 2); + + FlinkSink.forRowData(streamFailingInTheMiddleOfReading) + .context(Optional::of) + .table(testFailoverTable) + .tableLoader(MixedFormatTableLoader.of(FAIL_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .build(); + + JobClient jobClient = env.executeAsync("Bounded Mixed-Format Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + RecordCounterToFail.waitToFail(); + triggerFailover( + failoverType, + jobId, + RecordCounterToFail::continueProcessing, + miniClusterResource.getMiniCluster()); + + assertRecords(testFailoverTable, expectedAfterMoR, Duration.ofMillis(10), 12000); + } + + @Test + public void testDimTaskManagerFailover() throws Exception { + List updated = updateRecords(); + writeUpdate(updated); + List records = generateRecords(2, 1); + writeUpdate(records); + + MixedFormatSource mixedFormatSource = initMixedFormatDimSource(true); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 0)); + + DataStream input = + env.fromSource( + mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") + .setParallelism(PARALLELISM); + + WatermarkAwareFailWrapper.wrapWithFailureAfter(input); + + JobClient jobClient = env.executeAsync("Dim Mixed-Format Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + WatermarkAwareFailWrapper.waitToFail(); + triggerFailover( + FailoverType.TM, + jobId, + WatermarkAwareFailWrapper::continueProcessing, + miniClusterResource.getMiniCluster()); + + while (WatermarkAwareFailWrapper.watermarkCounter.get() != PARALLELISM) { + Thread.sleep(1000); + LOG.info("wait for watermark after failover"); + } + Assert.assertEquals(Long.MAX_VALUE, WatermarkAwareFailWrapper.getWatermarkAfterFailover()); + } + + @Test + public void testMixedFormatContinuousSource() throws Exception { + MixedFormatSource mixedFormatSource = initMixedFormatSource(true); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + ClientAndIterator clientAndIterator = + executeAndCollectWithClient(env, mixedFormatSource); + + JobClient jobClient = clientAndIterator.client; + + List actualResult = + collectRecordsFromUnboundedStream(clientAndIterator, excepts().length); + + assertArrayEquals(excepts(), actualResult); + + LOG.info( + "begin write update_before update_after data and commit new snapshot to change table."); + writeUpdate(); + + actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length); + + assertArrayEquals(excepts2(), actualResult); + jobClient.cancel(); + } + + @Test + public void testMixedFormatContinuousSourceWithEmptyChangeInInit() throws Exception { + TableIdentifier tableId = + TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_empty_change"); + KeyedTable table = + getMixedFormatCatalog() + .newTableBuilder(tableId, TABLE_SCHEMA) + .withPartitionSpec(BasicTableTestHelper.SPEC) + .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) + .create() + .asKeyedTable(); + + TaskWriter taskWriter = createTaskWriter(true); + List baseData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(table, taskWriter.complete(), true); + + MixedFormatSource mixedFormatSource = + initMixedFormatSource(true, SCAN_STARTUP_MODE_EARLIEST, tableId); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + ClientAndIterator clientAndIterator = + executeAndCollectWithClient(env, mixedFormatSource); + + JobClient jobClient = clientAndIterator.client; + + List actualResult = + collectRecordsFromUnboundedStream(clientAndIterator, baseData.size()); + + Assert.assertEquals(new HashSet<>(baseData), new HashSet<>(actualResult)); + + LOG.info( + "begin write update_before update_after data and commit new snapshot to change table."); + writeUpdate(updateRecords(), table); + writeUpdate(updateRecords(), table); + + actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); + jobClient.cancel(); + + Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); + getMixedFormatCatalog().dropTable(tableId, true); + } + + @Test + public void testMixedFormatSourceEnumeratorWithChangeExpired() throws Exception { + final String maxContinuousEmptyCommits = "flink.max-continuous-empty-commits"; + TableIdentifier tableId = TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_keyed_tb"); + KeyedTable table = + getMixedFormatCatalog() + .newTableBuilder(tableId, TABLE_SCHEMA) + .withProperty(maxContinuousEmptyCommits, "1") + .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) + .create() + .asKeyedTable(); + + TaskWriter taskWriter = createTaskWriter(table, false); + List changeData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + } + }; + for (RowData record : changeData) { + taskWriter.write(record); + } + + List changeDataFiles = new ArrayList<>(); + WriteResult result = taskWriter.complete(); + changeDataFiles.addAll(Arrays.asList(result.dataFiles())); + commit(table, result, false); + + for (DataFile dataFile : changeDataFiles) { + Assert.assertTrue(table.io().exists(dataFile.path().toString())); + } + + final Duration monitorInterval = Duration.ofSeconds(1); + MixedFormatSource mixedFormatSource = + initMixedFormatSourceWithMonitorInterval( + true, SCAN_STARTUP_MODE_EARLIEST, tableId, monitorInterval); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + ClientAndIterator clientAndIterator = + executeAndCollectWithClient(env, mixedFormatSource); + + JobClient jobClient = clientAndIterator.client; + + List actualResult = + collectRecordsFromUnboundedStream(clientAndIterator, changeData.size()); + Assert.assertEquals(new HashSet<>(changeData), new HashSet<>(actualResult)); + + // expire changeTable snapshots + DeleteFiles deleteFiles = table.changeTable().newDelete(); + for (DataFile dataFile : changeDataFiles) { + Assert.assertTrue(table.io().exists(dataFile.path().toString())); + deleteFiles.deleteFile(dataFile); + } + deleteFiles.commit(); + + LOG.info("commit empty snapshot"); + AppendFiles changeAppend = table.changeTable().newAppend(); + changeAppend.commit(); + + final long timeWait = (monitorInterval.toMillis() * 2); + LOG.info("try sleep {}, wait snapshot expired and scan the empty snapshot.", timeWait); + Thread.sleep(timeWait); + + expireSnapshots(table.changeTable(), System.currentTimeMillis(), new HashSet<>()); + + writeUpdate(updateRecords(), table); + writeUpdate(updateRecords(), table); + + actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); + jobClient.cancel(); + + Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); + getMixedFormatCatalog().dropTable(tableId, true); + } + + @Test + public void testMixedFormatSourceEnumeratorWithBaseExpired() throws Exception { + final String maxContinuousEmptyCommits = "flink.max-continuous-empty-commits"; + TableIdentifier tableId = TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_keyed_tb"); + KeyedTable table = + getMixedFormatCatalog() + .newTableBuilder(tableId, TABLE_SCHEMA) + .withProperty(maxContinuousEmptyCommits, "1") + .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) + .create() + .asKeyedTable(); + + TaskWriter taskWriter = createTaskWriter(table, true); + List baseData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + + List baseDataFiles = new ArrayList<>(); + WriteResult result = taskWriter.complete(); + baseDataFiles.addAll(Arrays.asList(result.dataFiles())); + commit(table, result, true); + + for (DataFile dataFile : baseDataFiles) { + Assert.assertTrue(table.io().exists(dataFile.path().toString())); + } + + final Duration monitorInterval = Duration.ofSeconds(1); + MixedFormatSource mixedFormatSource = + initMixedFormatSourceWithMonitorInterval( + true, SCAN_STARTUP_MODE_EARLIEST, tableId, monitorInterval); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + ClientAndIterator clientAndIterator = + executeAndCollectWithClient(env, mixedFormatSource); + + JobClient jobClient = clientAndIterator.client; + + List actualResult = + collectRecordsFromUnboundedStream(clientAndIterator, baseData.size()); + Assert.assertEquals(new HashSet<>(baseData), new HashSet<>(actualResult)); + + // expire baseTable snapshots + DeleteFiles deleteFiles = table.baseTable().newDelete(); + for (DataFile dataFile : baseDataFiles) { + Assert.assertTrue(table.io().exists(dataFile.path().toString())); + deleteFiles.deleteFile(dataFile); + } + deleteFiles.commit(); + + LOG.info("commit empty snapshot"); + AppendFiles changeAppend = table.changeTable().newAppend(); + changeAppend.commit(); + + final long timeWait = (monitorInterval.toMillis() * 2); + LOG.info("try sleep {}, wait snapshot expired and scan the empty snapshot.", timeWait); + Thread.sleep(timeWait); + + expireSnapshots(table.baseTable(), System.currentTimeMillis(), new HashSet<>()); + + writeUpdate(updateRecords(), table); + writeUpdate(updateRecords(), table); + + actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); + jobClient.cancel(); + + Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); + getMixedFormatCatalog().dropTable(tableId, true); + } + + @Test + public void testLatestStartupMode() throws Exception { + MixedFormatSource mixedFormatSource = initMixedFormatSourceWithLatest(); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + + ClientAndIterator clientAndIterator = + executeAndCollectWithClient(env, mixedFormatSource); + + JobClient jobClient = clientAndIterator.client; + + while (true) { + if (JobStatus.RUNNING == jobClient.getJobStatus().get()) { + Thread.sleep(500); + LOG.info( + "begin write update_before update_after data and commit new snapshot to change table."); + writeUpdate(); + break; + } + Thread.sleep(100); + } + + List actualResult = + collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length); + + assertArrayEquals(excepts2(), actualResult); + jobClient.cancel(); + } + + @Test + public void testMixedFormatContinuousSourceJobManagerFailover() throws Exception { + testMixedFormatContinuousSource(FailoverType.JM); + } + + @Test + public void testMixedFormatContinuousSourceTaskManagerFailover() throws Exception { + testMixedFormatContinuousSource(FailoverType.TM); + } + + public void testMixedFormatContinuousSource(final FailoverType failoverType) throws Exception { + List expected = new ArrayList<>(Arrays.asList(excepts())); + writeUpdate(); + expected.addAll(Arrays.asList(excepts2())); + + MixedFormatSource mixedFormatSource = initMixedFormatSource(true); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + // enable checkpoint + env.enableCheckpointing(1000); + // env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); + + DataStream input = + env.fromSource( + mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") + .setParallelism(PARALLELISM); + + FlinkSink.forRowData(input) + .context(Optional::of) + .table(testFailoverTable) + .tableLoader(MixedFormatTableLoader.of(FAIL_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .build(); + + JobClient jobClient = env.executeAsync("Unbounded Mixed-Format Source Failover Test"); + JobID jobId = jobClient.getJobID(); + + for (int i = 1; i < 5; i++) { + Thread.sleep(10); + List records = generateRecords(2, i); + expected.addAll(records); + writeUpdate(records); + if (i == 2) { + triggerFailover(failoverType, jobId, () -> {}, miniClusterResource.getMiniCluster()); + } + } + + // wait longer for continuous source to reduce flakiness + // because CI servers tend to be overloaded. + assertRecords(testFailoverTable, expected, Duration.ofMillis(10), 12000); + jobClient.cancel(); + } + + private void assertRecords( + KeyedTable testFailoverTable, + List expected, + Duration checkInterval, + int maxCheckCount) + throws InterruptedException { + for (int i = 0; i < maxCheckCount; ++i) { + if (equalsRecords(expected, tableRecords(testFailoverTable), testFailoverTable.schema())) { + break; + } else { + Thread.sleep(checkInterval.toMillis()); + } + } + // success or failure, assert on the latest table state + equalsRecords(expected, tableRecords(testFailoverTable), testFailoverTable.schema()); + } + + private boolean equalsRecords(List expected, List tableRecords, Schema schema) { + try { + RowData[] expectedArray = sortRowDataCollection(expected); + RowData[] actualArray = sortRowDataCollection(tableRecords); + Assert.assertArrayEquals(expectedArray, actualArray); + return true; + } catch (Throwable e) { + return false; + } + } + + public static List tableRecords(final KeyedTable keyedTable) { + keyedTable.refresh(); + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(keyedTable, new AtomicInteger(0)); + + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + new Configuration(), + keyedTable.schema(), + keyedTable.schema(), + keyedTable.primaryKeySpec(), + null, + true, + keyedTable.io()); + + List actual = new ArrayList<>(); + mixedFormatSplits.forEach( + split -> { + LOG.info("Mixed format split: {}.", split); + DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + LOG.info("{}", rowData); + actual.add(rowData); + } + }); + return actual; + } + + private List generateRecords(int numRecords, int index) { + int pk = 100; + List records = new ArrayList<>(numRecords); + for (int i = index; i < numRecords + index; i++) { + records.add( + GenericRowData.ofKind( + RowKind.INSERT, + pk + index, + StringData.fromString("jo" + index + i), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + records.add( + GenericRowData.ofKind( + RowKind.DELETE, + pk + index, + StringData.fromString("jo" + index + i), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + } + return records; + } + + // ------------------------------------------------------------------------ + // test utilities + // ------------------------------------------------------------------------ + + private enum FailoverType { + NONE, + TM, + JM + } + + private static void triggerFailover( + FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + switch (type) { + case NONE: + afterFailAction.run(); + break; + case TM: + restartTaskManager(afterFailAction, miniCluster); + break; + case JM: + triggerJobManagerFailover(jobId, afterFailAction, miniCluster); + break; + } + } + + private static void triggerJobManagerFailover( + JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { + final HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); + haLeadershipControl.revokeJobMasterLeadership(jobId).get(); + afterFailAction.run(); + haLeadershipControl.grantJobMasterLeadership(jobId).get(); + } + + private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) + throws Exception { + miniCluster.terminateTaskManager(0).get(); + afterFailAction.run(); + miniCluster.startTaskManager(); + } + + private List collectRecordsFromUnboundedStream( + final ClientAndIterator client, final int numElements) { + + checkNotNull(client, "client"); + checkArgument(numElements > 0, "numElement must be > 0"); + + final ArrayList result = new ArrayList<>(numElements); + final Iterator iterator = client.iterator; + + CollectTask collectTask = new CollectTask(result, iterator, numElements); + new Thread(collectTask).start(); + + long start = System.currentTimeMillis(); + final long timeout = 60 * 1000; + long intervalOneSecond = 1; + while (collectTask.running) { + // TODO a more proper timeout strategy? + long timeFlies = System.currentTimeMillis() - start; + if (timeFlies / 1000 >= intervalOneSecond) { + LOG.info("Time flies: {} ms.", timeFlies); + intervalOneSecond++; + } + if (System.currentTimeMillis() - start > timeout) { + LOG.error( + "This task [{}] try to collect records from unbounded stream but timeout {}. As of now, collect result:{}.", + client.client.getJobID().toString(), + timeout, + result.toArray()); + break; + } + } + + Assert.assertEquals( + String.format( + "The stream ended before reaching the requested %d records. Only %d records were received, received list:%s.", + numElements, result.size(), Arrays.toString(result.toArray())), + numElements, + result.size()); + + return result; + } + + private static class CollectTask implements Runnable { + final ArrayList result; + final Iterator iterator; + final int limit; + + boolean running = true; + + public CollectTask(ArrayList result, Iterator iterator, int limit) { + this.result = result; + this.iterator = iterator; + this.limit = limit; + } + + @Override + public void run() { + while (iterator.hasNext()) { + result.add(convert(iterator.next())); + if (result.size() == limit) { + running = false; + return; + } + } + } + } + + private ClientAndIterator executeAndCollectWithClient( + StreamExecutionEnvironment env, MixedFormatSource mixedFormatSource) + throws Exception { + final DataStreamSource source = + env.fromSource( + mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") + .setParallelism(PARALLELISM); + return DataStreamUtils.collectWithClient(source, "job_" + name.getMethodName()); + } + + private static GenericRowData convert(RowData row) { + GenericRowData rowData = new GenericRowData(row.getRowKind(), row.getArity()); + rowData.setField(0, row.getInt(0)); + rowData.setField(1, row.getString(1)); + rowData.setField(2, row.getLong(2)); + rowData.setField(3, row.getTimestamp(3, 6)); + return rowData; + } + + private static void expireSnapshots( + UnkeyedTable tableStore, long olderThan, Set exclude) { + LOG.debug("start expire snapshots, the exclude is {}", exclude); + final AtomicInteger toDeleteFiles = new AtomicInteger(0); + final AtomicInteger deleteFiles = new AtomicInteger(0); + Set parentDirectory = new HashSet<>(); + tableStore + .expireSnapshots() + .retainLast(1) + .expireOlderThan(olderThan) + .deleteWith( + file -> { + try { + if (!exclude.contains(file) + && !exclude.contains(new Path(file).getParent().toString())) { + tableStore.io().deleteFile(file); + } + parentDirectory.add(new Path(file).getParent().toString()); + deleteFiles.incrementAndGet(); + } catch (Throwable t) { + LOG.warn("failed to delete file {}", file, t); + } finally { + toDeleteFiles.incrementAndGet(); + } + }) + .cleanExpiredFiles(true) + .commit(); + parentDirectory.forEach( + parent -> TableFileUtil.deleteEmptyDirectory(tableStore.io(), parent, exclude)); + LOG.info("to delete {} files, success delete {} files", toDeleteFiles.get(), deleteFiles.get()); + } + + private MixedFormatSource initMixedFormatSource(boolean isStreaming) { + return initMixedFormatSource(isStreaming, SCAN_STARTUP_MODE_EARLIEST); + } + + private MixedFormatSource initMixedFormatSourceWithLatest() { + return initMixedFormatSource(true, SCAN_STARTUP_MODE_LATEST); + } + + private MixedFormatSource initMixedFormatSource( + boolean isStreaming, String scanStartupMode) { + MixedFormatTableLoader tableLoader = initLoader(); + MixedFormatScanContext mixedFormatScanContext = + initMixedFormatScanContext(isStreaming, scanStartupMode); + ReaderFunction rowDataReaderFunction = initRowDataReadFunction(); + TypeInformation typeInformation = + InternalTypeInfo.of(FlinkSchemaUtil.convert(testKeyedTable.schema())); + + return new MixedFormatSource<>( + tableLoader, + mixedFormatScanContext, + rowDataReaderFunction, + typeInformation, + testKeyedTable.name(), + false); + } + + private MixedFormatSource initMixedFormatSourceWithMonitorInterval( + boolean isStreaming, + String scanStartupMode, + TableIdentifier tableIdentifier, + Duration monitorInterval) { + MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableIdentifier, catalogBuilder); + MixedFormatScanContext mixedFormatScanContext = + initMixedFormatScanContext(isStreaming, scanStartupMode, monitorInterval); + MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); + ReaderFunction rowDataReaderFunction = initRowDataReadFunction(table.asKeyedTable()); + TypeInformation typeInformation = + InternalTypeInfo.of(FlinkSchemaUtil.convert(table.schema())); + + return new MixedFormatSource<>( + tableLoader, + mixedFormatScanContext, + rowDataReaderFunction, + typeInformation, + table.name(), + false); + } + + private MixedFormatSource initMixedFormatSource( + boolean isStreaming, String scanStartupMode, TableIdentifier tableIdentifier) { + return initMixedFormatSourceWithMonitorInterval( + isStreaming, scanStartupMode, tableIdentifier, Duration.ofMillis(500)); + } + + private MixedFormatSource initMixedFormatDimSource(boolean isStreaming) { + MixedFormatTableLoader tableLoader = initLoader(); + MixedFormatScanContext mixedFormatScanContext = + initMixedFormatScanContext(isStreaming, SCAN_STARTUP_MODE_EARLIEST); + ReaderFunction rowDataReaderFunction = initRowDataReadFunction(); + Schema schema = testKeyedTable.schema(); + Schema schemaWithWm = + TypeUtil.join( + schema, + new Schema(Types.NestedField.of(-1, true, "opt", Types.TimestampType.withoutZone()))); + TypeInformation typeInformation = + InternalTypeInfo.of(FlinkSchemaUtil.convert(schemaWithWm)); + + return new MixedFormatSource<>( + tableLoader, + mixedFormatScanContext, + rowDataReaderFunction, + typeInformation, + testKeyedTable.name(), + true); + } + + private RowDataReaderFunction initRowDataReadFunction() { + return initRowDataReadFunction(testKeyedTable); + } + + private RowDataReaderFunction initRowDataReadFunction(KeyedTable keyedTable) { + return new RowDataReaderFunction( + new Configuration(), + keyedTable.schema(), + keyedTable.schema(), + keyedTable.primaryKeySpec(), + null, + true, + keyedTable.io()); + } + + private MixedFormatScanContext initMixedFormatScanContext( + boolean isStreaming, String scanStartupMode, Duration monitorInterval) { + return MixedFormatScanContext.contextBuilder() + .streaming(isStreaming) + .scanStartupMode(scanStartupMode) + .monitorInterval(monitorInterval) + .build(); + } + + private MixedFormatScanContext initMixedFormatScanContext( + boolean isStreaming, String scanStartupMode) { + return MixedFormatScanContext.contextBuilder() + .streaming(isStreaming) + .scanStartupMode(scanStartupMode) + .monitorInterval(Duration.ofMillis(500)) + .build(); + } + + private MixedFormatTableLoader initLoader() { + return MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + } + + // ------------------------------------------------------------------------ + // mini cluster failover utilities + // ------------------------------------------------------------------------ + + private static class RecordCounterToFail { + + private static AtomicInteger records; + private static CompletableFuture fail; + private static CompletableFuture continueProcessing; + + private static DataStream wrapWithFailureAfter(DataStream stream, int failAfter) { + + records = new AtomicInteger(); + fail = new CompletableFuture<>(); + continueProcessing = new CompletableFuture<>(); + return stream.map( + record -> { + final boolean halfOfInputIsRead = records.incrementAndGet() > failAfter; + final boolean notFailedYet = !fail.isDone(); + if (notFailedYet && halfOfInputIsRead) { + fail.complete(null); + continueProcessing.get(); + } + return record; + }); + } + + private static void waitToFail() throws ExecutionException, InterruptedException { + fail.get(); + } + + private static void continueProcessing() { + continueProcessing.complete(null); + } + } + + private static class WatermarkAwareFailWrapper { + + private static WatermarkFailoverTestOperator op; + private static long watermarkAfterFailover = -1; + private static final AtomicInteger watermarkCounter = new AtomicInteger(0); + + public static long getWatermarkAfterFailover() { + return watermarkAfterFailover; + } + + private static DataStream wrapWithFailureAfter(DataStream stream) { + op = new WatermarkFailoverTestOperator(); + return stream.transform("watermark failover", TypeInformation.of(RowData.class), op); + } + + private static void waitToFail() throws InterruptedException { + op.waitToFail(); + } + + private static void continueProcessing() { + op.continueProcessing(); + } + + static class WatermarkFailoverTestOperator extends AbstractStreamOperator + implements OneInputStreamOperator { + + private static final long serialVersionUID = 1L; + private static boolean fail = false; + private static boolean failoverHappened = false; + + public WatermarkFailoverTestOperator() { + super(); + chainingStrategy = ChainingStrategy.ALWAYS; + } + + private void waitToFail() throws InterruptedException { + while (!fail) { + LOG.info("Waiting to fail"); + Thread.sleep(1000); + } + } + + private void continueProcessing() { + failoverHappened = true; + LOG.info("failover happened"); + } + + @Override + public void open() throws Exception { + super.open(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + output.collect(element); + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + LOG.info("processWatermark: {}", mark); + if (!failoverHappened && mark.getTimestamp() > 0) { + fail = true; + } + if (failoverHappened) { + LOG.info("failover happened, watermark: {}", mark); + Assert.assertEquals(Long.MAX_VALUE, mark.getTimestamp()); + if (watermarkAfterFailover == -1) { + watermarkAfterFailover = mark.getTimestamp(); + } else { + watermarkAfterFailover = Math.min(watermarkAfterFailover, mark.getTimestamp()); + } + watermarkCounter.incrementAndGet(); + } + super.processWatermark(mark); + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java new file mode 100644 index 0000000000..029ebf3e75 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hidden.kafka; + +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; +import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; +import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; +import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.TopicPartition; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +public class TestKafkaConsumer extends TestBaseLog { + private static final Logger LOG = LoggerFactory.getLogger(TestKafkaConsumer.class); + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Test + public void testTransactionalConsume() { + final String topic = "test-offset-flip"; + FlinkKafkaInternalProducer reuse = null; + final String transactionalIdPrefix = UUID.randomUUID().toString(); + try { + int numCount = 20; + Properties properties = new Properties(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + properties = getProperties(KafkaConfigGenerate.getStandardProperties(properties)); + properties.put(TRANSACTIONAL_ID_CONFIG, transactionalIdPrefix + "flip"); + reuse = new FlinkKafkaInternalProducer<>(properties); + reuse.initTransactions(); + reuse.beginTransaction(); + for (int i = 1; i <= numCount; i++) { + reuse.send(new ProducerRecord<>(topic, "test-value-" + i)); + } + reuse.commitTransaction(); + int count = KafkaContainerTest.countAllRecords(topic, properties); + LOG.info("consumption = {}", count); + assertThat(count).isEqualTo(numCount); + } catch (Throwable e) { + LOG.error("error:", e); + if (reuse != null) { + reuse.abortTransaction(); + } + } finally { + assert reuse != null; + reuse.close(Duration.ofMillis(1000)); + } + } + + @Test + public void testResetOffset() { + final int countNum = 20; + String topicIntern = TestHiddenLogOperators.TOPIC; + Properties properties = new Properties(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); + // send + properties.put(TRANSACTIONAL_ID_CONFIG, "transactionalId1"); + FlinkKafkaInternalProducer reuse = new FlinkKafkaInternalProducer<>(properties); + reuse.initTransactions(); + reuse.beginTransaction(); + String[] expects = new String[countNum]; + for (int i = 0; i < countNum; i++) { + expects[i] = "test-value-" + i; + reuse.send(new ProducerRecord<>(TestHiddenLogOperators.TOPIC, expects[i].getBytes())); + } + reuse.commitTransaction(); + reuse.close(Duration.ofMillis(1000)); + + // read all + properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + KafkaConsumer consumer = new KafkaConsumer<>(properties); + Set topicPartitionList = + consumer.partitionsFor(topicIntern).stream() + .map(partitionInfo -> new TopicPartition(topicIntern, partitionInfo.partition())) + .collect(Collectors.toSet()); + TopicPartition partition0 = topicPartitionList.stream().iterator().next(); + consumer.assign(topicPartitionList); + consumer.seekToBeginning(consumer.assignment()); + ConsumerRecords consumerRecords = consumer.poll(Duration.ofMillis(1000)); + + int count = consumerRecords.count(); + assertThat(count).isEqualTo(countNum); + List actual = new ArrayList<>(); + consumerRecords.forEach(consumerRecord -> actual.add(new String(consumerRecord.value()))); + Assertions.assertArrayEquals(expects, actual.toArray(new String[0])); + + // seek + long seekOffset = 1L; + consumer.seek(partition0, seekOffset); + + consumerRecords = consumer.poll(Duration.ofMillis(1000)); + + count = consumerRecords.count(); + assertThat(count).isEqualTo(countNum - seekOffset); + List actualSeek = new ArrayList<>(); + consumerRecords.forEach(consumerRecord -> actualSeek.add(new String(consumerRecord.value()))); + String[] expect = Arrays.copyOfRange(expects, (int) seekOffset, countNum); + Assertions.assertArrayEquals(expect, actualSeek.toArray(new String[0])); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java new file mode 100644 index 0000000000..2747d9ec59 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hidden.kafka; + +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.getPropertiesByTopic; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; +import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; +import static org.junit.Assert.assertEquals; + +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplit; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplitState; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSourceReader; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.util.TestUtil; +import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; +import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; +import org.apache.flink.table.data.RowData; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.TopicPartition; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +public class TestKafkaSourceReader { + private static final Logger LOG = LoggerFactory.getLogger(TestKafkaSourceReader.class); + private static String topic; + private static final int KAFKA_PARTITION_NUMS = 1; + private static final int NUM_SPLITS = 1; + private static final int NUM_RECORDS_PER_SPLIT = 10; + private static final int TOTAL_NUM_RECORDS = NUM_RECORDS_PER_SPLIT * NUM_SPLITS; + + @Rule public TestName testName = new TestName(); + + private static final byte[] JOB_ID = IdGenerator.generateUpstreamId(); + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Before + public void initData() throws Exception { + topic = TestUtil.getUtMethodName(testName); + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + write(topic, TOTAL_NUM_RECORDS); + } + + @Test + public void testSourceReaderFailover() throws Exception { + final String groupId = "testSourceReaderFailover"; + LogKafkaSourceReader reader = (LogKafkaSourceReader) createReader(groupId); + reader.addSplits(getSplits(NUM_SPLITS)); + ValidatingSourceOutput output = new ValidatingSourceOutput(); + List splitList; + long checkpointId = 0; + do { + checkpointId++; + reader.pollNext(output); + // Create a checkpoint for each message consumption, but not complete them. + splitList = reader.snapshotState(checkpointId); + } while (output.count() < TOTAL_NUM_RECORDS); + + // The completion of the last checkpoint should subsume all the previous checkpoints. + assertEquals(checkpointId, reader.getOffsetsToCommit().size()); + reader.notifyCheckpointComplete(checkpointId); + + // re-create and restore + reader = (LogKafkaSourceReader) createReader(groupId); + reader.addSplits(splitList); + List currentSplitList = reader.snapshotState(checkpointId); + currentSplitList.forEach(s -> assertEquals(TOTAL_NUM_RECORDS, s.getStartingOffset())); + } + + private ProducerRecord createLogData( + String topic, + int i, + int epicNo, + boolean flip, + LogDataJsonSerialization serialization) { + RowData rowData = TestHiddenLogOperators.createRowData(i); + LogData logData = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + JOB_ID, + epicNo, + flip, + transformFromFlinkRowKind(rowData.getRowKind()), + rowData); + byte[] message = serialization.serialize(logData); + int partition = 0; + ProducerRecord producerRecord = + new ProducerRecord<>(topic, partition, null, null, message); + return producerRecord; + } + + private void write(String topic, int numRecords) throws Exception { + KafkaProducer producer = KafkaContainerTest.getProducer(); + LogDataJsonSerialization serialization = + new LogDataJsonSerialization<>(TestBaseLog.USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); + for (int i = 0; i < numRecords; i++) { + producer.send(createLogData(topic, 0, 1, false, serialization)); + } + printDataInTopic(topic); + } + + public static void printDataInTopic(String topic) { + ConsumerRecords consumerRecords = readRecordsBytes(topic); + LogDataJsonDeserialization deserialization = + TestBaseLog.createLogDataDeserialization(); + consumerRecords.forEach( + consumerRecord -> { + try { + LOG.info("data in kafka: {}", deserialization.deserialize(consumerRecord.value())); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + private SourceReader createReader(String groupId) { + List topics = new ArrayList<>(); + topics.add(topic); + LogKafkaSource kafkaSource = createKafkaSource(groupId, false, topics); + return kafkaSource.createReader(new TestingReaderContext()); + } + + private LogKafkaSource createKafkaSource(String groupId, boolean retract, List topics) { + Properties properties = getPropertiesByTopic(topic); + properties.put("group.id", groupId); + properties.put("auto.offset.reset", "earliest"); + + Map configuration = new HashMap<>(); + configuration.put(MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), String.valueOf(retract)); + + return LogKafkaSource.builder(TestBaseLog.USER_SCHEMA, configuration) + .setTopics(topics) + .setStartingOffsets(OffsetsInitializer.earliest()) + .setProperties(properties) + .build(); + } + + protected List getSplits(int numRecordsPerSplit) { + List splits = new ArrayList<>(); + for (int i = 0; i < numRecordsPerSplit; i++) { + splits.add(getSplit(i, numRecordsPerSplit)); + } + return splits; + } + + protected LogKafkaPartitionSplit getSplit(int splitId, int numRecords) { + long stoppingOffset = KafkaPartitionSplit.NO_STOPPING_OFFSET; + KafkaPartitionSplit kafkaPartitionSplit = + new KafkaPartitionSplit(new TopicPartition(topic, splitId), 0L, stoppingOffset); + return new LogKafkaPartitionSplit(new LogKafkaPartitionSplitState(kafkaPartitionSplit)); + } + + // ---------------- helper classes ----------------- + + /** A source output that validates the output. */ + public static class ValidatingSourceOutput implements ReaderOutput { + private final Set consumedValues = new HashSet<>(); + private final int max = Integer.MIN_VALUE; + private final int min = Integer.MAX_VALUE; + + private int count = 0; + + @Override + public void collect(RowData rowData) { + count++; + consumedValues.add(rowData); + } + + @Override + public void collect(RowData rowData, long timestamp) { + collect(rowData); + } + + @Override + public void emitWatermark(Watermark watermark) {} + + public void validate() { + assertEquals( + String.format("Should be %d distinct elements in total", TOTAL_NUM_RECORDS), + TOTAL_NUM_RECORDS, + consumedValues.size()); + assertEquals( + String.format("Should be %d elements in total", TOTAL_NUM_RECORDS), + TOTAL_NUM_RECORDS, + count); + assertEquals("The min value should be 0", 0, min); + assertEquals( + "The max value should be " + (TOTAL_NUM_RECORDS - 1), TOTAL_NUM_RECORDS - 1, max); + } + + public int count() { + return count; + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + + @Override + public SourceOutput createOutputForSplit(String splitId) { + return this; + } + + @Override + public void releaseOutputForSplit(String splitId) {} + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java new file mode 100644 index 0000000000..1c45d4abb9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hidden.kafka; + +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; +import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; +import static org.junit.Assert.assertEquals; + +import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.read.source.log.LogSourceHelper; +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplitReader; +import org.apache.amoro.flink.read.source.log.kafka.LogRecordWithRetractInfo; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; +import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; +import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; +import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; +import org.apache.flink.metrics.groups.SourceReaderMetricGroup; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.table.data.RowData; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.TopicPartition; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +public class TestLogKafkaPartitionSplitReader { + + private static final Logger LOG = LoggerFactory.getLogger(TestLogKafkaPartitionSplitReader.class); + + public static final int TOPIC1_STOP_OFFSET = 16; + public static final int TOPIC2_STOP_OFFSET = 21; + public static final String TOPIC1 = "topic1"; + public static final String TOPIC2 = "topic2"; + private static Map> splitsByOwners; + private static final byte[] JOB_ID = IdGenerator.generateUpstreamId(); + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + + Map earliestOffsets = new HashMap<>(); + earliestOffsets.put(new TopicPartition(TOPIC1, 0), 0L); + earliestOffsets.put(new TopicPartition(TOPIC2, 0), 5L); + splitsByOwners = getSplitsByOwners(earliestOffsets); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Before + public void initData() throws Exception { + // |0 1 2 3 4 5 6 7 8 9 Flip 10 11 12 13 14| 15 16 17 18 19 + write(TOPIC1, 0); + // 0 0 0 0 0 |5 6 7 8 9 10 11 12 13 14 Flip 15 16 17 18 19| 20 21 22 23 24 + write(TOPIC2, 5); + } + + @Test + public void testHandleSplitChangesAndFetch() throws IOException { + LogKafkaPartitionSplitReader reader = createReader(new Properties()); + assignSplitsAndFetchUntilFinish(reader, 0, 20); + assignSplitsAndFetchUntilFinish(reader, 1, 20); + } + + private ProducerRecord createLogData( + String topic, + int i, + int epicNo, + boolean flip, + LogDataJsonSerialization serialization) { + RowData rowData = TestHiddenLogOperators.createRowData(i); + LogData logData = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + JOB_ID, + epicNo, + flip, + transformFromFlinkRowKind(rowData.getRowKind()), + rowData); + byte[] message = serialization.serialize(logData); + int partition = 0; + ProducerRecord producerRecord = + new ProducerRecord<>(topic, partition, null, null, message); + return producerRecord; + } + + private void write(String topic, int offset) throws Exception { + KafkaProducer producer = KafkaContainerTest.getProducer(); + LogDataJsonSerialization serialization = + new LogDataJsonSerialization<>(TestBaseLog.USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); + for (int j = 0; j < offset; j++) { + producer.send(createLogData(topic, 0, 1, false, serialization)); + } + + int i = offset; + // 0-4 + offset success + for (; i < offset + 5; i++) { + producer.send(createLogData(topic, i, 1, false, serialization)); + } + + // 5-9 + offset fail + for (; i < offset + 10; i++) { + producer.send(createLogData(topic, i, 2, false, serialization)); + } + + producer.send(createLogData(topic, i, 1, true, serialization)); + + // 10-14 + offset success + for (; i < offset + 15; i++) { + producer.send(createLogData(topic, i, 2, false, serialization)); + } + + for (; i < offset + 20; i++) { + producer.send(createLogData(topic, i, 3, false, serialization)); + } + printDataInTopic(topic); + } + + public static void printDataInTopic(String topic) { + ConsumerRecords consumerRecords = readRecordsBytes(topic); + LogDataJsonDeserialization deserialization = + TestBaseLog.createLogDataDeserialization(); + consumerRecords.forEach( + consumerRecord -> { + try { + LOG.info("data in kafka: {}", deserialization.deserialize(consumerRecord.value())); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } + + private void assignSplitsAndFetchUntilFinish( + LogKafkaPartitionSplitReader reader, int readerId, int expectedRecordCount) + throws IOException { + Map splits = assignSplits(reader, splitsByOwners.get(readerId)); + + Map numConsumedRecords = new HashMap<>(); + Set finishedSplits = new HashSet<>(); + int flipCount = 0; + while (finishedSplits.size() < splits.size()) { + RecordsWithSplitIds> recordsBySplitIds = reader.fetch(); + String splitId = recordsBySplitIds.nextSplit(); + while (splitId != null) { + // Collect the records in this split. + List> splitFetch = new ArrayList<>(); + ConsumerRecord record; + boolean hasFlip = false; + while ((record = recordsBySplitIds.nextRecordFromSplit()) != null) { + LOG.info( + "read: {}, offset: {}", + ((LogRecordWithRetractInfo) record).getLogData().getActualValue(), + record.offset()); + if (((LogRecordWithRetractInfo) record).isRetracting()) { + hasFlip = true; + } + splitFetch.add((LogRecordWithRetractInfo) record); + } + if (hasFlip) { + flipCount++; + } + // verify the consumed records. + if (verifyConsumed(splits.get(splitId), splitFetch, flipCount)) { + finishedSplits.add(splitId); + } + numConsumedRecords.compute( + splitId, + (ignored, recordCount) -> + recordCount == null ? splitFetch.size() : recordCount + splitFetch.size()); + splitId = recordsBySplitIds.nextSplit(); + } + } + + // Verify the number of records consumed from each split. + numConsumedRecords.forEach( + (splitId, recordCount) -> { + assertEquals( + String.format("%s should have %d records.", splits.get(splitId), expectedRecordCount), + expectedRecordCount, + (long) recordCount); + }); + } + + public static Map> getSplitsByOwners( + Map earliestOffsets) { + final Map> splitsByOwners = new HashMap<>(); + splitsByOwners.put( + 0, + new HashMap() { + { + TopicPartition tp = new TopicPartition(TOPIC1, 0); + put( + KafkaPartitionSplit.toSplitId(tp), + new KafkaPartitionSplit(tp, earliestOffsets.get(tp), TOPIC1_STOP_OFFSET)); + } + }); + splitsByOwners.put( + 1, + new HashMap() { + { + TopicPartition tp = new TopicPartition(TOPIC2, 0); + put( + KafkaPartitionSplit.toSplitId(tp), + new KafkaPartitionSplit(tp, earliestOffsets.get(tp), TOPIC2_STOP_OFFSET)); + } + }); + return splitsByOwners; + } + + private Map assignSplits( + LogKafkaPartitionSplitReader reader, Map splits) { + SplitsChange splitsChange = + new SplitsAddition<>(new ArrayList<>(splits.values())); + reader.handleSplitsChanges(splitsChange); + return splits; + } + + private LogKafkaPartitionSplitReader createReader(Properties additionalProperties) { + Properties props = KafkaConfigGenerate.getPropertiesWithByteArray(); + props.put("group.id", "test"); + props.put("auto.offset.reset", "earliest"); + if (!additionalProperties.isEmpty()) { + props.putAll(additionalProperties); + } + SourceReaderMetricGroup sourceReaderMetricGroup = + UnregisteredMetricsGroup.createSourceReaderMetricGroup(); + return new LogKafkaPartitionSplitReader( + props, + new TestingReaderContext(new Configuration(), sourceReaderMetricGroup), + new KafkaSourceReaderMetrics(sourceReaderMetricGroup), + TestBaseLog.USER_SCHEMA, + true, + new LogSourceHelper(), + "all-kinds"); + } + + private boolean verifyConsumed( + final KafkaPartitionSplit split, + final Collection> consumed, + final int valueOffsetDiffInOrderedRead) { + long currentOffset = -1; + + for (LogRecordWithRetractInfo record : consumed) { + if (record.isRetracting()) { + assertEquals(record.offset(), record.getActualValue().getInt(1)); + } else { + assertEquals( + record.offset(), record.getActualValue().getInt(1) + valueOffsetDiffInOrderedRead); + } + + currentOffset = Math.max(currentOffset, record.offset()); + } + if (split.getStoppingOffset().isPresent()) { + return currentOffset == split.getStoppingOffset().get() - 1; + } else { + return false; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java new file mode 100644 index 0000000000..eef62b8dbe --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.flink.api.connector.source.ReaderInfo; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SplitsAssignment; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup; +import org.apache.flink.table.data.RowData; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; +import java.util.stream.Collectors; + +public class TestShuffleSplitAssigner extends TestRowDataReaderFunction { + private static final Logger LOG = LoggerFactory.getLogger(TestShuffleSplitAssigner.class); + + @Test + public void testSingleParallelism() { + ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(1); + + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + shuffleSplitAssigner.onDiscoveredSplits(splitList); + List actual = new ArrayList<>(); + + while (true) { + Split splitOpt = shuffleSplitAssigner.getNext(0); + if (splitOpt.isAvailable()) { + actual.add(splitOpt.split()); + } else { + break; + } + } + + Assert.assertEquals(splitList.size(), actual.size()); + } + + @Test + public void testMultiParallelism() { + ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); + + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + shuffleSplitAssigner.onDiscoveredSplits(splitList); + List actual = new ArrayList<>(); + + int subtaskId = 2; + while (subtaskId >= 0) { + Split splitOpt = shuffleSplitAssigner.getNext(subtaskId); + if (splitOpt.isAvailable()) { + actual.add(splitOpt.split()); + } else { + LOG.info("Subtask id {}, splits {}.\n {}", subtaskId, actual.size(), actual); + --subtaskId; + } + } + + Assert.assertEquals(splitList.size(), actual.size()); + } + + @Test + public void testTreeNodeMaskUpdate() { + ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); + long[][] treeNodes = + new long[][] { + {3, 0}, {3, 1}, {3, 2}, {3, 3}, {7, 0}, {7, 1}, {7, 2}, {7, 3}, {7, 4}, {1, 0}, {1, 1}, + {0, 0}, {7, 7}, {15, 15} + }; + long[][] expectNodes = + new long[][] { + {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 0}, {3, 0}, {3, 2}, + {3, 1}, {3, 3}, {3, 0}, {3, 2}, {3, 1}, {3, 3}, {3, 3}, {3, 3} + }; + + List actualNodes = new ArrayList<>(); + + for (long[] node : treeNodes) { + MixedFormatSplit mixedFormatSplit = + new MixedFormatSplit() { + DataTreeNode dataTreeNode = DataTreeNode.of(node[0], node[1]); + + @Override + public Integer taskIndex() { + return null; + } + + @Override + public void updateOffset(Object[] recordOffsets) {} + + @Override + public MixedFormatSplit copy() { + return null; + } + + @Override + public DataTreeNode dataTreeNode() { + return this.dataTreeNode; + } + + @Override + public void modifyTreeNode(DataTreeNode expected) { + this.dataTreeNode = expected; + } + + @Override + public String splitId() { + return null; + } + + @Override + public String toString() { + return dataTreeNode.toString(); + } + }; + List exactTreeNodes = + shuffleSplitAssigner.getExactlyTreeNodes(mixedFormatSplit); + actualNodes.addAll(exactTreeNodes); + } + long[][] result = + actualNodes.stream() + .map(treeNode -> new long[] {treeNode.mask(), treeNode.index()}) + .toArray(value -> new long[actualNodes.size()][]); + + Assert.assertArrayEquals(expectNodes, result); + } + + @Test + public void testNodeUpMoved() throws IOException { + writeUpdateWithSpecifiedMaskOne(); + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); + int totalParallelism = 3; + ShuffleSplitAssigner assigner = instanceSplitAssigner(totalParallelism); + assigner.onDiscoveredSplits(mixedFormatSplits); + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + new Configuration(), + testKeyedTable.schema(), + testKeyedTable.schema(), + testKeyedTable.primaryKeySpec(), + null, + true, + testKeyedTable.io()); + int subtaskId = 0; + Split split; + List actual = new ArrayList<>(); + LOG.info("subtaskId={}...", subtaskId); + do { + split = assigner.getNext(subtaskId); + if (split.isAvailable()) { + DataIterator dataIterator = + rowDataReaderFunction.createDataIterator(split.split()); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + LOG.info("{}", rowData); + actual.add(rowData); + } + } else { + subtaskId = subtaskId + 1; + LOG.info("subtaskId={}...", subtaskId); + } + } while (subtaskId < totalParallelism); + + List excepts = expectedCollection(); + excepts.addAll(generateRecords()); + RowData[] array = + excepts.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()) + .toArray(new RowData[excepts.size()]); + assertArrayEquals(array, actual); + } + + protected ShuffleSplitAssigner instanceSplitAssigner(int parallelism) { + SplitEnumeratorContext splitEnumeratorContext = + new InternalSplitEnumeratorContext(parallelism); + return new ShuffleSplitAssigner(splitEnumeratorContext); + } + + protected static class InternalSplitEnumeratorContext + implements SplitEnumeratorContext { + private final int parallelism; + + public InternalSplitEnumeratorContext(int parallelism) { + this.parallelism = parallelism; + } + + @Override + public SplitEnumeratorMetricGroup metricGroup() { + return null; + } + + @Override + public void sendEventToSourceReader(int subtaskId, SourceEvent event) {} + + @Override + public int currentParallelism() { + return parallelism; + } + + @Override + public Map registeredReaders() { + return null; + } + + @Override + public void assignSplits(SplitsAssignment newSplitAssignments) {} + + @Override + public void signalNoMoreSplits(int subtask) {} + + @Override + public void callAsync(Callable callable, BiConsumer handler) {} + + @Override + public void callAsync( + Callable callable, BiConsumer handler, long initialDelay, long period) {} + + @Override + public void runInCoordinatorThread(Runnable runnable) {} + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java new file mode 100644 index 0000000000..71c85609ae --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Collection; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestSplitAssignerAwaiting extends TestShuffleSplitAssigner { + + @Test + public void testEmpty() { + ShuffleSplitAssigner splitAssigner = instanceSplitAssigner(1); + Split split = splitAssigner.getNext(0); + Assert.assertNotNull(split); + Assert.assertEquals(Split.Status.UNAVAILABLE, split.status()); + } + + @Test + public void testStaticAssign() { + ShuffleSplitAssigner splitAssigner = instanceSplitAssigner(1); + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + + splitAssigner.onDiscoveredSplits(splitList); + assertSnapshot(splitAssigner, 7); + assertAllSplits(splitAssigner, 7); + + splitAssigner.onUnassignedSplits(splitList.subList(0, 6)); + assertSnapshot(splitAssigner, 6); + assertAllSplits(splitAssigner, 6); + } + + @Test + public void testContinueAssign() { + ShuffleSplitAssigner assigner = instanceSplitAssigner(1); + assertGetNext(assigner, Split.Status.UNAVAILABLE); + + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + List splits1 = splitList.subList(0, 1); + assertAvailableFuture(assigner, () -> assigner.onDiscoveredSplits(splits1)); + List splits2 = splitList.subList(1, 2); + assertAvailableFuture(assigner, () -> assigner.onUnassignedSplits(splits2)); + + assigner.onDiscoveredSplits(splitList.subList(2, 4)); + assertSnapshot(assigner, 2); + assertAllSplits(assigner, 2); + assertSnapshot(assigner, 0); + } + + private void assertAllSplits(ShuffleSplitAssigner splitAssigner, int splitCount) { + for (int i = 0; i < splitCount + 2; i++) { + if (i < splitCount) { + assertGetNext(splitAssigner, Split.Status.AVAILABLE); + } else { + assertGetNext(splitAssigner, Split.Status.UNAVAILABLE); + } + } + } + + private void assertAvailableFuture(ShuffleSplitAssigner assigner, Runnable addSplitsRunnable) { + // register callback + AtomicBoolean futureCompleted = new AtomicBoolean(); + CompletableFuture future = assigner.isAvailable(); + future.thenAccept(ignored -> futureCompleted.set(true)); + // calling isAvailable again should return the same object reference + // note that thenAccept will return a new future. + // we want to assert the same instance on the assigner returned future + Assert.assertSame(future, assigner.isAvailable()); + + // now add some splits + addSplitsRunnable.run(); + Assert.assertTrue(futureCompleted.get()); + + for (int i = 0; i < 1; ++i) { + assertGetNext(assigner, Split.Status.AVAILABLE); + } + assertGetNext(assigner, Split.Status.UNAVAILABLE); + assertSnapshot(assigner, 0); + } + + private void assertGetNext(ShuffleSplitAssigner assigner, Split.Status expectedStatus) { + Split result = assigner.getNext(0); + Assert.assertEquals(expectedStatus, result.status()); + switch (expectedStatus) { + case AVAILABLE: + Assert.assertNotNull(result.split()); + break; + case UNAVAILABLE: + Assert.assertNull(result.split()); + break; + default: + Assert.fail("Unknown status: " + expectedStatus); + } + } + + private void assertSnapshot(ShuffleSplitAssigner assigner, int splitCount) { + Collection stateBeforeGet = assigner.state(); + Assert.assertEquals(splitCount, stateBeforeGet.size()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java new file mode 100644 index 0000000000..3c614e921d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.assigner; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestStaticSplitAssigner extends TestRowDataReaderFunction { + private static final Logger LOG = LoggerFactory.getLogger(TestStaticSplitAssigner.class); + + @Test + public void testSingleParallelism() throws IOException { + try (StaticSplitAssigner staticSplitAssigner = instanceStaticSplitAssigner()) { + List splitList = + FlinkSplitPlanner.mergeOnReadPlan( + testKeyedTable, Collections.emptyList(), new AtomicInteger()); + staticSplitAssigner.onDiscoveredSplits(splitList); + List actual = new ArrayList<>(); + + while (true) { + Split splitOpt = staticSplitAssigner.getNext(0); + if (splitOpt.isAvailable()) { + actual.add(splitOpt.split()); + } else { + break; + } + } + + Assert.assertEquals(splitList.size(), actual.size()); + } + } + + @Test + public void testMultiParallelism() throws IOException { + try (StaticSplitAssigner staticSplitAssigner = instanceStaticSplitAssigner()) { + List splitList = + FlinkSplitPlanner.mergeOnReadPlan( + testKeyedTable, Collections.emptyList(), new AtomicInteger()); + staticSplitAssigner.onDiscoveredSplits(splitList); + List actual = new ArrayList<>(); + + int subtaskId = 2; + while (subtaskId >= 0) { + Split splitOpt = staticSplitAssigner.getNext(subtaskId); + if (splitOpt.isAvailable()) { + actual.add(splitOpt.split()); + } else { + LOG.info("Subtask id {}, splits {}.\n {}", subtaskId, actual.size(), actual); + --subtaskId; + } + } + + Assert.assertEquals(splitList.size(), actual.size()); + } + } + + protected StaticSplitAssigner instanceStaticSplitAssigner() { + return new StaticSplitAssigner(null); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java new file mode 100644 index 0000000000..ff41fb5319 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.TaskWriter; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.List; + +public class TestContinuousSplitPlannerImpl extends FlinkTestBase { + private static final Logger LOG = LoggerFactory.getLogger(TestContinuousSplitPlannerImpl.class); + protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TABLE_SCHEMA); + protected KeyedTable testKeyedTable; + + protected static final LocalDateTime LDT = + LocalDateTime.of(LocalDate.of(2022, 1, 1), LocalTime.of(0, 0, 0, 0)); + + public TestContinuousSplitPlannerImpl( + CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + @Before + public void init() throws IOException { + testKeyedTable = getMixedTable().asKeyedTable(); + // write base + { + TaskWriter taskWriter = createTaskWriter(true); + List baseData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(testKeyedTable, taskWriter.complete(), true); + } + + // write change insert + { + TaskWriter taskWriter = createTaskWriter(false); + List insert = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 5, + StringData.fromString("mary"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 6, + StringData.fromString("mack"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + } + }; + for (RowData record : insert) { + taskWriter.write(record); + } + commit(testKeyedTable, taskWriter.complete(), true); + } + + // write change delete + { + TaskWriter taskWriter = createTaskWriter(false); + List update = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.DELETE, + 5, + StringData.fromString("mary"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 5, + StringData.fromString("lind"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + } + }; + + for (RowData record : update) { + taskWriter.write(record); + } + commit(testKeyedTable, taskWriter.complete(), false); + } + } + + protected TaskWriter createTaskWriter(boolean base) { + return createKeyedTaskWriter(testKeyedTable, ROW_TYPE, base); + } + + protected TaskWriter createTaskWriter(KeyedTable keyedTable, boolean base) { + return createKeyedTaskWriter(keyedTable, ROW_TYPE, base); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java new file mode 100644 index 0000000000..1c1f52616f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; +import org.apache.amoro.flink.read.hybrid.assigner.Split; +import org.apache.amoro.flink.read.hybrid.assigner.TestShuffleSplitAssigner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestMixedFormatSourceEnumStateSerializer extends TestShuffleSplitAssigner { + private static final Logger LOG = + LoggerFactory.getLogger(TestMixedFormatSourceEnumStateSerializer.class); + + @Test + public void testMixedFormatEnumState() throws IOException { + ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); + + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + shuffleSplitAssigner.onDiscoveredSplits(splitList); + TemporalJoinSplits splits = new TemporalJoinSplits(splitList, null); + + MixedFormatSourceEnumState expect = + new MixedFormatSourceEnumState( + shuffleSplitAssigner.state(), + null, + shuffleSplitAssigner.serializePartitionIndex(), + splits); + + MixedFormatSourceEnumStateSerializer mixedFormatSourceEnumStateSerializer = + new MixedFormatSourceEnumStateSerializer(); + byte[] ser = mixedFormatSourceEnumStateSerializer.serialize(expect); + + Assert.assertNotNull(ser); + + MixedFormatSourceEnumState actual = mixedFormatSourceEnumStateSerializer.deserialize(1, ser); + + Assert.assertEquals(expect.pendingSplits().size(), actual.pendingSplits().size()); + Assert.assertEquals( + Objects.requireNonNull(expect.shuffleSplitRelation()).length, + Objects.requireNonNull(actual.shuffleSplitRelation()).length); + + SplitEnumeratorContext splitEnumeratorContext = + new InternalSplitEnumeratorContext(3); + try (ShuffleSplitAssigner actualAssigner = + new ShuffleSplitAssigner(splitEnumeratorContext, getMixedTable().name(), actual)) { + List actualSplits = new ArrayList<>(); + + int subtaskId = 2; + while (subtaskId >= 0) { + Split splitOpt = actualAssigner.getNext(subtaskId); + if (splitOpt.isAvailable()) { + actualSplits.add(splitOpt.split()); + } else { + LOG.info("subtask id {}, splits {}.\n {}", subtaskId, actualSplits.size(), actualSplits); + --subtaskId; + } + } + + Assert.assertEquals(splitList.size(), actualSplits.size()); + + TemporalJoinSplits temporalJoinSplits = actual.temporalJoinSplits(); + Assert.assertEquals(expect.temporalJoinSplits(), temporalJoinSplits); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java new file mode 100644 index 0000000000..2849bf3ca3 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import static org.apache.flink.util.Preconditions.checkState; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; +import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; +import org.apache.amoro.flink.read.source.MixedFormatScanContext; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.api.connector.source.ReaderInfo; +import org.apache.flink.api.connector.source.SourceEvent; +import org.apache.flink.api.connector.source.SplitEnumeratorContext; +import org.apache.flink.api.connector.source.SplitsAssignment; +import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.io.TaskWriter; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; + +public class TestMixedFormatSourceEnumerator extends FlinkTestBase { + + public TestMixedFormatSourceEnumerator() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + private final int splitCount = 4; + private final int parallelism = 5; + private KeyedTable testKeyedTable; + + public static final String SCAN_STARTUP_MODE_EARLIEST = "earliest"; + + protected static final LocalDateTime LDT = + LocalDateTime.of(LocalDate.of(2022, 1, 1), LocalTime.of(0, 0, 0, 0)); + + @Before + public void init() throws IOException { + testKeyedTable = getMixedTable().asKeyedTable(); + // write change insert + { + TaskWriter taskWriter = createKeyedTaskWriter(testKeyedTable, FLINK_ROW_TYPE, false); + List insert = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + } + }; + for (RowData record : insert) { + taskWriter.write(record); + } + commit(testKeyedTable, taskWriter.complete(), false); + } + } + + @Test + public void testReadersNumGreaterThanSplits() throws Exception { + TestingSplitEnumeratorContext splitEnumeratorContext = + instanceSplitEnumeratorContext(parallelism); + ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(splitEnumeratorContext); + MixedFormatScanContext scanContext = + MixedFormatScanContext.contextBuilder() + .streaming(true) + .scanStartupMode(SCAN_STARTUP_MODE_EARLIEST) + .build(); + + List splitList = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); + shuffleSplitAssigner.onDiscoveredSplits(splitList); + assertSnapshot(shuffleSplitAssigner, splitCount); + + MixedFormatSourceEnumerator enumerator = + new MixedFormatSourceEnumerator( + splitEnumeratorContext, + shuffleSplitAssigner, + MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder), + scanContext, + null, + false); + + Collection pendingSplitsEmpty = + enumerator.snapshotState(1).pendingSplits(); + Assert.assertEquals(splitCount, pendingSplitsEmpty.size()); + + // register readers, and let them request a split + // 4 split, 5 subtask, one or more subtask will fetch empty split + // subtask 0 + splitEnumeratorContext.registerReader(0, "host0"); + enumerator.addReader(0); + enumerator.handleSourceEvent(0, new SplitRequestEvent()); + // subtask 1 + splitEnumeratorContext.registerReader(1, "host1"); + enumerator.addReader(1); + enumerator.handleSourceEvent(1, new SplitRequestEvent()); + // subtask 2 + splitEnumeratorContext.registerReader(2, "host2"); + enumerator.addReader(2); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); + // subtask 3 + splitEnumeratorContext.registerReader(3, "host3"); + enumerator.addReader(3); + enumerator.handleSourceEvent(3, new SplitRequestEvent()); + // subtask 4 + splitEnumeratorContext.registerReader(4, "host4"); + enumerator.addReader(4); + enumerator.handleSourceEvent(4, new SplitRequestEvent()); + + Assert.assertEquals(parallelism - splitCount, enumerator.getReadersAwaitingSplit().size()); + Assert.assertTrue(enumerator.snapshotState(2).pendingSplits().isEmpty()); + } + + private void assertSnapshot(ShuffleSplitAssigner assigner, int splitCount) { + Collection stateBeforeGet = assigner.state(); + Assert.assertEquals(splitCount, stateBeforeGet.size()); + } + + private ShuffleSplitAssigner instanceSplitAssigner( + TestingSplitEnumeratorContext splitEnumeratorContext) { + return new ShuffleSplitAssigner(splitEnumeratorContext); + } + + private TestingSplitEnumeratorContext instanceSplitEnumeratorContext(int parallelism) { + return new TestingSplitEnumeratorContext(parallelism); + } + + protected static class TestingSplitEnumeratorContext + implements SplitEnumeratorContext { + private final int parallelism; + + private final HashMap> splitAssignments = + new HashMap<>(); + + private final HashMap> events = new HashMap<>(); + + private final HashMap registeredReaders = new HashMap<>(); + + public Map> getSplitAssignments() { + return splitAssignments; + } + + public Map> getSentEvents() { + return events; + } + + public void registerReader(int subtask, String hostname) { + checkState(!registeredReaders.containsKey(subtask), "Reader already registered"); + registeredReaders.put(subtask, new ReaderInfo(subtask, hostname)); + } + + public TestingSplitEnumeratorContext(int parallelism) { + this.parallelism = parallelism; + } + + @Override + public SplitEnumeratorMetricGroup metricGroup() { + return null; + } + + @Override + public void sendEventToSourceReader(int subtaskId, SourceEvent event) { + final List eventsForSubTask = + events.computeIfAbsent(subtaskId, (key) -> new ArrayList<>()); + eventsForSubTask.add(event); + } + + @Override + public int currentParallelism() { + return parallelism; + } + + @Override + public Map registeredReaders() { + return registeredReaders; + } + + @Override + public void assignSplits(SplitsAssignment newSplitAssignments) { + for (final Map.Entry> entry : + newSplitAssignments.assignment().entrySet()) { + final SplitAssignmentState assignment = + splitAssignments.computeIfAbsent(entry.getKey(), (key) -> new SplitAssignmentState<>()); + + assignment.getAssignedSplits().addAll(entry.getValue()); + } + } + + @Override + public void assignSplit(MixedFormatSplit split, int subtask) { + SplitEnumeratorContext.super.assignSplit(split, subtask); + } + + @Override + public void signalNoMoreSplits(int subtask) { + final SplitAssignmentState assignment = + splitAssignments.computeIfAbsent(subtask, (key) -> new SplitAssignmentState<>()); + assignment.noMoreSplits = true; + } + + @Override + public void callAsync( + Callable callable, BiConsumer handler, long initialDelay, long period) {} + + @Override + public void callAsync(Callable callable, BiConsumer handler) {} + + @Override + public void runInCoordinatorThread(Runnable runnable) {} + } + + public static final class SplitAssignmentState { + + final List splits = new ArrayList<>(); + boolean noMoreSplits; + + public List getAssignedSplits() { + return splits; + } + + public boolean hasReceivedNoMoreSplitsSignal() { + return noMoreSplits; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java new file mode 100644 index 0000000000..ba378491d1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.enumerator; + +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +public class TestTemporalJoinSplitsThreadSafe { + + @Test + public void testTemporalJoinSplits() { + List allSplit = new LinkedList<>(); + for (int i = 0; i < 100; i++) { + allSplit.add(UUID.randomUUID().toString()); + } + + Collection mixedFormatSplits = + allSplit.stream().map(TestMixedFormatSplit::of).collect(Collectors.toList()); + + for (int i = 0; i < 2; i++) { + round(allSplit, mixedFormatSplits); + } + } + + public void round(List allSplit, Collection mixedFormatSplits) { + TemporalJoinSplits temporalJoinSplits = new TemporalJoinSplits(mixedFormatSplits, null); + int n = allSplit.size(); + + List s1 = new ArrayList<>(allSplit.subList(0, (int) (2.0 / 3 * n))), + s2 = new ArrayList<>(allSplit.subList((int) (1.0 / 3 * n), n)); + Collections.shuffle(s1); + Collections.shuffle(s2); + + List as = new ArrayList<>(mixedFormatSplits); + Collections.shuffle(as); + int an = as.size(); + List as1 = new ArrayList<>(as.subList(0, (int) (2.0 / 3 * an))); + List as2 = new ArrayList<>(as.subList((int) (1.0 / 3 * an), an)); + CompletableFuture f1 = + CompletableFuture.runAsync(() -> temporalJoinSplits.removeAndReturnIfAllFinished(s1)); + CompletableFuture f2 = + CompletableFuture.runAsync(() -> temporalJoinSplits.addSplitsBack(as1)); + CompletableFuture f3 = + CompletableFuture.runAsync(() -> temporalJoinSplits.removeAndReturnIfAllFinished(s2)); + CompletableFuture f4 = + CompletableFuture.runAsync(() -> temporalJoinSplits.addSplitsBack(as2)); + CompletableFuture.allOf(f1, f2, f3, f4).join(); + Assert.assertTrue(temporalJoinSplits.removeAndReturnIfAllFinished(allSplit)); + } + + static class TestMixedFormatSplit extends MixedFormatSplit { + private final String splitId; + + public TestMixedFormatSplit(String splitId) { + this.splitId = splitId; + } + + public static TestMixedFormatSplit of(String splitId) { + return new TestMixedFormatSplit(splitId); + } + + @Override + public Integer taskIndex() { + return null; + } + + @Override + public void updateOffset(Object[] recordOffsets) {} + + @Override + public MixedFormatSplit copy() { + return new TestMixedFormatSplit(splitId); + } + + @Override + public String splitId() { + return splitId; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java new file mode 100644 index 0000000000..61da6d3e71 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.TableTestBase; +import org.apache.amoro.flink.read.MixedIncrementalLoader; +import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousSplitPlanner; +import org.apache.amoro.flink.read.hybrid.enumerator.MergeOnReadIncrementalPlanner; +import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.TaskWriter; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +@RunWith(value = Parameterized.class) +public class MixedIncrementalLoaderTest extends TableTestBase implements FlinkTaskWriterBaseTest { + + public MixedIncrementalLoaderTest(boolean partitionedTable) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, partitionedTable)); + } + + @Parameterized.Parameters(name = "partitionedTable = {0}") + public static Object[][] parameters() { + // todo mix hive test + return new Object[][] {{true}, {false}}; + } + + @Before + public void before() throws IOException { + MixedTable mixedTable = getMixedTable(); + TableSchema flinkPartialSchema = + TableSchema.builder() + .field("id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .field("ts", DataTypes.BIGINT()) + .field("op_time", DataTypes.TIMESTAMP()) + .build(); + RowType rowType = (RowType) flinkPartialSchema.toRowDataType().getLogicalType(); + + List expected = + Lists.newArrayList( + DataUtil.toRowData(1000011, "a", 1010L, LocalDateTime.parse("2022-06-18T10:10:11.0")), + DataUtil.toRowData(1000012, "b", 1011L, LocalDateTime.parse("2022-06-18T10:10:11.0")), + DataUtil.toRowData(1000013, "c", 1012L, LocalDateTime.parse("2022-06-18T10:10:11.0")), + DataUtil.toRowData(1000014, "d", 1013L, LocalDateTime.parse("2022-06-21T10:10:11.0")), + DataUtil.toRowData(1000015, "e", 1014L, LocalDateTime.parse("2022-06-21T10:10:11.0"))); + for (RowData rowData : expected) { + try (TaskWriter taskWriter = createBaseTaskWriter(mixedTable, rowType)) { + writeAndCommit(rowData, taskWriter, mixedTable); + } + } + + expected = + Lists.newArrayList( + DataUtil.toRowDataWithKind( + RowKind.DELETE, 1000015, "e", 1014L, LocalDateTime.parse("2022-06-21T10:10:11.0")), + DataUtil.toRowData(1000021, "a", 1020L, LocalDateTime.parse("2022-06-28T10:10:11.0")), + DataUtil.toRowData(1000022, "b", 1021L, LocalDateTime.parse("2022-06-28T10:10:11.0")), + DataUtil.toRowData(1000023, "c", 1022L, LocalDateTime.parse("2022-06-28T10:10:11.0")), + DataUtil.toRowData(1000024, "d", 1023L, LocalDateTime.parse("2022-06-28T10:10:11.0")), + DataUtil.toRowData(1000025, "e", 1024L, LocalDateTime.parse("2022-06-28T10:10:11.0"))); + for (RowData rowData : expected) { + try (TaskWriter taskWriter = createTaskWriter(mixedTable, rowType)) { + writeAndCommit(rowData, taskWriter, mixedTable); + } + } + } + + @Test + public void testMOR() { + KeyedTable keyedTable = getMixedTable().asKeyedTable(); + List expressions = + Lists.newArrayList(Expressions.greaterThan("op_time", "2022-06-20T10:10:11.0")); + ContinuousSplitPlanner morPlanner = + new MergeOnReadIncrementalPlanner( + getTableLoader(getCatalogName(), getMetastoreUri(), keyedTable)); + + FlinkKeyedMORDataReader flinkKeyedMORDataReader = + new FlinkKeyedMORDataReader( + keyedTable.io(), + keyedTable.schema(), + keyedTable.schema(), + keyedTable.primaryKeySpec(), + null, + true, + RowDataUtil::convertConstant, + true); + + MixedIncrementalLoader incrementalLoader = + new MixedIncrementalLoader<>( + morPlanner, + flinkKeyedMORDataReader, + new RowDataReaderFunction( + new Configuration(), + keyedTable.schema(), + keyedTable.schema(), + keyedTable.asKeyedTable().primaryKeySpec(), + null, + true, + keyedTable.io(), + true), + expressions); + + List actuals = new ArrayList<>(); + while (incrementalLoader.hasNext()) { + CloseableIterator iterator = incrementalLoader.next(); + while (iterator.hasNext()) { + RowData rowData = iterator.next(); + System.out.println(rowData); + actuals.add(rowData); + } + } + if (isPartitionedTable()) { + Assert.assertEquals(6, actuals.size()); + } else { + Assert.assertEquals(9, actuals.size()); + } + } + + @Override + public String getMetastoreUri() { + return getCatalogUri(); + } + + @Override + public String getCatalogName() { + return getMixedFormatCatalog().name(); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java new file mode 100644 index 0000000000..4bad7e47fe --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.reader; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.data.DataFileType; +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.enumerator.TestContinuousSplitPlannerImpl; +import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.scan.ChangeTableIncrementalScan; +import org.apache.amoro.scan.MixedFileScanTask; +import org.apache.amoro.table.KeyedTable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.shaded.guava30.com.google.common.collect.Maps; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.TaskWriter; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +public class TestRowDataReaderFunction extends TestContinuousSplitPlannerImpl { + private static final Logger LOG = LoggerFactory.getLogger(TestRowDataReaderFunction.class); + private static final AtomicInteger splitCount = new AtomicInteger(); + + public TestRowDataReaderFunction() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + @Test + public void testReadChangelog() throws IOException { + + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); + + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + new Configuration(), + testKeyedTable.schema(), + testKeyedTable.schema(), + testKeyedTable.primaryKeySpec(), + null, + true, + testKeyedTable.io()); + + List actual = new ArrayList<>(); + mixedFormatSplits.forEach( + split -> { + LOG.info("Mixed format split: {}.", split); + DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + LOG.info("{}", rowData); + actual.add(rowData); + } + }); + + assertArrayEquals(excepts(), actual); + + long snapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); + writeUpdate(); + + testKeyedTable.changeTable().refresh(); + long nowSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); + ChangeTableIncrementalScan changeTableScan = + testKeyedTable.changeTable().newScan().useSnapshot(nowSnapshotId); + + Snapshot snapshot = testKeyedTable.changeTable().snapshot(snapshotId); + long fromSequence = snapshot.sequenceNumber(); + + Set appendLogTasks = new HashSet<>(); + Set deleteLogTasks = new HashSet<>(); + try (CloseableIterable tasks = changeTableScan.planFiles()) { + for (FileScanTask fileScanTask : tasks) { + if (fileScanTask.file().dataSequenceNumber() <= fromSequence) { + continue; + } + MixedFileScanTask mixedFileScanTask = (MixedFileScanTask) fileScanTask; + if (mixedFileScanTask.fileType().equals(DataFileType.INSERT_FILE)) { + appendLogTasks.add(mixedFileScanTask); + } else if (mixedFileScanTask.fileType().equals(DataFileType.EQ_DELETE_FILE)) { + deleteLogTasks.add(mixedFileScanTask); + } else { + throw new IllegalArgumentException( + String.format( + "DataFileType %s is not supported during change log reading period.", + mixedFileScanTask.fileType())); + } + } + } + ChangelogSplit changelogSplit = + new ChangelogSplit(appendLogTasks, deleteLogTasks, splitCount.incrementAndGet()); + actual.clear(); + DataIterator dataIterator = rowDataReaderFunction.createDataIterator(changelogSplit); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + actual.add(rowData); + } + assertArrayEquals(excepts2(), actual); + } + + @Test + public void testReadNodesUpMoved() throws IOException { + writeUpdateWithSpecifiedMaskOne(); + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); + + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + new Configuration(), + testKeyedTable.schema(), + testKeyedTable.schema(), + testKeyedTable.primaryKeySpec(), + null, + true, + testKeyedTable.io()); + + List actual = new ArrayList<>(); + mixedFormatSplits.forEach( + split -> { + LOG.info("Mixed format split: {}.", split); + DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + LOG.info("{}", rowData); + actual.add(rowData); + } + }); + + List excepts = expectedCollection(); + excepts.addAll(generateRecords()); + RowData[] array = + excepts.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()) + .toArray(new RowData[excepts.size()]); + assertArrayEquals(array, actual); + } + + protected void assertArrayEquals(RowData[] excepts, List actual) { + Assert.assertArrayEquals(excepts, sortRowDataCollection(actual)); + } + + protected RowData[] sortRowDataCollection(Collection records) { + return records.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()) + .toArray(new RowData[records.size()]); + } + + protected void writeUpdate() throws IOException { + // write change update + writeUpdate(updateRecords()); + } + + protected void writeUpdate(List input) throws IOException { + writeUpdate(input, testKeyedTable); + } + + protected void writeUpdateWithSpecifiedMaskOne() throws IOException { + List excepts = generateRecords(); + + writeUpdateWithSpecifiedMask(excepts, testKeyedTable, 1); + } + + protected void writeUpdateWithSpecifiedMask(List input, KeyedTable table, long mask) + throws IOException { + // write change update + TaskWriter taskWriter = createKeyedTaskWriter(table, ROW_TYPE, false, mask); + + for (RowData record : input) { + taskWriter.write(record); + } + commit(table, taskWriter.complete(), false); + } + + protected void writeUpdate(List input, KeyedTable table) throws IOException { + // write change update + TaskWriter taskWriter = createKeyedTaskWriter(table, ROW_TYPE, false); + + for (RowData record : input) { + taskWriter.write(record); + } + commit(table, taskWriter.complete(), false); + } + + protected List generateRecords() { + List excepts = new ArrayList<>(); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 7, + StringData.fromString("syan"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_BEFORE, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_AFTER, + 2, + StringData.fromString("daniel"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_BEFORE, + 7, + StringData.fromString("syan"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_AFTER, + 7, + StringData.fromString("syan2"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + return excepts; + } + + protected List updateRecords() { + List excepts = new ArrayList<>(); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_BEFORE, + 5, + StringData.fromString("lind"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.UPDATE_AFTER, + 5, + StringData.fromString("lina"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + return excepts; + } + + protected RowData[] excepts2() { + List excepts = updateRecords(); + + return updateRecords().stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()) + .toArray(new RowData[excepts.size()]); + } + + protected RowData[] excepts() { + List excepts = expectedCollection(); + + return excepts.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()) + .toArray(new RowData[excepts.size()]); + } + + protected RowData[] expectedAfterMOR() { + List expected = expectedCollection(); + return mor(expected).stream() + .sorted(Comparator.comparing(RowData::toString)) + .toArray(RowData[]::new); + } + + protected Collection mor(final Collection changelog) { + Map map = Maps.newHashMap(); + + changelog.forEach( + rowData -> { + int key = rowData.getInt(0); + RowKind kind = rowData.getRowKind(); + + if ((kind == RowKind.INSERT || kind == RowKind.UPDATE_AFTER) && !map.containsKey(key)) { + rowData.setRowKind(RowKind.INSERT); + map.put(key, rowData); + } else if ((kind == RowKind.DELETE || kind == RowKind.UPDATE_BEFORE)) { + map.remove(key); + } + }); + + return map.values(); + } + + protected List expectedCollection() { + List excepts = new ArrayList<>(); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 1, + StringData.fromString("john"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 2, + StringData.fromString("lily"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 3, + StringData.fromString("jake"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 4, + StringData.fromString("sam"), + LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT.plusDays(1)))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 5, + StringData.fromString("mary"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 6, + StringData.fromString("mack"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.DELETE, + 5, + StringData.fromString("mary"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + excepts.add( + GenericRowData.ofKind( + RowKind.INSERT, + 5, + StringData.fromString("lind"), + LDT.toEpochSecond(ZoneOffset.UTC), + TimestampData.fromLocalDateTime(LDT))); + return excepts; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java new file mode 100644 index 0000000000..f1a6d115c3 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.read.hybrid.split; + +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; +import org.apache.flink.util.FlinkRuntimeException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +public class TestMixedFormatSplitSerializer extends TestRowDataReaderFunction { + + @Test + public void testSerAndDes() { + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); + assertSerializedSplitEquals(mixedFormatSplits); + } + + @Test + public void testSerAndDesMoRSplit() { + List mixedFormatSplits = + FlinkSplitPlanner.mergeOnReadPlan(testKeyedTable, null, new AtomicInteger(0)); + assertSerializedSplitEquals(mixedFormatSplits); + } + + private void assertSerializedSplitEquals(List expected) { + MixedFormatSplitSerializer serializer = new MixedFormatSplitSerializer(); + List contents = + expected.stream() + .map( + split -> { + try { + return serializer.serialize(split); + } catch (IOException e) { + e.printStackTrace(); + return new byte[0]; + } + }) + .collect(Collectors.toList()); + + Assert.assertArrayEquals( + expected.toArray(new MixedFormatSplit[0]), + contents.stream() + .map( + data -> { + if (data.length == 0) { + throw new FlinkRuntimeException("failed cause data length is 0."); + } + try { + return serializer.deserialize(1, data); + } catch (IOException e) { + throw new FlinkRuntimeException(e); + } + }) + .toArray(MixedFormatSplit[]::new)); + } + + @Test + public void testNullableSplit() throws IOException { + MixedFormatSplitSerializer serializer = new MixedFormatSplitSerializer(); + byte[] ser = serializer.serialize(null); + + MixedFormatSplit actual = serializer.deserialize(1, ser); + + Assert.assertNull(actual); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java new file mode 100644 index 0000000000..bafb9c28cc --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.log.Bytes; +import org.apache.amoro.log.FormatTestBase; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; + +/** This is a {@link LogRecordV1} log data test, include all data types. */ +public class TestLogRecordV1 extends FormatTestBase { + + public final Schema userSchema = + new Schema( + new ArrayList() { + { + add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); + add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); + add(Types.NestedField.optional(2, "f_long", Types.LongType.get())); + add( + Types.NestedField.optional( + 3, "f_list_string", Types.ListType.ofOptional(4, Types.StringType.get()))); + } + }); + + @Test + public void testLogDataSerialize() throws IOException { + + LogDataJsonSerialization logDataJsonSerialization = + new LogDataJsonSerialization<>(userSchema, LogRecordV1.FIELD_GETTER_FACTORY); + GenericRowData rowData = new GenericRowData(4); + rowData.setField(0, true); + rowData.setField(1, 1); + rowData.setField(2, 123456789L); + rowData.setField( + 3, + new GenericArrayData( + new StringData[] { + null, StringData.fromString("b"), null, StringData.fromString("c"), null + })); + LogData logData = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + IdGenerator.generateUpstreamId(), + 123455L, + false, + ChangeAction.INSERT, + rowData); + + byte[] bytes = logDataJsonSerialization.serialize(logData); + + Assert.assertNotNull(bytes); + String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); + String expected = + "{\"f_boolean\":true,\"f_int\":1,\"f_long\":123456789,\"f_list_string\":[null,\"b\",null,\"c\",null]}"; + assertEquals(expected, actualJson); + + LogDataJsonDeserialization logDataJsonDeserialization = + new LogDataJsonDeserialization<>( + userSchema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); + LogData result = logDataJsonDeserialization.deserialize(bytes); + Assert.assertNotNull(result); + check(logData, result); + } + + @Test + public void testLogDataSerializeNullList() throws IOException { + + LogDataJsonSerialization logDataJsonSerialization = + new LogDataJsonSerialization<>(userSchema, LogRecordV1.FIELD_GETTER_FACTORY); + GenericRowData rowData = new GenericRowData(4); + rowData.setField(0, true); + rowData.setField(1, 1); + rowData.setField(2, 123456789L); + rowData.setField(3, new GenericArrayData(new StringData[] {null, null, null})); + LogData logData = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + IdGenerator.generateUpstreamId(), + 123455L, + false, + ChangeAction.INSERT, + rowData); + + byte[] bytes = logDataJsonSerialization.serialize(logData); + + Assert.assertNotNull(bytes); + String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); + String expected = + "{\"f_boolean\":true,\"f_int\":1,\"f_long\":123456789,\"f_list_string\":[null,null,null]}"; + assertEquals(expected, actualJson); + + LogDataJsonDeserialization logDataJsonDeserialization = + new LogDataJsonDeserialization<>( + userSchema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); + LogData result = logDataJsonDeserialization.deserialize(bytes); + Assert.assertNotNull(result); + check(logData, result); + } + + private void check(LogData expected, LogData actual) { + assertArrayEquals(expected.getVersionBytes(), actual.getVersionBytes()); + assertArrayEquals(expected.getUpstreamIdBytes(), actual.getUpstreamIdBytes()); + assertEquals(expected.getEpicNo(), actual.getEpicNo()); + assertEquals(expected.getFlip(), actual.getFlip()); + assertEquals(expected.getChangeActionByte(), actual.getChangeActionByte()); + assertEquals(expected.getActualValue().toString(), actual.getActualValue().toString()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java new file mode 100644 index 0000000000..c870eb2287 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.shuffle; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.data.DataTreeNode; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Map; +import java.util.Set; + +@RunWith(Parameterized.class) +public class TestRoundRobinShuffleRulePolicy extends FlinkTestBase { + + public TestRoundRobinShuffleRulePolicy(boolean keyedTable, boolean partitionedTable) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(keyedTable, partitionedTable)); + } + + @Parameterized.Parameters(name = "keyedTable = {0}, partitionedTable = {1}") + public static Object[][] parameters() { + return new Object[][] { + {true, true}, + {true, false}, + {false, true}, + {false, false} + }; + } + + @Test + public void testPrimaryKeyPartitionedTable() throws Exception { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + ShuffleHelper helper = + ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); + RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); + Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); + Assert.assertEquals(subTaskTreeNodes.size(), 5); + subTaskTreeNodes + .values() + .forEach( + nodes -> { + Assert.assertEquals(nodes.size(), 2); + Assert.assertTrue(nodes.contains(DataTreeNode.of(1, 0))); + Assert.assertTrue(nodes.contains(DataTreeNode.of(1, 1))); + }); + + KeySelector keySelector = policy.generateKeySelector(); + Partitioner partitioner = policy.generatePartitioner(); + Assert.assertEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); + + Assert.assertNotEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); + + Assert.assertNotEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); + } + + @Test + public void testPrimaryKeyTableWithoutPartition() throws Exception { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + ShuffleHelper helper = + ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); + RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); + Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); + Assert.assertEquals(subTaskTreeNodes.size(), 5); + Assert.assertEquals( + subTaskTreeNodes.get(0), Sets.newHashSet(DataTreeNode.of(7, 0), DataTreeNode.of(7, 5))); + Assert.assertEquals( + subTaskTreeNodes.get(1), Sets.newHashSet(DataTreeNode.of(7, 1), DataTreeNode.of(7, 6))); + Assert.assertEquals( + subTaskTreeNodes.get(2), Sets.newHashSet(DataTreeNode.of(7, 2), DataTreeNode.of(7, 7))); + Assert.assertEquals(subTaskTreeNodes.get(3), Sets.newHashSet(DataTreeNode.of(7, 3))); + Assert.assertEquals(subTaskTreeNodes.get(4), Sets.newHashSet(DataTreeNode.of(7, 4))); + + KeySelector keySelector = policy.generateKeySelector(); + Partitioner partitioner = policy.generatePartitioner(); + Assert.assertEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); + + Assert.assertEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); + + Assert.assertNotEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); + } + + @Test + public void testPartitionedTableWithoutPrimaryKey() throws Exception { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + ShuffleHelper helper = + ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); + RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); + Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); + Assert.assertEquals(subTaskTreeNodes.size(), 5); + subTaskTreeNodes + .values() + .forEach( + nodes -> { + Assert.assertEquals(nodes.size(), 1); + Assert.assertTrue(nodes.contains(DataTreeNode.of(0, 0))); + }); + + KeySelector keySelector = policy.generateKeySelector(); + Partitioner partitioner = policy.generatePartitioner(); + Assert.assertEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); + + Assert.assertEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); + + Assert.assertNotEquals( + partitioner.partition( + keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), + partitioner.partition( + keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java new file mode 100644 index 0000000000..7b5d723636 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; + +import org.apache.amoro.TestAms; +import org.apache.amoro.formats.AmoroCatalogTestBase; +import org.apache.amoro.formats.AmoroCatalogTestHelper; +import org.apache.amoro.hive.TestHMS; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.filesystem.FsStateBackend; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.iceberg.flink.MiniClusterResource; +import org.junit.ClassRule; + +import java.io.IOException; + +public class AmoroCatalogITCaseBase extends AmoroCatalogTestBase { + static final TestHMS TEST_HMS = new TestHMS(); + public static final String TEST_DB_NAME = "test_db"; + public static final String TEST_TABLE_NAME = "test_table"; + + @ClassRule + public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = + MiniClusterResource.createWithClassloaderCheckDisabled(); + + @ClassRule public static TestAms TEST_AMS = new TestAms(); + + private volatile StreamTableEnvironment tEnv = null; + private volatile StreamExecutionEnvironment env = null; + + public AmoroCatalogITCaseBase(AmoroCatalogTestHelper catalogTestHelper) { + super(catalogTestHelper); + } + + @Override + public void setupCatalog() throws IOException { + super.setupCatalog(); + catalogTestHelper.initHiveConf(TEST_HMS.getHiveConf()); + TEST_AMS.getAmsHandler().createCatalog(catalogTestHelper.getCatalogMeta()); + } + + protected String getCatalogUrl() { + return TEST_AMS.getServerUrl() + "/" + catalogTestHelper.getCatalogMeta().getCatalogName(); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected StreamTableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + StreamTableEnvironment.create( + getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); + Configuration configuration = tEnv.getConfig().getConfiguration(); + // set low-level key-value options + configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); + } + } + } + return tEnv; + } + + protected StreamExecutionEnvironment getEnv() { + if (env == null) { + synchronized (this) { + if (env == null) { + StateBackend backend = + new FsStateBackend( + "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.setParallelism(defaultParallelism()); + env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig().setCheckpointInterval(300); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + env.setStateBackend(backend); + env.setRestartStrategy(RestartStrategies.noRestart()); + } + } + } + return env; + } + + protected int defaultParallelism() { + return 1; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java new file mode 100644 index 0000000000..2e10a280cb --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; + +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.catalog.TableTestBase; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.StateBackend; +import org.apache.flink.runtime.state.filesystem.FsStateBackend; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.EnvironmentSettings; +import org.apache.flink.table.api.TableEnvironment; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.iceberg.flink.MiniClusterResource; +import org.junit.ClassRule; + +public abstract class CatalogITCaseBase extends TableTestBase { + + @ClassRule + public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = + MiniClusterResource.createWithClassloaderCheckDisabled(); + + private volatile StreamTableEnvironment tEnv = null; + private volatile StreamExecutionEnvironment env = null; + + public CatalogITCaseBase(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { + super(catalogTestHelper, tableTestHelper); + } + + protected TableResult exec(String query, Object... args) { + return exec(getTableEnv(), query, args); + } + + protected static TableResult exec(TableEnvironment env, String query, Object... args) { + return env.executeSql(String.format(query, args)); + } + + protected StreamTableEnvironment getTableEnv() { + if (tEnv == null) { + synchronized (this) { + if (tEnv == null) { + this.tEnv = + StreamTableEnvironment.create( + getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); + Configuration configuration = tEnv.getConfig().getConfiguration(); + // set low-level key-value options + configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); + } + } + } + return tEnv; + } + + protected StreamExecutionEnvironment getEnv() { + if (env == null) { + synchronized (this) { + if (env == null) { + StateBackend backend = + new FsStateBackend( + "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + env.setParallelism(defaultParallelism()); + env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig().setCheckpointInterval(300); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + env.setStateBackend(backend); + env.setRestartStrategy(RestartStrategies.noRestart()); + } + } + } + return env; + } + + protected int defaultParallelism() { + return 1; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java new file mode 100644 index 0000000000..9a1a21e09d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.TaskWriter; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +public class LookupITCase extends CatalogITCaseBase implements FlinkTaskWriterBaseTest { + private String db; + + public LookupITCase() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, false)); + } + + @Before + public void setup() throws IOException { + List dbs = getMixedFormatCatalog().listDatabases(); + if (dbs.isEmpty()) { + db = "test_db"; + getMixedFormatCatalog().createDatabase(db); + } else { + db = dbs.get(0); + } + exec( + "create catalog mixed_catalog with ('type'='arctic', 'metastore.url'='%s')", + getCatalogUri()); + exec( + "create table mixed_catalog.%s.L (id int) " + + "with ('scan.startup.mode'='earliest', 'monitor-interval'='1 s','streaming'='true')", + db); + exec( + "create table mixed_catalog.%s.DIM (id int, name string, primary key(id) not enforced) " + + "with ('write.upsert.enabled'='true', 'lookup.reloading.interval'='1 s')", + db); + exec("create view vi as select *, PROCTIME() as proc from mixed_catalog.%s.L", db); + + writeAndCommit( + TableIdentifier.of(getCatalogName(), db, "DIM"), + Lists.newArrayList(DataUtil.toRowData(1, "a"), DataUtil.toRowData(2, "b"))); + writeAndCommit( + TableIdentifier.of(getCatalogName(), db, "L"), Lists.newArrayList(DataUtil.toRowData(1))); + } + + @After + public void drop() { + exec("drop table mixed_catalog.%s.L", db); + exec("drop table mixed_catalog.%s.DIM", db); + } + + @Test() + public void testLookup() throws Exception { + TableResult tableResult = + exec( + "select L.id, D.name from vi L LEFT JOIN mixed_catalog.%s.DIM " + + "for system_time as of L.proc AS D ON L.id = D.id", + db); + + tableResult.await(1, TimeUnit.MINUTES); // wait for the first row. + + writeToChangeAndCommit( + TableIdentifier.of(getCatalogName(), db, "DIM"), + Lists.newArrayList( + DataUtil.toRowData(2, "c"), + DataUtil.toRowData(3, "d"), + DataUtil.toRowData(4, "e"), + DataUtil.toRowData(5, "f")), + true); + Thread.sleep(2000); // wait dim table commit and reload + + writeToChangeAndCommit( + TableIdentifier.of(getCatalogName(), db, "L"), + Lists.newArrayList( + DataUtil.toRowData(2), + DataUtil.toRowData(3), + DataUtil.toRowData(4), + DataUtil.toRowData(5), + DataUtil.toRowData(6)), + false); + + int expected = 6, count = 0; + Set actual = new HashSet<>(); + try (CloseableIterator rows = tableResult.collect()) { + while (count < expected && rows.hasNext()) { + Row row = rows.next(); + actual.add(row); + count++; + } + } + + Assert.assertEquals(expected, actual.size()); + List expects = new LinkedList<>(); + expects.add(new Object[] {1, "a"}); + expects.add(new Object[] {2, "c"}); + expects.add(new Object[] {3, "d"}); + expects.add(new Object[] {4, "e"}); + expects.add(new Object[] {5, "f"}); + expects.add(new Object[] {6, null}); + Assert.assertEquals(DataUtil.toRowSet(expects), actual); + } + + @Override + public String getMetastoreUri() { + return getCatalogUri(); + } + + @Override + public String getCatalogName() { + return getMixedFormatCatalog().name(); + } + + @Override + public boolean upsertEnabled() { + return true; + } + + private void writeAndCommit(TableIdentifier table, List expected) throws IOException { + writeAndCommit(table, expected, true, false); + } + + private void writeToChangeAndCommit( + TableIdentifier table, List expected, boolean upsertEnabled) throws IOException { + writeAndCommit(table, expected, false, upsertEnabled); + } + + private void writeAndCommit( + TableIdentifier table, + List expected, + boolean writeToBaseStore, + boolean upsertEnabled) + throws IOException { + MixedTable mixedTable = getMixedFormatCatalog().loadTable(table); + Assert.assertNotNull(mixedTable); + RowType rowType = FlinkSchemaUtil.convert(mixedTable.schema()); + for (RowData rowData : expected) { + try (TaskWriter taskWriter = + writeToBaseStore + ? createBaseTaskWriter(mixedTable, rowType) + : createTaskWriter(mixedTable, rowType)) { + if (writeToBaseStore) { + writeAndCommit(rowData, taskWriter, mixedTable); + } else { + writeAndCommit(rowData, taskWriter, mixedTable, upsertEnabled); + } + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java new file mode 100644 index 0000000000..da5347a5e5 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.flink.table.planner.factories.TestValuesTableFactory.registerData; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.util.TestUtil; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.runtime.testutils.CommonTestUtils; +import org.apache.flink.shaded.guava30.com.google.common.collect.Lists; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.planner.factories.TestValuesTableFactory; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.io.TaskWriter; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class TestJoin extends FlinkTestBase { + + public static final Logger LOG = LoggerFactory.getLogger(TestJoin.class); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + private static final String DB = TableTestHelper.TEST_DB_NAME; + private static final String TABLE = "test_keyed"; + private static final TableIdentifier TABLE_ID = + TableIdentifier.of(TableTestHelper.TEST_CATALOG_NAME, TableTestHelper.TEST_DB_NAME, TABLE); + + public TestJoin() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(false, false)); + } + + @Before + public void before() throws Exception { + super.before(); + super.config(); + } + + @After + public void after() { + getMixedFormatCatalog().dropTable(TABLE_ID, true); + } + + @Test + public void testRightEmptyLookupJoin() throws Exception { + getEnv().getCheckpointConfig().disableCheckpointing(); + List data = new LinkedList<>(); + data.add(new Object[] {RowKind.INSERT, 1000004L, "a", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 1000015L, "b", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 1000011L, "c", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 1000022L, "d", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 1000021L, "e", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 1000016L, "e", LocalDateTime.now()}); + String id = TestValuesTableFactory.registerData(DataUtil.toRowList(data)); + sql( + "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " + + "with (" + + " 'connector' = 'values'," + + " 'bounded' = 'false'," + + " 'data-id' = '" + + id + + "' " + + " )"); + + sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); + Map tableProperties = new HashMap<>(); + String table = String.format("mixed_catalog.%s.%s", DB, TABLE); + + String sql = + String.format( + "CREATE TABLE IF NOT EXISTS %s (" + + " info int, id bigint, name STRING" + + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", + table, toWithClause(tableProperties)); + sql(sql); + + sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); + + TableResult result = + exec( + "select u.name, u.id, dim.info, dim.name dname from `user` as u left join d " + + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" + + " on u.id = dim.id"); + + CommonTestUtils.waitForJobStatus( + result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + result.getJobClient().ifPresent(TestUtil::cancelJob); + + List expected = new LinkedList<>(); + expected.add(new Object[] {"a", 1000004L, null, null}); + expected.add(new Object[] {"b", 1000015L, null, null}); + expected.add(new Object[] {"c", 1000011L, null, null}); + expected.add(new Object[] {"d", 1000022L, null, null}); + expected.add(new Object[] {"e", 1000021L, null, null}); + expected.add(new Object[] {"e", 1000016L, null, null}); + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + } + + @Test + public void testLookupJoin() throws Exception { + getEnv().getCheckpointConfig().disableCheckpointing(); + List data = new LinkedList<>(); + data.add(new Object[] {RowKind.INSERT, 1L, "a", LocalDateTime.now().minusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 2L, "b", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 3L, "c", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 4L, "d", LocalDateTime.now().plusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 5L, "e", LocalDateTime.now().plusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 3L, "e", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 6L, "f", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 8L, "g", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 9L, "h", LocalDateTime.now()}); + String id = registerData(DataUtil.toRowList(data)); + sql( + "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " + + "with (" + + " 'connector' = 'values'," + + " 'bounded' = 'false'," + + " 'data-id' = '" + + id + + "' " + + " )"); + + sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); + Map tableProperties = new HashMap<>(); + String table = String.format("mixed_catalog.%s.%s", DB, TABLE); + + String sql = + String.format( + "CREATE TABLE IF NOT EXISTS %s (" + + " info int, id bigint, name STRING" + + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", + table, toWithClause(tableProperties)); + sql(sql); + + TableSchema flinkSchema = + TableSchema.builder() + .field("info", DataTypes.INT()) + .field("id", DataTypes.BIGINT()) + .field("name", DataTypes.STRING()) + .build(); + RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); + KeyedTable keyedTable = + (KeyedTable) + MixedFormatUtils.loadMixedTable(MixedFormatTableLoader.of(TABLE_ID, catalogBuilder)); + TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); + List baseData = + new ArrayList() { + { + add(GenericRowData.ofKind(RowKind.INSERT, 123, 1L, StringData.fromString("a"))); + add(GenericRowData.ofKind(RowKind.INSERT, 324, 2L, StringData.fromString("b"))); + add(GenericRowData.ofKind(RowKind.INSERT, 456, 3L, StringData.fromString("c"))); + add(GenericRowData.ofKind(RowKind.INSERT, 463, 4L, StringData.fromString("d"))); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(keyedTable, taskWriter.complete(), true); + + writeChange(keyedTable, rowType); + + sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); + + TableResult result = + exec( + "select u.name, u.id, dim.info, dim.name dname from `user` as u left join d " + + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" + + " on u.id = dim.id"); + + CommonTestUtils.waitForJobStatus( + result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + result.getJobClient().ifPresent(TestUtil::cancelJob); + + List expected = new LinkedList<>(); + expected.add(new Object[] {"a", 1L, 123, "a"}); + expected.add(new Object[] {"b", 2L, 324, "b"}); + expected.add(new Object[] {"c", 3L, null, null}); + expected.add(new Object[] {"d", 4L, 463, "d"}); + expected.add(new Object[] {"e", 5L, 324, "john"}); + expected.add(new Object[] {"e", 3L, null, null}); + expected.add(new Object[] {"f", 6L, 324, "lily"}); + expected.add(new Object[] {"g", 8L, null, null}); + expected.add(new Object[] {"h", 9L, null, null}); + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + } + + @Test + public void testLookupJoinWithPartialFields() throws Exception { + getEnv().getCheckpointConfig().disableCheckpointing(); + List data = new LinkedList<>(); + data.add(new Object[] {RowKind.INSERT, 1L, "a", LocalDateTime.now().minusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 2L, "b", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 3L, "c", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 4L, "d", LocalDateTime.now().plusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 5L, "e", LocalDateTime.now().plusDays(3)}); + data.add(new Object[] {RowKind.INSERT, 3L, "e", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 6L, "f", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 8L, "g", LocalDateTime.now()}); + data.add(new Object[] {RowKind.INSERT, 9L, "h", LocalDateTime.now()}); + String id = registerData(DataUtil.toRowList(data)); + sql( + "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " + + "with (" + + " 'connector' = 'values'," + + " 'bounded' = 'false'," + + " 'data-id' = '" + + id + + "' " + + " )"); + + sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); + Map tableProperties = new HashMap<>(); + String table = String.format("mixed_catalog.%s.%s", DB, TABLE); + + String sql = + String.format( + "CREATE TABLE IF NOT EXISTS %s (" + + " info int, id bigint, name STRING" + + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", + table, toWithClause(tableProperties)); + sql(sql); + + TableSchema flinkSchema = + TableSchema.builder() + .field("info", DataTypes.INT()) + .field("id", DataTypes.BIGINT()) + .field("name", DataTypes.STRING()) + .build(); + RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); + KeyedTable keyedTable = + (KeyedTable) + MixedFormatUtils.loadMixedTable(MixedFormatTableLoader.of(TABLE_ID, catalogBuilder)); + TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); + List baseData = + new ArrayList() { + { + add(GenericRowData.ofKind(RowKind.INSERT, 123, 1L, StringData.fromString("a"))); + add(GenericRowData.ofKind(RowKind.INSERT, 324, 2L, StringData.fromString("b"))); + add(GenericRowData.ofKind(RowKind.INSERT, 456, 3L, StringData.fromString("c"))); + add(GenericRowData.ofKind(RowKind.INSERT, 463, 4L, StringData.fromString("d"))); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(keyedTable, taskWriter.complete(), true); + + writeChange(keyedTable, rowType); + + sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); + + // schema fields:[info, id, name], now only use [id, name] + TableResult result = + exec( + "select u.name, u.id, dim.name dname from `user` as u left join d " + + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" + + " on u.id = dim.id"); + + CommonTestUtils.waitForJobStatus( + result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + result.getJobClient().ifPresent(TestUtil::cancelJob); + + List expected = new LinkedList<>(); + expected.add(new Object[] {"a", 1L, "a"}); + expected.add(new Object[] {"b", 2L, "b"}); + expected.add(new Object[] {"c", 3L, null}); + expected.add(new Object[] {"d", 4L, "d"}); + expected.add(new Object[] {"e", 5L, "john"}); + expected.add(new Object[] {"e", 3L, null}); + expected.add(new Object[] {"f", 6L, "lily"}); + expected.add(new Object[] {"g", 8L, null}); + expected.add(new Object[] {"h", 9L, null}); + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + } + + private void writeChange(KeyedTable keyedTable, RowType rowType) { + TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, false); + List data = + new ArrayList() { + { + add(GenericRowData.ofKind(RowKind.INSERT, 324, 5L, StringData.fromString("john"))); + add(GenericRowData.ofKind(RowKind.INSERT, 324, 6L, StringData.fromString("lily"))); + add(GenericRowData.ofKind(RowKind.DELETE, 324, 3L, StringData.fromString("jake1"))); + } + }; + try { + for (RowData record : data) { + taskWriter.write(record); + } + commit(keyedTable, taskWriter.complete(), false); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java new file mode 100644 index 0000000000..05fc24eb23 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java @@ -0,0 +1,1164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; +import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; +import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_KAFKA; +import static org.apache.amoro.table.TableProperties.LOG_STORE_TYPE; +import static org.apache.flink.table.api.Expressions.$; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.util.TestUtil; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.hive.catalog.HiveTableTestHelper; +import org.apache.amoro.table.TableProperties; +import org.apache.commons.collections.CollectionUtils; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.ApiExpression; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +@RunWith(Parameterized.class) +public class TestKeyed extends FlinkTestBase { + + public static final Logger LOG = LoggerFactory.getLogger(TestKeyed.class); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TestName testName = new TestName(); + @ClassRule public static TestHMS TEST_HMS = new TestHMS(); + + private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); + private static final String TABLE = "test_keyed"; + + private String catalog; + private String db; + private String topic; + private final Map tableProperties = new HashMap<>(); + public boolean isHive; + + public TestKeyed( + CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { + super(catalogTestHelper, tableTestHelper); + this.isHive = isHive; + } + + @Parameterized.Parameters(name = "{0}, {1}, {2}") + public static Collection parameters() { + return Arrays.asList( + new Object[][] { + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true), + true + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true), + true + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true), + false + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true), + false + } + }); + } + + @BeforeClass + public static void beforeClass() throws Exception { + FlinkTestBase.prepare(); + } + + @AfterClass + public static void afterClass() throws Exception { + FlinkTestBase.shutdown(); + } + + @Before + public void before() throws Exception { + if (isHive) { + db = HiveTableTestHelper.TEST_DB_NAME; + } else { + db = DB; + } + super.before(); + prepareLog(); + super.config(); + } + + @After + public void after() { + sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); + } + + private void prepareLog() { + topic = TestUtil.getUtMethodName(testName) + isHive; + tableProperties.clear(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + tableProperties.put(LOG_STORE_TYPE, LOG_STORE_STORAGE_TYPE_KAFKA); + tableProperties.put( + LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + } + + @Test + public void testSinkSourceFile() throws IOException { + + List data = new LinkedList<>(); + data.add( + new Object[] { + RowKind.INSERT, + 1000004, + "a", + LocalDateTime.parse("2022-06-17T10:10:11.0"), + LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + data.add( + new Object[] { + RowKind.DELETE, + 1000015, + "b", + LocalDateTime.parse("2022-06-17T10:08:11.0"), + LocalDateTime.parse("2022-06-17T10:08:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + data.add( + new Object[] { + RowKind.DELETE, + 1000011, + "c", + LocalDateTime.parse("2022-06-18T10:10:11.0"), + LocalDateTime.parse("2022-06-18T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, + 1000021, + "d", + LocalDateTime.parse("2022-06-17T10:11:11.0"), + LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, + 1000021, + "e", + LocalDateTime.parse("2022-06-17T10:11:11.0"), + LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + data.add( + new Object[] { + RowKind.INSERT, + 1000015, + "e", + LocalDateTime.parse("2022-06-17T10:10:11.0"), + LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + + DataStream source = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.TIMESTAMP().getLogicalType(), + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE().getLogicalType())); + + Table input = + getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time"), $("op_time_tz")); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + sql( + "CREATE TABLE mixed_catalog." + + db + + "." + + TABLE + + " (" + + " id INT," + + " name STRING," + + " op_time_tz TIMESTAMP WITH LOCAL TIME ZONE," + + " op_time TIMESTAMP," + + " PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) " + + " WITH (" + + " 'connector' = 'arctic'" + + ")"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/ select id, name, op_time_tz, op_time from input"); + + List actual = + sql( + "select id, op_time, op_time_tz from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'streaming'='false'" + + ", 'source.parallelism'='2'" + + ")*/"); + + List expected = new LinkedList<>(); + expected.add( + new Object[] { + RowKind.INSERT, + 1000004, + LocalDateTime.parse("2022-06-17T10:10:11.0"), + LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + expected.add( + new Object[] { + RowKind.INSERT, + 1000021, + LocalDateTime.parse("2022-06-17T10:11:11.0"), + LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + expected.add( + new Object[] { + RowKind.INSERT, + 1000015, + LocalDateTime.parse("2022-06-17T10:10:11.0"), + LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() + }); + + Assert.assertTrue(CollectionUtils.isEqualCollection(DataUtil.toRowList(expected), actual)); + } + + @Test + public void testUnpartitionLogSinkSource() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, PRIMARY KEY (id) NOT ENFORCED) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ", 'source.parallelism'='2'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testUnpartitionLogSinkSourceWithSelectedFields() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select id, op_time from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + + List expected = new LinkedList<>(); + expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testUnPartitionDoubleSink() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, PRIMARY KEY (id) NOT ENFORCED) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='file, log'" + + ") */" + + "select id, name from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('streaming'='false') */"))); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testPartitionSinkFile() throws IOException { + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH ('connector' = 'arctic')"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/" + + " select * from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"))); + } + + @Test + public void testSinkSourceFileWithoutSelectPK() throws Exception { + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") WITH ('connector' = 'arctic')"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/" + + " select * from input"); + + TableResult result = + exec( + "select name, op_time from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"); + LinkedList actual = new LinkedList<>(); + try (CloseableIterator iterator = result.collect()) { + while (iterator.hasNext()) { + Row row = iterator.next(); + actual.add(row); + } + } + + List expected = new LinkedList<>(); + expected.add(new Object[] {"a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {"b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {"c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {"d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {"d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {"e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + Assert.assertEquals(DataUtil.toRowSet(expected), new HashSet<>(actual)); + } + + @Test + public void testFileUpsert() { + + List data = new LinkedList<>(); + data.add( + new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.DELETE, 1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); + data.add( + new Object[] {RowKind.DELETE, 1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000021, "d", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] {RowKind.INSERT, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + DataStream source = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.TIMESTAMP().getLogicalType())); + + Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(TableProperties.UPSERT_ENABLED, "true"); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/" + + " select * from input"); + + List expected = new LinkedList<>(); + expected.add( + new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + // key = 1000021 locate in two partitions. + expected.add( + new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + Assert.assertEquals( + DataUtil.toRowSet(expected), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"))); + } + + @Test + public void testFileCDC() { + + List data = new LinkedList<>(); + data.add( + new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.DELETE, 1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); + data.add( + new Object[] {RowKind.DELETE, 1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000021, "d", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] {RowKind.INSERT, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] {RowKind.INSERT, 1000031, "g", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000032, "h", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000031, "g", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_BEFORE, 1000032, "h", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000031, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + RowKind.UPDATE_AFTER, 1000032, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + DataStream source = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.TIMESTAMP().getLogicalType())); + + Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/" + + " select * from input"); + + List expected = new LinkedList<>(); + // upsert is disEnabled, key=1000021 locate in two diff partitions. + expected.add( + new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000031, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000032, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + Assert.assertEquals( + DataUtil.toRowSet(expected), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"))); + } + + @Test + public void testFileUpsertWithSamePrimaryKey() throws Exception { + + List data = new LinkedList<>(); + data.add( + new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000004, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000011, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {RowKind.INSERT, 1000011, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + DataStream source = + getEnv() + .fromCollection( + DataUtil.toRowData(data), + InternalTypeInfo.ofFields( + DataTypes.INT().getLogicalType(), + DataTypes.VARCHAR(100).getLogicalType(), + DataTypes.TIMESTAMP().getLogicalType())); + + getEnv().setParallelism(4); + Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(TableProperties.UPSERT_ENABLED, "true"); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.emit.mode'='file'" + + ")*/" + + " select * from input"); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"); + LinkedList actual = new LinkedList<>(); + try (CloseableIterator iterator = result.collect()) { + while (iterator.hasNext()) { + Row row = iterator.next(); + actual.add(row); + } + } + + LinkedList expected = new LinkedList<>(); + + expected.add( + new Object[] {RowKind.INSERT, 1000004, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add( + new Object[] {RowKind.INSERT, 1000011, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + Map> actualMap = DataUtil.groupByPrimaryKey(actual, 0); + Map> expectedMap = + DataUtil.groupByPrimaryKey(DataUtil.toRowList(expected), 0); + + for (Object key : actualMap.keySet()) { + Assert.assertTrue( + CollectionUtils.isEqualCollection(actualMap.get(key), expectedMap.get(key))); + } + } + + @Test + public void testPartitionLogSinkSource() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testPartitionLogSinkSourceWithSelectedFields() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select id, op_time from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + + List expected = new LinkedList<>(); + expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testPartitionDoubleSink() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " + + ") PARTITIONED BY(op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='file, log'" + + ", 'log.version'='v1'" + + ") */" + + "select * from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */"))); + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + Row row = iterator.next(); + actual.add(row); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java new file mode 100644 index 0000000000..659f4e955f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.TaskWriter; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class TestLookupSecondary extends CatalogITCaseBase implements FlinkTaskWriterBaseTest { + private String db; + + public TestLookupSecondary() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, false)); + } + + @Before + public void setup() throws IOException { + List dbs = getMixedFormatCatalog().listDatabases(); + if (dbs.isEmpty()) { + db = "test_db"; + getMixedFormatCatalog().createDatabase(db); + } else { + db = dbs.get(0); + } + exec( + "create catalog mixed_catalog with ('type'='mixed_iceberg', 'ams.uri'='%s')", + getCatalogUri()); + exec( + "create table mixed_catalog.%s.L (id int) " + + "with ('scan.startup.mode'='earliest', 'monitor-interval'='1 s')", + db); + exec( + "create table mixed_catalog.%s.DIM_2 (id int, name string, cls bigint, primary key(id, name) not enforced) " + + "with ('write.upsert.enabled'='true', 'lookup.reloading.interval'='1 s')", + db); + exec("create view vi as select *, PROCTIME() as proc from mixed_catalog.%s.L", db); + + writeAndCommit( + TableIdentifier.of(getCatalogName(), db, "L"), + Lists.newArrayList( + DataUtil.toRowData(1), + DataUtil.toRowData(2), + DataUtil.toRowData(3), + DataUtil.toRowData(4))); + writeToChangeAndCommit( + TableIdentifier.of(getCatalogName(), db, "DIM_2"), + Lists.newArrayList( + DataUtil.toRowData(1, "a", 1L), + DataUtil.toRowData(1, "b", 1L), + DataUtil.toRowData(2, "c", 2L), + DataUtil.toRowData(3, "d", 3L)), + true); + } + + @After + public void drop() { + exec("drop table mixed_catalog.%s.L", db); + exec("drop table mixed_catalog.%s.DIM_2", db); + } + + @Test() + public void testLookup() throws Exception { + TableResult tableResult = + exec( + "select L.id, D.cls from vi L LEFT JOIN mixed_catalog.%s.DIM_2 " + + "for system_time as of L.proc AS D ON L.id = D.id", + db); + + tableResult.await(1, TimeUnit.MINUTES); // wait for the first row. + + List expects = new LinkedList<>(); + expects.add(new Object[] {1, 1L}); + expects.add(new Object[] {1, 1L}); + expects.add(new Object[] {2, 2L}); + expects.add(new Object[] {3, 3L}); + expects.add(new Object[] {4, null}); + int expected = expects.size(), count = 0; + List actual = new ArrayList<>(); + try (CloseableIterator rows = tableResult.collect()) { + while (count < expected && rows.hasNext()) { + Row row = rows.next(); + actual.add(row); + count++; + } + } + + Assert.assertEquals(expected, actual.size()); + List rows = + expects.stream() + .map( + r -> + r[0] instanceof RowKind + ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) + : Row.of(r)) + .collect(Collectors.toList()); + Assert.assertEquals( + rows.stream().sorted(Comparator.comparing(Row::toString)).collect(Collectors.toList()), + actual.stream().sorted(Comparator.comparing(Row::toString)).collect(Collectors.toList())); + } + + @Override + public String getMetastoreUri() { + return getCatalogUri(); + } + + @Override + public String getCatalogName() { + return getMixedFormatCatalog().name(); + } + + @Override + public boolean upsertEnabled() { + return true; + } + + private void writeAndCommit(TableIdentifier table, List expected) throws IOException { + writeAndCommit(table, expected, true, false); + } + + private void writeToChangeAndCommit( + TableIdentifier table, List expected, boolean upsertEnabled) throws IOException { + writeAndCommit(table, expected, false, upsertEnabled); + } + + private void writeAndCommit( + TableIdentifier table, + List expected, + boolean writeToBaseStore, + boolean upsertEnabled) + throws IOException { + MixedTable mixedTable = getMixedFormatCatalog().loadTable(table); + Assert.assertNotNull(mixedTable); + RowType rowType = FlinkSchemaUtil.convert(mixedTable.schema()); + for (RowData rowData : expected) { + try (TaskWriter taskWriter = + writeToBaseStore + ? createBaseTaskWriter(mixedTable, rowType) + : createTaskWriter(mixedTable, rowType)) { + if (writeToBaseStore) { + writeAndCommit(rowData, taskWriter, mixedTable); + } else { + writeAndCommit(rowData, taskWriter, mixedTable, upsertEnabled); + } + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java new file mode 100644 index 0000000000..5bbc2412e2 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP_TIMESTAMP; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.hive.catalog.HiveTableTestHelper; +import org.apache.amoro.table.MixedTable; +import org.apache.iceberg.UpdateProperties; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Map; + +@RunWith(Parameterized.class) +public class TestTableRefresh extends FlinkTestBase { + @ClassRule public static TestHMS TEST_HMS = new TestHMS(); + + public TestTableRefresh(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { + super(catalogTestHelper, tableTestHelper); + } + + @Parameterized.Parameters(name = "{0}, {1}") + public static Collection parameters() { + return Arrays.asList( + new Object[][] { + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true) + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true) + } + }); + } + + @Test + public void testRefresh() { + MixedFormatTableLoader tableLoader = + MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + + tableLoader.open(); + MixedTable mixedTable = tableLoader.loadMixedFormatTable(); + boolean catchUp = true; + String catchUpTs = "1"; + + UpdateProperties updateProperties = mixedTable.updateProperties(); + updateProperties.set(LOG_STORE_CATCH_UP.key(), String.valueOf(catchUp)); + updateProperties.set(LOG_STORE_CATCH_UP_TIMESTAMP.key(), catchUpTs); + updateProperties.commit(); + + mixedTable.refresh(); + Map properties = mixedTable.properties(); + Assert.assertEquals(String.valueOf(catchUp), properties.get(LOG_STORE_CATCH_UP.key())); + Assert.assertEquals(catchUpTs, properties.get(LOG_STORE_CATCH_UP_TIMESTAMP.key())); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java new file mode 100644 index 0000000000..ccab05a761 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java @@ -0,0 +1,1052 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; +import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.util.TestUtil; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.hive.catalog.HiveTableTestHelper; +import org.apache.amoro.mixed.MixedFormatCatalog; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.table.api.ApiExpression; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.types.Types; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +@RunWith(Parameterized.class) +public class TestUnkeyed extends FlinkTestBase { + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + private static final String TABLE = "test_unkeyed"; + private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); + + private String catalog; + private MixedFormatCatalog mixedFormatCatalog; + private String db; + private String topic; + + @ClassRule public static TestHMS TEST_HMS = new TestHMS(); + public boolean isHive; + + public TestUnkeyed( + CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { + super(catalogTestHelper, tableTestHelper); + this.isHive = isHive; + } + + @Parameterized.Parameters(name = "{0}, {1}, {2}") + public static Collection parameters() { + return Arrays.asList( + new Object[][] { + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true), + true + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true), + false + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true), + false + } + }); + } + + @BeforeClass + public static void beforeClass() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void afterClass() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Before + public void before() throws Exception { + if (isHive) { + catalog = HiveTableTestHelper.TEST_CATALOG_NAME; + db = HiveTableTestHelper.TEST_DB_NAME; + } else { + catalog = TEST_CATALOG_NAME; + db = DB; + } + super.before(); + mixedFormatCatalog = getMixedFormatCatalog(); + topic = String.join(".", catalog, db, TABLE); + super.config(); + } + + @After + public void after() { + sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); + } + + @Test + public void testUnPartitionDDL() throws IOException { + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, age SMALLINT, sex TINYINT, score BIGINT, height FLOAT, speed DOUBLE, ts TIMESTAMP)"); + + MixedTable table = + mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TestUnkeyed.TABLE)); + + Schema required = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "name", Types.StringType.get()), + Types.NestedField.optional(3, "age", Types.IntegerType.get()), + Types.NestedField.optional(4, "sex", Types.IntegerType.get()), + Types.NestedField.optional(5, "score", Types.LongType.get()), + Types.NestedField.optional(6, "height", Types.FloatType.get()), + Types.NestedField.optional(7, "speed", Types.DoubleType.get()), + Types.NestedField.optional(8, "ts", Types.TimestampType.withoutZone())); + Assert.assertEquals(required.asStruct(), table.schema().asStruct()); + } + + @Test + public void testPartitionDDL() throws IOException { + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, age SMALLINT, sex TINYINT, score BIGINT, height FLOAT, speed DOUBLE, ts TIMESTAMP)" + + " PARTITIONED BY (ts)"); + + Schema required = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "name", Types.StringType.get()), + Types.NestedField.optional(3, "age", Types.IntegerType.get()), + Types.NestedField.optional(4, "sex", Types.IntegerType.get()), + Types.NestedField.optional(5, "score", Types.LongType.get()), + Types.NestedField.optional(6, "height", Types.FloatType.get()), + Types.NestedField.optional(7, "speed", Types.DoubleType.get()), + Types.NestedField.optional(8, "ts", Types.TimestampType.withoutZone())); + MixedTable table = mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TABLE)); + Assert.assertEquals(required.asStruct(), table.schema().asStruct()); + + PartitionSpec requiredSpec = PartitionSpec.builderFor(required).identity("ts").build(); + Assert.assertEquals(requiredSpec, table.spec()); + } + + @Test + public void testUnkeyedWatermarkSet() throws Exception { + List data = new LinkedList<>(); + + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-17T10:11:11.0")}); + data.add(new Object[] {1000021, "d", LocalDateTime.parse("2022-06-17T16:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, ts TIMESTAMP)"); + + sql( + "create table user_tb (" + + " rtime as cast(ts as timestamp(3))," + + " WATERMARK FOR rtime as rtime" + + " ) LIKE mixed_catalog." + + db + + "." + + TABLE); + + sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); + + TableResult result = + exec( + "select id, name, ts from user_tb" + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testSinkBatchRead() throws IOException { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-17T10:11:11.0")}); + data.add(new Object[] {1000021, "d", LocalDateTime.parse("2022-06-17T16:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP)"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS('mixed-format.emit.mode'='file')*/ select * from input"); + + MixedTable table = mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TABLE)); + Iterable snapshots = table.asUnkeyedTable().snapshots(); + Snapshot s = snapshots.iterator().next(); + + Assert.assertEquals( + DataUtil.toRowSet(data), + new HashSet<>( + sql( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'streaming'='false'" + + ", 'snapshot-id'='" + + s.snapshotId() + + "'" + + ")*/"))); + } + + @Test + public void testSinkStreamRead() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + sql("CREATE TABLE IF NOT EXISTS mixed_catalog." + db + "." + TABLE + "(id INT, name STRING)"); + + sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); + + // verify in earliest scan-startup-mode file read + TableResult resultWithEarliestPosition = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'streaming'='true'" + + ", 'mixed-format.read.mode'='file'" + + ", 'scan.startup.mode'='earliest'" + + ", 'monitor-interval'='1s'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = resultWithEarliestPosition.collect()) { + for (int i = 0; i < data.size(); i++) { + actual.add(iterator.next()); + } + } + resultWithEarliestPosition.getJobClient().ifPresent(TestUtil::cancelJob); + Assert.assertEquals(DataUtil.toRowSet(data), actual); + + // verify in latest scan-startup-mode file read + TableResult resultWithLatestPosition = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'streaming'='true'" + + ", 'mixed-format.read.mode'='file'" + + ", 'scan.startup.mode'='latest'" + + ", 'monitor-interval'='1s'" + + ")*/"); + + List appendData = new LinkedList<>(); + appendData.add(new Object[] {2000004, "a"}); + appendData.add(new Object[] {2000015, "b"}); + appendData.add(new Object[] {2000011, "c"}); + appendData.add(new Object[] {2000014, "d"}); + appendData.add(new Object[] {2000021, "d"}); + appendData.add(new Object[] {2000007, "e"}); + + List appendRows = DataUtil.toRows(appendData); + + Table appendInput = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + appendRows); + getTableEnv().createTemporaryView("appendInput", appendInput); + + actual.clear(); + try (CloseableIterator iterator = resultWithLatestPosition.collect()) { + sql("insert into mixed_catalog." + db + "." + TABLE + " select * from appendInput"); + for (int i = 0; i < appendData.size(); i++) { + Assert.assertTrue("Should have more records", iterator.hasNext()); + actual.add(iterator.next()); + } + } + resultWithLatestPosition.getJobClient().ifPresent(TestUtil::cancelJob); + Assert.assertEquals(DataUtil.toRowSet(appendData), actual); + } + + @Test + public void testLogSinkSource() throws Exception { + String topic = this.topic + "testLogSinkSource"; + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put( + LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + KafkaContainerTest.deleteTopics(topic); + } + + @Test + public void testUnpartitionLogSinkSourceWithSelectedFields() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put( + LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select id, op_time from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + + List expected = new LinkedList<>(); + expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + } + + @Test + public void testUnPartitionDoubleSink() throws Exception { + String topic = this.topic + "testUnPartitionDoubleSink"; + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put( + LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='file, log'" + + ", 'log.version'='v1'" + + ") */" + + "select id, name from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + sqlSet( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='file', 'streaming'='false') */")); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + KafkaContainerTest.deleteTopics(topic); + } + + @Test + public void testPartitionSinkBatchRead() throws IOException { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", "2022-05-17"}); + data.add(new Object[] {1000015, "b", "2022-05-17"}); + data.add(new Object[] {1000011, "c", "2022-05-17"}); + data.add(new Object[] {1000014, "d", "2022-05-18"}); + data.add(new Object[] {1000021, "d", "2022-05-18"}); + data.add(new Object[] {1000007, "e", "2022-05-18"}); + + List expected = new LinkedList<>(); + expected.add(new Object[] {1000014, "d", "2022-05-18"}); + expected.add(new Object[] {1000021, "d", "2022-05-18"}); + expected.add(new Object[] {1000007, "e", "2022-05-18"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, dt STRING)" + + " PARTITIONED BY (dt)"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " PARTITION (dt='2022-05-18') select id, name from input" + + " where dt='2022-05-18' "); + + TableIdentifier identifier = TableIdentifier.of(catalog, db, TABLE); + MixedTable table = mixedFormatCatalog.loadTable(identifier); + Iterable snapshots = table.asUnkeyedTable().snapshots(); + Snapshot s = snapshots.iterator().next(); + + Assert.assertEquals( + DataUtil.toRowSet(expected), + sqlSet( + "select * from mixed_catalog." + + db + + "." + + TestUnkeyed.TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'snapshot-id'='" + + s.snapshotId() + + "'" + + ", 'streaming'='false'" + + ")*/")); + Assert.assertEquals( + DataUtil.toRowSet(expected), + sqlSet( + "select * from mixed_catalog." + + db + + "." + + TestUnkeyed.TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'as-of-timestamp'='" + + s.timestampMillis() + + "'" + + ", 'streaming'='false'" + + ")*/")); + } + + @Test + public void testPartitionSinkStreamRead() throws Exception { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", "2022-05-17"}); + data.add(new Object[] {1000015, "b", "2022-05-17"}); + data.add(new Object[] {1000011, "c", "2022-05-17"}); + data.add(new Object[] {1000014, "d", "2022-05-18"}); + data.add(new Object[] {1000021, "d", "2022-05-18"}); + data.add(new Object[] {1000007, "e", "2022-05-18"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, dt STRING)" + + " PARTITIONED BY (dt)"); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " PARTITION (dt='2022-05-18') select id, name from input" + + " where dt='2022-05-18' "); + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " PARTITION (dt='2022-05-18') select id, name from input" + + " where dt='2022-05-18' "); + + TableIdentifier identifier = TableIdentifier.of(catalog, db, TABLE); + MixedTable table = mixedFormatCatalog.loadTable(identifier); + Iterable snapshots = table.asUnkeyedTable().snapshots(); + Snapshot s = snapshots.iterator().next(); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TestUnkeyed.TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='file'" + + ", 'start-snapshot-id'='" + + s.snapshotId() + + "'" + + ")*/"); + + List expected = + new ArrayList() { + { + add(Row.of(1000014, "d", "2022-05-18")); + add(Row.of(1000021, "d", "2022-05-18")); + add(Row.of(1000007, "e", "2022-05-18")); + } + }; + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (int i = 0; i < expected.size(); i++) { + actual.add(iterator.next()); + } + } + result.getJobClient().ifPresent(TestUtil::cancelJob); + Assert.assertEquals(new HashSet<>(expected), actual); + } + + @Test + public void testPartitionLogSinkSource() throws Exception { + String topic = this.topic + "testUnKeyedPartitionLogSinkSource"; + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", "2022-05-17"}); + data.add(new Object[] {1000015, "b", "2022-05-17"}); + data.add(new Object[] {1000011, "c", "2022-05-17"}); + data.add(new Object[] {1000014, "d", "2022-05-18"}); + data.add(new Object[] {1000021, "d", "2022-05-18"}); + data.add(new Object[] {1000007, "e", "2022-05-18"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, dt STRING) PARTITIONED BY (dt) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + KafkaContainerTest.deleteTopics(topic); + } + + @Test + public void testPartitionLogSinkSourceWithSelectedFields() throws Exception { + String topic = this.topic + "testPartitionLogSinkSourceWithSelectedFields"; + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + List rows = DataUtil.toRows(data); + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, op_time TIMESTAMP) PARTITIONED BY (op_time) WITH %s", + toWithClause(tableProperties)); + + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='log'" + + ", 'log.version'='v1'" + + ") */" + + " select * from input"); + + TableResult result = + exec( + "select id, op_time from mixed_catalog." + + db + + "." + + TABLE + + "/*+ OPTIONS(" + + "'mixed-format.read.mode'='log'" + + ", 'scan.startup.mode'='earliest'" + + ")*/"); + + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + + List expected = new LinkedList<>(); + expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + + result.getJobClient().ifPresent(TestUtil::cancelJob); + KafkaContainerTest.deleteTopics(topic); + } + + @Test + public void testPartitionDoubleSink() throws Exception { + String topic = this.topic + "testUnkeyedPartitionDoubleSink"; + KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); + + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", "2022-05-17"}); + data.add(new Object[] {1000015, "b", "2022-05-17"}); + data.add(new Object[] {1000011, "c", "2022-05-17"}); + data.add(new Object[] {1000014, "d", "2022-05-18"}); + data.add(new Object[] {1000021, "d", "2022-05-18"}); + data.add(new Object[] {1000007, "e", "2022-05-18"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + Map tableProperties = new HashMap<>(); + tableProperties.put(ENABLE_LOG_STORE, "true"); + tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); + tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, dt STRING) PARTITIONED BY (dt) WITH %s", + toWithClause(tableProperties)); + sql( + "insert into mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'mixed-format.emit.mode'='file, log'" + + ", 'log.version'='v1'" + + ") */" + + "select * from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + sqlSet( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='file', 'streaming'='false') */")); + TableResult result = + exec( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + for (Object[] datum : data) { + actual.add(iterator.next()); + } + } + Assert.assertEquals(DataUtil.toRowSet(data), actual); + result.getJobClient().ifPresent(TestUtil::cancelJob); + KafkaContainerTest.deleteTopics(topic); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java new file mode 100644 index 0000000000..fcb092e3d6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.hive.catalog.HiveTableTestHelper; +import org.apache.flink.table.api.ApiExpression; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.iceberg.flink.MiniClusterResource; +import org.junit.After; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +@RunWith(Parameterized.class) +public class TestUnkeyedOverwrite extends FlinkTestBase { + + private static final Logger LOGGER = LoggerFactory.getLogger(TestUnkeyedOverwrite.class); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + @ClassRule + public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = + MiniClusterResource.createWithClassloaderCheckDisabled(); + + private static final String TABLE = "test_unkeyed"; + private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); + + private String db; + public boolean isHive; + @ClassRule public static TestHMS TEST_HMS = new TestHMS(); + + public TestUnkeyedOverwrite( + CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { + super(catalogTestHelper, tableTestHelper); + this.isHive = isHive; + } + + @Parameterized.Parameters(name = "{0}, {1}, {2}") + public static Object[] parameters() { + return new Object[][] { + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true), + true + }, + { + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true), + false + } + }; + } + + public void before() throws Exception { + if (isHive) { + db = HiveTableTestHelper.TEST_DB_NAME; + } else { + db = DB; + } + super.before(); + super.config(); + } + + @After + public void after() { + sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); + } + + @Test + public void testInsertOverwrite() throws IOException { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a"}); + data.add(new Object[] {1000015, "b"}); + data.add(new Object[] {1000011, "c"}); + data.add(new Object[] {1000014, "d"}); + data.add(new Object[] {1000021, "d"}); + data.add(new Object[] {1000007, "e"}); + + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING)"); + + sql("insert overwrite mixed_catalog." + db + "." + TABLE + " select * from input"); + + Assert.assertEquals( + DataUtil.toRowSet(data), + sqlSet( + "select * from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */")); + } + + @Test + public void testPartitionInsertOverwrite() throws IOException { + List data = new LinkedList<>(); + data.add(new Object[] {1000004, "a", "2022-05-17"}); + data.add(new Object[] {1000015, "b", "2022-05-17"}); + data.add(new Object[] {1000011, "c", "2022-05-17"}); + data.add(new Object[] {1000014, "d", "2022-05-18"}); + data.add(new Object[] {1000021, "d", "2022-05-18"}); + data.add(new Object[] {1000007, "e", "2022-05-18"}); + + List expected = new LinkedList<>(); + expected.add(new Object[] {11, "d", "2022-05-19"}); + expected.add(new Object[] {21, "d", "2022-05-19"}); + expected.add(new Object[] {35, "e", "2022-05-19"}); + + data.addAll(expected); + List rows = DataUtil.toRows(data); + + Table input = + getTableEnv() + .fromValues( + DataTypes.ROW( + DataTypes.FIELD("id", DataTypes.INT()), + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("dt", DataTypes.STRING())), + rows); + getTableEnv().createTemporaryView("input", input); + + sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); + + sql( + "CREATE TABLE IF NOT EXISTS mixed_catalog." + + db + + "." + + TABLE + + "(" + + " id INT, name STRING, dt STRING) PARTITIONED BY (dt)"); + + sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); + sql( + "insert overwrite mixed_catalog." + + db + + "." + + TABLE + + " PARTITION (dt='2022-05-18') select id, name from input where dt = '2022-05-19'"); + + Assert.assertEquals( + DataUtil.toRowSet(expected), + sqlSet( + "select id, name, '2022-05-19' from mixed_catalog." + + db + + "." + + TABLE + + " /*+ OPTIONS(" + + "'streaming'='false'" + + ") */" + + " where dt='2022-05-18'")); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java new file mode 100644 index 0000000000..ae0dbe8c77 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.table; + +import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.util.TestUtil; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.TableIdentifier; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.runtime.testutils.CommonTestUtils; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.TableResult; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CloseableIterator; +import org.apache.iceberg.io.TaskWriter; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; + +public class TestWatermark extends FlinkTestBase { + public static final Logger LOG = LoggerFactory.getLogger(TestWatermark.class); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); + private static final String TABLE = "test_keyed"; + + public TestWatermark() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + @Before + public void before() throws Exception { + super.before(); + super.config(); + } + + @After + public void after() { + sql("DROP TABLE IF EXISTS mixed_catalog." + DB + "." + TABLE); + } + + @Test + public void testWatermark() throws Exception { + sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); + Map tableProperties = new HashMap<>(); + String table = String.format("mixed_catalog.%s.%s", DB, TABLE); + + sql( + "CREATE TABLE IF NOT EXISTS %s (" + + " id bigint, user_id int, name STRING, category string, op_time timestamp, is_true boolean" + + ", PRIMARY KEY (id, user_id) NOT ENFORCED) PARTITIONED BY(category, name) WITH %s", + table, toWithClause(tableProperties)); + + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("user_id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .field("category", DataTypes.STRING()) + .field("op_time", DataTypes.TIMESTAMP(3)) + .field("is_true", DataTypes.BOOLEAN()) + .build(); + RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); + KeyedTable keyedTable = + (KeyedTable) + MixedFormatUtils.loadMixedTable( + MixedFormatTableLoader.of( + TableIdentifier.of(TEST_CATALOG_NAME, DB, TABLE), catalogBuilder)); + TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); + List baseData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2L, + 123, + StringData.fromString("a"), + StringData.fromString("a"), + TimestampData.fromLocalDateTime(LocalDateTime.now().minusMinutes(1)), + true)); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(keyedTable, taskWriter.complete(), true); + + sql( + "create table d (tt as cast(op_time as timestamp(3)), watermark for tt as tt) like %s", + table); + + Table source = getTableEnv().sqlQuery("select is_true from d"); + + WatermarkTestOperator op = new WatermarkTestOperator(); + getTableEnv() + .toRetractStream(source, RowData.class) + .transform("test watermark", TypeInformation.of(RowData.class), op); + getEnv().executeAsync("test watermark"); + + op.waitWatermark(); + + Assert.assertTrue(op.watermark > Long.MIN_VALUE); + } + + @Test + public void testSelectWatermarkField() throws Exception { + sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); + Map tableProperties = new HashMap<>(); + String table = String.format("mixed_catalog.%s.%s", DB, TABLE); + + sql( + "CREATE TABLE IF NOT EXISTS %s (" + + " id bigint, user_id int, name STRING, category string, op_time timestamp, is_true boolean" + + ", PRIMARY KEY (id, user_id) NOT ENFORCED) PARTITIONED BY(category, name) WITH %s", + table, toWithClause(tableProperties)); + + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("user_id", DataTypes.INT()) + .field("name", DataTypes.STRING()) + .field("category", DataTypes.STRING()) + .field("op_time", DataTypes.TIMESTAMP(3)) + .field("is_true", DataTypes.BOOLEAN()) + .build(); + RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); + KeyedTable keyedTable = + (KeyedTable) + MixedFormatUtils.loadMixedTable( + MixedFormatTableLoader.of( + TableIdentifier.of(TEST_CATALOG_NAME, DB, TABLE), catalogBuilder)); + TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); + List baseData = + new ArrayList() { + { + add( + GenericRowData.ofKind( + RowKind.INSERT, + 2L, + 123, + StringData.fromString("a"), + StringData.fromString("a"), + TimestampData.fromLocalDateTime(LocalDateTime.parse("2022-06-17T10:08:11.0")), + true)); + } + }; + for (RowData record : baseData) { + taskWriter.write(record); + } + commit(keyedTable, taskWriter.complete(), true); + + sql( + "create table d (tt as cast(op_time as timestamp(3)), watermark for tt as tt) like %s", + table); + + TableResult result = exec("select is_true, tt from d"); + + CommonTestUtils.waitUntilJobManagerIsInitialized( + () -> result.getJobClient().get().getJobStatus().get()); + Set actual = new HashSet<>(); + try (CloseableIterator iterator = result.collect()) { + Row row = iterator.next(); + actual.add(row); + } + result.getJobClient().ifPresent(TestUtil::cancelJob); + + List expected = new LinkedList<>(); + expected.add(new Object[] {true, LocalDateTime.parse("2022-06-17T10:08:11")}); + Assert.assertEquals(DataUtil.toRowSet(expected), actual); + } + + public static class WatermarkTestOperator extends AbstractStreamOperator + implements OneInputStreamOperator, RowData> { + + private static final long serialVersionUID = 1L; + public long watermark; + private static final CompletableFuture waitWatermark = new CompletableFuture<>(); + + public WatermarkTestOperator() { + super(); + chainingStrategy = ChainingStrategy.ALWAYS; + } + + private void waitWatermark() throws InterruptedException, ExecutionException { + waitWatermark.get(); + } + + @Override + public void processElement(StreamRecord> element) throws Exception { + output.collect(element.asRecord()); + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + LOG.info("processWatermark: {}", mark); + watermark = mark.getTimestamp(); + waitWatermark.complete(null); + super.processWatermark(mark); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java new file mode 100644 index 0000000000..7b90f8a179 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import javax.annotation.Nullable; +import javax.tools.JavaCompiler; +import javax.tools.ToolProvider; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLClassLoader; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** Utilities to create class loaders. */ +public class ClassLoaderUtils { + public static URLClassLoader compileAndLoadJava(File root, String filename, String source) + throws IOException { + return withRoot(root).addClass(filename.replaceAll("\\.java", ""), source).build(); + } + + private static URLClassLoader createClassLoader(File root) throws MalformedURLException { + return new URLClassLoader( + new URL[] {root.toURI().toURL()}, Thread.currentThread().getContextClassLoader()); + } + + private static void writeAndCompile(File root, String filename, String source) + throws IOException { + File file = writeSourceFile(root, filename, source); + + compileClass(file); + } + + private static File writeSourceFile(File root, String filename, String source) + throws IOException { + File file = new File(root, filename); + file.getParentFile().mkdirs(); + FileWriter fileWriter = new FileWriter(file); + + fileWriter.write(source); + fileWriter.close(); + + return file; + } + + public static ClassLoaderBuilder withRoot(File root) { + return new ClassLoaderBuilder(root); + } + + private static int compileClass(File sourceFile) { + JavaCompiler compiler = ToolProvider.getSystemJavaCompiler(); + return compiler.run(null, null, null, "-proc:none", sourceFile.getPath()); + } + + public static URL[] getClasspathURLs() { + final String[] cp = System.getProperty("java.class.path").split(File.pathSeparator); + + return Arrays.stream(cp) + .filter(str -> !str.isEmpty()) + .map(ClassLoaderUtils::parse) + .toArray(URL[]::new); + } + + private static URL parse(String fileName) { + try { + return new File(fileName).toURI().toURL(); + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + } + + public static class ClassLoaderBuilder { + + private final File root; + private final Map classes; + private final Map resources; + + private ClassLoaderBuilder(File root) { + this.root = root; + this.classes = new HashMap<>(); + this.resources = new HashMap<>(); + } + + public ClassLoaderBuilder addResource(String targetPath, String resource) { + String oldValue = resources.putIfAbsent(targetPath, resource); + + if (oldValue != null) { + throw new RuntimeException( + String.format("Resource with path %s already registered.", resource)); + } + + return this; + } + + public ClassLoaderBuilder addClass(String className, String source) { + String oldValue = classes.putIfAbsent(className, source); + + if (oldValue != null) { + throw new RuntimeException( + String.format("Class with name %s already registered.", className)); + } + + return this; + } + + public URLClassLoader build() throws IOException { + for (Map.Entry classInfo : classes.entrySet()) { + writeAndCompile(root, createFileName(classInfo.getKey()), classInfo.getValue()); + } + + for (Map.Entry resource : resources.entrySet()) { + writeSourceFile(root, resource.getKey(), resource.getValue()); + } + + return createClassLoader(root); + } + + private String createFileName(String className) { + return className + ".java"; + } + } + // ------------------------------------------------------------------------ + // Testing of objects not in the application class loader + // ------------------------------------------------------------------------ + + /** + * A new object and the corresponding ClassLoader for that object, as returned by {@link + * #createSerializableObjectFromNewClassLoader()} or {@link + * #createExceptionObjectFromNewClassLoader()}. + */ + public static final class ObjectAndClassLoader { + + private final T object; + private final ClassLoader classLoader; + + private ObjectAndClassLoader(T object, ClassLoader classLoader) { + this.object = object; + this.classLoader = classLoader; + } + + public ClassLoader getClassLoader() { + return classLoader; + } + + public T getObject() { + return object; + } + } + + /** + * Creates a new ClassLoader and a new {@link Serializable} class inside that ClassLoader. This is + * useful when unit testing the class loading behavior of code, and needing a class that is + * outside the system class path. + * + *

NOTE: Even though this method may throw IOExceptions, we do not declare those and rather + * wrap them in Runtime Exceptions. While this is generally discouraged, we do this here because + * it is merely a test utility and not production code, and it makes it easier to use this method + * during the initialization of variables and especially static variables. + */ + public static ObjectAndClassLoader createSerializableObjectFromNewClassLoader() { + + final String classSource = + "import java.io.Serializable;" + + "import java.util.Random;" + + "public class TestSerializable implements Serializable {" + + " private static final long serialVersionUID = -3L;" + + " private final long random;" + + " public TestSerializable() {" + + " random = new Random().nextLong();" + + " }" + + " public boolean equals(Object o) {" + + " if (this == o) { return true; }" + + " if ((o == null) || (getClass() != o.getClass())) { return false; }" + + " TestSerializable that = (TestSerializable) o;" + + " return random == random;" + + " }" + + " public int hashCode() {" + + " return (int)(random ^ random >>> 32);" + + " }" + + " public String toString() {" + + " return \"TestSerializable{random=\" + random + '}';" + + " }" + + "}"; + + return createObjectFromNewClassLoader("TestSerializable", Serializable.class, classSource); + } + + /** + * Creates a new ClassLoader and a new {@link Exception} class inside that ClassLoader. This is + * useful when unit testing the class loading behavior of code, and needing a class that is + * outside the system class path. + * + *

NOTE: Even though this method may throw IOExceptions, we do not declare those and rather + * wrap them in Runtime Exceptions. While this is generally discouraged, we do this here because + * it is merely a test utility and not production code, and it makes it easier to use this method + * during the initialization of variables and especially static variables. + */ + public static ObjectAndClassLoader createExceptionObjectFromNewClassLoader() { + + return createObjectFromNewClassLoader( + "TestExceptionForSerialization", + Exception.class, + "public class TestExceptionForSerialization extends java.lang.Exception {}"); + } + + private static ObjectAndClassLoader createObjectFromNewClassLoader( + String testClassName, Class testClass, String source) { + final Path classDirPath = + new File(System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString()).toPath(); + + URLClassLoader classLoader = null; + try { + Files.createDirectories(classDirPath); + classLoader = compileAndLoadJava(classDirPath.toFile(), testClassName, source); + + final Class clazz = classLoader.loadClass(testClassName); + final T object = clazz.asSubclass(testClass).getDeclaredConstructor().newInstance(); + + return new ObjectAndClassLoader<>(object, classLoader); + } catch (Exception e) { + throw new RuntimeException("Cannot create test class outside system class path", e); + } finally { + // we clean up eagerly, because it is fine to delete the class file once the class is + // loaded + // and we have no later life cycle hook here to do the cleanup + tryClose(classLoader); + tryDeleteDirectoryRecursively(classDirPath); + } + } + + // ------------------------------------------------------------------------ + // miscellaneous utils + // ------------------------------------------------------------------------ + + private static void tryClose(@Nullable AutoCloseable closeable) { + if (closeable != null) { + try { + closeable.close(); + } catch (Exception ignored) { + } + } + } + + private static void tryDeleteDirectoryRecursively(Path directory) { + final SimpleFileVisitor deletingVisitor = + new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }; + + try { + Files.walkFileTree(directory, deletingVisitor); + } catch (Exception ignored) { + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java new file mode 100644 index 0000000000..f71476ad31 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static org.apache.flink.table.api.Expressions.row; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.flink.table.api.ApiExpression; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.CloseableIterable; +import org.junit.Assert; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public class DataUtil { + + public static List toRows(Collection data) { + return data.stream() + .map( + i -> { + int size = i.length; + return size == 1 ? row(i[0]) : row(i[0], ArrayUtils.subarray(i, 1, size)); + }) + .collect(Collectors.toList()); + } + + public static Set toRowSet(Collection data) { + return data.stream() + .map( + r -> + r[0] instanceof RowKind + ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) + : Row.of(r)) + .collect(Collectors.toSet()); + } + + public static List toRowList(Collection data) { + return data.stream() + .map( + r -> + r[0] instanceof RowKind + ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) + : Row.of(r)) + .collect(Collectors.toList()); + } + + public static void assertEqual(Collection expected, Collection actual) { + Assert.assertEquals( + CollectionUtil.isNullOrEmpty(expected), CollectionUtil.isNullOrEmpty(actual)); + if (expected == null) { + return; + } + Assert.assertEquals(expected.size(), actual.size()); + for (Iterator i1 = expected.iterator(), i2 = actual.iterator(); i1.hasNext(); ) { + Object[] actualRow = i2.next(); + System.out.println(ArrayUtils.toString(actualRow)); + Assert.assertArrayEquals(i1.next(), actualRow); + } + } + + private static Object[] convertData(Object... values) { + Object[] row = new Object[values.length]; + for (int i = 0; i < values.length; i++) { + if (values[i] instanceof String) { + row[i] = StringData.fromString((String) values[i]); + } else if (values[i] instanceof LocalDateTime) { + row[i] = TimestampData.fromLocalDateTime(((LocalDateTime) values[i])); + } else if (values[i] instanceof Instant) { + row[i] = TimestampData.fromInstant((Instant) values[i]); + } else { + row[i] = values[i]; + } + } + return row; + } + + public static Collection toRowData(List data) { + return data.stream() + .map( + d -> + d[0] instanceof RowKind + ? toRowDataWithKind((RowKind) d[0], ArrayUtils.subarray(d, 1, d.length)) + : toRowData(d)) + .collect(Collectors.toList()); + } + + public static RowData toRowData(Object... values) { + return GenericRowData.of(convertData(values)); + } + + public static RowData toRowDataWithKind(RowKind rowKind, Object... values) { + return GenericRowData.ofKind(rowKind, convertData(values)); + } + + public static Set read(Table table) { + table.refresh(); + + Set records = new HashSet<>(); + + try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { + for (Record record : iterable) { + records.add(record); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + return records; + } + + public static Map> groupByPrimaryKey(List rowList, int pkIdx) { + Map> result = new HashMap<>(); + for (Row row : rowList) { + Object pk = row.getField(pkIdx); + List list = result.getOrDefault(pk, new LinkedList<>()); + list.add(row); + result.put(pk, list); + } + return result; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java new file mode 100644 index 0000000000..18f5e3ea8f --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.JobID; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.checkpoint.channel.ChannelStateWriteRequestExecutorFactory; +import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.jobgraph.JobVertexID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.metrics.groups.TaskMetricGroup; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.runtime.state.TaskStateManager; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; +import org.apache.flink.runtime.taskmanager.TaskManagerRuntimeInfo; +import org.apache.flink.util.UserCodeClassLoader; + +public class MixedFormatMockEnvironment extends MockEnvironment { + + protected MixedFormatMockEnvironment( + JobID jobID, + JobVertexID jobVertexID, + String taskName, + MockInputSplitProvider inputSplitProvider, + int bufferSize, + Configuration taskConfiguration, + ExecutionConfig executionConfig, + IOManager ioManager, + TaskStateManager taskStateManager, + GlobalAggregateManager aggregateManager, + int maxParallelism, + int parallelism, + int subtaskIndex, + UserCodeClassLoader userCodeClassLoader, + TaskMetricGroup taskMetricGroup, + TaskManagerRuntimeInfo taskManagerRuntimeInfo, + MemoryManager memManager, + ExternalResourceInfoProvider externalResourceInfoProvider, + ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory) { + super( + jobID, + jobVertexID, + taskName, + inputSplitProvider, + bufferSize, + taskConfiguration, + executionConfig, + ioManager, + taskStateManager, + aggregateManager, + maxParallelism, + parallelism, + subtaskIndex, + userCodeClassLoader, + taskMetricGroup, + taskManagerRuntimeInfo, + memManager, + externalResourceInfoProvider, + channelStateExecutorFactory); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java new file mode 100644 index 0000000000..0738c98d52 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.JobID; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.checkpoint.channel.ChannelStateWriteRequestExecutorFactory; +import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider; +import org.apache.flink.runtime.io.disk.iomanager.IOManager; +import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; +import org.apache.flink.runtime.jobgraph.JobVertexID; +import org.apache.flink.runtime.memory.MemoryManager; +import org.apache.flink.runtime.memory.MemoryManagerBuilder; +import org.apache.flink.runtime.metrics.groups.TaskMetricGroup; +import org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups; +import org.apache.flink.runtime.operators.testutils.MockEnvironment; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.runtime.state.TaskStateManager; +import org.apache.flink.runtime.state.TestTaskStateManager; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; +import org.apache.flink.runtime.taskmanager.TaskManagerRuntimeInfo; +import org.apache.flink.runtime.util.TestingTaskManagerRuntimeInfo; +import org.apache.flink.runtime.util.TestingUserCodeClassLoader; +import org.apache.flink.util.UserCodeClassLoader; + +public class MockEnvironmentBuilder { + private String taskName = "mock-task"; + private MockInputSplitProvider inputSplitProvider = null; + private int bufferSize = 16; + private TaskStateManager taskStateManager = new TestTaskStateManager(); + private GlobalAggregateManager aggregateManager = new TestGlobalAggregateManager(); + private Configuration taskConfiguration = new Configuration(); + private ExecutionConfig executionConfig = new ExecutionConfig(); + private int maxParallelism = 1; + private int parallelism = 1; + private int subtaskIndex = 0; + private UserCodeClassLoader userCodeClassLoader = TestingUserCodeClassLoader.newBuilder().build(); + private JobID jobID = new JobID(); + private JobVertexID jobVertexID = new JobVertexID(); + private TaskMetricGroup taskMetricGroup = + UnregisteredMetricGroups.createUnregisteredTaskMetricGroup(); + private TaskManagerRuntimeInfo taskManagerRuntimeInfo = new TestingTaskManagerRuntimeInfo(); + private IOManager ioManager; + private MemoryManager memoryManager = this.buildMemoryManager(33554432L); + private ExternalResourceInfoProvider externalResourceInfoProvider; + private ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory = + new ChannelStateWriteRequestExecutorFactory(this.jobID); + + public MockEnvironmentBuilder() { + this.externalResourceInfoProvider = ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES; + } + + private MemoryManager buildMemoryManager(long memorySize) { + return MemoryManagerBuilder.newBuilder().setMemorySize(memorySize).build(); + } + + public MockEnvironmentBuilder setTaskName(String taskName) { + this.taskName = taskName; + return this; + } + + public MockEnvironmentBuilder setManagedMemorySize(long managedMemorySize) { + this.memoryManager = this.buildMemoryManager(managedMemorySize); + return this; + } + + public MockEnvironmentBuilder setInputSplitProvider(MockInputSplitProvider inputSplitProvider) { + this.inputSplitProvider = inputSplitProvider; + return this; + } + + public MockEnvironmentBuilder setBufferSize(int bufferSize) { + this.bufferSize = bufferSize; + return this; + } + + public MockEnvironmentBuilder setTaskStateManager(TaskStateManager taskStateManager) { + this.taskStateManager = taskStateManager; + return this; + } + + public MockEnvironmentBuilder setAggregateManager(GlobalAggregateManager aggregateManager) { + this.aggregateManager = aggregateManager; + return this; + } + + public MockEnvironmentBuilder setTaskConfiguration(Configuration taskConfiguration) { + this.taskConfiguration = taskConfiguration; + return this; + } + + public MockEnvironmentBuilder setExecutionConfig(ExecutionConfig executionConfig) { + this.executionConfig = executionConfig; + return this; + } + + public MockEnvironmentBuilder setTaskManagerRuntimeInfo( + TaskManagerRuntimeInfo taskManagerRuntimeInfo) { + this.taskManagerRuntimeInfo = taskManagerRuntimeInfo; + return this; + } + + public MockEnvironmentBuilder setMaxParallelism(int maxParallelism) { + this.maxParallelism = maxParallelism; + return this; + } + + public MockEnvironmentBuilder setParallelism(int parallelism) { + this.parallelism = parallelism; + return this; + } + + public MockEnvironmentBuilder setSubtaskIndex(int subtaskIndex) { + this.subtaskIndex = subtaskIndex; + return this; + } + + public MockEnvironmentBuilder setUserCodeClassLoader(ClassLoader userCodeClassLoader) { + this.userCodeClassLoader = + TestingUserCodeClassLoader.newBuilder().setClassLoader(userCodeClassLoader).build(); + return this; + } + + public MockEnvironmentBuilder setJobID(JobID jobID) { + this.jobID = jobID; + return this; + } + + public MockEnvironmentBuilder setJobVertexID(JobVertexID jobVertexID) { + this.jobVertexID = jobVertexID; + return this; + } + + public MockEnvironmentBuilder setMetricGroup(TaskMetricGroup taskMetricGroup) { + this.taskMetricGroup = taskMetricGroup; + return this; + } + + public MockEnvironmentBuilder setIOManager(IOManager ioManager) { + this.ioManager = ioManager; + return this; + } + + public MockEnvironmentBuilder setMemoryManager(MemoryManager memoryManager) { + this.memoryManager = memoryManager; + return this; + } + + public MockEnvironmentBuilder setExternalResourceInfoProvider( + ExternalResourceInfoProvider externalResourceInfoProvider) { + this.externalResourceInfoProvider = externalResourceInfoProvider; + return this; + } + + public MockEnvironmentBuilder setGlobalAggregateManager( + GlobalAggregateManager globalAggregateManager) { + this.aggregateManager = globalAggregateManager; + return this; + } + + public void setChannelStateExecutorFactory( + ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory) { + this.channelStateExecutorFactory = channelStateExecutorFactory; + } + + public MockEnvironment build() { + if (this.ioManager == null) { + this.ioManager = new IOManagerAsync(); + } + + return new MixedFormatMockEnvironment( + this.jobID, + this.jobVertexID, + this.taskName, + this.inputSplitProvider, + this.bufferSize, + this.taskConfiguration, + this.executionConfig, + this.ioManager, + this.taskStateManager, + this.aggregateManager, + this.maxParallelism, + this.parallelism, + this.subtaskIndex, + this.userCodeClassLoader, + this.taskMetricGroup, + this.taskManagerRuntimeInfo, + this.memoryManager, + this.externalResourceInfoProvider, + this.channelStateExecutorFactory); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java new file mode 100644 index 0000000000..dfe12e2f82 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; +import org.apache.flink.configuration.Configuration; +import org.junit.Assert; +import org.junit.Test; + +public class TestCompatibleFlinkPropertyUtil { + @Test + public void testGetNewProperty() { + Configuration config = new Configuration(); + Assert.assertEquals( + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue(), + CompatibleFlinkPropertyUtil.propertyAsBoolean( + config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); + + config.setBoolean(MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE, true); + Assert.assertTrue( + CompatibleFlinkPropertyUtil.propertyAsBoolean( + config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); + + config.setBoolean( + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY, false); + Assert.assertTrue( + CompatibleFlinkPropertyUtil.propertyAsBoolean( + config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); + } + + @Test + public void testGetLegacyProperty() { + Configuration config = new Configuration(); + config.setBoolean( + MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY, true); + Assert.assertTrue( + CompatibleFlinkPropertyUtil.propertyAsBoolean( + config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java new file mode 100644 index 0000000000..0162cd04e1 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.api.common.functions.AggregateFunction; +import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; +import org.apache.flink.runtime.taskexecutor.rpc.RpcGlobalAggregateManager; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * An util class of global aggregate manager that simulates action as {@link + * RpcGlobalAggregateManager} in the jobMaster. + */ +public class TestGlobalAggregateManager implements GlobalAggregateManager { + private final Map accumulators = new HashMap<>(); + + @Override + public OUT updateGlobalAggregate( + String aggregateName, Object aggregand, AggregateFunction aggregateFunction) + throws IOException { + + Object accumulator = accumulators.get(aggregateName); + if (null == accumulator) { + accumulator = aggregateFunction.createAccumulator(); + } + + accumulator = aggregateFunction.add((IN) aggregand, (ACC) accumulator); + accumulators.put(aggregateName, accumulator); + return aggregateFunction.getResult((ACC) accumulator); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java new file mode 100644 index 0000000000..e77d4b2e48 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.runtime.checkpoint.PrioritizedOperatorSubtaskState; +import org.apache.flink.runtime.checkpoint.TaskStateSnapshot; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; +import org.apache.flink.runtime.state.TestTaskStateManager; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; + +import javax.annotation.Nonnull; + +import java.util.Collections; +import java.util.List; + +public class TestOneInputStreamOperatorIntern + extends OneInputStreamOperatorTestHarness { + public TestOneInputStreamOperatorIntern( + OneInputStreamOperator operator, + int maxParallelism, + int parallelism, + int subtaskIndex, + Long restoredCheckpointId, + TestGlobalAggregateManager testGlobalAggregateManager) + throws Exception { + super( + operator, + (new MockEnvironmentBuilder()) + .setTaskName("MockTask") + .setManagedMemorySize(3145728L) + .setInputSplitProvider(new MockInputSplitProvider()) + .setBufferSize(1024) + .setTaskStateManager(new TestTaskStateManagerIntern(restoredCheckpointId)) + .setAggregateManager(testGlobalAggregateManager) + .setMaxParallelism(maxParallelism) + .setParallelism(parallelism) + .setSubtaskIndex(subtaskIndex) + .build()); + } + + public void notifyOfAbortedCheckpoint(long checkpointId) throws Exception { + this.operator.notifyCheckpointAborted(checkpointId); + } + + static class TestTaskStateManagerIntern extends TestTaskStateManager { + private long reportedCheckpointId = -1L; + private boolean restored = false; + + public TestTaskStateManagerIntern(Long reportedCheckpointId) { + super(); + if (reportedCheckpointId != null) { + this.reportedCheckpointId = reportedCheckpointId; + this.restored = true; + } + } + + @Nonnull + public PrioritizedOperatorSubtaskState prioritizedOperatorState(OperatorID operatorID) { + TaskStateSnapshot jmTaskStateSnapshot = this.getLastJobManagerTaskStateSnapshot(); + TaskStateSnapshot tmTaskStateSnapshot = this.getLastTaskManagerTaskStateSnapshot(); + if (jmTaskStateSnapshot == null) { + return PrioritizedOperatorSubtaskState.emptyNotRestored(); + } else { + OperatorSubtaskState jmOpState = + jmTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID); + if (jmOpState == null) { + return PrioritizedOperatorSubtaskState.emptyNotRestored(); + } else { + List tmStateCollection = Collections.emptyList(); + if (tmTaskStateSnapshot != null) { + OperatorSubtaskState tmOpState = + tmTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID); + if (tmOpState != null) { + tmStateCollection = Collections.singletonList(tmOpState); + } + } + + PrioritizedOperatorSubtaskState.Builder builder = + new PrioritizedOperatorSubtaskState.Builder( + jmOpState, tmStateCollection, this.reportedCheckpointId); + return builder.build(); + } + } + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java new file mode 100644 index 0000000000..69431378bf --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import static org.apache.flink.table.api.DataTypes.BIGINT; +import static org.apache.flink.table.api.DataTypes.BOOLEAN; +import static org.apache.flink.table.api.DataTypes.DOUBLE; +import static org.apache.flink.table.api.DataTypes.FIELD; +import static org.apache.flink.table.api.DataTypes.INT; +import static org.apache.flink.table.api.DataTypes.ROW; +import static org.apache.flink.table.api.DataTypes.STRING; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.flink.table.types.DataType; +import org.junit.jupiter.api.Test; + +class TestProjection { + + @Test + void testTopLevelProject() { + assertThat( + Projection.of(new int[] {2, 1}) + .project(ROW(FIELD("f0", BIGINT()), FIELD("f1", STRING()), FIELD("f2", INT())))) + .isEqualTo(ROW(FIELD("f2", INT()), FIELD("f1", STRING()))); + } + + @Test + void testNestedProject() { + final DataType thirdLevelRow = + ROW(FIELD("c0", BOOLEAN()), FIELD("c1", DOUBLE()), FIELD("c2", INT())); + final DataType secondLevelRow = + ROW(FIELD("b0", BOOLEAN()), FIELD("b1", thirdLevelRow), FIELD("b2", INT())); + final DataType topLevelRow = + ROW(FIELD("a0", INT()), FIELD("a1", secondLevelRow), FIELD("a1_b1_c0", INT())); + + assertThat(Projection.of(new int[][] {{0}, {1, 1, 0}}).project(topLevelRow)) + .isEqualTo(ROW(FIELD("a0", INT()), FIELD("a1_b1_c0", BOOLEAN()))); + assertThat(Projection.of(new int[][] {{1, 1}, {0}}).project(topLevelRow)) + .isEqualTo(ROW(FIELD("a1_b1", thirdLevelRow), FIELD("a0", INT()))); + assertThat(Projection.of(new int[][] {{1, 1, 2}, {1, 1, 1}, {1, 1, 0}}).project(topLevelRow)) + .isEqualTo( + ROW( + FIELD("a1_b1_c2", INT()), + FIELD("a1_b1_c1", DOUBLE()), + FIELD("a1_b1_c0", BOOLEAN()))); + assertThat(Projection.of(new int[][] {{1, 1, 0}, {2}}).project(topLevelRow)) + .isEqualTo(ROW(FIELD("a1_b1_c0", BOOLEAN()), FIELD("a1_b1_c0_$0", INT()))); + } + + @Test + void testIsNested() { + assertThat(Projection.of(new int[] {2, 1}).isNested()).isFalse(); + assertThat(Projection.of(new int[][] {new int[] {1}, new int[] {3}}).isNested()).isFalse(); + assertThat( + Projection.of(new int[][] {new int[] {1}, new int[] {1, 2}, new int[] {3}}).isNested()) + .isTrue(); + } + + @Test + void testDifference() { + assertThat(Projection.of(new int[] {4, 1, 0, 3, 2}).difference(Projection.of(new int[] {4, 2}))) + .isEqualTo(Projection.of(new int[] {1, 0, 2})); + + assertThat( + Projection.of( + new int[][] { + new int[] {4}, + new int[] {1, 3}, + new int[] {0}, + new int[] {3, 1}, + new int[] {2} + }) + .difference(Projection.of(new int[] {4, 2}))) + .isEqualTo(Projection.of(new int[][] {new int[] {1, 3}, new int[] {0}, new int[] {2, 1}})); + + assertThatThrownBy( + () -> + Projection.of(new int[] {1, 2, 3, 4}) + .difference(Projection.of(new int[][] {new int[] {2}, new int[] {3, 4}}))) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + void testComplement() { + assertThat(Projection.of(new int[] {4, 1, 2}).complement(5)) + .isEqualTo(Projection.of(new int[] {0, 3})); + + assertThat( + Projection.of(new int[][] {new int[] {4}, new int[] {1}, new int[] {2}}).complement(5)) + .isEqualTo(Projection.of(new int[] {0, 3})); + + assertThatThrownBy( + () -> + Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) + .complement(10)) + .isInstanceOf(IllegalStateException.class); + } + + @Test + void testToTopLevelIndexes() { + assertThat(Projection.of(new int[] {1, 2, 3, 4}).toTopLevelIndexes()) + .isEqualTo(new int[] {1, 2, 3, 4}); + + assertThat( + Projection.of(new int[][] {new int[] {4}, new int[] {1}, new int[] {2}}) + .toTopLevelIndexes()) + .isEqualTo(new int[] {4, 1, 2}); + + assertThatThrownBy( + () -> + Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) + .toTopLevelIndexes()) + .isInstanceOf(IllegalStateException.class); + } + + @Test + void testToNestedIndexes() { + assertThat(Projection.of(new int[] {1, 2, 3, 4}).toNestedIndexes()) + .isEqualTo(new int[][] {new int[] {1}, new int[] {2}, new int[] {3}, new int[] {4}}); + assertThat( + Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) + .toNestedIndexes()) + .isEqualTo(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}); + } + + @Test + void testEquals() { + assertThat(Projection.of(new int[][] {new int[] {1}, new int[] {2}, new int[] {3}})) + .isEqualTo(Projection.of(new int[] {1, 2, 3})); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java new file mode 100644 index 0000000000..6888add512 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.util; + +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.core.execution.JobClient; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestUtil { + + public static final Logger LOG = LoggerFactory.getLogger(TestUtil.class); + + /** get ut method name without parameters. */ + public static String getUtMethodName(TestName testName) { + int i = testName.getMethodName().indexOf("["); + if (i == -1) { + return testName.getMethodName(); + } + return testName.getMethodName().substring(0, i); + } + + public static void cancelJob(JobClient jobClient) { + if (isJobTerminated(jobClient)) { + return; + } + try { + jobClient.cancel(); + } catch (Exception e) { + LOG.warn("cancel job exception.", e); + } + } + + public static boolean isJobTerminated(JobClient jobClient) { + try { + JobStatus status = jobClient.getJobStatus().get(); + return status.isGloballyTerminalState(); + } catch (Exception e) { + // TODO + // This is sort of hack. + // Currently different execution environment will have different behaviors + // when fetching a finished job status. + // For example, standalone session cluster will return a normal FINISHED, + // while mini cluster will throw IllegalStateException, + // and yarn per job will throw ApplicationNotFoundException. + // We have to assume that job has finished in this case. + // Change this when these behaviors are unified. + LOG.warn( + "Failed to get job status so we assume that the job has terminated. Some data might be lost.", + e); + return true; + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java new file mode 100644 index 0000000000..610a7854b0 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.BasicTableTestHelper.PRIMARY_KEY_SPEC; + +import org.apache.amoro.flink.FlinkTableTestBase; +import org.apache.amoro.flink.read.FlinkSplitPlanner; +import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; +import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; +import org.apache.amoro.flink.read.source.DataIterator; +import org.apache.amoro.io.AuthenticatedFileIO; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.FlinkInputFormat; +import org.apache.iceberg.flink.source.FlinkSource; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.types.TypeUtil; +import org.junit.Assert; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +public interface FlinkTaskWriterBaseTest extends FlinkTableTestBase { + Logger LOG = LoggerFactory.getLogger(FlinkTaskWriterBaseTest.class); + + default void testWriteAndReadMixedFormatTable( + MixedTable mixedTable, TableSchema flinkTableSchema, RowData expected) { + + // This is a partial-write schema from Flink engine view. + RowType rowType = (RowType) flinkTableSchema.toRowDataType().getLogicalType(); + + try (TaskWriter taskWriter = createTaskWriter(mixedTable, rowType)) { + Assert.assertNotNull(taskWriter); + + writeAndCommit(expected, taskWriter, mixedTable); + + mixedTable.refresh(); + + // This is a partial-read schema from Flink engine view, should reassign schema id to + // selected-schema + Schema selectedSchema = + TypeUtil.reassignIds(FlinkSchemaUtil.convert(flinkTableSchema), mixedTable.schema()); + + assertRecords(mixedTable.schema(), selectedSchema, mixedTable, expected, flinkTableSchema); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + default void assertRecords( + Schema tableSchema, + Schema selectedSchema, + MixedTable mixedTable, + RowData expected, + TableSchema flinkTableSchema) + throws IOException { + List records; + if (mixedTable.isKeyedTable()) { + records = + recordsOfKeyedTable( + mixedTable.asKeyedTable(), tableSchema, selectedSchema, mixedTable.io()); + } else { + records = + recordsOfUnkeyedTable( + getTableLoader(getCatalogName(), getMetastoreUri(), mixedTable), + selectedSchema, + flinkTableSchema); + } + Assert.assertEquals(1, records.size()); + Assert.assertEquals(expected, records.get(0)); + } + + /** For asserting unkeyed table records. */ + String getMetastoreUri(); + + /** For asserting unkeyed table records. */ + String getCatalogName(); + + default void writeAndCommit( + RowData expected, TaskWriter taskWriter, MixedTable mixedTable) throws IOException { + writeAndCommit(expected, taskWriter, mixedTable, false); + } + + default void writeAndCommit( + RowData expected, + TaskWriter taskWriter, + MixedTable mixedTable, + boolean upsertEnabled) + throws IOException { + taskWriter.write(expected); + WriteResult writerResult = taskWriter.complete(); + boolean writeToBase = mixedTable.isUnkeyedTable(); + commit(mixedTable, writerResult, writeToBase); + Assert.assertEquals(upsertEnabled ? 2 : 1, writerResult.dataFiles().length); + } + + default boolean upsertEnabled() { + return false; + } + + default List recordsOfUnkeyedTable( + TableLoader tableLoader, Schema projectedSchema, TableSchema flinkTableSchema) + throws IOException { + FlinkInputFormat inputFormat = + FlinkSource.forRowData().tableLoader(tableLoader).project(flinkTableSchema).buildFormat(); + return runFormat(inputFormat, FlinkSchemaUtil.convert(projectedSchema)); + } + + default List recordsOfKeyedTable( + KeyedTable table, Schema tableSchema, Schema projectedSchema, AuthenticatedFileIO io) { + List mixedFormatSplits = + FlinkSplitPlanner.planFullTable(table, new AtomicInteger(0)); + + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + new Configuration(), tableSchema, projectedSchema, PRIMARY_KEY_SPEC, null, true, io); + + List actual = new ArrayList<>(); + mixedFormatSplits.forEach( + split -> { + LOG.info("Mixed-format split: {}.", split); + DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); + while (dataIterator.hasNext()) { + RowData rowData = dataIterator.next(); + LOG.info("{}", rowData); + actual.add(rowData); + } + }); + + return actual; + } + + default List runFormat(FlinkInputFormat inputFormat, RowType readRowType) + throws IOException { + return TestHelpers.readRowData(inputFormat, readRowType); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java new file mode 100644 index 0000000000..133d9acbb6 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java @@ -0,0 +1,311 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.data.FileNameRules; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.read.TestMixedFormatSource; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.api.common.RuntimeExecutionMode; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.time.Time; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ExecutionOptions; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.minicluster.MiniClusterConfiguration; +import org.apache.flink.runtime.state.CheckpointListener; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.datastream.DataStreamSource; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; +import org.apache.flink.streaming.api.graph.StreamGraph; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Snapshot; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.Stack; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; + +public class MixedFormatFileWriterITCase extends FlinkTestBase { + + public static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileWriterITCase.class); + + private static final Map LATCH_MAP = new ConcurrentHashMap<>(); + public MixedFormatTableLoader tableLoader; + private String latchId; + private final int NUM_SOURCES = 4; + private final int NUM_RECORDS = 10000; + + public MixedFormatFileWriterITCase() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + @Before + public void setup() { + this.latchId = UUID.randomUUID().toString(); + // We wait for two successful checkpoints in sources before shutting down. This ensures that + // the sink can commit its data. + // We need to keep a "static" latch here because all sources need to be kept running + // while we're waiting for the required number of checkpoints. Otherwise, we would lock up + // because we can only do checkpoints while all operators are running. + LATCH_MAP.put(latchId, new CountDownLatch(NUM_SOURCES * 2)); + } + + protected static final double FAILOVER_RATIO = 0.4; + + private static class StreamingExecutionTestSource extends RichParallelSourceFunction + implements CheckpointListener, CheckpointedFunction { + + private final String latchId; + + private final int numberOfRecords; + + /** + * Whether the test is executing in a scenario that induces a failover. This doesn't mean that + * this source induces the failover. + */ + private final boolean isFailoverScenario; + + private ListState nextValueState; + + private int nextValue; + + private volatile boolean isCanceled; + + private volatile boolean snapshottedAfterAllRecordsOutput; + + private volatile boolean isWaitingCheckpointComplete; + + private volatile boolean hasCompletedCheckpoint; + + public StreamingExecutionTestSource( + String latchId, int numberOfRecords, boolean isFailoverScenario) { + this.latchId = latchId; + this.numberOfRecords = numberOfRecords; + this.isFailoverScenario = isFailoverScenario; + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + nextValueState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("nextValue", Integer.class)); + + if (nextValueState.get() != null && nextValueState.get().iterator().hasNext()) { + nextValue = nextValueState.get().iterator().next(); + } + } + + @Override + public void run(SourceContext ctx) throws Exception { + if (isFailoverScenario && getRuntimeContext().getAttemptNumber() == 0) { + // In the first execution, we first send a part of record... + sendRecordsUntil((int) (numberOfRecords * FAILOVER_RATIO * 0.5), ctx); + + // Wait till the first part of data is committed. + while (!hasCompletedCheckpoint) { + Thread.sleep(50); + } + + // Then we write the second part of data... + sendRecordsUntil((int) (numberOfRecords * FAILOVER_RATIO), ctx); + + // And then trigger the failover. + if (getRuntimeContext().getIndexOfThisSubtask() == 0) { + throw new RuntimeException("Designated Exception"); + } else { + while (true) { + Thread.sleep(50); + } + } + } else { + // If we are not going to trigger failover or we have already triggered failover, + // run until finished. + sendRecordsUntil(numberOfRecords, ctx); + + // Wait the last checkpoint to commit all the pending records. + isWaitingCheckpointComplete = true; + CountDownLatch latch = LATCH_MAP.get(latchId); + latch.await(); + } + } + + private void sendRecordsUntil(int targetNumber, SourceContext ctx) { + while (!isCanceled && nextValue < targetNumber) { + synchronized (ctx.getCheckpointLock()) { + ctx.collect( + GenericRowData.of( + nextValue++, + StringData.fromString(""), + LocalDateTime.now().toInstant(ZoneOffset.UTC).toEpochMilli(), + TimestampData.fromLocalDateTime(LocalDateTime.now()))); + } + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + nextValueState.update(Collections.singletonList(nextValue)); + + if (isWaitingCheckpointComplete) { + snapshottedAfterAllRecordsOutput = true; + } + } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + if (isWaitingCheckpointComplete && snapshottedAfterAllRecordsOutput) { + CountDownLatch latch = LATCH_MAP.get(latchId); + latch.countDown(); + } + + hasCompletedCheckpoint = true; + } + + @Override + public void cancel() { + isCanceled = true; + } + } + + protected JobGraph createJobGraph( + MixedFormatTableLoader tableLoader, TableSchema tableSchema, boolean triggerFailover) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + Configuration config = new Configuration(); + config.set(ExecutionOptions.RUNTIME_MODE, RuntimeExecutionMode.STREAMING); + env.configure(config, getClass().getClassLoader()); + + env.enableCheckpointing(10, CheckpointingMode.EXACTLY_ONCE); + + if (triggerFailover) { + env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, Time.milliseconds(100))); + } else { + env.setRestartStrategy(RestartStrategies.noRestart()); + } + + DataStreamSource source = + env.addSource(new StreamingExecutionTestSource(latchId, NUM_RECORDS, triggerFailover)) + .setParallelism(4); + MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); + FlinkSink.forRowData(source) + .context(Optional::of) + .table(table) + .tableLoader(tableLoader) + .flinkSchema(tableSchema) + .build(); + + StreamGraph streamGraph = env.getStreamGraph(); + return streamGraph.getJobGraph(); + } + + @Test + public void testWrite() throws Exception { + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + + JobGraph jobGraph = createJobGraph(tableLoader, FLINK_SCHEMA, true); + final Configuration config = new Configuration(); + config.setString(RestOptions.BIND_PORT, "18081-19000"); + final MiniClusterConfiguration cfg = + new MiniClusterConfiguration.Builder() + .setNumTaskManagers(1) + .setNumSlotsPerTaskManager(NUM_SOURCES) + .setConfiguration(config) + .build(); + + try (MiniCluster miniCluster = new MiniCluster(cfg)) { + miniCluster.start(); + miniCluster.executeJobBlocking(jobGraph); + } + + KeyedTable keyedTable = tableLoader.loadMixedFormatTable().asKeyedTable(); + checkResult(keyedTable, NUM_RECORDS * NUM_SOURCES); + } + + public static void checkResult(KeyedTable keyedTable, int exceptedSize) { + keyedTable.refresh(); + Snapshot crt = keyedTable.changeTable().currentSnapshot(); + + Stack snapshots = new Stack<>(); + while (crt != null) { + snapshots.push(crt); + if (crt.parentId() == null) { + break; + } + crt = keyedTable.changeTable().snapshot(crt.parentId()); + } + + Set paths = new HashSet<>(); + long maxTxId = -1; + while (!snapshots.isEmpty()) { + Snapshot snapshot = snapshots.pop(); + long minTxIdInSnapshot = Integer.MAX_VALUE; + long maxTxIdInSnapshot = -1; + for (DataFile addedFile : snapshot.addedDataFiles(keyedTable.io())) { + String path = addedFile.path().toString(); + Assert.assertFalse(paths.contains(path)); + paths.add(path); + LOG.info("add file: {}", addedFile.path()); + + long txId = FileNameRules.parseChange(path, snapshot.sequenceNumber()).transactionId(); + minTxIdInSnapshot = Math.min(minTxIdInSnapshot, txId); + maxTxIdInSnapshot = Math.max(maxTxIdInSnapshot, txId); + } + Assert.assertTrue(maxTxId <= minTxIdInSnapshot); + + maxTxId = maxTxIdInSnapshot; + } + + Assert.assertEquals(exceptedSize, TestMixedFormatSource.tableRecords(keyedTable).size()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java new file mode 100644 index 0000000000..057765df1c --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.table.TableProperties.FILE_FORMAT_ORC; + +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.CatalogTestHelper; +import org.apache.amoro.catalog.TableTestBase; +import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; +import org.apache.amoro.hive.TestHMS; +import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; +import org.apache.amoro.hive.catalog.HiveTableTestHelper; +import org.apache.amoro.hive.table.HiveLocationKind; +import org.apache.amoro.shade.guava32.com.google.common.collect.Iterators; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.table.BaseLocationKind; +import org.apache.amoro.table.ChangeLocationKind; +import org.apache.amoro.table.LocationKind; +import org.apache.amoro.table.MixedTable; +import org.apache.amoro.table.WriteOperationKind; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.AdaptHiveParquet; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; +import java.math.BigDecimal; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +@RunWith(Parameterized.class) +public class TestAdaptHiveWriter extends TableTestBase { + + @ClassRule public static TestHMS TEST_HMS = new TestHMS(); + + public TestAdaptHiveWriter(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { + super(catalogTestHelper, tableTestHelper); + } + + @Parameterized.Parameters(name = "{0}, {1}") + public static Object[] parameters() { + return new Object[][] { + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, false) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(false, true) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(false, false) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, true, FILE_FORMAT_ORC) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(true, false, FILE_FORMAT_ORC) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(false, true, FILE_FORMAT_ORC) + }, + { + new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), + new HiveTableTestHelper(false, false, FILE_FORMAT_ORC) + } + }; + } + + @Test + public void testKeyedTableWriteTypeFromOperateKind() { + Assume.assumeTrue(isKeyedTable()); + MixedTable testKeyedHiveTable = getMixedTable(); + FlinkTaskWriterBuilder builder = + FlinkTaskWriterBuilder.buildFor(testKeyedHiveTable) + .withFlinkSchema(FlinkSchemaUtil.convert(testKeyedHiveTable.schema())); + + Assert.assertTrue( + builder.buildWriter(ChangeLocationKind.INSTANT) instanceof FlinkChangeTaskWriter); + Assert.assertTrue(builder.buildWriter(BaseLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); + Assert.assertTrue(builder.buildWriter(HiveLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); + + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.APPEND) instanceof FlinkChangeTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.OVERWRITE) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.MINOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.MAJOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.FULL_OPTIMIZE) instanceof FlinkBaseTaskWriter); + } + + @Test + public void testUnKeyedTableWriteTypeFromOperateKind() { + Assume.assumeFalse(isKeyedTable()); + MixedTable testHiveTable = getMixedTable(); + FlinkTaskWriterBuilder builder = + FlinkTaskWriterBuilder.buildFor(testHiveTable) + .withFlinkSchema(FlinkSchemaUtil.convert(testHiveTable.schema())); + + Assert.assertTrue(builder.buildWriter(BaseLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); + Assert.assertTrue(builder.buildWriter(HiveLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); + + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.APPEND) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.OVERWRITE) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.MAJOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); + Assert.assertTrue( + builder.buildWriter(WriteOperationKind.FULL_OPTIMIZE) instanceof FlinkBaseTaskWriter); + } + + @Test + public void testKeyedTableChangeWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); + } + + @Test + public void testKeyedTableBaseWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); + } + + @Test + public void testKeyedTableHiveWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); + } + + @Test + public void testUnPartitionKeyedTableChangeWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); + } + + @Test + public void testUnPartitionKeyedTableBaseWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); + } + + @Test + public void testUnPartitionKeyedTableHiveWriteByLocationKind() throws IOException { + Assume.assumeTrue(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); + } + + @Test + public void testUnKeyedTableChangeWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + try { + testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); + } catch (Exception e) { + Assert.assertTrue(e instanceof IllegalArgumentException); + } + } + + @Test + public void testUnKeyedTableBaseWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); + } + + @Test + public void testUnKeyedTableHiveWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeTrue(isPartitionedTable()); + testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); + } + + @Test + public void testUnPartitionUnKeyedTableChangeWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + try { + testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); + } catch (Exception e) { + Assert.assertTrue(e instanceof IllegalArgumentException); + } + } + + @Test + public void testUnPartitionUnKeyedTableBaseWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); + } + + @Test + public void testUnPartitionUnKeyedTableHiveWriteByLocationKind() throws IOException { + Assume.assumeFalse(isKeyedTable()); + Assume.assumeFalse(isPartitionedTable()); + testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); + } + + public void testWrite( + MixedTable table, LocationKind locationKind, List records, String pathFeature) + throws IOException { + FlinkTaskWriterBuilder builder = + FlinkTaskWriterBuilder.buildFor(table) + .withFlinkSchema(FlinkSchemaUtil.convert(table.schema())); + + TaskWriter changeWrite = builder.buildWriter(locationKind); + for (RowData record : records) { + changeWrite.write(record); + } + WriteResult complete = changeWrite.complete(); + Arrays.stream(complete.dataFiles()) + .forEach(s -> Assert.assertTrue(s.path().toString().contains(pathFeature))); + CloseableIterable concat = + CloseableIterable.concat( + Arrays.stream(complete.dataFiles()) + .map( + s -> { + switch (s.format()) { + case PARQUET: + return readParquet(table.schema(), s.path().toString()); + case ORC: + return readOrc(table.schema(), s.path().toString()); + default: + throw new UnsupportedOperationException( + "Cannot read unknown format: " + s.format()); + } + }) + .collect(Collectors.toList())); + Set result = new HashSet<>(); + Iterators.addAll(result, concat.iterator()); + Assert.assertEquals(result, records.stream().collect(Collectors.toSet())); + } + + private CloseableIterable readParquet(Schema schema, String path) { + AdaptHiveParquet.ReadBuilder builder = + AdaptHiveParquet.read(Files.localInput(path)) + .project(schema) + .createReaderFunc( + fileSchema -> + AdaptHiveFlinkParquetReaders.buildReader(schema, fileSchema, new HashMap<>())) + .caseSensitive(false); + + CloseableIterable iterable = builder.build(); + return iterable; + } + + private CloseableIterable readOrc(Schema schema, String path) { + ORC.ReadBuilder builder = + ORC.read(Files.localInput(path)) + .project(schema) + .createReaderFunc(fileSchema -> new FlinkOrcReader(schema, fileSchema, new HashMap<>())) + .caseSensitive(false); + + CloseableIterable iterable = builder.build(); + return iterable; + } + + private List geneRowData() { + return Lists.newArrayList(geneRowData(1, "lily", 0, "2022-01-02T12:00:00")); + } + + private RowData geneRowData(int id, String name, long ts, String timestamp) { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss"); + return GenericRowData.of( + id, + StringData.fromString(name), + ts, + TimestampData.fromLocalDateTime(LocalDateTime.parse(timestamp, formatter)), + TimestampData.fromLocalDateTime(LocalDateTime.parse(timestamp, formatter)), + DecimalData.fromBigDecimal(new BigDecimal("0"), 10, 0), + StringData.fromString(timestamp.substring(0, 10))); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java new file mode 100644 index 0000000000..fbdf6a4b7e --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.iceberg.UpdateProperties; +import org.junit.Assert; +import org.junit.Test; + +import java.time.Duration; + +public class TestAutomaticDoubleWriteStatus extends FlinkTestBase { + public MixedFormatTableLoader tableLoader; + + public TestAutomaticDoubleWriteStatus() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + @Test + public void testTableProperties() { + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + tableLoader.open(); + MixedTable mixedTable = tableLoader.loadMixedFormatTable(); + UpdateProperties up = mixedTable.updateProperties(); + up.set(AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), "10"); + up.commit(); + AutomaticDoubleWriteStatus status = + new AutomaticDoubleWriteStatus(tableLoader, Duration.ofSeconds(10)); + status.open(); + + Assert.assertFalse(status.isDoubleWrite()); + status.processWatermark(new Watermark(System.currentTimeMillis() - 11 * 1000)); + Assert.assertFalse(status.isDoubleWrite()); + Assert.assertFalse(Boolean.parseBoolean(mixedTable.properties().get(LOG_STORE_CATCH_UP.key()))); + status.processWatermark(new Watermark(System.currentTimeMillis() - 9 * 1000)); + Assert.assertTrue(status.isDoubleWrite()); + Assert.assertTrue(status.isDoubleWrite()); + + mixedTable.refresh(); + Assert.assertTrue(Boolean.parseBoolean(mixedTable.properties().get(LOG_STORE_CATCH_UP.key()))); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java new file mode 100644 index 0000000000..37150e3563 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; +import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; +import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; +import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.metric.MetricsGenerator; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.flink.util.TestGlobalAggregateManager; +import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; +import org.apache.amoro.flink.write.hidden.kafka.HiddenKafkaFactory; +import org.apache.amoro.io.MixedDataTestHelpers; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.Schema; +import org.apache.iceberg.UpdateProperties; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.types.TypeUtil; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +@RunWith(Parameterized.class) +public class TestAutomaticLogWriter extends FlinkTestBase { + private static final Logger LOG = LoggerFactory.getLogger(TestAutomaticLogWriter.class); + public MixedFormatTableLoader tableLoader; + public static final TestGlobalAggregateManager GLOBAL_AGGREGATE_MANGER = + new TestGlobalAggregateManager(); + + private final boolean isGapNone; + private final boolean logstoreEnabled; + + public TestAutomaticLogWriter(boolean isGapNone, boolean logstoreEnabled) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + this.isGapNone = isGapNone; + this.logstoreEnabled = logstoreEnabled; + } + + @Parameterized.Parameters(name = "isGapNone={0}, logstoreEnabled={1}") + public static Object[][] parameters() { + return new Object[][] { + {true, true}, + {false, false}, + {false, true}, + {true, false} + }; + } + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Before + public void init() { + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + tableLoader.open(); + } + + @Test + public void testHasCaughtUp() throws Exception { + String topic = + Thread.currentThread().getStackTrace()[1].getMethodName() + isGapNone + logstoreEnabled; + + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + env.getConfig().setAutoWatermarkInterval(10); + + List expects = new LinkedList<>(); + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + expects.add( + new Object[] { + 1000004, + "a", + LocalDateTime.parse("2022-06-17 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17 10:10:11", dtf) + }); + expects.add( + new Object[] { + 1000015, + "b", + LocalDateTime.parse("2022-06-17 10:08:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17 10:08:11", dtf) + }); + expects.add( + new Object[] { + 1000011, + "c", + LocalDateTime.parse("2022-06-18 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-18 10:10:11", dtf) + }); + List catchUpExpects = new LinkedList<>(); + catchUpExpects.add( + new Object[] { + 1000014, + "d", + LocalDateTime.now().minusSeconds(3).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.now().minusSeconds(3) + }); + catchUpExpects.add( + new Object[] { + 1000021, + "d", + LocalDateTime.now().minusSeconds(2).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.now().minusSeconds(2) + }); + catchUpExpects.add( + new Object[] { + 1000015, + "e", + LocalDateTime.now().minusSeconds(1).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.now().minusSeconds(1) + }); + expects.addAll(catchUpExpects); + + DataStream input = + env.fromElements(expects.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); + + KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); + UpdateProperties up = testKeyedTable.updateProperties(); + up.set(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); + up.set(LOG_STORE_MESSAGE_TOPIC, topic); + if (logstoreEnabled) { + up.set(ENABLE_LOG_STORE, "true"); + } else { + up.set(ENABLE_LOG_STORE, "false"); + } + up.set(LOG_STORE_CATCH_UP.key(), "true"); + up.commit(); + + FlinkSink.forRowData(input) + .context(Optional::of) + .table(testKeyedTable) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .producerConfig(getPropertiesByTopic(topic)) + .topic(topic) + .build(); + + env.execute(); + + testKeyedTable.changeTable().refresh(); + List actual = MixedDataTestHelpers.readKeyedTable(testKeyedTable, null); + + Set expected = toRecords(DataUtil.toRowSet(expects)); + Assert.assertEquals(expected, new HashSet<>(actual)); + if (logstoreEnabled) { + checkLogstoreDataAccuracy(topic, expects); + } else { + checkLogstoreDataAccuracy(topic, new ArrayList<>()); + } + } + + @Test + public void testHasNotCaughtUp() throws Exception { + String topic = + Thread.currentThread().getStackTrace()[1].getMethodName() + isGapNone + logstoreEnabled; + byte[] jobId = IdGenerator.generateUpstreamId(); + Duration gap; + KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); + UpdateProperties up = testKeyedTable.updateProperties(); + up.set(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); + up.set(LOG_STORE_MESSAGE_TOPIC, topic); + up.set(ENABLE_LOG_STORE, "true"); + if (!isGapNone) { + up.set(AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), "20"); + } + up.commit(); + + if (isGapNone) { + gap = null; + } else { + gap = Duration.ofSeconds(20); + } + + List expects = new LinkedList<>(); + List results; + testKeyedTable.refresh(); + Assert.assertFalse( + Boolean.parseBoolean( + testKeyedTable.properties().getOrDefault(LOG_STORE_CATCH_UP.key(), "false"))); + try (TestOneInputStreamOperatorIntern harness = + createSingleProducer(1, jobId, topic, gap)) { + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + expects.add( + new Object[] { + 1000004, + "a", + LocalDateTime.parse("2022-06-17 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17 10:10:11", dtf) + }); + expects.add( + new Object[] { + 1000015, + "b", + LocalDateTime.parse("2022-06-17 10:18:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17 10:18:11", dtf) + }); + expects.add( + new Object[] { + 1000011, + "c", + LocalDateTime.parse("2022-06-18 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-18 10:10:11", dtf) + }); + long checkpoint = 0; + + harness.setup(); + harness.initializeEmptyState(); + harness.open(); + harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(0)))); + harness.processWatermark(1); + harness.prepareSnapshotPreBarrier(++checkpoint); + harness.snapshot(1, 1); + harness.notifyOfCompletedCheckpoint(checkpoint); + harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(1)))); + harness.processWatermark(System.currentTimeMillis() - 1000); + harness.prepareSnapshotPreBarrier(++checkpoint); + harness.snapshot(2, 1); + harness.notifyOfCompletedCheckpoint(checkpoint); + harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(2)))); + harness.processWatermark(System.currentTimeMillis()); + harness.prepareSnapshotPreBarrier(++checkpoint); + harness.snapshot(3, 1); + harness.notifyOfCompletedCheckpoint(checkpoint); + + results = harness.extractOutputValues(); + } catch (Throwable e) { + LOG.error("", e); + throw e; + } + + // check expects accuracy. + Assert.assertEquals(3, results.size()); + results.forEach(result -> Assert.assertEquals(1, result.dataFiles().length)); + List expected = isGapNone ? expects : expects.subList(2, expects.size()); + checkLogstoreDataAccuracy(topic, expected); + testKeyedTable.refresh(); + if (!isGapNone) { + Assert.assertTrue( + Boolean.parseBoolean(testKeyedTable.properties().get(LOG_STORE_CATCH_UP.key()))); + } + } + + private void checkLogstoreDataAccuracy(String topic, List expects) { + LogDataJsonDeserialization logDataJsonDeserialization = + new LogDataJsonDeserialization<>( + TABLE_SCHEMA, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); + ConsumerRecords consumerRecords = KafkaContainerTest.readRecordsBytes(topic); + Assertions.assertEquals(expects.size(), consumerRecords.count()); + List actual = new ArrayList<>(); + consumerRecords.forEach( + consumerRecord -> { + try { + actual.add( + logDataJsonDeserialization.deserialize(consumerRecord.value()).getActualValue()); + } catch (IOException e) { + e.printStackTrace(); + } + }); + Collection expected = DataUtil.toRowData(expects); + Assertions.assertEquals( + expected.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList()), + actual.stream() + .sorted(Comparator.comparing(RowData::toString)) + .collect(Collectors.toList())); + } + + public TestOneInputStreamOperatorIntern createSingleProducer( + int maxParallelism, byte[] jobId, String topic, Duration writeLogstoreWatermarkGap) + throws Exception { + return createProducer( + maxParallelism, + maxParallelism, + 0, + null, + jobId, + GLOBAL_AGGREGATE_MANGER, + topic, + writeLogstoreWatermarkGap); + } + + private TestOneInputStreamOperatorIntern createProducer( + int maxParallelism, + int parallelism, + int subTaskId, + Long restoredCheckpointId, + byte[] jobId, + TestGlobalAggregateManager testGlobalAggregateManager, + String topic, + Duration writeLogstoreWatermarkGap) + throws Exception { + AutomaticLogWriter automaticLogWriter = + new AutomaticLogWriter( + TABLE_SCHEMA, + getPropertiesByTopic(topic), + topic, + new HiddenKafkaFactory<>(), + LogRecordV1.FIELD_GETTER_FACTORY, + jobId, + ShuffleHelper.EMPTY, + tableLoader, + writeLogstoreWatermarkGap); + + KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); + RowType flinkSchemaRowType = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); + Schema writeSchema = + TypeUtil.reassignIds(FlinkSchemaUtil.convert(FLINK_SCHEMA), testKeyedTable.schema()); + MetricsGenerator metricsGenerator = + MixedFormatUtils.getMetricsGenerator( + false, false, testKeyedTable, flinkSchemaRowType, writeSchema); + + MixedFormatFileWriter streamWriter = + FlinkSink.createFileWriter( + testKeyedTable, + null, + false, + (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(), + tableLoader); + + MixedFormatWriter mixedFormatWriter = + new MixedFormatWriter<>(automaticLogWriter, streamWriter, metricsGenerator); + + TestOneInputStreamOperatorIntern harness = + new TestOneInputStreamOperatorIntern<>( + mixedFormatWriter, + maxParallelism, + parallelism, + subTaskId, + restoredCheckpointId, + testGlobalAggregateManager); + harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); + return harness; + } + + private static Properties getPropertiesByTopic(String topic) { + Properties properties = new Properties(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); + properties.put(LOG_STORE_MESSAGE_TOPIC, topic); + properties.put(ProducerConfig.ACKS_CONFIG, "all"); + properties.put(ProducerConfig.BATCH_SIZE_CONFIG, "0"); + return properties; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java new file mode 100644 index 0000000000..226d721155 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.DataUtil; +import org.apache.amoro.io.MixedDataTestHelpers; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.UnkeyedTable; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.data.Record; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +@RunWith(Parameterized.class) +public class TestFlinkSink extends FlinkTestBase { + + public TestFlinkSink(boolean isKeyed) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(isKeyed, false)); + } + + @Parameterized.Parameters(name = "{0}") + public static Collection parameters() { + return Arrays.asList(new Object[][] {{true}, {false}}); + } + + @Test + public void testKeyedSink() throws Exception { + Assume.assumeTrue(isKeyedTable()); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List data = new LinkedList<>(); + data.add( + new Object[] { + 1000004, + "a", + LocalDateTime.parse("2022-06-17T10:10:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + data.add( + new Object[] { + 1000015, + "b", + LocalDateTime.parse("2022-06-17T10:08:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17T10:08:11.0") + }); + data.add( + new Object[] { + 1000011, + "c", + LocalDateTime.parse("2022-06-18T10:10:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-18T10:10:11.0") + }); + data.add( + new Object[] { + 1000014, + "d", + LocalDateTime.parse("2022-06-17T10:11:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17T10:11:11.0") + }); + data.add( + new Object[] { + 1000021, + "d", + LocalDateTime.parse("2022-06-17T16:10:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17T16:10:11.0") + }); + data.add( + new Object[] { + 1000015, + "e", + LocalDateTime.parse("2022-06-17T10:10:11.0").toEpochSecond(ZoneOffset.UTC), + LocalDateTime.parse("2022-06-17T10:10:11.0") + }); + + DataStream input = + env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); + + FlinkSink.forRowData(input) + .context(Optional::of) + .table(testKeyedTable) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .build(); + + env.execute(); + + testKeyedTable.changeTable().refresh(); + List actual = MixedDataTestHelpers.readKeyedTable(testKeyedTable, null); + + Set expected = toRecords(DataUtil.toRowSet(data)); + Assert.assertEquals(expected, new HashSet<>(actual)); + } + + @Test + public void testUnkeyedSink() throws Exception { + Assume.assumeFalse(isKeyedTable()); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + UnkeyedTable testTable = getMixedTable().asUnkeyedTable(); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List data = new LinkedList<>(); + data.add( + new Object[] {1000004, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {1000015, "b", 1655513411000L, LocalDateTime.parse("2022-06-17T10:08:11.0")}); + data.add( + new Object[] {1000011, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] {1000014, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T10:11:11.0")}); + data.add( + new Object[] {1000021, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T16:10:11.0")}); + data.add( + new Object[] {1000015, "e", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + DataStream input = + env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); + + FlinkSink.forRowData(input) + .context(Optional::of) + .table(testTable) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .build(); + + env.execute(); + testTable.refresh(); + Set actual = DataUtil.read(testTable); + + Set expected = toRecords(DataUtil.toRowSet(data)); + Assert.assertEquals(expected, actual); + } + + @Test + public void testUnkeyedOverwrite() throws Exception { + Assume.assumeFalse(isKeyedTable()); + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + UnkeyedTable testTable = getMixedTable().asUnkeyedTable(); + + env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); + env.getCheckpointConfig() + .enableExternalizedCheckpoints( + CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + + List data = new LinkedList<>(); + data.add( + new Object[] {1000004, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {1000015, "b", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add( + new Object[] {1000011, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] {1000014, "d", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] {1000021, "d", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add( + new Object[] {1000015, "e", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + DataStream input = + env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); + + FlinkSink.forRowData(input) + .context(Optional::of) + .table(testTable) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .flinkSchema(FLINK_SCHEMA) + .build(); + env.execute(); + + data.clear(); + data.add(new Object[] {12, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {11, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {15, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {21, "k", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + data.add(new Object[] {91, "l", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); + data.add(new Object[] {74, "m", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); + + DataStream overwrite = + env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); + + FlinkSink.forRowData(overwrite) + .context(Optional::of) + .table(testTable) + .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) + .overwrite(true) + .flinkSchema(FLINK_SCHEMA) + .build(); + + env.execute(); + testTable.refresh(); + Set actual = DataUtil.read(testTable); + + Set expected = toRecords(DataUtil.toRowSet(data)); + Assert.assertEquals(expected, actual); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java new file mode 100644 index 0000000000..6b3b572e33 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.MixedFormatUtils; +import org.apache.amoro.table.KeyedTable; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.WriteResult; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +public class TestMixedFormatFileCommitter extends FlinkTestBase { + public MixedFormatTableLoader tableLoader; + + public TestMixedFormatFileCommitter() { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(true, true)); + } + + public OneInputStreamOperatorTestHarness createMixedFormatFileCommitter( + MixedFormatTableLoader tableLoader, + MixedTable table, + OperatorSubtaskState operatorSubtaskState) + throws Exception { + OneInputStreamOperator committer = + FlinkSink.createFileCommitter( + table, tableLoader, false, SnapshotRef.MAIN_BRANCH, table.spec()); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(committer, 1, 1, 0); + + harness.setup(); + if (operatorSubtaskState == null) { + harness.initializeEmptyState(); + } else { + harness.initializeState(operatorSubtaskState); + } + harness.open(); + + return harness; + } + + public void checkChangeFiles(int fileCnt, int recordCnt, KeyedTable table) { + table.changeTable().refresh(); + TableScan tableScan = table.changeTable().newScan(); + CloseableIterable fileScanTasks = tableScan.planFiles(); + int actualFileCnt = 0; + int actualRecordCnt = 0; + for (FileScanTask fileScanTask : fileScanTasks) { + actualFileCnt++; + actualRecordCnt += fileScanTask.file().recordCount(); + } + Assert.assertEquals(fileCnt, actualFileCnt); + Assert.assertEquals(recordCnt, actualRecordCnt); + } + + @Test + public void testCommit() throws Exception { + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + KeyedTable table = MixedFormatUtils.loadMixedTable(tableLoader).asKeyedTable(); + + List completedFiles = prepareChangeFiles(); + OperatorSubtaskState snapshot; + long checkpoint = 1; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatFileCommitter(tableLoader, table, null)) { + + for (WriteResult completedFile : completedFiles) { + testHarness.processElement(new StreamRecord<>(completedFile)); + } + snapshot = testHarness.snapshot(checkpoint, System.currentTimeMillis()); + } + + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatFileCommitter(tableLoader, table, snapshot)) { + testHarness.notifyOfCompletedCheckpoint(checkpoint); + } + + checkChangeFiles(7, 9, table); + } + + private List prepareChangeFiles() throws Exception { + List changeFiles; + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + TestMixedFormatFileWriter.createMixedFormatStreamWriter(tableLoader)) { + // The first checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(1, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement( + createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_BEFORE), 1); + testHarness.processElement( + createRowData(2, "hello0", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); + testHarness.processElement( + createRowData(3, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); + testHarness.processElement(createRowData(5, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(6, "hello", "2020-10-12T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + // testHarness.extractOutputValues() compute the sum + Assert.assertEquals(2, testHarness.extractOutputValues().size()); + Assert.assertEquals(4, testHarness.extractOutputValues().get(1).dataFiles().length); + changeFiles = testHarness.extractOutputValues(); + } + return changeFiles; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java new file mode 100644 index 0000000000..889fd74e1b --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write; + +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SUBMIT_EMPTY_SNAPSHOTS; + +import org.apache.amoro.BasicTableTestHelper; +import org.apache.amoro.TableFormat; +import org.apache.amoro.TableTestHelper; +import org.apache.amoro.catalog.BasicCatalogTestHelper; +import org.apache.amoro.flink.FlinkTestBase; +import org.apache.amoro.flink.table.MixedFormatTableLoader; +import org.apache.amoro.flink.util.TestGlobalAggregateManager; +import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; +import org.apache.amoro.table.MixedTable; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.RowKind; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; +import org.apache.iceberg.flink.sink.TaskWriterFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.WriteResult; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.HashMap; +import java.util.List; + +@RunWith(Parameterized.class) +public class TestMixedFormatFileWriter extends FlinkTestBase { + + public static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; + public MixedFormatTableLoader tableLoader; + private final boolean submitEmptySnapshots; + + @Parameterized.Parameters(name = "{0}, {1}") + public static Object[][] parameters() { + return new Object[][] { + {true, false}, + {true, true}, + {false, false}, + {false, true} + }; + } + + public TestMixedFormatFileWriter(boolean isKeyed, boolean submitEmptySnapshots) { + super( + new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), + new BasicTableTestHelper(isKeyed, true)); + this.submitEmptySnapshots = submitEmptySnapshots; + } + + public static OneInputStreamOperatorTestHarness + createMixedFormatStreamWriter(MixedFormatTableLoader tableLoader) throws Exception { + return createMixedFormatStreamWriter(tableLoader, true, null); + } + + public static OneInputStreamOperatorTestHarness + createMixedFormatStreamWriter( + MixedFormatTableLoader tableLoader, + boolean submitEmptySnapshots, + Long restoredCheckpointId) + throws Exception { + OneInputStreamOperatorTestHarness harness = + doCreateMixedFormatStreamWriter(tableLoader, submitEmptySnapshots, restoredCheckpointId); + + harness.setup(); + harness.open(); + + return harness; + } + + public static OneInputStreamOperatorTestHarness + doCreateMixedFormatStreamWriter( + MixedFormatTableLoader tableLoader, + boolean submitEmptySnapshots, + Long restoredCheckpointId) + throws Exception { + tableLoader.open(); + MixedTable mixedTable = tableLoader.loadMixedFormatTable(); + mixedTable.properties().put(SUBMIT_EMPTY_SNAPSHOTS.key(), String.valueOf(submitEmptySnapshots)); + + MixedFormatFileWriter streamWriter = + FlinkSink.createFileWriter( + mixedTable, + null, + false, + (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(), + tableLoader); + TestOneInputStreamOperatorIntern harness = + new TestOneInputStreamOperatorIntern<>( + streamWriter, 1, 1, 0, restoredCheckpointId, new TestGlobalAggregateManager()); + + return harness; + } + + public static TaskWriter createUnkeyedTaskWriter( + Table table, long targetFileSize, FileFormat format, RowType rowType) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + rowType, + targetFileSize, + format, + new HashMap<>(), + null, + false); + taskWriterFactory.initialize(1, 1); + return taskWriterFactory.create(); + } + + @Test + public void testInsertWrite() throws Exception { + Assume.assumeTrue(isKeyedTable()); + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader)) { + MixedFormatFileWriter fileWriter = (MixedFormatFileWriter) testHarness.getOneInputOperator(); + Assert.assertNotNull(fileWriter.getWriter()); + // The first checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertNull(fileWriter.getWriter()); + Assert.assertEquals(1, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); + Assert.assertNotNull(fileWriter.getWriter()); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(3, "hello", "2020-10-12T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + // testHarness.extractOutputValues() calculates the cumulative value + List completedFiles = testHarness.extractOutputValues(); + Assert.assertEquals(2, completedFiles.size()); + Assert.assertEquals(3, completedFiles.get(1).dataFiles().length); + } + } + + @Test + public void testSnapshotMultipleTimes() throws Exception { + long checkpointId = 1; + long timestamp = 1; + + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader)) { + testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), timestamp++); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), timestamp); + testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), timestamp); + + testHarness.prepareSnapshotPreBarrier(checkpointId++); + long expectedDataFiles = 3; + WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + Assert.assertEquals(0, result.deleteFiles().length); + Assert.assertEquals(expectedDataFiles, result.dataFiles().length); + + // snapshot again immediately. + for (int i = 0; i < 5; i++) { + testHarness.prepareSnapshotPreBarrier(checkpointId++); + + result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); + Assert.assertEquals(0, result.deleteFiles().length); + Assert.assertEquals(expectedDataFiles, result.dataFiles().length); + } + } + } + + @Test + public void testInsertWriteWithoutPk() throws Exception { + Assume.assumeFalse(isKeyedTable()); + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader)) { + // The first checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(1, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement(createRowData(3, "hello", "2020-10-12T10:10:11.0"), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + // testHarness.extractOutputValues() calculates the cumulative value + List completedFiles = testHarness.extractOutputValues(); + Assert.assertEquals(2, completedFiles.size()); + Assert.assertEquals(1, completedFiles.get(1).dataFiles().length); + } + } + + @Test + public void testDeleteWrite() throws Exception { + Assume.assumeTrue(isKeyedTable()); + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader)) { + // The first checkpoint + testHarness.processElement( + createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.INSERT), 1); + testHarness.processElement( + createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.INSERT), 1); + testHarness.processElement( + createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.DELETE), 1); + testHarness.processElement( + createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.DELETE), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(1, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement( + createRowData(1, "hello", "2020-10-12T10:10:11.0", RowKind.INSERT), 1); + testHarness.processElement( + createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); + testHarness.processElement( + createRowData(3, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + // testHarness.extractOutputValues() calculates the cumulative value + Assert.assertEquals(2, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(1).dataFiles().length); + } + } + + @Test + public void testUpdateWrite() throws Exception { + Assume.assumeTrue(isKeyedTable()); + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + long checkpointId = 1L; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader)) { + // The first checkpoint + testHarness.processElement( + createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.INSERT), 1); + testHarness.processElement( + createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.UPDATE_BEFORE), 1); + testHarness.processElement( + createRowData(1, "hi", "2020-10-11T10:10:11.0", RowKind.UPDATE_AFTER), 1); + testHarness.processElement( + createRowData(1, "hello", "2020-10-13T10:10:11.0", RowKind.UPDATE_AFTER), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(1, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.processElement( + createRowData(1, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); + testHarness.processElement(createRowData(2, "h", "2020-10-12T10:10:11.0"), 1); + testHarness.processElement( + createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); + testHarness.processElement( + createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); + + testHarness.prepareSnapshotPreBarrier(checkpointId); + // testHarness.extractOutputValues() calculates the cumulative value + Assert.assertEquals(2, testHarness.extractOutputValues().size()); + Assert.assertEquals(3, testHarness.extractOutputValues().get(1).dataFiles().length); + } + } + + @Test + public void testEmitEmptyResults() throws Exception { + Assume.assumeTrue(isKeyedTable()); + tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); + long checkpointId = 1L; + long excepted = submitEmptySnapshots ? 1 : 0; + try (OneInputStreamOperatorTestHarness testHarness = + createMixedFormatStreamWriter(tableLoader, submitEmptySnapshots, null)) { + // The first checkpoint + + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(excepted, testHarness.extractOutputValues().size()); + + checkpointId = checkpointId + 1; + + // The second checkpoint + testHarness.prepareSnapshotPreBarrier(checkpointId); + Assert.assertEquals(excepted, testHarness.extractOutputValues().size()); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java new file mode 100644 index 0000000000..76fc10446d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden.kafka; + +import static org.apache.amoro.flink.shuffle.LogRecordV1.arrayFactory; +import static org.apache.amoro.flink.shuffle.LogRecordV1.mapFactory; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.table.PrimaryKeySpec; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.types.Types; + +import java.util.ArrayList; + +public class TestBaseLog { + public static final Schema USER_SCHEMA = + new Schema( + new ArrayList() { + { + add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); + add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); + add(Types.NestedField.optional(2, "f_long", Types.LongType.get())); + add( + Types.NestedField.optional( + 3, + "f_struct", + Types.StructType.of( + Types.NestedField.optional(4, "f_sub_boolean", Types.BooleanType.get()), + Types.NestedField.optional(5, "f_sub_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "f_sub_long", Types.LongType.get()), + Types.NestedField.optional(7, "f_sub_string", Types.StringType.get()), + Types.NestedField.optional(8, "f_sub_time", Types.TimeType.get()), + Types.NestedField.optional( + 9, "f_sub_decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(10, "f_sub_float", Types.FloatType.get()), + Types.NestedField.optional(11, "f_sub_double", Types.DoubleType.get()), + Types.NestedField.optional(12, "f_sub_date", Types.DateType.get()), + Types.NestedField.optional( + 13, "f_sub_timestamp_local", Types.TimestampType.withoutZone()), + Types.NestedField.optional( + 14, "f_sub_timestamp_tz", Types.TimestampType.withZone()), + Types.NestedField.optional(15, "f_sub_uuid", Types.UUIDType.get()), + Types.NestedField.optional( + 16, "f_sub_fixed", Types.FixedType.ofLength(18)), + Types.NestedField.optional(17, "f_sub_binary", Types.BinaryType.get()), + Types.NestedField.optional( + 18, + "f_sub_list", + Types.ListType.ofOptional(19, Types.LongType.get())), + Types.NestedField.optional( + 20, + "f_list2", + Types.ListType.ofOptional(21, Types.IntegerType.get())), + Types.NestedField.optional( + 22, + "f_list3", + Types.ListType.ofOptional( + 23, + Types.StructType.of( + Types.NestedField.optional( + 24, "f_sub_boolean", Types.BooleanType.get()), + Types.NestedField.optional( + 25, "f_sub_int", Types.IntegerType.get()), + Types.NestedField.optional( + 26, "f_sub_long", Types.LongType.get())))), + Types.NestedField.optional( + 27, + "f_map", + Types.MapType.ofOptional( + 28, 29, Types.StringType.get(), Types.StringType.get()))))); + } + }); + + public static final Schema USER_SCHEMA_WITH_ALL_DATA_TYPE = + new Schema( + new ArrayList() { + { + add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); + add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); + add(Types.NestedField.optional(2, "f_date", Types.DateType.get())); + add(Types.NestedField.optional(3, "f_long", Types.LongType.get())); + add(Types.NestedField.optional(4, "f_time", Types.TimeType.get())); + add(Types.NestedField.optional(5, "f_float", Types.FloatType.get())); + add(Types.NestedField.optional(6, "f_double", Types.DoubleType.get())); + add( + Types.NestedField.optional( + 7, "f_timestamp_local", Types.TimestampType.withoutZone())); + add(Types.NestedField.optional(8, "f_timestamp_tz", Types.TimestampType.withZone())); + add(Types.NestedField.optional(9, "f_string", Types.StringType.get())); + add(Types.NestedField.optional(10, "f_uuid", Types.UUIDType.get())); + add(Types.NestedField.optional(11, "f_fixed", Types.FixedType.ofLength(18))); + add(Types.NestedField.optional(12, "f_binary", Types.BinaryType.get())); + add(Types.NestedField.optional(13, "f_decimal", Types.DecimalType.of(38, 18))); + add( + Types.NestedField.optional( + 14, "f_list", Types.ListType.ofOptional(15, Types.LongType.get()))); + add( + Types.NestedField.optional( + 16, + "f_map", + Types.MapType.ofOptional( + 17, 18, Types.StringType.get(), Types.StringType.get()))); + add( + Types.NestedField.optional( + 19, + "f_struct", + Types.StructType.of( + Types.NestedField.optional(20, "f_sub_boolean", Types.BooleanType.get()), + Types.NestedField.optional(21, "f_sub_int", Types.IntegerType.get()), + Types.NestedField.optional(22, "f_sub_long", Types.LongType.get()), + Types.NestedField.optional(23, "f_sub_string", Types.StringType.get()), + Types.NestedField.optional(24, "f_sub_time", Types.TimeType.get()), + Types.NestedField.optional( + 25, "f_sub_decimal", Types.DecimalType.of(36, 18)), + Types.NestedField.optional(26, "f_sub_float", Types.FloatType.get()), + Types.NestedField.optional(27, "f_sub_double", Types.DoubleType.get()), + Types.NestedField.optional(28, "f_sub_date", Types.DateType.get()), + Types.NestedField.optional( + 29, "f_sub_timestamp_local", Types.TimestampType.withoutZone()), + Types.NestedField.optional( + 30, "f_sub_timestamp_tz", Types.TimestampType.withZone()), + Types.NestedField.optional(31, "f_sub_uuid", Types.UUIDType.get()), + Types.NestedField.optional( + 32, "f_sub_fixed", Types.FixedType.ofLength(18)), + Types.NestedField.optional(33, "f_sub_binary", Types.BinaryType.get()), + Types.NestedField.optional( + 34, + "f_sub_list", + Types.ListType.ofOptional(35, Types.LongType.get())), + Types.NestedField.optional( + 36, + "f_list2", + Types.ListType.ofOptional(37, Types.IntegerType.get())), + Types.NestedField.optional( + 38, + "f_list3", + Types.ListType.ofOptional( + 39, + Types.StructType.of( + Types.NestedField.optional( + 40, "f_sub_boolean", Types.BooleanType.get()), + Types.NestedField.optional( + 41, "f_sub_int", Types.IntegerType.get()), + Types.NestedField.optional( + 42, "f_sub_long", Types.LongType.get())))), + Types.NestedField.optional( + 43, + "f_map", + Types.MapType.ofOptional( + 44, 45, Types.StringType.get(), Types.StringType.get()))))); + } + }); + + private final PrimaryKeySpec primaryKeySpec = + PrimaryKeySpec.builderFor(USER_SCHEMA).addColumn(1).build(); + + public final RowType flinkUserSchema = FlinkSchemaUtil.convert(USER_SCHEMA); + + public final LogData FLIP_LOG = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + IdGenerator.generateUpstreamId(), + 1L, + true, + ChangeAction.INSERT, + new GenericRowData(0)); + + public static LogDataJsonDeserialization createLogDataDeserialization() { + return new LogDataJsonDeserialization<>( + USER_SCHEMA, LogRecordV1.factory, arrayFactory, mapFactory); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java new file mode 100644 index 0000000000..845b79eb16 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden.kafka; + +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; +import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; +import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import org.apache.amoro.data.ChangeAction; +import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; +import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.write.hidden.LogMsgFactory; +import org.apache.amoro.log.Bytes; +import org.apache.amoro.log.FormatVersion; +import org.apache.amoro.log.LogData; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.log.LogDataJsonSerialization; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.types.RowKind; +import org.apache.flink.util.InstantiationUtil; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.Duration; +import java.util.Properties; +import java.util.UUID; + +public class TestHiddenKafkaProducer extends TestBaseLog { + private static final Logger LOG = LoggerFactory.getLogger(TestHiddenKafkaProducer.class); + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Test + public void testInitTransactionId() { + final String topic = "test-init-transactions"; + KafkaContainerTest.createTopics(1, 1, topic); + FlinkKafkaInternalProducer reuse = null; + final String transactionalIdPrefix = UUID.randomUUID().toString(); + try { + int numTransactions = 20; + for (int i = 1; i <= numTransactions; i++) { + Properties properties = new Properties(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + properties = getProperties(KafkaConfigGenerate.getStandardProperties(properties)); + properties.put(TRANSACTIONAL_ID_CONFIG, transactionalIdPrefix + i); + reuse = new FlinkKafkaInternalProducer<>(properties); + reuse.initTransactions(); + reuse.beginTransaction(); + reuse.send(new ProducerRecord<>(topic, "test-value-" + i)); + if (i % 2 == 0) { + reuse.commitTransaction(); + } else { + reuse.flush(); + reuse.abortTransaction(); + } + int count = KafkaContainerTest.countAllRecords(topic, properties); + LOG.info("consumption = {}", count); + assertThat(count).isEqualTo(i / 2); + } + } catch (Throwable e) { + LOG.error("error:", e); + if (reuse != null) { + reuse.abortTransaction(); + } + } finally { + assert reuse != null; + reuse.close(Duration.ofMillis(1000)); + } + } + + @Test + public void testLogProducerSendFlip() throws Exception { + final String topic = "test-recover-transactions"; + int numPartitions = 3; + KafkaContainerTest.createTopics(numPartitions, 1, topic); + LogData.FieldGetterFactory fieldGetterFactory = LogRecordV1.FIELD_GETTER_FACTORY; + LogDataJsonSerialization logDataJsonSerialization = + new LogDataJsonSerialization<>(checkNotNull(USER_SCHEMA), checkNotNull(fieldGetterFactory)); + Properties properties = new Properties(); + properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); + properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); + LogMsgFactory.Producer producer = + new HiddenKafkaFactory() + .createProducer(properties, topic, logDataJsonSerialization, null); + producer.open(); + + int recoverNum = 3; + for (int i = 0; i < recoverNum; i++) { + producer.sendToAllPartitions(FLIP_LOG); + } + producer.close(); + int count = KafkaContainerTest.countAllRecords(topic, properties); + assertThat(count).isEqualTo(numPartitions * recoverNum); + } + + @Test + public void testLogDataNullValueSerialize() throws IOException { + + LogDataJsonSerialization logDataJsonSerialization = + new LogDataJsonSerialization<>( + USER_SCHEMA_WITH_ALL_DATA_TYPE, LogRecordV1.FIELD_GETTER_FACTORY); + + GenericRowData rowData = new GenericRowData(17); + rowData.setRowKind(RowKind.INSERT); + rowData.setField(0, null); + rowData.setField(1, null); + rowData.setField(2, null); + rowData.setField(3, null); + rowData.setField(4, null); + rowData.setField(5, null); + rowData.setField(6, null); + rowData.setField(7, null); + rowData.setField(8, null); + rowData.setField(9, null); + rowData.setField(10, null); + rowData.setField(11, null); + rowData.setField(12, null); + rowData.setField(13, null); + rowData.setField(14, null); + rowData.setField(15, null); + rowData.setField(16, null); + + LogData logData = + new LogRecordV1( + FormatVersion.FORMAT_VERSION_V1, + IdGenerator.generateUpstreamId(), + 1L, + false, + ChangeAction.INSERT, + rowData); + + byte[] bytes = logDataJsonSerialization.serialize(logData); + + Assert.assertNotNull(bytes); + String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); + + String expected = + "{\"f_boolean\":null,\"f_int\":null,\"f_date\":null,\"f_long\":null,\"f_time\":null,\"f_float\":null,\"f_double\":null,\"f_timestamp_local\":null,\"f_timestamp_tz\":null,\"f_string\":null,\"f_uuid\":null,\"f_fixed\":null,\"f_binary\":null,\"f_decimal\":null,\"f_list\":null,\"f_map\":null,\"f_struct\":null}"; + assertEquals(expected, actualJson); + + LogDataJsonDeserialization logDataDeserialization = createLogDataDeserialization(); + LogData result = logDataDeserialization.deserialize(bytes); + Assert.assertNotNull(result); + } + + @Test + public void testLogDataJsonSerializationClassSerialize() + throws IOException, ClassNotFoundException { + LogDataJsonSerialization actual = + new LogDataJsonSerialization<>(USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); + byte[] bytes = InstantiationUtil.serializeObject(actual); + LogDataJsonSerialization result = + InstantiationUtil.deserializeObject(bytes, actual.getClass().getClassLoader()); + Assert.assertNotNull(result); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java new file mode 100644 index 0000000000..9a42fd37fb --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.amoro.flink.write.hidden.kafka; + +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.getPropertiesByTopic; +import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; +import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; +import static org.apache.amoro.flink.write.hidden.kafka.TestBaseLog.USER_SCHEMA; +import static org.apache.amoro.flink.write.hidden.kafka.TestBaseLog.createLogDataDeserialization; + +import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; +import org.apache.amoro.flink.shuffle.LogRecordV1; +import org.apache.amoro.flink.shuffle.ShuffleHelper; +import org.apache.amoro.flink.util.TestGlobalAggregateManager; +import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; +import org.apache.amoro.flink.write.hidden.HiddenLogWriter; +import org.apache.amoro.log.LogDataJsonDeserialization; +import org.apache.amoro.utils.IdGenerator; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; +import org.apache.flink.streaming.api.TimeCharacteristic; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamUtils; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.util.CloseableIterator; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.jupiter.api.Assertions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +/** Hidden log operator tests. */ +public class TestHiddenLogOperators { + private static final Logger LOG = LoggerFactory.getLogger(TestHiddenLogOperators.class); + public static final String TOPIC = "produce-consume-topic"; + public static final TestGlobalAggregateManager GLOBAL_AGGREGATE_MANGER = + new TestGlobalAggregateManager(); + + @BeforeClass + public static void prepare() throws Exception { + KAFKA_CONTAINER.start(); + } + + @AfterClass + public static void shutdown() throws Exception { + KAFKA_CONTAINER.close(); + } + + @Test + public void testProduceAndConsume() throws Exception { + String topic = "testProduceAndConsume"; + final int count = 20; + + String[] expect = new String[count]; + try (OneInputStreamOperatorTestHarness harness = + createProducer(null, topic)) { + harness.setup(); + harness.initializeEmptyState(); + harness.open(); + for (int i = 0; i < count; i++) { + RowData row = createRowData(i); + expect[i] = row.toString(); + harness.processElement(row, 0); + } + harness.snapshot(1, 1); + harness.notifyOfCompletedCheckpoint(1); + List output = collect(harness); + Assertions.assertEquals(count, output.size()); + Assertions.assertArrayEquals(expect, output.toArray(new String[0])); + + createConsumerWithoutRetract(true, count, "test-gid", topic); + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + } + + @Test + public void testProducerFailoverWithoutRetract() throws Exception { + String topic = "testProducerFailoverWithoutRetract"; + OperatorSubtaskState state; + try { + OneInputStreamOperatorTestHarness harness = createProducer(null, topic); + harness.setup(); + harness.initializeEmptyState(); + harness.open(); + harness.processElement(createRowData(1), 0); + harness.processElement(createRowData(2), 0); + harness.processElement(createRowData(3), 0); + state = harness.snapshot(1, 1); + harness.processElement(createRowData(4), 0); + harness.processElement(createRowData(5), 0); + harness.notifyOfCompletedCheckpoint(1); + List output = collect(harness); + Assertions.assertEquals(5, output.size()); + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + + // failover happen 1 time + try { + OneInputStreamOperatorTestHarness harness = createProducer(1L, topic); + harness.setup(); + harness.initializeState(state); + harness.open(); + harness.processElement(createRowData(4), 0); + harness.processElement(createRowData(5), 0); + harness.processElement(createRowData(6), 0); + harness.snapshot(2, 1); + harness.processElement(createRowData(7), 0); + harness.processElement(createRowData(8), 0); + harness.notifyOfCompletedCheckpoint(2); + List output = collect(harness); + Assertions.assertEquals(5, output.size()); + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + + createConsumerWithoutRetract(true, 10, "test-gid", topic); + } + + @Test + public void testMultiParallelismFailoverConsistencyRead() throws Exception { + String topic = "testMultiParallelismFailoverConsistencyRead"; + OperatorSubtaskState state0; + OperatorSubtaskState state1; + OperatorSubtaskState state2; + byte[] jobId = IdGenerator.generateUpstreamId(); + try (TestOneInputStreamOperatorIntern harness0 = + createProducer(3, 0, jobId, topic); + TestOneInputStreamOperatorIntern harness1 = + createProducer(3, 1, jobId, topic); + TestOneInputStreamOperatorIntern harness2 = + createProducer(3, 2, jobId, topic)) { + harness0.setup(); + harness0.initializeEmptyState(); + harness0.open(); + harness1.setup(); + harness1.initializeEmptyState(); + harness1.open(); + harness2.setup(); + harness2.initializeEmptyState(); + harness2.open(); + + harness0.processElement(createRowData(1), 0); + + state0 = harness0.snapshot(1, 1); + + harness1.processElement(createRowData(11), 0); + harness2.processElement(createRowData(21), 0); + + // chp-1 success. + state1 = harness1.snapshot(1, 1); + state2 = harness2.snapshot(1, 1); + + harness0.processElement(createRowData(2), 0); + harness1.processElement(createRowData(12), 0); + harness2.processElement(createRowData(22), 0); + harness0.notifyOfCompletedCheckpoint(1); + harness1.notifyOfCompletedCheckpoint(1); + harness2.notifyOfCompletedCheckpoint(1); + harness0.processElement(createRowData(3), 0); + // after 3, harness0 happen timeout + harness1.processElement(createRowData(13), 0); + harness2.processElement(createRowData(23), 0); + + // harness0 snapshot chp-2 failed. + harness1.snapshot(2, 1); + harness2.snapshot(2, 1); + + harness1.processElement(createRowData(14), 0); + harness2.processElement(createRowData(24), 0); + // notify chp-2 aborted + harness1.notifyOfAbortedCheckpoint(2); + harness2.notifyOfAbortedCheckpoint(2); + + List output = collect(harness0); + output.addAll(collect(harness1)); + output.addAll(collect(harness2)); + Assertions.assertEquals(11, output.size()); + ConsumerRecords consumerRecords = readRecordsBytes(topic); + Assertions.assertEquals(11, consumerRecords.count()); + LogDataJsonDeserialization deserialization = createLogDataDeserialization(); + consumerRecords.forEach( + consumerRecord -> { + try { + System.out.println(deserialization.deserialize(consumerRecord.value())); + } catch (IOException e) { + e.printStackTrace(); + } + }); + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + + // failover restore from chp-1 + try (TestOneInputStreamOperatorIntern harness0 = + createProducer(3, 0, jobId, 1L, topic); + TestOneInputStreamOperatorIntern harness1 = + createProducer(3, 1, jobId, 1L, topic); + TestOneInputStreamOperatorIntern harness2 = + createProducer(3, 2, jobId, 1L, topic)) { + harness0.setup(); + harness0.initializeState(state0); + harness0.open(); + harness1.setup(); + harness1.initializeState(state1); + harness1.open(); + harness2.setup(); + harness2.initializeState(state2); + harness2.open(); + + harness0.processElement(createRowData(2), 0); + harness1.processElement(createRowData(12), 0); + harness2.processElement(createRowData(22), 0); + // chp-2 + state1 = harness1.snapshot(3, 1); + state2 = harness2.snapshot(3, 1); + + harness0.processElement(createRowData(3), 0); + // after 3, harness0 happen timeout + harness1.processElement(createRowData(13), 0); + harness2.processElement(createRowData(23), 0); + + harness1.processElement(createRowData(14), 0); + harness2.processElement(createRowData(24), 0); + + harness1.notifyOfAbortedCheckpoint(2); + harness2.notifyOfAbortedCheckpoint(2); + + List output = collect(harness0); + output.addAll(collect(harness1)); + output.addAll(collect(harness2)); + Assertions.assertEquals(8, output.size()); + ConsumerRecords consumerRecords = readRecordsBytes(topic); + LogDataJsonDeserialization deserialization = createLogDataDeserialization(); + consumerRecords.forEach( + consumerRecord -> { + try { + System.out.println(deserialization.deserialize(consumerRecord.value())); + } catch (IOException e) { + e.printStackTrace(); + } + }); + Assertions.assertEquals(20, consumerRecords.count()); + } catch (Exception e) { + e.printStackTrace(); + throw e; + } + createConsumerWithoutRetract(true, 19, "test-gid", topic); + createConsumerWithRetract(true, 27, "test-gid-2", topic); + } + + public static RowData createRowData(int i) { + GenericRowData rowData = new GenericRowData(USER_SCHEMA.columns().size()); + rowData.setField(0, true); + rowData.setField(1, i); + rowData.setField(2, 1L); + GenericRowData sub = new GenericRowData(18); + sub.setField(0, true); + sub.setField(1, 1); + sub.setField(2, 1L); + sub.setField(3, StringData.fromString("sssss")); + sub.setField(4, LocalTime.of(13, 23, 23, 98766545).toNanoOfDay()); + sub.setField( + 5, DecimalData.fromBigDecimal(new BigDecimal("123456789.123456789123456789"), 30, 18)); + sub.setField(6, 123.12345f); + sub.setField(7, 123.12345d); + sub.setField(8, (int) LocalDate.of(2022, 5, 5).toEpochDay()); + sub.setField( + 9, TimestampData.fromLocalDateTime(LocalDateTime.of(2022, 12, 12, 13, 14, 14, 987654234))); + sub.setField(10, TimestampData.fromInstant(Instant.parse("2022-12-13T13:33:44.98765432Z"))); + sub.setField(11, new byte[] {1}); + sub.setField(12, new byte[] {'1'}); + sub.setField(13, new byte[] {2}); + + GenericArrayData fSubList = new GenericArrayData(new long[] {112L, 123L}); + sub.setField(14, fSubList); + + GenericArrayData fSubList2 = new GenericArrayData(new int[] {112, 123}); + sub.setField(15, fSubList2); + + GenericRowData subStruct = new GenericRowData(3); + subStruct.setField(0, false); + subStruct.setField(1, 112); + subStruct.setField(2, 123L); + GenericArrayData structList = new GenericArrayData(new GenericRowData[] {subStruct}); + sub.setField(16, structList); + + GenericMapData map = + new GenericMapData( + new HashMap() { + { + put(StringData.fromString("Key_123"), StringData.fromString("Str_123")); + put(StringData.fromString("Key_124"), StringData.fromString("Str_123")); + put(StringData.fromString("Key_125"), StringData.fromString("Str_123")); + } + }); + sub.setField(17, map); + + rowData.setField(3, sub); + return rowData; + } + + private static List collect(OneInputStreamOperatorTestHarness harness) { + List parts = new ArrayList<>(); + harness.extractOutputValues().forEach(m -> parts.add(m.toString())); + return parts; + } + + private void createConsumerWithRetract( + boolean print, int count, final String groupId, String topic) throws Exception { + createConsumer(print, count, groupId, true, topic); + } + + private void createConsumerWithoutRetract( + boolean print, int count, final String groupId, String topic) throws Exception { + createConsumer(print, count, groupId, false, topic); + } + + private void createConsumer( + boolean print, int count, final String groupId, boolean retract, String topic) + throws Exception { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + env.enableCheckpointing(10000); + env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); + List topics = new ArrayList<>(); + topics.add(topic); + Properties properties = getPropertiesByTopic(topic); + properties.put("group.id", groupId); + properties.put("auto.offset.reset", "earliest"); + + Map configuration = new HashMap<>(); + configuration.put(MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), String.valueOf(retract)); + + DataStream streamWithTimestamps = + env.fromSource( + LogKafkaSource.builder(USER_SCHEMA, configuration) + .setTopics(topics) + .setStartingOffsets(OffsetsInitializer.earliest()) + .setProperties(properties) + .build(), + WatermarkStrategy.noWatermarks(), + "Log Source"); + if (print) { + streamWithTimestamps.print("log-hidden"); + } + + ClientAndIterator clientAndIterator = + DataStreamUtils.collectWithClient(streamWithTimestamps, "testLog"); + + JobClient jobClient = clientAndIterator.client; + CloseableIterator iterator = clientAndIterator.iterator; + + List actualResult = new ArrayList<>(); + + while (iterator.hasNext()) { + RowData row = iterator.next(); + actualResult.add(row); + LOG.info("size {}, {}, {}.", actualResult.size(), row.getRowKind(), row.getInt(1)); + if (actualResult.size() == count) { + break; + } + } + } + + public static OneInputStreamOperatorTestHarness createProducer( + Long restoredCheckpoint, String topic) throws Exception { + return createProducer( + 1, + 1, + 0, + restoredCheckpoint, + IdGenerator.generateUpstreamId(), + new TestGlobalAggregateManager(), + topic); + } + + public static TestOneInputStreamOperatorIntern createProducer( + int maxParallelism, int subTaskId, byte[] jobId, Long restoredCheckpointId, String topic) + throws Exception { + return createProducer( + maxParallelism, + maxParallelism, + subTaskId, + restoredCheckpointId, + jobId, + GLOBAL_AGGREGATE_MANGER, + topic); + } + + public static TestOneInputStreamOperatorIntern createProducer( + int maxParallelism, int subTaskId, byte[] jobId, String topic) throws Exception { + return createProducer( + maxParallelism, maxParallelism, subTaskId, null, jobId, GLOBAL_AGGREGATE_MANGER, topic); + } + + private static TestOneInputStreamOperatorIntern createProducer( + int maxParallelism, + int parallelism, + int subTaskId, + Long restoredCheckpointId, + byte[] jobId, + TestGlobalAggregateManager testGlobalAggregateManager, + String topic) + throws Exception { + HiddenLogWriter writer = + new HiddenLogWriter( + USER_SCHEMA, + getPropertiesByTopic(topic), + topic, + new HiddenKafkaFactory<>(), + LogRecordV1.FIELD_GETTER_FACTORY, + jobId, + ShuffleHelper.EMPTY); + + TestOneInputStreamOperatorIntern harness = + new TestOneInputStreamOperatorIntern<>( + writer, + maxParallelism, + parallelism, + subTaskId, + restoredCheckpointId, + testGlobalAggregateManager); + harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); + return harness; + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java new file mode 100644 index 0000000000..5a1e3d85b9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.util.MiniClusterWithClientResource; + +/** + * Compatibility shim for tests that previously used Iceberg's removed MiniClusterResource helper. + */ +public class MiniClusterResource { + private static final int DEFAULT_TM_NUM = 1; + private static final int DEFAULT_PARALLELISM = 4; + + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration().set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private MiniClusterResource() {} + + public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { + return new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml new file mode 100644 index 0000000000..182ca4d1c8 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml @@ -0,0 +1,349 @@ + + + + 4.0.0 + + org.apache.amoro + amoro-mixed-flink + 0.9-SNAPSHOT + ../pom.xml + + + amoro-format-mixed-flink-common-iceberg-bridge-1.17 + Amoro Project Mixed Format Flink Iceberg Bridge + https://amoro.apache.org + + + 3.21.0 + 1.17.2 + 1.17.2 + 3.0.2-1.17 + 1.6.1 + + + + + + org.apache.amoro + amoro-format-iceberg + + + org.ow2.asm + asm + + + + + + org.apache.amoro + amoro-mixed-hive + ${project.version} + + + + org.apache.iceberg + iceberg-flink-1.17 + ${iceberg.version} + provided + + + org.slf4j + slf4j-api + + + org.apache.parquet + parquet-column + + + org.apache.parquet + parquet-avro + + + + + + cglib + cglib + + + + com.google.code.gson + gson + ${gson.version} + + + + + org.apache.flink + flink-connector-files + ${flink.version} + provided + + + + org.apache.flink + flink-connector-kafka + ${flink-kafka.version} + provided + + + + org.apache.flink + flink-json + ${flink.version} + provided + + + + org.apache.flink + flink-hadoop-compatibility_${flink.scala.binary.version} + ${flink.version} + provided + + + + org.apache.flink + flink-table-api-java-bridge + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + + org.apache.flink + flink-orc + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-parquet + ${flink.version} + provided + + + org.apache.parquet + parquet-hadoop + + + + + + org.apache.flink + flink-table-runtime + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + org.apache.flink + flink-table-planner_${flink.scala.binary.version} + ${flink.version} + provided + + + org.slf4j + slf4j-api + + + + + + org.apache.iceberg + iceberg-flink-1.17 + ${iceberg.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-runtime + ${flink.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + + org.apache.flink + flink-streaming-java + ${flink.version} + tests + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-clients + ${flink.version} + test + + + org.slf4j + slf4j-api + + + + + + org.apache.flink + flink-test-utils + ${flink.version} + test + + + org.apache.logging.log4j + log4j-slf4j-impl + + + org.slf4j + slf4j-api + + + com.google.guava + guava + + + + + org.apache.flink + flink-connector-test-utils + ${flink.version} + test + + + + org.apache.iceberg + iceberg-hive-metastore + ${iceberg.version} + tests + test + + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.vesion} + provided + + + + org.apache.amoro + amoro-common + ${project.version} + tests + test + + + + org.apache.flink + flink-metrics-jmx + ${flink.version} + test + + + org.apache.flink + flink-runtime-web + ${flink.version} + test + + + + + org.apache.flink + flink-table-planner_${flink.scala.binary.version} + ${flink.version} + test-jar + test + + + org.slf4j + slf4j-api + + + + + + + org.apache.curator + curator-test + 2.12.0 + test + + + + org.testcontainers + kafka + ${testcontainers.version} + test + + + + org.testcontainers + junit-jupiter + ${testcontainers.version} + test + + + org.assertj + assertj-core + ${assertj.version} + test + + + diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java new file mode 100644 index 0000000000..6b984b1b5d --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java @@ -0,0 +1,873 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink.data; + +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RawValueData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.ParquetValueReader; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +public class AdaptHiveFlinkParquetReaders { + private AdaptHiveFlinkParquetReaders() {} + + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { + return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); + } + + private static class ReadBuilder extends TypeWithSchemaVisitor> { + private final MessageType type; + private final Map idToConstant; + + ReadBuilder(MessageType type, Map idToConstant) { + this.type = type; + this.idToConstant = idToConstant; + } + + @Override + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { + return struct(expected, message.asGroupType(), fieldReaders); + } + + @Override + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { + // match the expected struct's order + Map> readersById = Maps.newHashMap(); + Map typesById = Maps.newHashMap(); + List fields = struct.getFields(); + for (int i = 0; i < fields.size(); i += 1) { + Type fieldType = fields.get(i); + if (fieldReaders.get(i) != null) { + int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; + if (fieldType.getId() != null) { + int id = fieldType.getId().intValue(); + readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); + typesById.put(id, fieldType); + } + } + } + + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); + List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); + for (Types.NestedField field : expectedFields) { + int id = field.fieldId(); + if (idToConstant.containsKey(id)) { + // containsKey is used because the constant may be null + reorderedFields.add(ParquetValueReaders.constant(idToConstant.get(id))); + types.add(null); + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { + reorderedFields.add(ParquetValueReaders.position()); + types.add(null); + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { + reorderedFields.add(ParquetValueReaders.constant(false)); + types.add(null); + } else { + ParquetValueReader reader = readersById.get(id); + if (reader != null) { + reorderedFields.add(reader); + types.add(typesById.get(id)); + } else { + reorderedFields.add(ParquetValueReaders.nulls()); + types.add(null); + } + } + } + + return new RowDataReader(types, reorderedFields); + } + + @Override + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { + if (expectedList == null) { + return null; + } + + GroupType repeated = array.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type elementType = repeated.getType(0); + int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; + + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + } + + @Override + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + if (expectedMap == null) { + return null; + } + + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; + int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; + + Type keyType = repeatedKeyValue.getType(0); + int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; + Type valueType = repeatedKeyValue.getType(1); + int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; + + return new MapReader<>( + repeatedD, + repeatedR, + ParquetValueReaders.option(keyType, keyD, keyReader), + ParquetValueReaders.option(valueType, valueD, valueReader)); + } + + @Override + @SuppressWarnings("CyclomaticComplexity") + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { + if (expected == null) { + return null; + } + + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + + if (primitive.getOriginalType() != null) { + switch (primitive.getOriginalType()) { + case ENUM: + case JSON: + case UTF8: + return new StringReader(desc); + case INT_8: + case INT_16: + case INT_32: + if (expected.typeId() == Types.LongType.get().typeId()) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case TIME_MICROS: + return new LossyMicrosToMillisTimeReader(desc); + case TIME_MILLIS: + return new MillisTimeReader(desc); + case DATE: + case INT_64: + return new ParquetValueReaders.UnboxedReader<>(desc); + case TIMESTAMP_MICROS: + if (((Types.TimestampType) expected).shouldAdjustToUTC()) { + return new MicrosToTimestampTzReader(desc); + } else { + return new MicrosToTimestampReader(desc); + } + case TIMESTAMP_MILLIS: + if (((Types.TimestampType) expected).shouldAdjustToUTC()) { + return new MillisToTimestampTzReader(desc); + } else { + return new MillisToTimestampReader(desc); + } + case DECIMAL: + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + switch (primitive.getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + case INT64: + return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + case INT32: + return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); + default: + throw new UnsupportedOperationException( + "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); + } + case BSON: + return new ParquetValueReaders.ByteArrayReader(desc); + default: + throw new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getOriginalType()); + } + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return new ParquetValueReaders.ByteArrayReader(desc); + case INT32: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { + return new ParquetValueReaders.IntAsLongReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case FLOAT: + if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { + return new ParquetValueReaders.FloatAsDoubleReader(desc); + } else { + return new ParquetValueReaders.UnboxedReader<>(desc); + } + case BOOLEAN: + case INT64: + case DOUBLE: + return new ParquetValueReaders.UnboxedReader<>(desc); + case INT96: + Types.TimestampType tsMicrosType = (Types.TimestampType) expected; + if (tsMicrosType.shouldAdjustToUTC()) { + return new TimestampIntWithTZ96Reader(desc); + } else { + return new TimestampIntWithOutTZ96Reader(desc); + } + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class TimestampIntWithOutTZ96Reader + extends ParquetValueReaders.PrimitiveReader { + private static final long UNIX_EPOCH_JULIAN = 2_440_588L; + + TimestampIntWithOutTZ96Reader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData reuse) { + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final long timeOfDayNanos = byteBuffer.getLong(); + final int julianDay = byteBuffer.getInt(); + + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) + .plusNanos(timeOfDayNanos) + .atZone(ZoneId.systemDefault()) + .toLocalDateTime()); + } + } + + private static class TimestampIntWithTZ96Reader + extends ParquetValueReaders.PrimitiveReader { + private static final long UNIX_EPOCH_JULIAN = 2_440_588L; + + private TimestampIntWithTZ96Reader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData reuse) { + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final long timeOfDayNanos = byteBuffer.getLong(); + final int julianDay = byteBuffer.getInt(); + + return TimestampData.fromInstant( + Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) + .plusNanos(timeOfDayNanos)); + } + } + + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + Binary binary = column.nextBinary(); + BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); + // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader + return DecimalData.fromBigDecimal(bigDecimal, precision, scale); + } + } + + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); + } + } + + private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { + private final int precision; + private final int scale; + + LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public DecimalData read(DecimalData ignored) { + return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); + } + } + + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MicrosToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long value = readLong(); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000)); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromEpochMillis(millis); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { + MillisToTimestampTzReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public TimestampData read(TimestampData ignored) { + long millis = readLong(); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); + } + + @Override + public long readLong() { + return column.nextLong(); + } + } + + private static class StringReader extends ParquetValueReaders.PrimitiveReader { + StringReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public StringData read(StringData ignored) { + Binary binary = column.nextBinary(); + ByteBuffer buffer = binary.toByteBuffer(); + if (buffer.hasArray()) { + return StringData.fromBytes( + buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); + } else { + return StringData.fromBytes(binary.getBytes()); + } + } + } + + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { + LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + // Discard microseconds since Flink uses millisecond unit for TIME type. + return (int) Math.floorDiv(column.nextLong(), 1000L); + } + } + + private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { + MillisTimeReader(ColumnDescriptor desc) { + super(desc); + } + + @Override + public Integer read(Integer reuse) { + return (int) column.nextLong(); + } + } + + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { + private int readPos = 0; + private int writePos = 0; + + ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { + super(definitionLevel, repetitionLevel, reader); + } + + @Override + protected ReusableArrayData newListData(ArrayData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableArrayData) { + return (ReusableArrayData) reuse; + } else { + return new ReusableArrayData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected E getElement(ReusableArrayData list) { + E value = null; + if (readPos < list.capacity()) { + value = (E) list.values[readPos]; + } + + readPos += 1; + + return value; + } + + @Override + protected void addElement(ReusableArrayData reused, E element) { + if (writePos >= reused.capacity()) { + reused.grow(); + } + + reused.values[writePos] = element; + + writePos += 1; + } + + @Override + protected ArrayData buildList(ReusableArrayData list) { + list.setNumElements(writePos); + return list; + } + } + + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { + private int readPos = 0; + private int writePos = 0; + + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); + + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { + super(definitionLevel, repetitionLevel, keyReader, valueReader); + } + + @Override + protected ReusableMapData newMapData(MapData reuse) { + this.readPos = 0; + this.writePos = 0; + + if (reuse instanceof ReusableMapData) { + return (ReusableMapData) reuse; + } else { + return new ReusableMapData(); + } + } + + @Override + @SuppressWarnings("unchecked") + protected Map.Entry getPair(ReusableMapData map) { + Map.Entry kv = nullEntry; + if (readPos < map.capacity()) { + entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); + kv = entry; + } + + readPos += 1; + + return kv; + } + + @Override + protected void addPair(ReusableMapData map, K key, V value) { + if (writePos >= map.capacity()) { + map.grow(); + } + + map.keys.values[writePos] = key; + map.values.values[writePos] = value; + + writePos += 1; + } + + @Override + protected MapData buildMap(ReusableMapData map) { + map.setNumElements(writePos); + return map; + } + } + + private static class RowDataReader + extends ParquetValueReaders.StructReader { + private final int numFields; + + RowDataReader(List types, List> readers) { + super(types, readers); + this.numFields = readers.size(); + } + + @Override + protected GenericRowData newStructData(RowData reuse) { + if (reuse instanceof GenericRowData) { + return (GenericRowData) reuse; + } else { + return new GenericRowData(numFields); + } + } + + @Override + protected Object getField(GenericRowData intermediate, int pos) { + return intermediate.getField(pos); + } + + @Override + protected RowData buildStruct(GenericRowData struct) { + return struct; + } + + @Override + protected void set(GenericRowData row, int pos, Object value) { + row.setField(pos, value); + } + + @Override + protected void setNull(GenericRowData row, int pos) { + row.setField(pos, null); + } + + @Override + protected void setBoolean(GenericRowData row, int pos, boolean value) { + row.setField(pos, value); + } + + @Override + protected void setInteger(GenericRowData row, int pos, int value) { + row.setField(pos, value); + } + + @Override + protected void setLong(GenericRowData row, int pos, long value) { + row.setField(pos, value); + } + + @Override + protected void setFloat(GenericRowData row, int pos, float value) { + row.setField(pos, value); + } + + @Override + protected void setDouble(GenericRowData row, int pos, double value) { + row.setField(pos, value); + } + } + + private static class ReusableMapData implements MapData { + private final ReusableArrayData keys; + private final ReusableArrayData values; + + private int numElements; + + private ReusableMapData() { + this.keys = new ReusableArrayData(); + this.values = new ReusableArrayData(); + } + + private void grow() { + keys.grow(); + values.grow(); + } + + private int capacity() { + return keys.capacity(); + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + keys.setNumElements(numElements); + values.setNumElements(numElements); + } + + @Override + public int size() { + return numElements; + } + + @Override + public ReusableArrayData keyArray() { + return keys; + } + + @Override + public ReusableArrayData valueArray() { + return values; + } + } + + private static class ReusableArrayData implements ArrayData { + private static final Object[] EMPTY = new Object[0]; + + private Object[] values = EMPTY; + private int numElements = 0; + + private void grow() { + if (values.length == 0) { + this.values = new Object[20]; + } else { + Object[] old = values; + this.values = new Object[old.length << 1]; + // copy the old array in case it has values that can be reused + System.arraycopy(old, 0, values, 0, old.length); + } + } + + private int capacity() { + return values.length; + } + + public void setNumElements(int numElements) { + this.numElements = numElements; + } + + @Override + public int size() { + return numElements; + } + + @Override + public boolean isNullAt(int ordinal) { + return null == values[ordinal]; + } + + @Override + public boolean getBoolean(int ordinal) { + return (boolean) values[ordinal]; + } + + @Override + public byte getByte(int ordinal) { + return (byte) values[ordinal]; + } + + @Override + public short getShort(int ordinal) { + return (short) values[ordinal]; + } + + @Override + public int getInt(int ordinal) { + return (int) values[ordinal]; + } + + @Override + public long getLong(int ordinal) { + return (long) values[ordinal]; + } + + @Override + public float getFloat(int ordinal) { + return (float) values[ordinal]; + } + + @Override + public double getDouble(int ordinal) { + return (double) values[ordinal]; + } + + @Override + public StringData getString(int pos) { + return (StringData) values[pos]; + } + + @Override + public DecimalData getDecimal(int pos, int precision, int scale) { + return (DecimalData) values[pos]; + } + + @Override + public TimestampData getTimestamp(int pos, int precision) { + return (TimestampData) values[pos]; + } + + @SuppressWarnings("unchecked") + @Override + public RawValueData getRawValue(int pos) { + return (RawValueData) values[pos]; + } + + @Override + public byte[] getBinary(int ordinal) { + return (byte[]) values[ordinal]; + } + + @Override + public ArrayData getArray(int ordinal) { + return (ArrayData) values[ordinal]; + } + + @Override + public MapData getMap(int ordinal) { + return (MapData) values[ordinal]; + } + + @Override + public RowData getRow(int pos, int numFields) { + return (RowData) values[pos]; + } + + @Override + public boolean[] toBooleanArray() { + return ArrayUtil.toPrimitive((Boolean[]) values); + } + + @Override + public byte[] toByteArray() { + return ArrayUtil.toPrimitive((Byte[]) values); + } + + @Override + public short[] toShortArray() { + return ArrayUtil.toPrimitive((Short[]) values); + } + + @Override + public int[] toIntArray() { + return ArrayUtil.toPrimitive((Integer[]) values); + } + + @Override + public long[] toLongArray() { + return ArrayUtil.toPrimitive((Long[]) values); + } + + @Override + public float[] toFloatArray() { + return ArrayUtil.toPrimitive((Float[]) values); + } + + @Override + public double[] toDoubleArray() { + return ArrayUtil.toPrimitive((Double[]) values); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java new file mode 100644 index 0000000000..6407265d89 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java @@ -0,0 +1,599 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink.data; + +import static org.apache.flink.table.types.logical.LogicalTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE; + +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.flink.table.types.logical.SmallIntType; +import org.apache.flink.table.types.logical.TinyIntType; +import org.apache.iceberg.parquet.AdaptHivePrimitiveWriter; +import org.apache.iceberg.parquet.ParquetValueReaders; +import org.apache.iceberg.parquet.ParquetValueWriter; +import org.apache.iceberg.parquet.ParquetValueWriters; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.DecimalUtil; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.time.Instant; +import java.time.ZoneId; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.concurrent.TimeUnit; + +/** + * Copy from iceberg {@link FlinkParquetWriters} to support int96 type and use {@link + * AdaptHiveParquetWithFlinkSchemaVisitor}. + */ +public class AdaptHiveFlinkParquetWriters { + private AdaptHiveFlinkParquetWriters() {} + + @SuppressWarnings("unchecked") + public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { + return (ParquetValueWriter) + AdaptHiveParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + } + + private static class WriteBuilder + extends AdaptHiveParquetWithFlinkSchemaVisitor> { + private final MessageType type; + + WriteBuilder(MessageType type) { + this.type = type; + } + + @Override + public ParquetValueWriter message( + RowType rowType, MessageType message, List> fields) { + return struct(rowType, message.asGroupType(), fields); + } + + @Override + public ParquetValueWriter struct( + RowType rowType, GroupType struct, List> fieldWriters) { + List fields = struct.getFields(); + List flinkFields = rowType.getFields(); + List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); + List flinkTypes = Lists.newArrayList(); + for (int i = 0; i < fields.size(); i += 1) { + writers.add(newOption(struct.getType(i), fieldWriters.get(i))); + flinkTypes.add(flinkFields.get(i).getType()); + } + + return new RowDataWriter(writers, flinkTypes); + } + + @Override + public ParquetValueWriter list( + ArrayType arrayType, GroupType array, ParquetValueWriter elementWriter) { + GroupType repeated = array.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new ArrayDataWriter<>( + repeatedD, + repeatedR, + newOption(repeated.getType(0), elementWriter), + arrayType.getElementType()); + } + + @Override + public ParquetValueWriter map( + MapType mapType, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { + GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); + String[] repeatedPath = currentPath(); + + int repeatedD = type.getMaxDefinitionLevel(repeatedPath); + int repeatedR = type.getMaxRepetitionLevel(repeatedPath); + + return new MapDataWriter<>( + repeatedD, + repeatedR, + newOption(repeatedKeyValue.getType(0), keyWriter), + newOption(repeatedKeyValue.getType(1), valueWriter), + mapType.getKeyType(), + mapType.getValueType()); + } + + private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { + int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); + return ParquetValueWriters.option(fieldType, maxD, writer); + } + + @Override + public ParquetValueWriter primitive(LogicalType logicalType, PrimitiveType primitive) { + ColumnDescriptor desc = type.getColumnDescription(currentPath()); + + if (primitive.getOriginalType() != null) { + switch (primitive.getOriginalType()) { + case ENUM: + case JSON: + case UTF8: + return strings(desc); + case DATE: + case INT_8: + case INT_16: + case INT_32: + return ints(logicalType, desc); + case INT_64: + return ParquetValueWriters.longs(desc); + case TIME_MICROS: + return timeMicros(desc); + case TIMESTAMP_MICROS: + return timestamps(desc); + case DECIMAL: + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + switch (primitive.getPrimitiveTypeName()) { + case INT32: + return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); + case INT64: + return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); + default: + throw new UnsupportedOperationException( + "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); + } + case BSON: + return byteArrays(desc); + default: + throw new UnsupportedOperationException( + "Unsupported logical type: " + primitive.getOriginalType()); + } + } + + switch (primitive.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY: + case BINARY: + return byteArrays(desc); + case BOOLEAN: + return ParquetValueWriters.booleans(desc); + case INT32: + return ints(logicalType, desc); + case INT64: + return ParquetValueWriters.longs(desc); + case INT96: + LogicalTypeRoot typeRoot = logicalType.getTypeRoot(); + if (typeRoot == TIMESTAMP_WITHOUT_TIME_ZONE) { + return new TimestampInt96Writer(desc); + } else { + return new TimestampTZInt96Writer(desc); + } + case FLOAT: + return ParquetValueWriters.floats(desc); + case DOUBLE: + return ParquetValueWriters.doubles(desc); + default: + throw new UnsupportedOperationException("Unsupported type: " + primitive); + } + } + } + + private static class TimestampTZInt96Writer extends AdaptHivePrimitiveWriter { + + private static final long JULIAN_DAY_OF_EPOCH = 2440588L; + private static final long MICROS_PER_DAY = 86400000000L; + private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); + private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1); + + public TimestampTZInt96Writer(ColumnDescriptor descriptor) { + super(descriptor); + } + + /** Writes nano timestamps to parquet int96 */ + void writeBinary(int repetitionLevel, int julianDay, long nanosOfDay) { + ByteBuffer buf = ByteBuffer.allocate(12); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.putLong(nanosOfDay); + buf.putInt(julianDay); + buf.flip(); + column.writeBinary(repetitionLevel, Binary.fromConstantByteBuffer(buf)); + } + + void writeInstant(int repetitionLevel, Instant instant) { + long timestamp = instant.toEpochMilli(); + int julianDay = (int) (timestamp / MILLIS_IN_DAY + 2440588L); + long nanosOfDay = + timestamp % MILLIS_IN_DAY * NANOS_PER_MILLISECOND + + instant.getNano() % NANOS_PER_MILLISECOND; + writeBinary(repetitionLevel, julianDay, nanosOfDay); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + writeInstant(repetitionLevel, value.toInstant()); + } + } + + private static class TimestampInt96Writer extends AdaptHivePrimitiveWriter { + + private static final long JULIAN_DAY_OF_EPOCH = 2440588L; + private static final long MICROS_PER_DAY = 86400000000L; + private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); + private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1); + + public TimestampInt96Writer(ColumnDescriptor descriptor) { + super(descriptor); + } + + /** Writes nano timestamps to parquet int96 */ + void writeBinary(int repetitionLevel, int julianDay, long nanosOfDay) { + ByteBuffer buf = ByteBuffer.allocate(12); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.putLong(nanosOfDay); + buf.putInt(julianDay); + buf.flip(); + column.writeBinary(repetitionLevel, Binary.fromConstantByteBuffer(buf)); + } + + void writeInstant(int repetitionLevel, Instant instant) { + long timestamp = instant.toEpochMilli(); + int julianDay = (int) (timestamp / MILLIS_IN_DAY + 2440588L); + long nanosOfDay = + timestamp % MILLIS_IN_DAY * NANOS_PER_MILLISECOND + + instant.getNano() % NANOS_PER_MILLISECOND; + writeBinary(repetitionLevel, julianDay, nanosOfDay); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + writeInstant( + repetitionLevel, value.toLocalDateTime().atZone(ZoneId.systemDefault()).toInstant()); + } + } + + private static ParquetValueWriters.PrimitiveWriter ints( + LogicalType type, ColumnDescriptor desc) { + if (type instanceof TinyIntType) { + return ParquetValueWriters.tinyints(desc); + } else if (type instanceof SmallIntType) { + return ParquetValueWriters.shorts(desc); + } + return ParquetValueWriters.ints(desc); + } + + private static ParquetValueWriters.PrimitiveWriter strings(ColumnDescriptor desc) { + return new StringDataWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDescriptor desc) { + return new TimeMicrosWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); + return new IntegerDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); + return new LongDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { + return new FixedDecimalWriter(desc, precision, scale); + } + + private static ParquetValueWriters.PrimitiveWriter timestamps( + ColumnDescriptor desc) { + return new TimestampDataWriter(desc); + } + + private static ParquetValueWriters.PrimitiveWriter byteArrays(ColumnDescriptor desc) { + return new ByteArrayWriter(desc); + } + + private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { + private StringDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, StringData value) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); + } + } + + private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { + private TimeMicrosWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, Integer value) { + long micros = value.longValue() * 1000; + column.writeLong(repetitionLevel, micros); + } + } + + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); + } + } + + private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + + private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); + + column.writeLong(repetitionLevel, decimal.toUnscaledLong()); + } + } + + private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private final int precision; + private final int scale; + private final ThreadLocal bytes; + + private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { + super(desc); + this.precision = precision; + this.scale = scale; + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + } + + @Override + public void write(int repetitionLevel, DecimalData decimal) { + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); + } + } + + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { + private TimestampDataWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, TimestampData value) { + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + } + } + + private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { + private ByteArrayWriter(ColumnDescriptor desc) { + super(desc); + } + + @Override + public void write(int repetitionLevel, byte[] bytes) { + column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); + } + } + + private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { + private final LogicalType elementType; + + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { + super(definitionLevel, repetitionLevel, writer); + this.elementType = elementType; + } + + @Override + protected Iterator elements(ArrayData list) { + return new ElementIterator<>(list); + } + + private class ElementIterator implements Iterator { + private final int size; + private final ArrayData list; + private final ArrayData.ElementGetter getter; + private int index; + + private ElementIterator(ArrayData list) { + this.list = list; + size = list.size(); + getter = ArrayData.createElementGetter(elementType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public E next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + E element = (E) getter.getElementOrNull(list, index); + index += 1; + + return element; + } + } + } + + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { + private final LogicalType keyType; + private final LogicalType valueType; + + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { + super(definitionLevel, repetitionLevel, keyWriter, valueWriter); + this.keyType = keyType; + this.valueType = valueType; + } + + @Override + protected Iterator> pairs(MapData map) { + return new EntryIterator<>(map); + } + + private class EntryIterator implements Iterator> { + private final int size; + private final ArrayData keys; + private final ArrayData values; + private final ParquetValueReaders.ReusableEntry entry; + private final ArrayData.ElementGetter keyGetter; + private final ArrayData.ElementGetter valueGetter; + private int index; + + private EntryIterator(MapData map) { + size = map.size(); + keys = map.keyArray(); + values = map.valueArray(); + entry = new ParquetValueReaders.ReusableEntry<>(); + keyGetter = ArrayData.createElementGetter(keyType); + valueGetter = ArrayData.createElementGetter(valueType); + index = 0; + } + + @Override + public boolean hasNext() { + return index != size; + } + + @Override + @SuppressWarnings("unchecked") + public Map.Entry next() { + if (index >= size) { + throw new NoSuchElementException(); + } + + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); + index += 1; + + return entry; + } + } + } + + private static class RowDataWriter extends ParquetValueWriters.StructWriter { + private final RowData.FieldGetter[] fieldGetter; + + RowDataWriter(List> writers, List types) { + super(writers); + fieldGetter = new RowData.FieldGetter[types.size()]; + for (int i = 0; i < types.size(); i += 1) { + fieldGetter[i] = RowData.createFieldGetter(types.get(i), i); + } + } + + @Override + protected Object get(RowData struct, int index) { + return fieldGetter[index].getFieldOrNull(struct); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java new file mode 100644 index 0000000000..34099c47d3 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink.data; + +import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; +import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.util.Deque; +import java.util.List; + +/** + * Copy from iceberg {@link ParquetWithFlinkSchemaVisitor}. see annotation "Change For mixed-format" + */ +public class AdaptHiveParquetWithFlinkSchemaVisitor { + private final Deque fieldNames = Lists.newLinkedList(); + + public static T visit( + LogicalType sType, Type type, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { + Preconditions.checkArgument(sType != null, "Invalid DataType: null"); + if (type instanceof MessageType) { + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + } else if (type.isPrimitive()) { + return visitor.primitive(sType, type.asPrimitiveType()); + } else { + // if not a primitive, the typeId must be a group + GroupType group = type.asGroupType(); + OriginalType annotation = group.getOriginalType(); + if (annotation != null) { + switch (annotation) { + case LIST: + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); + + GroupType repeatedElement = group.getFields().get(0).asGroupType(); + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), + "Invalid list: inner group is not repeated"); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); + + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + ArrayType array = (ArrayType) sType; + RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); + + visitor.fieldNames.push(repeatedElement.getName()); + try { + T elementResult = null; + if (repeatedElement.getFieldCount() > 0) { + elementResult = visitField(element, repeatedElement.getType(0), visitor); + } + + return visitor.list(array, group, elementResult); + + } finally { + visitor.fieldNames.pop(); + } + + case MAP: + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); + + GroupType repeatedKeyValue = group.getType(0).asGroupType(); + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + "Invalid map: inner group is not repeated"); + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, + "Invalid map: repeated group does not have 2 fields"); + + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); + MapType map = (MapType) sType; + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); + + visitor.fieldNames.push(repeatedKeyValue.getName()); + try { + T keyResult = null; + T valueResult = null; + switch (repeatedKeyValue.getFieldCount()) { + case 2: + // if there are 2 fields, both key and value are projected + keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); + valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); + break; + case 1: + // if there is just one, use the name to determine what it is + Type keyOrValue = repeatedKeyValue.getType(0); + if (keyOrValue.getName().equalsIgnoreCase("key")) { + keyResult = visitField(keyField, keyOrValue, visitor); + // value result remains null + } else { + valueResult = visitField(valueField, keyOrValue, visitor); + // key result remains null + } + break; + default: + // both results will remain null + } + + return visitor.map(map, group, keyResult, valueResult); + + } finally { + visitor.fieldNames.pop(); + } + + default: + } + } + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + RowType struct = (RowType) sType; + return visitor.struct(struct, group, visitFields(struct, group, visitor)); + } + } + + private static T visitField( + RowField sField, Type field, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { + visitor.fieldNames.push(field.getName()); + try { + return visit(sField.getType(), field, visitor); + } finally { + visitor.fieldNames.pop(); + } + } + + private static List visitFields( + RowType struct, GroupType group, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { + List sFields = struct.getFields(); + Preconditions.checkArgument( + sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); + for (int i = 0; i < sFields.size(); i += 1) { + Type field = group.getFields().get(i); + RowField sField = sFields.get(i); + + // Change for mixed-format table ⬇ + // Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + // "Structs do not match: field %s != %s", field.getName(), sField.getName()); + Preconditions.checkArgument( + field.getName().equals(sField.getName()), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); + // Change for mixed-format table ⬆ + + results.add(visitField(sField, field, visitor)); + } + + return results; + } + + public T message(RowType sStruct, MessageType message, List fields) { + return null; + } + + public T struct(RowType sStruct, GroupType struct, List fields) { + return null; + } + + public T list(ArrayType sArray, GroupType array, T element) { + return null; + } + + public T map(MapType sMap, GroupType map, T key, T value) { + return null; + } + + public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { + return null; + } + + protected String[] currentPath() { + return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); + } + + protected String[] path(String name) { + List list = Lists.newArrayList(fieldNames.descendingIterator()); + list.add(name); + return list.toArray(new String[0]); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java new file mode 100644 index 0000000000..599d4cbb2a --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink.source; + +import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; +import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkSourceFilter; +import org.apache.iceberg.flink.RowDataWrapper; +import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetReaders; +import org.apache.iceberg.flink.data.FlinkAvroReader; +import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.flink.data.RowDataProjection; +import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PartitionUtil; + +import java.util.List; +import java.util.Map; + +/** Copy from iceberg. Adopt AdaptHiveFlinkParquetReaders to adapt hive flink parquet readers. */ +@Internal +public class RowDataFileScanTaskReader implements FileScanTaskReader { + + private final Schema tableSchema; + private final Schema projectedSchema; + private final String nameMapping; + private final boolean caseSensitive; + private final FlinkSourceFilter rowFilter; + + public RowDataFileScanTaskReader( + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + List filters) { + this.tableSchema = tableSchema; + this.projectedSchema = projectedSchema; + this.nameMapping = nameMapping; + this.caseSensitive = caseSensitive; + + if (filters != null && !filters.isEmpty()) { + Expression combinedExpression = + filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); + this.rowFilter = + new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); + } else { + this.rowFilter = null; + } + } + + @Override + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); + + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); + + // Project the RowData to remove the extra meta columns. + if (!projectedSchema.sameSchema(deletes.requiredSchema())) { + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); + iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); + } + + return iterable.iterator(); + } + + private CloseableIterable newIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + CloseableIterable iter; + if (task.isDataTask()) { + throw new UnsupportedOperationException("Cannot read data task."); + } else { + switch (task.file().format()) { + case PARQUET: + iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case AVRO: + iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + case ORC: + iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); + break; + + default: + throw new UnsupportedOperationException( + "Cannot read unknown format: " + task.file().format()); + } + } + + if (rowFilter != null) { + return CloseableIterable.filter(iter, rowFilter::filter); + } + return iter; + } + + private CloseableIterable newAvroIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newParquetIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + // Change for mixed-format table + .createReaderFunc( + fileSchema -> + AdaptHiveFlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private CloseableIterable newOrcIterable( + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return builder.build(); + } + + private static class FlinkDeleteFilter extends DeleteFilter { + private final RowType requiredRowType; + private final RowDataWrapper asStructLike; + private final InputFilesDecryptor inputFilesDecryptor; + + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { + super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); + this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); + this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); + this.inputFilesDecryptor = inputFilesDecryptor; + } + + public RowType requiredRowType() { + return requiredRowType; + } + + @Override + protected StructLike asStructLike(RowData row) { + return asStructLike.wrap(row); + } + + @Override + protected InputFile getInputFile(String location) { + return inputFilesDecryptor.getInputFile(location); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java new file mode 100644 index 0000000000..c3ff9bea62 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -0,0 +1,707 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink.source; + +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.TimeUtils; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.FlinkReadConf; +import org.apache.iceberg.flink.FlinkReadOptions; + +import java.io.Serializable; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * Copy from Iceberg {@link ScanContext}. only change line 115 and expand the modifier. Context + * object with optional arguments for a Flink Scan. + */ +public class ScanContext implements Serializable { + + private static final long serialVersionUID = 1L; + + public static final ConfigOption SNAPSHOT_ID = + ConfigOptions.key("snapshot-id") + .longType() + .defaultValue(null) + .withDescription( + "Retrieve the full data of the specified snapshot by ID, used for batch scan mode"); + + public static final ConfigOption TAG = + ConfigOptions.key("tag").stringType().defaultValue(null); + + public static final ConfigOption BRANCH = + ConfigOptions.key("branch").stringType().defaultValue(null); + + public static final ConfigOption START_TAG = + ConfigOptions.key("start-tag").stringType().defaultValue(null); + + public static final ConfigOption END_TAG = + ConfigOptions.key("end-tag").stringType().defaultValue(null); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("case-sensitive") + .booleanType() + .defaultValue(false) + .withDescription("Set if column names are case-sensitive"); + + public static final ConfigOption AS_OF_TIMESTAMP = + ConfigOptions.key("as-of-timestamp") + .longType() + .defaultValue(null) + .withDescription( + "Retrieve the full data of the specified snapshot at the given timestamp, " + + "used for batch scan mode"); + + public static final ConfigOption STARTING_STRATEGY = + ConfigOptions.key("starting-strategy") + .enumType(StreamingStartingStrategy.class) + .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .withDescription("Specific the starting strategy for streaming execution"); + + public static final ConfigOption START_SNAPSHOT_TIMESTAMP = + ConfigOptions.key("start-snapshot-timestamp") + .longType() + .defaultValue(null) + .withDescription("Specific the snapshot timestamp that streaming job starts from"); + + public static final ConfigOption START_SNAPSHOT_ID = + ConfigOptions.key("start-snapshot-id") + .longType() + .defaultValue(null) + .withDescription("Specific the snapshot id that streaming job starts from"); + + public static final ConfigOption END_SNAPSHOT_ID = + ConfigOptions.key("end-snapshot-id") + .longType() + .defaultValue(null) + .withDescription("Specific the snapshot id that streaming job to end"); + + public static final ConfigOption SPLIT_SIZE = + ConfigOptions.key("split-size") + .longType() + .defaultValue(null) + .withDescription("Specific the target size when combining data input splits"); + + public static final ConfigOption SPLIT_LOOKBACK = + ConfigOptions.key("split-lookback") + .intType() + .defaultValue(null) + .withDescription("Specify the number of bins to consider when combining input splits"); + + public static final ConfigOption SPLIT_FILE_OPEN_COST = + ConfigOptions.key("split-file-open-cost") + .longType() + .defaultValue(null) + .withDescription( + "The estimated cost to open a file, used as a minimum weight when combining splits"); + + public static final ConfigOption STREAMING = + ConfigOptions.key("streaming") + .booleanType() + .defaultValue(true) + .withDescription("Set if job is bounded or unbounded"); + + public static final ConfigOption MONITOR_INTERVAL = + ConfigOptions.key("monitor-interval") + .durationType() + .defaultValue(Duration.ofSeconds(10)) + .withDescription( + "Specify the time interval for consecutively monitoring newly committed data files"); + + public static final ConfigOption INCLUDE_COLUMN_STATS = + ConfigOptions.key("include-column-stats") + .booleanType() + .defaultValue(false) + .withDescription("Set if loads the column stats with each file"); + + public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT = + ConfigOptions.key("max-planning-snapshot-count") + .intType() + .defaultValue(Integer.MAX_VALUE) + .withDescription("Specify the max planning snapshot count"); + + public static final ConfigOption LIMIT_OPTION = + ConfigOptions.key("limit").longType().defaultValue(-1L); + + public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = + ConfigOptions.key("max-allowed-planning-failures").intType().defaultValue(3); + + protected final boolean caseSensitive; + protected final boolean exposeLocality; + protected final Long snapshotId; + protected final String branch; + protected final String tag; + protected final StreamingStartingStrategy startingStrategy; + protected final Long startSnapshotId; + protected final Long startSnapshotTimestamp; + protected final Long endSnapshotId; + protected final Long asOfTimestamp; + protected final String startTag; + protected final String endTag; + protected final Long splitSize; + protected final Integer splitLookback; + protected final Long splitOpenFileCost; + protected final boolean isStreaming; + protected final Duration monitorInterval; + + protected final String nameMapping; + protected final Schema schema; + protected final List filters; + protected final long limit; + protected final boolean includeColumnStats; + protected final Collection includeStatsForColumns; + protected final Integer planParallelism; + protected final int maxPlanningSnapshotCount; + protected final int maxAllowedPlanningFailures; + protected final String watermarkColumn; + protected final TimeUnit watermarkColumnTimeUnit; + + protected ScanContext( + boolean caseSensitive, + Long snapshotId, + StreamingStartingStrategy startingStrategy, + Long startSnapshotTimestamp, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean includeColumnStats, + Collection includeStatsForColumns, + boolean exposeLocality, + Integer planParallelism, + int maxPlanningSnapshotCount, + int maxAllowedPlanningFailures, + String watermarkColumn, + TimeUnit watermarkColumnTimeUnit, + String branch, + String tag, + String startTag, + String endTag) { + this.caseSensitive = caseSensitive; + this.snapshotId = snapshotId; + this.tag = tag; + this.branch = branch; + this.startingStrategy = startingStrategy; + this.startSnapshotTimestamp = startSnapshotTimestamp; + this.startSnapshotId = startSnapshotId; + this.endSnapshotId = endSnapshotId; + this.asOfTimestamp = asOfTimestamp; + this.startTag = startTag; + this.endTag = endTag; + this.splitSize = splitSize; + this.splitLookback = splitLookback; + this.splitOpenFileCost = splitOpenFileCost; + this.isStreaming = isStreaming; + this.monitorInterval = monitorInterval; + + this.nameMapping = nameMapping; + this.schema = schema; + this.filters = filters; + this.limit = limit; + this.includeColumnStats = includeColumnStats; + this.includeStatsForColumns = includeStatsForColumns; + this.exposeLocality = exposeLocality; + this.planParallelism = planParallelism; + this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; + this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; + this.watermarkColumn = watermarkColumn; + this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; + } + + void validate() { + if (isStreaming) { + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { + Preconditions.checkArgument( + startSnapshotId != null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + Preconditions.checkArgument( + startSnapshotTimestamp == null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { + Preconditions.checkArgument( + startSnapshotTimestamp != null, + "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + Preconditions.checkArgument( + startSnapshotId == null, + "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + } + + Preconditions.checkArgument( + tag == null, + String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + Preconditions.checkArgument( + snapshotId == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); + Preconditions.checkArgument( + endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); + Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); + } + Preconditions.checkArgument( + !(startTag != null && startSnapshotId() != null), + "START_SNAPSHOT_ID and START_TAG cannot both be set."); + + Preconditions.checkArgument( + !(endTag != null && endSnapshotId() != null), + "END_SNAPSHOT_ID and END_TAG cannot both be set."); + + Preconditions.checkArgument( + maxAllowedPlanningFailures >= -1, + "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); + } + + public boolean caseSensitive() { + return caseSensitive; + } + + public Long snapshotId() { + return snapshotId; + } + + public String branch() { + return branch; + } + + public String tag() { + return tag; + } + + public String startTag() { + return startTag; + } + + public String endTag() { + return endTag; + } + + public StreamingStartingStrategy streamingStartingStrategy() { + return startingStrategy; + } + + public Long startSnapshotTimestamp() { + return startSnapshotTimestamp; + } + + public Long startSnapshotId() { + return startSnapshotId; + } + + public Long endSnapshotId() { + return endSnapshotId; + } + + public Long asOfTimestamp() { + return asOfTimestamp; + } + + public Long splitSize() { + return splitSize; + } + + public Integer splitLookback() { + return splitLookback; + } + + public Long splitOpenFileCost() { + return splitOpenFileCost; + } + + public boolean isStreaming() { + return isStreaming; + } + + public Duration monitorInterval() { + return monitorInterval; + } + + public String nameMapping() { + return nameMapping; + } + + public Schema project() { + return schema; + } + + public List filters() { + return filters; + } + + public long limit() { + return limit; + } + + public boolean includeColumnStats() { + return includeColumnStats; + } + + public Collection includeStatsForColumns() { + return includeStatsForColumns; + } + + public boolean exposeLocality() { + return exposeLocality; + } + + public Integer planParallelism() { + return planParallelism; + } + + public int maxPlanningSnapshotCount() { + return maxPlanningSnapshotCount; + } + + public int maxAllowedPlanningFailures() { + return maxAllowedPlanningFailures; + } + + public String watermarkColumn() { + return watermarkColumn; + } + + public TimeUnit watermarkColumnTimeUnit() { + return watermarkColumnTimeUnit; + } + + public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(null) + .useBranch(branch) + .useTag(null) + .startSnapshotId(newStartSnapshotId) + .endSnapshotId(newEndSnapshotId) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public ScanContext copyWithSnapshotId(long newSnapshotId) { + return ScanContext.builder() + .caseSensitive(caseSensitive) + .useSnapshotId(newSnapshotId) + .useBranch(branch) + .useTag(tag) + .startSnapshotId(null) + .endSnapshotId(null) + .startTag(null) + .endTag(null) + .asOfTimestamp(null) + .splitSize(splitSize) + .splitLookback(splitLookback) + .splitOpenFileCost(splitOpenFileCost) + .streaming(isStreaming) + .monitorInterval(monitorInterval) + .nameMapping(nameMapping) + .project(schema) + .filters(filters) + .limit(limit) + .includeColumnStats(includeColumnStats) + .includeColumnStats(includeStatsForColumns) + .exposeLocality(exposeLocality) + .planParallelism(planParallelism) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(watermarkColumn) + .watermarkColumnTimeUnit(watermarkColumnTimeUnit) + .build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); + private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); + private String branch = FlinkReadOptions.BRANCH.defaultValue(); + private String tag = FlinkReadOptions.TAG.defaultValue(); + private String startTag = FlinkReadOptions.START_TAG.defaultValue(); + private String endTag = FlinkReadOptions.END_TAG.defaultValue(); + private StreamingStartingStrategy startingStrategy = + FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); + private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); + private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); + private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); + private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); + private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); + private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); + private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); + private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); + private Duration monitorInterval = + TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); + private String nameMapping; + private Schema projectedSchema; + private List filters; + private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); + private boolean includeColumnStats = + FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); + private Collection includeStatsForColumns = null; + private boolean exposeLocality; + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private int maxPlanningSnapshotCount = + FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); + private int maxAllowedPlanningFailures = + FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); + private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); + private TimeUnit watermarkColumnTimeUnit = + FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); + + private Builder() {} + + public Builder caseSensitive(boolean newCaseSensitive) { + this.caseSensitive = newCaseSensitive; + return this; + } + + public Builder useSnapshotId(Long newSnapshotId) { + this.snapshotId = newSnapshotId; + return this; + } + + public Builder useTag(String newTag) { + this.tag = newTag; + return this; + } + + public Builder useBranch(String newBranch) { + this.branch = newBranch; + return this; + } + + public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { + this.startingStrategy = newStartingStrategy; + return this; + } + + public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { + this.startSnapshotTimestamp = newStartSnapshotTimestamp; + return this; + } + + public Builder startSnapshotId(Long newStartSnapshotId) { + this.startSnapshotId = newStartSnapshotId; + return this; + } + + public Builder endSnapshotId(Long newEndSnapshotId) { + this.endSnapshotId = newEndSnapshotId; + return this; + } + + public Builder startTag(String newStartTag) { + this.startTag = newStartTag; + return this; + } + + public Builder endTag(String newEndTag) { + this.endTag = newEndTag; + return this; + } + + public Builder asOfTimestamp(Long newAsOfTimestamp) { + this.asOfTimestamp = newAsOfTimestamp; + return this; + } + + public Builder splitSize(Long newSplitSize) { + this.splitSize = newSplitSize; + return this; + } + + public Builder splitLookback(Integer newSplitLookback) { + this.splitLookback = newSplitLookback; + return this; + } + + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { + this.splitOpenFileCost = newSplitOpenFileCost; + return this; + } + + public Builder streaming(boolean streaming) { + this.isStreaming = streaming; + return this; + } + + public Builder monitorInterval(Duration newMonitorInterval) { + this.monitorInterval = newMonitorInterval; + return this; + } + + public Builder nameMapping(String newNameMapping) { + this.nameMapping = newNameMapping; + return this; + } + + public Builder project(Schema newProjectedSchema) { + this.projectedSchema = newProjectedSchema; + return this; + } + + public Builder filters(List newFilters) { + this.filters = newFilters; + return this; + } + + public Builder limit(long newLimit) { + this.limit = newLimit; + return this; + } + + public Builder includeColumnStats(boolean newIncludeColumnStats) { + this.includeColumnStats = newIncludeColumnStats; + return this; + } + + public Builder includeColumnStats(Collection newIncludeStatsForColumns) { + this.includeStatsForColumns = newIncludeStatsForColumns; + return this; + } + + public Builder exposeLocality(boolean newExposeLocality) { + this.exposeLocality = newExposeLocality; + return this; + } + + public Builder planParallelism(Integer parallelism) { + this.planParallelism = parallelism; + return this; + } + + public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { + this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; + return this; + } + + public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { + this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; + return this; + } + + public Builder watermarkColumn(String newWatermarkColumn) { + this.watermarkColumn = newWatermarkColumn; + return this; + } + + public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { + this.watermarkColumnTimeUnit = newWatermarkTimeUnit; + return this; + } + + public Builder resolveConfig( + Table table, Map readOptions, ReadableConfig readableConfig) { + FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); + + return this.useSnapshotId(flinkReadConf.snapshotId()) + .useTag(flinkReadConf.tag()) + .useBranch(flinkReadConf.branch()) + .startTag(flinkReadConf.startTag()) + .endTag(flinkReadConf.endTag()) + .caseSensitive(flinkReadConf.caseSensitive()) + .asOfTimestamp(flinkReadConf.asOfTimestamp()) + .startingStrategy(flinkReadConf.startingStrategy()) + .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) + .startSnapshotId(flinkReadConf.startSnapshotId()) + .endSnapshotId(flinkReadConf.endSnapshotId()) + .splitSize(flinkReadConf.splitSize()) + .splitLookback(flinkReadConf.splitLookback()) + .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) + .streaming(flinkReadConf.streaming()) + .monitorInterval(flinkReadConf.monitorInterval()) + .nameMapping(flinkReadConf.nameMapping()) + .limit(flinkReadConf.limit()) + .planParallelism(flinkReadConf.workerPoolSize()) + .includeColumnStats(flinkReadConf.includeColumnStats()) + .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) + .maxAllowedPlanningFailures(maxAllowedPlanningFailures) + .watermarkColumn(flinkReadConf.watermarkColumn()) + .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); + } + + public ScanContext build() { + return new ScanContext( + caseSensitive, + snapshotId, + startingStrategy, + startSnapshotTimestamp, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + includeColumnStats, + includeStatsForColumns, + exposeLocality, + planParallelism, + maxPlanningSnapshotCount, + maxAllowedPlanningFailures, + watermarkColumn, + watermarkColumnTimeUnit, + branch, + tag, + startTag, + endTag); + } + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java index 6b984b1b5d..ab0b36a2c2 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java @@ -135,7 +135,7 @@ public ParquetValueReader struct( } } - return new RowDataReader(types, reorderedFields); + return new RowDataReader(reorderedFields); } @Override @@ -622,8 +622,8 @@ private static class RowDataReader extends ParquetValueReaders.StructReader { private final int numFields; - RowDataReader(List types, List> readers) { - super(types, readers); + RowDataReader(List> readers) { + super(readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index 599d4cbb2a..8b99a33621 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -36,8 +36,8 @@ import org.apache.iceberg.flink.FlinkSourceFilter; import org.apache.iceberg.flink.RowDataWrapper; import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetReaders; -import org.apache.iceberg.flink.data.FlinkAvroReader; import org.apache.iceberg.flink.data.FlinkOrcReader; +import org.apache.iceberg.flink.data.FlinkPlannedAvroReader; import org.apache.iceberg.flink.data.RowDataProjection; import org.apache.iceberg.flink.data.RowDataUtil; import org.apache.iceberg.io.CloseableIterable; @@ -156,7 +156,7 @@ private CloseableIterable newAvroIterable( .reuseContainers() .project(schema) .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + .createReaderFunc(ignore -> FlinkPlannedAvroReader.create(schema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java index 155bda30ad..a1cf6f5741 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java @@ -135,7 +135,7 @@ public ParquetValueReader struct( } } - return new RowDataReader(types, reorderedFields); + return new RowDataReader(reorderedFields); } @Override @@ -622,8 +622,8 @@ private static class RowDataReader extends ParquetValueReaders.StructReader { private final int numFields; - RowDataReader(List types, List> readers) { - super(types, readers); + RowDataReader(List> readers) { + super(readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java index 024c65f80d..0e922e50dc 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java @@ -29,10 +29,12 @@ class RowDataRecordFactory implements RecordFactory { private final RowType rowType; private final TypeSerializer[] fieldSerializers; + private final RowData.FieldGetter[] fieldGetters; RowDataRecordFactory(RowType rowType) { this.rowType = rowType; this.fieldSerializers = createFieldSerializers(rowType); + this.fieldGetters = createFieldGetters(rowType); } static TypeSerializer[] createFieldSerializers(RowType rowType) { @@ -41,6 +43,14 @@ static TypeSerializer[] createFieldSerializers(RowType rowType) { .toArray(TypeSerializer[]::new); } + static RowData.FieldGetter[] createFieldGetters(RowType rowType) { + RowData.FieldGetter[] getters = new RowData.FieldGetter[rowType.getFieldCount()]; + for (int i = 0; i < rowType.getFieldCount(); i++) { + getters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); + } + return getters; + } + @Override public RowData[] createBatch(int batchSize) { RowData[] arr = new RowData[batchSize]; @@ -57,6 +67,7 @@ public void clone(RowData from, RowData[] batch, int position) { // Clone method will allocate a new GenericRowData object // if the target object is NOT a GenericRowData. // So we should always set the clone return value back to the array. - batch[position] = RowDataUtil.clone(from, batch[position], rowType, fieldSerializers); + batch[position] = + RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); } } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java new file mode 100644 index 0000000000..5a1e3d85b9 --- /dev/null +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.flink; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.CoreOptions; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.test.util.MiniClusterWithClientResource; + +/** + * Compatibility shim for tests that previously used Iceberg's removed MiniClusterResource helper. + */ +public class MiniClusterResource { + private static final int DEFAULT_TM_NUM = 1; + private static final int DEFAULT_PARALLELISM = 4; + + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration().set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + + private MiniClusterResource() {} + + public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { + return new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(DEFAULT_TM_NUM) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) + .build()); + } +} diff --git a/amoro-format-mixed/amoro-mixed-flink/pom.xml b/amoro-format-mixed/amoro-mixed-flink/pom.xml index 4216ef937a..e4d98a6534 100644 --- a/amoro-format-mixed/amoro-mixed-flink/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/pom.xml @@ -34,8 +34,10 @@ amoro-mixed-flink-common + amoro-mixed-flink-common-1.17 amoro-mixed-flink-common-format amoro-mixed-flink-common-iceberg-bridge + amoro-mixed-flink-common-iceberg-bridge-1.17 v1.16/amoro-mixed-flink-1.16 v1.16/amoro-mixed-flink-runtime-1.16 v1.17/amoro-mixed-flink-1.17 @@ -47,6 +49,7 @@ 1.18.1 + 1.9.2 3.2.0-1.18 3.4.0 2.9.0 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml index 2e349a5a5d..7d7e935233 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml @@ -33,6 +33,7 @@ https://amoro.apache.org + 1.6.1 3.2.3 3.21.0 1.17.2 @@ -41,7 +42,7 @@ org.apache.amoro - amoro-mixed-flink-common + amoro-mixed-flink-common-1.17 ${project.parent.version} @@ -87,7 +88,7 @@ - org.apache.amoro:amoro-format-mixed-flink-common + org.apache.amoro:amoro-mixed-flink-common-1.17 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml index 4d5186dc36..4169e0660d 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml @@ -31,6 +31,7 @@ https://amoro.apache.org + 1.6.1 1.17.2 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml index 69d90fc205..95f82a7e1d 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml @@ -33,6 +33,7 @@ https://amoro.apache.org + 1.9.2 3.2.3 3.21.0 1.18.1 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml index d2e7155222..64c3b2e116 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml @@ -31,6 +31,7 @@ https://amoro.apache.org + 1.9.2 1.18.1 3.2.0-1.18 diff --git a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/data/parquet/AdaptHiveGenericParquetReaders.java b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/data/parquet/AdaptHiveGenericParquetReaders.java index ef42d99fa1..76e0b45d07 100644 --- a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/data/parquet/AdaptHiveGenericParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/data/parquet/AdaptHiveGenericParquetReaders.java @@ -50,14 +50,14 @@ public static ParquetValueReader buildReader( @Override protected ParquetValueReader createStructReader( List types, List> fieldReaders, StructType structType) { - return new RecordReader(types, fieldReaders, structType); + return new RecordReader(fieldReaders, structType); } private static class RecordReader extends StructReader { private final StructType structType; - RecordReader(List types, List> readers, StructType struct) { - super(types, readers); + RecordReader(List> readers, StructType struct) { + super(readers); this.structType = struct; } diff --git a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetReader.java b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetReader.java index f1dd2da9de..5f6e087cc4 100644 --- a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetReader.java +++ b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetReader.java @@ -106,8 +106,6 @@ private static class FileIterator implements CloseableIterator { private final ParquetValueReader model; private final long totalValues; private final boolean reuseContainers; - private final long[] rowGroupsStartRowPos; - private int nextRowGroup = 0; private long nextRowGroupStart = 0; private long valuesRead = 0; @@ -119,7 +117,6 @@ private static class FileIterator implements CloseableIterator { this.model = conf.model(); this.totalValues = conf.totalValues(); this.reuseContainers = conf.reuseContainers(); - this.rowGroupsStartRowPos = conf.startRowPositions(); } @Override @@ -156,11 +153,10 @@ private void advance() { throw new RuntimeIOException(e); } - long rowPosition = rowGroupsStartRowPos[nextRowGroup]; nextRowGroupStart += pages.getRowCount(); nextRowGroup += 1; - model.setPageSource(pages, rowPosition); + model.setPageSource(pages); } @Override diff --git a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetSchemaUtil.java b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetSchemaUtil.java index 9ef8c1cad4..a3f378e5f8 100644 --- a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetSchemaUtil.java +++ b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/iceberg/parquet/AdaptHiveParquetSchemaUtil.java @@ -79,7 +79,9 @@ private static Schema convertInternal( public static MessageType pruneColumns(MessageType fileSchema, Schema expectedSchema) { // column order must match the incoming type, so it doesn't matter that the ids are unordered Set selectedIds = TypeUtil.getProjectedIds(expectedSchema); - return (MessageType) ParquetTypeVisitor.visit(fileSchema, new PruneColumns(selectedIds)); + return (MessageType) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new PruneColumns(selectedIds)); } /** diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index 46f3e6f4fa..ef33a3d301 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(types, newFields); + return new InternalRowReader(newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(types, reorderedFields); + return new InternalRowReader(reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List types, List> readers) { - super(types, readers); + InternalRowReader(List> readers) { + super(readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index 46f3e6f4fa..ef33a3d301 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(types, newFields); + return new InternalRowReader(newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(types, reorderedFields); + return new InternalRowReader(reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List types, List> readers) { - super(types, readers); + InternalRowReader(List> readers) { + super(readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index 46f3e6f4fa..ef33a3d301 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(types, newFields); + return new InternalRowReader(newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(types, reorderedFields); + return new InternalRowReader(reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List types, List> readers) { - super(types, readers); + InternalRowReader(List> readers) { + super(readers); this.numFields = readers.size(); } diff --git a/pom.xml b/pom.xml index 070c9c0f1d..c16d2b11ca 100644 --- a/pom.xml +++ b/pom.xml @@ -102,7 +102,7 @@ 3.3.2 3.3.1 - 1.6.1 + 1.10.1 1.2.0 3.1.3 3.4.2 @@ -126,8 +126,8 @@ 5.7.0 4.11.0 1.21.4 - 1.13.1 - 1.15.2 + 1.16.0 + 1.16.0 8.0.33 1.9.7 2.24.12 @@ -397,7 +397,6 @@ ${parquet-avro.version} - org.apache.parquet parquet-jackson @@ -1840,6 +1839,9 @@ 2.3.8 2.10.2 + 1.8.1 + 1.15.2 + 1.15.2 3.3.4 3.3 hadoop-client @@ -1874,6 +1876,7 @@ 3.3.4 3.3 + 1.8.1 From 491cefb6e07400ad89614c715cc4c303d947f9a2 Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 01:06:15 +0800 Subject: [PATCH 2/7] chore: trigger CI From c61f3eee790bcb11cc92dab14bf5e800d5b1e556 Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 10:37:12 +0800 Subject: [PATCH 3/7] [AMORO-4110] Align Spark mixed module with Avro 1.12.0 for Iceberg 1.10.x --- .../amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml index ad5668c54d..0032a96ad0 100644 --- a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml +++ b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml @@ -45,6 +45,13 @@ ${project.version} + + + org.apache.avro + avro + 1.12.0 + + org.apache.spark spark-sql_${scala.binary.version} From 8a522114d6b69be8f244795b247e6423f9f91a1b Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 12:02:52 +0800 Subject: [PATCH 4/7] Upgrade Iceberg version to 1.10.x and implement deleteFiles method for compatibility with new interface --- .../apache/amoro/op/MixedOverwriteFiles.java | 24 +++++++++++++++++++ .../amoro/hive/op/OverwriteHiveFiles.java | 23 ++++++++++++++++++ .../amoro-mixed-spark-3-common/pom.xml | 2 +- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/amoro-format-iceberg/src/main/java/org/apache/amoro/op/MixedOverwriteFiles.java b/amoro-format-iceberg/src/main/java/org/apache/amoro/op/MixedOverwriteFiles.java index 8555938791..11974c9be9 100644 --- a/amoro-format-iceberg/src/main/java/org/apache/amoro/op/MixedOverwriteFiles.java +++ b/amoro-format-iceberg/src/main/java/org/apache/amoro/op/MixedOverwriteFiles.java @@ -24,6 +24,8 @@ import org.apache.iceberg.Table; import org.apache.iceberg.Transaction; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.util.DataFileSet; +import org.apache.iceberg.util.DeleteFileSet; import java.util.function.Supplier; @@ -69,6 +71,28 @@ public OverwriteFiles deleteFile(DataFile file) { return this; } + // Note: Do not add @Override here. This method implements the default method + // OverwriteFiles.deleteFiles(DataFileSet, DeleteFileSet) introduced in Iceberg 1.10.x. + // The @Override annotation would fail compilation against Iceberg 1.8.x (hadoop2/spark-3.3 + // profiles) where this interface method does not exist. At runtime on 1.10.x+, this method + // is still resolved correctly via dynamic dispatch. + @SuppressWarnings("unused") + public OverwriteFiles deleteFiles( + DataFileSet dataFilesToDelete, DeleteFileSet deleteFilesToDelete) { + try { + overwriteFiles + .getClass() + .getMethod("deleteFiles", DataFileSet.class, DeleteFileSet.class) + .invoke(overwriteFiles, dataFilesToDelete, deleteFilesToDelete); + } catch (ReflectiveOperationException e) { + throw new UnsupportedOperationException( + "Deleting data and delete files is not supported by the underlying implementation", e); + } + dataFilesToDelete.forEach(this::deleteIcebergDataFile); + deleteFilesToDelete.forEach(this::deleteIcebergDeleteFile); + return this; + } + @Override public OverwriteFiles validateAddedFilesMatchOverwriteFilter() { overwriteFiles.validateAddedFilesMatchOverwriteFilter(); diff --git a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/amoro/hive/op/OverwriteHiveFiles.java b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/amoro/hive/op/OverwriteHiveFiles.java index a5595197ce..295e22c2a2 100644 --- a/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/amoro/hive/op/OverwriteHiveFiles.java +++ b/amoro-format-mixed/amoro-mixed-hive/src/main/java/org/apache/amoro/hive/op/OverwriteHiveFiles.java @@ -26,6 +26,8 @@ import org.apache.iceberg.Transaction; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.util.DataFileSet; +import org.apache.iceberg.util.DeleteFileSet; import java.util.List; @@ -82,6 +84,27 @@ public OverwriteFiles deleteFile(DataFile file) { return this; } + // Note: Do not add @Override here. This method implements the default method + // OverwriteFiles.deleteFiles(DataFileSet, DeleteFileSet) introduced in Iceberg 1.10.x. + // The @Override annotation would fail compilation against Iceberg 1.8.x (hadoop2/spark-3.3 + // profiles) where this interface method does not exist. At runtime on 1.10.x+, this method + // is still resolved correctly via dynamic dispatch. + @SuppressWarnings("unused") + public OverwriteFiles deleteFiles( + DataFileSet dataFilesToDelete, DeleteFileSet deleteFilesToDelete) { + try { + delegate + .getClass() + .getMethod("deleteFiles", DataFileSet.class, DeleteFileSet.class) + .invoke(delegate, dataFilesToDelete, deleteFilesToDelete); + } catch (ReflectiveOperationException e) { + throw new UnsupportedOperationException( + "Deleting data and delete files is not supported by the underlying implementation", e); + } + dataFilesToDelete.stream().filter(this::isHiveDataFile).forEach(this.deleteFiles::add); + return this; + } + @Override public OverwriteFiles validateAddedFilesMatchOverwriteFilter() { delegate.validateAddedFilesMatchOverwriteFilter(); diff --git a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml index 0032a96ad0..72c681e001 100644 --- a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml +++ b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml @@ -49,7 +49,7 @@ org.apache.avro avro - 1.12.0 + 1.12.1 From 186ad77878764593eeba208142792175c3ce6047 Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 12:09:12 +0800 Subject: [PATCH 5/7] Upgrade Iceberg dependencies to version 1.8.1 and 1.10.1, and update Avro and Parquet libraries --- dev/deps/dependencies-hadoop-2-spark-3.3 | 37 ++++++++--------- dev/deps/dependencies-hadoop-3-spark-3.5 | 51 +++++++++++++----------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/dev/deps/dependencies-hadoop-2-spark-3.3 b/dev/deps/dependencies-hadoop-2-spark-3.3 index cd86dfc6f8..8e6f46a0cd 100644 --- a/dev/deps/dependencies-hadoop-2-spark-3.3 +++ b/dev/deps/dependencies-hadoop-2-spark-3.3 @@ -30,7 +30,7 @@ async-profiler/2.9//async-profiler-2.9.jar auth/2.24.12//auth-2.24.12.jar avro-ipc/1.11.0//avro-ipc-1.11.0.jar avro-mapred/1.11.0//avro-mapred-1.11.0.jar -avro/1.11.3//avro-1.11.3.jar +avro/1.12.0//avro-1.12.0.jar aws-core/2.24.12//aws-core-2.24.12.jar aws-json-protocol/2.24.12//aws-json-protocol-2.24.12.jar aws-query-protocol/2.24.12//aws-query-protocol-2.24.12.jar @@ -80,6 +80,7 @@ ehcache/3.3.1//ehcache-3.3.1.jar endpoints-spi/2.24.12//endpoints-spi-2.24.12.jar error_prone_annotations/2.18.0//error_prone_annotations-2.18.0.jar eventstream/1.0.1//eventstream-1.0.1.jar +failsafe/3.3.2//failsafe-3.3.2.jar failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/23.5.26//flatbuffers-java-23.5.26.jar flink-annotations/1.20.3//flink-annotations-1.20.3.jar @@ -143,23 +144,23 @@ http-auth-spi/2.24.12//http-auth-spi-2.24.12.jar http-auth/2.24.12//http-auth-2.24.12.jar http-client-spi/2.24.12//http-client-spi-2.24.12.jar httpclient/4.5.13//httpclient-4.5.13.jar -httpclient5/5.3.1//httpclient5-5.3.1.jar +httpclient5/5.4.1//httpclient5-5.4.1.jar httpcore/4.4.13//httpcore-4.4.13.jar -httpcore5-h2/5.2.4//httpcore5-h2-5.2.4.jar -httpcore5/5.2.4//httpcore5-5.2.4.jar -iceberg-aliyun/1.6.1//iceberg-aliyun-1.6.1.jar -iceberg-api/1.6.1//iceberg-api-1.6.1.jar -iceberg-arrow/1.6.1//iceberg-arrow-1.6.1.jar -iceberg-aws/1.6.1//iceberg-aws-1.6.1.jar -iceberg-bundled-guava/1.6.1//iceberg-bundled-guava-1.6.1.jar -iceberg-common/1.6.1//iceberg-common-1.6.1.jar -iceberg-core/1.6.1//iceberg-core-1.6.1.jar -iceberg-data/1.6.1//iceberg-data-1.6.1.jar -iceberg-hive-metastore/1.6.1//iceberg-hive-metastore-1.6.1.jar -iceberg-orc/1.6.1//iceberg-orc-1.6.1.jar -iceberg-parquet/1.6.1//iceberg-parquet-1.6.1.jar -iceberg-spark-3.3_2.12/1.6.1//iceberg-spark-3.3_2.12-1.6.1.jar -iceberg-spark-extensions-3.3_2.12/1.6.1//iceberg-spark-extensions-3.3_2.12-1.6.1.jar +httpcore5-h2/5.3.1//httpcore5-h2-5.3.1.jar +httpcore5/5.3.1//httpcore5-5.3.1.jar +iceberg-aliyun/1.8.1//iceberg-aliyun-1.8.1.jar +iceberg-api/1.8.1//iceberg-api-1.8.1.jar +iceberg-arrow/1.8.1//iceberg-arrow-1.8.1.jar +iceberg-aws/1.8.1//iceberg-aws-1.8.1.jar +iceberg-bundled-guava/1.8.1//iceberg-bundled-guava-1.8.1.jar +iceberg-common/1.8.1//iceberg-common-1.8.1.jar +iceberg-core/1.8.1//iceberg-core-1.8.1.jar +iceberg-data/1.8.1//iceberg-data-1.8.1.jar +iceberg-hive-metastore/1.8.1//iceberg-hive-metastore-1.8.1.jar +iceberg-orc/1.8.1//iceberg-orc-1.8.1.jar +iceberg-parquet/1.8.1//iceberg-parquet-1.8.1.jar +iceberg-spark-3.3_2.12/1.8.1//iceberg-spark-3.3_2.12-1.8.1.jar +iceberg-spark-extensions-3.3_2.12/1.8.1//iceberg-spark-extensions-3.3_2.12-1.8.1.jar icu4j/69.1//icu4j-69.1.jar identity-spi/2.24.12//identity-spi-2.24.12.jar ivy/2.5.1//ivy-2.5.1.jar @@ -358,7 +359,7 @@ parquet-common/1.15.2//parquet-common-1.15.2.jar parquet-encoding/1.15.2//parquet-encoding-1.15.2.jar parquet-format-structures/1.15.2//parquet-format-structures-1.15.2.jar parquet-hadoop/1.15.2//parquet-hadoop-1.15.2.jar -parquet-jackson/1.13.1//parquet-jackson-1.13.1.jar +parquet-jackson/1.15.2//parquet-jackson-1.15.2.jar pickle/1.2//pickle-1.2.jar postgresql/42.7.2//postgresql-42.7.2.jar profiles/2.24.12//profiles-2.24.12.jar diff --git a/dev/deps/dependencies-hadoop-3-spark-3.5 b/dev/deps/dependencies-hadoop-3-spark-3.5 index 2facef79f9..24a4eb5064 100644 --- a/dev/deps/dependencies-hadoop-3-spark-3.5 +++ b/dev/deps/dependencies-hadoop-3-spark-3.5 @@ -25,7 +25,7 @@ async-profiler/2.9//async-profiler-2.9.jar auth/2.24.12//auth-2.24.12.jar avro-ipc/1.11.4//avro-ipc-1.11.4.jar avro-mapred/1.11.4//avro-mapred-1.11.4.jar -avro/1.11.3//avro-1.11.3.jar +avro/1.12.0//avro-1.12.0.jar aws-core/2.24.12//aws-core-2.24.12.jar aws-json-protocol/2.24.12//aws-json-protocol-2.24.12.jar aws-query-protocol/2.24.12//aws-query-protocol-2.24.12.jar @@ -73,6 +73,7 @@ eclipse-collections/11.1.0//eclipse-collections-11.1.0.jar endpoints-spi/2.24.12//endpoints-spi-2.24.12.jar error_prone_annotations/2.18.0//error_prone_annotations-2.18.0.jar eventstream/1.0.1//eventstream-1.0.1.jar +failsafe/3.3.2//failsafe-3.3.2.jar failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/23.5.26//flatbuffers-java-23.5.26.jar flatbuffers/1.2.0-3f79e055//flatbuffers-1.2.0-3f79e055.jar @@ -126,23 +127,23 @@ http-auth-spi/2.24.12//http-auth-spi-2.24.12.jar http-auth/2.24.12//http-auth-2.24.12.jar http-client-spi/2.24.12//http-client-spi-2.24.12.jar httpclient/4.5.14//httpclient-4.5.14.jar -httpclient5/5.3.1//httpclient5-5.3.1.jar +httpclient5/5.5//httpclient5-5.5.jar httpcore/4.4.16//httpcore-4.4.16.jar -httpcore5-h2/5.2.4//httpcore5-h2-5.2.4.jar -httpcore5/5.2.4//httpcore5-5.2.4.jar -iceberg-aliyun/1.6.1//iceberg-aliyun-1.6.1.jar -iceberg-api/1.6.1//iceberg-api-1.6.1.jar -iceberg-arrow/1.6.1//iceberg-arrow-1.6.1.jar -iceberg-aws/1.6.1//iceberg-aws-1.6.1.jar -iceberg-bundled-guava/1.6.1//iceberg-bundled-guava-1.6.1.jar -iceberg-common/1.6.1//iceberg-common-1.6.1.jar -iceberg-core/1.6.1//iceberg-core-1.6.1.jar -iceberg-data/1.6.1//iceberg-data-1.6.1.jar -iceberg-hive-metastore/1.6.1//iceberg-hive-metastore-1.6.1.jar -iceberg-orc/1.6.1//iceberg-orc-1.6.1.jar -iceberg-parquet/1.6.1//iceberg-parquet-1.6.1.jar -iceberg-spark-3.5_2.12/1.6.1//iceberg-spark-3.5_2.12-1.6.1.jar -iceberg-spark-extensions-3.5_2.12/1.6.1//iceberg-spark-extensions-3.5_2.12-1.6.1.jar +httpcore5-h2/5.3.4//httpcore5-h2-5.3.4.jar +httpcore5/5.3.4//httpcore5-5.3.4.jar +iceberg-aliyun/1.10.1//iceberg-aliyun-1.10.1.jar +iceberg-api/1.10.1//iceberg-api-1.10.1.jar +iceberg-arrow/1.10.1//iceberg-arrow-1.10.1.jar +iceberg-aws/1.10.1//iceberg-aws-1.10.1.jar +iceberg-bundled-guava/1.10.1//iceberg-bundled-guava-1.10.1.jar +iceberg-common/1.10.1//iceberg-common-1.10.1.jar +iceberg-core/1.10.1//iceberg-core-1.10.1.jar +iceberg-data/1.10.1//iceberg-data-1.10.1.jar +iceberg-hive-metastore/1.10.1//iceberg-hive-metastore-1.10.1.jar +iceberg-orc/1.10.1//iceberg-orc-1.10.1.jar +iceberg-parquet/1.10.1//iceberg-parquet-1.10.1.jar +iceberg-spark-3.5_2.12/1.10.1//iceberg-spark-3.5_2.12-1.10.1.jar +iceberg-spark-extensions-3.5_2.12/1.10.1//iceberg-spark-extensions-3.5_2.12-1.10.1.jar icu4j/69.1//icu4j-69.1.jar identity-spi/2.24.12//identity-spi-2.24.12.jar ivy/2.5.1//ivy-2.5.1.jar @@ -197,6 +198,7 @@ json4s-scalap_2.12/3.7.0-M11//json4s-scalap_2.12-3.7.0-M11.jar jsqlparser/4.7//jsqlparser-4.7.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar +jts-core/1.20.0//jts-core-1.20.0.jar junit-jupiter-api/5.9.1//junit-jupiter-api-5.9.1.jar junit-jupiter-engine/5.9.1//junit-jupiter-engine-5.9.1.jar junit-jupiter-params/5.9.1//junit-jupiter-params-5.9.1.jar @@ -320,13 +322,14 @@ oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar pagehelper/6.1.0//pagehelper-6.1.0.jar paranamer/2.8//paranamer-2.8.jar -parquet-avro/1.15.2//parquet-avro-1.15.2.jar -parquet-column/1.15.2//parquet-column-1.15.2.jar -parquet-common/1.15.2//parquet-common-1.15.2.jar -parquet-encoding/1.15.2//parquet-encoding-1.15.2.jar -parquet-format-structures/1.15.2//parquet-format-structures-1.15.2.jar -parquet-hadoop/1.15.2//parquet-hadoop-1.15.2.jar -parquet-jackson/1.13.1//parquet-jackson-1.13.1.jar +parquet-avro/1.16.0//parquet-avro-1.16.0.jar +parquet-column/1.16.0//parquet-column-1.16.0.jar +parquet-common/1.16.0//parquet-common-1.16.0.jar +parquet-encoding/1.16.0//parquet-encoding-1.16.0.jar +parquet-format-structures/1.16.0//parquet-format-structures-1.16.0.jar +parquet-hadoop/1.16.0//parquet-hadoop-1.16.0.jar +parquet-jackson/1.16.0//parquet-jackson-1.16.0.jar +parquet-variant/1.16.0//parquet-variant-1.16.0.jar pickle/1.3//pickle-1.3.jar postgresql/42.7.2//postgresql-42.7.2.jar profiles/2.24.12//profiles-2.24.12.jar From 1e4e9910ba60d69cb471fa904b4c13458fc4e738 Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 6 Mar 2026 13:51:29 +0800 Subject: [PATCH 6/7] Refactor mixed-format operation rules for Iceberg 1.10.x compatibility --- .../amoro-mixed-spark-3-common/pom.xml | 2 +- .../amoro/spark/MixedFormatSparkExtensions.scala | 10 ++++++++-- .../amoro/spark/MixedFormatSparkExtensions.scala | 10 ++++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml index 72c681e001..0032a96ad0 100644 --- a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml +++ b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml @@ -49,7 +49,7 @@ org.apache.avro avro - 1.12.1 + 1.12.0 diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala index acf956f241..2db295c827 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala +++ b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala @@ -43,11 +43,17 @@ class MixedFormatSparkExtensions extends (SparkSessionExtensions => Unit) { extensions.injectPostHocResolutionRule(spark => RewriteMixedFormatCommand(spark)) + // mixed-format row-level operation rewrite rules + // These must be resolution rules (not optimizer rules) so they run BEFORE Iceberg 1.10.x's + // RewriteUpdateTableForRowLineage and RewriteMergeIntoTableForRowLineage rules. Those Iceberg + // rules do pattern matching on the table and throw scala.MatchError for non-SparkTable types + // (i.e., MixedSparkTable). + extensions.injectResolutionRule { spark => RewriteUpdateMixedFormatTable(spark) } + extensions.injectResolutionRule { spark => RewriteDeleteFromMixedFormatTable(spark) } + // mixed-format optimizer rules extensions.injectPostHocResolutionRule { spark => QueryWithConstraintCheck(spark) } extensions.injectOptimizerRule { spark => RewriteAppendMixedFormatTable(spark) } - extensions.injectOptimizerRule { spark => RewriteDeleteFromMixedFormatTable(spark) } - extensions.injectOptimizerRule { spark => RewriteUpdateMixedFormatTable(spark) } // planner extensions extensions.injectPlannerStrategy { spark => MixedFormatExtendedDataSourceV2Strategy(spark) } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala index acf956f241..2db295c827 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala +++ b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala @@ -43,11 +43,17 @@ class MixedFormatSparkExtensions extends (SparkSessionExtensions => Unit) { extensions.injectPostHocResolutionRule(spark => RewriteMixedFormatCommand(spark)) + // mixed-format row-level operation rewrite rules + // These must be resolution rules (not optimizer rules) so they run BEFORE Iceberg 1.10.x's + // RewriteUpdateTableForRowLineage and RewriteMergeIntoTableForRowLineage rules. Those Iceberg + // rules do pattern matching on the table and throw scala.MatchError for non-SparkTable types + // (i.e., MixedSparkTable). + extensions.injectResolutionRule { spark => RewriteUpdateMixedFormatTable(spark) } + extensions.injectResolutionRule { spark => RewriteDeleteFromMixedFormatTable(spark) } + // mixed-format optimizer rules extensions.injectPostHocResolutionRule { spark => QueryWithConstraintCheck(spark) } extensions.injectOptimizerRule { spark => RewriteAppendMixedFormatTable(spark) } - extensions.injectOptimizerRule { spark => RewriteDeleteFromMixedFormatTable(spark) } - extensions.injectOptimizerRule { spark => RewriteUpdateMixedFormatTable(spark) } // planner extensions extensions.injectPlannerStrategy { spark => MixedFormatExtendedDataSourceV2Strategy(spark) } From d8c42a1484d075d214339e584419a1e7ea4ad83c Mon Sep 17 00:00:00 2001 From: xuba Date: Fri, 13 Mar 2026 16:39:58 +0800 Subject: [PATCH 7/7] [AMORO-4110] Split Spark and Flink Iceberg upgrades from current PR --- amoro-format-iceberg/pom.xml | 6 + .../amoro-mixed-flink-common-1.17/pom.xml | 453 ----- .../apache/amoro/flink/FlinkSchemaUtil.java | 438 ---- .../amoro/flink/InternalCatalogBuilder.java | 189 -- .../flink/catalog/FlinkUnifiedCatalog.java | 550 ----- .../amoro/flink/catalog/MixedCatalog.java | 792 -------- .../factories/CatalogFactoryOptions.java | 45 - .../factories/FlinkUnifiedCatalogFactory.java | 125 -- .../iceberg/IcebergFlinkCatalogFactory.java | 39 - .../mixed/MixedHiveCatalogFactory.java | 34 - .../mixed/MixedIcebergCatalogFactory.java | 74 - .../paimon/PaimonFlinkCatalogFactory.java | 55 - ...FlinkTablePropertiesInvocationHandler.java | 86 - .../interceptor/KerberosInterceptor.java | 57 - .../KerberosInvocationHandler.java | 70 - .../amoro/flink/interceptor/ProxyFactory.java | 48 - .../flink/lookup/BasicLookupFunction.java | 263 --- .../BinaryRowDataSerializerWrapper.java | 81 - .../flink/lookup/ByteArraySetSerializer.java | 89 - .../amoro/flink/lookup/ByteArrayWrapper.java | 246 --- .../apache/amoro/flink/lookup/KVTable.java | 80 - .../amoro/flink/lookup/KVTableFactory.java | 83 - .../apache/amoro/flink/lookup/KeyRowData.java | 133 -- .../amoro/flink/lookup/LookupMetrics.java | 27 - .../amoro/flink/lookup/LookupOptions.java | 133 -- .../amoro/flink/lookup/LookupRecord.java | 53 - .../MixedFormatRowDataLookupFunction.java | 81 - .../amoro/flink/lookup/RocksDBCacheState.java | 342 ---- .../flink/lookup/RocksDBRecordState.java | 156 -- .../flink/lookup/RocksDBSetSpilledState.java | 230 --- .../amoro/flink/lookup/RocksDBSetState.java | 137 -- .../flink/lookup/RowDataStateFactory.java | 98 - .../flink/lookup/SecondaryIndexTable.java | 170 -- .../amoro/flink/lookup/TableFactory.java | 36 - .../amoro/flink/lookup/UniqueIndexTable.java | 153 -- .../flink/lookup/filter/RowDataPredicate.java | 307 --- .../RowDataPredicateExpressionVisitor.java | 287 --- .../amoro/flink/metric/MetricConstant.java | 36 - .../amoro/flink/metric/MetricsGenerator.java | 128 -- .../planner/calcite/FlinkTypeSystem.java | 215 -- .../read/AdaptHiveFlinkParquetReaders.java | 873 -------- .../amoro/flink/read/FlinkSplitPlanner.java | 288 --- .../amoro/flink/read/MixedFormatSource.java | 132 -- .../flink/read/MixedIncrementalLoader.java | 119 -- .../flink/read/PartitionAndNodeGroup.java | 119 -- .../hybrid/assigner/ShuffleSplitAssigner.java | 342 ---- .../flink/read/hybrid/assigner/Split.java | 83 - .../read/hybrid/assigner/SplitAssigner.java | 66 - .../hybrid/assigner/StaticSplitAssigner.java | 134 -- .../AbstractMixedFormatEnumerator.java | 183 -- .../ContinuousEnumerationResult.java | 82 - .../enumerator/ContinuousSplitPlanner.java | 43 - .../ContinuousSplitPlannerImpl.java | 127 -- .../InitializationFinishedEvent.java | 29 - .../MergeOnReadIncrementalPlanner.java | 67 - .../enumerator/MergeOnReadPlannerImpl.java | 83 - .../MixedFormatEnumeratorOffset.java | 94 - ...MixedFormatEnumeratorOffsetSerializer.java | 91 - .../MixedFormatSourceEnumState.java | 67 - .../MixedFormatSourceEnumStateSerializer.java | 160 -- .../MixedFormatSourceEnumerator.java | 264 --- .../StaticMixedFormatSourceEnumerator.java | 94 - .../read/hybrid/reader/ArrayBatchRecords.java | 206 -- .../reader/ArrayPoolDataIteratorBatcher.java | 138 -- .../hybrid/reader/DataIteratorBatcher.java | 37 - .../reader/DataIteratorReaderFunction.java | 59 - .../read/hybrid/reader/HybridSplitReader.java | 132 -- .../reader/MixedFormatRecordEmitter.java | 76 - .../reader/MixedFormatRecordWithOffset.java | 66 - .../reader/MixedFormatSourceReader.java | 193 -- .../read/hybrid/reader/ReaderFunction.java | 37 - .../hybrid/reader/ReaderStartedEvent.java | 28 - .../read/hybrid/reader/RecordFactory.java | 34 - .../read/hybrid/reader/RecordPosition.java | 61 - .../hybrid/reader/RowDataReaderFunction.java | 217 -- .../hybrid/reader/RowDataRecordFactory.java | 73 - .../read/hybrid/split/ChangelogSplit.java | 141 -- .../read/hybrid/split/MergeOnReadSplit.java | 96 - .../read/hybrid/split/MixedFormatSplit.java | 85 - .../split/MixedFormatSplitSerializer.java | 98 - .../hybrid/split/MixedFormatSplitState.java | 75 - .../read/hybrid/split/SnapshotSplit.java | 113 -- .../read/hybrid/split/SplitRequestEvent.java | 55 - .../read/hybrid/split/TemporalJoinSplits.java | 154 -- .../internals/KafkaPartitionSplitReader.java | 499 ----- .../flink/read/internals/KafkaSource.java | 214 -- .../internals/KafkaSourceFetcherManager.java | 107 - .../read/internals/KafkaSourceReader.java | 181 -- .../metrics/KafkaConsumerMetricConstants.java | 33 - .../read/source/ChangeLogDataIterator.java | 235 --- .../amoro/flink/read/source/DataIterator.java | 199 -- .../flink/read/source/FileScanTaskReader.java | 35 - .../read/source/FlinkKeyedMORDataReader.java | 84 - .../read/source/FlinkUnkyedDataReader.java | 128 -- .../read/source/MergeOnReadDataIterator.java | 132 -- .../read/source/MixedFormatScanContext.java | 378 ---- .../read/source/log/LogSourceHelper.java | 250 --- .../log/kafka/LogKafkaPartitionSplit.java | 85 - .../kafka/LogKafkaPartitionSplitReader.java | 443 ---- .../kafka/LogKafkaPartitionSplitState.java | 118 -- .../log/kafka/LogKafkaRecordEmitter.java | 44 - .../read/source/log/kafka/LogKafkaSource.java | 161 -- .../log/kafka/LogKafkaSourceBuilder.java | 578 ------ .../log/kafka/LogKafkaSourceReader.java | 77 - .../log/kafka/LogRecordWithRetractInfo.java | 115 -- .../flink/shuffle/ReadShuffleRulePolicy.java | 120 -- .../shuffle/RoundRobinShuffleRulePolicy.java | 235 --- .../amoro/flink/shuffle/ShuffleHelper.java | 155 -- .../amoro/flink/shuffle/ShuffleKey.java | 33 - .../flink/shuffle/ShuffleRulePolicy.java | 62 - .../apache/amoro/flink/table/FlinkSource.java | 316 --- .../amoro/flink/table/LogDynamicSource.java | 230 --- .../flink/table/MixedDynamicTableFactory.java | 265 --- .../flink/table/MixedFormatDynamicSink.java | 113 -- .../flink/table/MixedFormatDynamicSource.java | 384 ---- .../flink/table/MixedFormatFileSource.java | 244 --- .../flink/table/MixedFormatTableLoader.java | 152 -- .../apache/amoro/flink/table/OptionsUtil.java | 64 - .../table/UnifiedDynamicTableFactory.java | 124 -- .../UnkeyedInputFormatOperatorFactory.java | 67 - .../UnkeyedInputFormatSourceFunction.java | 191 -- .../descriptors/MixedFormatValidator.java | 349 ---- .../util/CompatibleFlinkPropertyUtil.java | 158 -- .../amoro/flink/util/DateTimeUtils.java | 1797 ----------------- .../apache/amoro/flink/util/FilterUtil.java | 45 - .../flink/util/FlinkClassReflectionUtil.java | 65 - .../flink/util/IcebergAndFlinkFilters.java | 49 - .../amoro/flink/util/IcebergClassUtil.java | 214 -- .../apache/amoro/flink/util/LookupUtil.java | 36 - .../amoro/flink/util/MixedFormatUtils.java | 276 --- .../apache/amoro/flink/util/Projection.java | 430 ---- .../apache/amoro/flink/util/ProxyUtil.java | 67 - .../amoro/flink/util/ReflectionUtil.java | 56 - .../amoro/flink/util/ThreadLocalCache.java | 90 - .../write/AdaptHiveFlinkAppenderFactory.java | 276 --- .../write/AutomaticDoubleWriteStatus.java | 96 - .../amoro/flink/write/AutomaticLogWriter.java | 142 -- .../write/AutomaticWriteSpecification.java | 76 - .../flink/write/FlinkBaseTaskWriter.java | 72 - .../flink/write/FlinkChangeTaskWriter.java | 136 -- .../apache/amoro/flink/write/FlinkSink.java | 444 ---- .../flink/write/FlinkTaskWriterBuilder.java | 289 --- .../flink/write/MixedFormatFileWriter.java | 231 --- .../flink/write/MixedFormatLogWriter.java | 28 - .../MixedFormatRowDataTaskWriterFactory.java | 77 - .../amoro/flink/write/MixedFormatWriter.java | 218 -- .../write/hidden/AbstractHiddenLogWriter.java | 240 --- .../write/hidden/GlobalFlipCommitter.java | 272 --- .../flink/write/hidden/HiddenLogWriter.java | 70 - .../flink/write/hidden/LogMsgFactory.java | 61 - .../hidden/MixedFormatLogPartitioner.java | 63 - .../hidden/kafka/HiddenKafkaFactory.java | 49 - .../hidden/kafka/HiddenKafkaProducer.java | 194 -- .../flink/DynamicTableSourceTestBase.java | 96 - .../amoro/flink/FlinkTableTestBase.java | 108 - .../org/apache/amoro/flink/FlinkTestBase.java | 324 --- .../amoro/flink/TestFlinkSchemaUtil.java | 60 - .../catalog/FlinkAmoroCatalogITCase.java | 154 -- .../flink/catalog/FlinkCatalogContext.java | 133 -- .../catalog/FlinkUnifiedCatalogITCase.java | 138 -- .../catalog/TestFlinkUnifiedCatalogs.java | 169 -- .../amoro/flink/catalog/TestMixedCatalog.java | 589 ------ .../TestMixedCatalogTablePartitions.java | 223 -- .../kafka/testutils/KafkaConfigGenerate.java | 81 - .../kafka/testutils/KafkaContainerTest.java | 137 -- .../flink/kafka/testutils/KafkaUtil.java | 186 -- .../kafka/testutils/SuccessException.java | 24 - .../lookup/ByteArraySetSerializerTest.java | 87 - .../amoro/flink/lookup/TestKVTable.java | 584 ------ .../TestRowDataPredicateAllFieldTypes.java | 258 --- .../filter/TestRowDataPredicateBase.java | 111 - ...TestRowDataPredicateExpressionVisitor.java | 163 -- .../amoro/flink/read/TestFlinkSource.java | 304 --- .../flink/read/TestFlinkSplitPlanner.java | 72 - .../flink/read/TestMixedFormatSource.java | 1128 ----------- .../read/hidden/kafka/TestKafkaConsumer.java | 150 -- .../hidden/kafka/TestKafkaSourceReader.java | 266 --- .../TestLogKafkaPartitionSplitReader.java | 306 --- .../assigner/TestShuffleSplitAssigner.java | 257 --- .../assigner/TestSplitAssignerAwaiting.java | 126 -- .../assigner/TestStaticSplitAssigner.java | 87 - .../TestContinuousSplitPlannerImpl.java | 173 -- ...tMixedFormatSourceEnumStateSerializer.java | 95 - .../TestMixedFormatSourceEnumerator.java | 295 --- .../TestTemporalJoinSplitsThreadSafe.java | 107 - .../reader/MixedIncrementalLoaderTest.java | 172 -- .../reader/TestRowDataReaderFunction.java | 391 ---- .../split/TestMixedFormatSplitSerializer.java | 89 - .../amoro/flink/shuffle/TestLogRecordV1.java | 143 -- .../TestRoundRobinShuffleRulePolicy.java | 173 -- .../flink/table/AmoroCatalogITCaseBase.java | 124 -- .../amoro/flink/table/CatalogITCaseBase.java | 105 - .../amoro/flink/table/LookupITCase.java | 189 -- .../apache/amoro/flink/table/TestJoin.java | 367 ---- .../apache/amoro/flink/table/TestKeyed.java | 1164 ----------- .../flink/table/TestLookupSecondary.java | 191 -- .../amoro/flink/table/TestTableRefresh.java | 88 - .../apache/amoro/flink/table/TestUnkeyed.java | 1052 ---------- .../flink/table/TestUnkeyedOverwrite.java | 208 -- .../amoro/flink/table/TestWatermark.java | 259 --- .../amoro/flink/util/ClassLoaderUtils.java | 293 --- .../org/apache/amoro/flink/util/DataUtil.java | 155 -- .../util/MixedFormatMockEnvironment.java | 80 - .../flink/util/MockEnvironmentBuilder.java | 209 -- .../util/TestCompatibleFlinkPropertyUtil.java | 56 - .../util/TestGlobalAggregateManager.java | 50 - .../TestOneInputStreamOperatorIntern.java | 105 - .../amoro/flink/util/TestProjection.java | 148 -- .../org/apache/amoro/flink/util/TestUtil.java | 71 - .../flink/write/FlinkTaskWriterBaseTest.java | 167 -- .../write/MixedFormatFileWriterITCase.java | 311 --- .../flink/write/TestAdaptHiveWriter.java | 330 --- .../write/TestAutomaticDoubleWriteStatus.java | 70 - .../flink/write/TestAutomaticLogWriter.java | 429 ---- .../amoro/flink/write/TestFlinkSink.java | 246 --- .../write/TestMixedFormatFileCommitter.java | 151 -- .../write/TestMixedFormatFileWriter.java | 327 --- .../flink/write/hidden/kafka/TestBaseLog.java | 197 -- .../hidden/kafka/TestHiddenKafkaProducer.java | 195 -- .../hidden/kafka/TestHiddenLogOperators.java | 475 ----- .../iceberg/flink/MiniClusterResource.java | 46 - .../pom.xml | 349 ---- .../data/AdaptHiveFlinkParquetReaders.java | 873 -------- .../data/AdaptHiveFlinkParquetWriters.java | 599 ------ ...daptHiveParquetWithFlinkSchemaVisitor.java | 231 --- .../source/RowDataFileScanTaskReader.java | 247 --- .../iceberg/flink/source/ScanContext.java | 707 ------- .../data/AdaptHiveFlinkParquetReaders.java | 6 +- .../source/RowDataFileScanTaskReader.java | 4 +- .../read/AdaptHiveFlinkParquetReaders.java | 6 +- .../hybrid/reader/RowDataRecordFactory.java | 13 +- .../iceberg/flink/MiniClusterResource.java | 46 - amoro-format-mixed/amoro-mixed-flink/pom.xml | 3 - .../v1.17/amoro-mixed-flink-1.17/pom.xml | 5 +- .../amoro-mixed-flink-runtime-1.17/pom.xml | 1 - .../v1.18/amoro-mixed-flink-1.18/pom.xml | 1 - .../amoro-mixed-flink-runtime-1.18/pom.xml | 1 - amoro-format-mixed/amoro-mixed-hive/pom.xml | 6 + .../amoro-mixed-spark-3-common/pom.xml | 7 - .../spark/reader/SparkParquetReaders.java | 8 +- .../spark/reader/SparkParquetReaders.java | 8 +- .../spark/MixedFormatSparkExtensions.scala | 10 +- .../spark/reader/SparkParquetReaders.java | 8 +- .../spark/MixedFormatSparkExtensions.scala | 10 +- dev/deps/dependencies-hadoop-2-spark-3.3 | 37 +- dev/deps/dependencies-hadoop-3-spark-3.5 | 51 +- pom.xml | 12 +- 247 files changed, 85 insertions(+), 44837 deletions(-) delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java delete mode 100644 amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java diff --git a/amoro-format-iceberg/pom.xml b/amoro-format-iceberg/pom.xml index 2931d90d8a..2f2283bc03 100644 --- a/amoro-format-iceberg/pom.xml +++ b/amoro-format-iceberg/pom.xml @@ -29,6 +29,12 @@ Amoro Project Iceberg Format https://amoro.apache.org + + 1.10.1 + 1.16.0 + 1.16.0 + + org.apache.amoro diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml deleted file mode 100644 index 7e2dc9859b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/pom.xml +++ /dev/null @@ -1,453 +0,0 @@ - - - - 4.0.0 - - org.apache.amoro - amoro-mixed-flink - 0.9-SNAPSHOT - ../pom.xml - - - amoro-mixed-flink-common-1.17 - - jar - Amoro Project Mixed Format Flink Common - https://amoro.apache.org - - - 3.21.0 - 1.17.2 - 1.17.2 - 3.0.2-1.17 - 1.6.1 - - - - - - org.apache.amoro - amoro-format-iceberg - - - org.ow2.asm - asm - - - - - - org.apache.amoro - amoro-mixed-hive - - - - org.apache.amoro - amoro-format-mixed-flink-common-iceberg-bridge-1.17 - ${project.version} - - - - org.apache.iceberg - iceberg-flink-1.17 - ${iceberg.version} - provided - - - org.slf4j - slf4j-api - - - org.apache.parquet - parquet-column - - - org.apache.parquet - parquet-avro - - - - - - org.apache.paimon - paimon-flink-1.17 - ${paimon.version} - provided - - - - org.apache.amoro - amoro-format-mixed-flink-common-format - ${project.parent.version} - - - - cglib - cglib - - - - com.google.code.gson - gson - ${gson.version} - - - - - org.apache.flink - flink-connector-files - ${flink.version} - provided - - - - org.apache.flink - flink-connector-kafka - ${flink-kafka.version} - provided - - - - org.apache.flink - flink-json - ${flink.version} - provided - - - - org.apache.flink - flink-hadoop-compatibility_${flink.scala.binary.version} - ${flink.version} - provided - - - - org.apache.flink - flink-table-api-java-bridge - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - org.apache.flink - flink-metrics-dropwizard - ${flink.version} - provided - - - - - org.apache.flink - flink-orc - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-parquet - ${flink.version} - provided - - - org.apache.parquet - parquet-hadoop - - - - - - org.apache.flink - flink-table-runtime - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-table-planner_${flink.scala.binary.version} - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - - com.fasterxml.jackson.core - jackson-databind - ${jackson.vesion} - provided - - - - - - org.apache.iceberg - iceberg-flink-1.17 - ${iceberg.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-runtime - ${flink.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-streaming-java - ${flink.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-clients - ${flink.version} - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-test-utils - ${flink.version} - test - - - org.apache.logging.log4j - log4j-slf4j-impl - - - org.slf4j - slf4j-api - - - com.google.guava - guava - - - - - - org.apache.flink - flink-connector-test-utils - ${flink.version} - test - - - - org.apache.iceberg - iceberg-hive-metastore - ${iceberg.version} - tests - test - - - - org.apache.amoro - amoro-common - ${project.version} - tests - test - - - - org.apache.amoro - amoro-format-iceberg - ${project.version} - test-jar - test - - - - org.apache.amoro - amoro-mixed-hive - ${project.version} - tests - test - - - - org.apache.amoro - amoro-format-paimon - ${project.version} - tests - test - - - - org.apache.amoro - amoro-format-paimon - ${project.version} - test - - - - org.apache.flink - flink-metrics-jmx - ${flink.version} - test - - - - org.apache.flink - flink-runtime-web - ${flink.version} - test - - - - - org.apache.flink - flink-table-planner_${flink.scala.binary.version} - ${flink.version} - test-jar - test - - - org.slf4j - slf4j-api - - - - - - - org.apache.curator - curator-test - 2.12.0 - test - - - com.google.guava - guava - - - - - - org.testcontainers - kafka - ${testcontainers.version} - test - - - - org.testcontainers - junit-jupiter - ${testcontainers.version} - test - - - - org.assertj - assertj-core - ${assertj.version} - test - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - - listener - org.apache.amoro.listener.AmoroRunListener - - - -verbose:class - - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - - - - - org.jacoco - jacoco-maven-plugin - - ${jacoco.flink.skip} - - - - - diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java deleted file mode 100644 index bb5bd93111..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/FlinkSchemaUtil.java +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import static org.apache.flink.table.descriptors.DescriptorProperties.DATA_TYPE; -import static org.apache.flink.table.descriptors.DescriptorProperties.EXPR; -import static org.apache.flink.table.descriptors.DescriptorProperties.NAME; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_ROWTIME; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_DATA_TYPE; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_EXPR; -import static org.apache.flink.table.descriptors.Schema.SCHEMA_PROCTIME; - -import org.apache.amoro.flink.table.FlinkSource; -import org.apache.amoro.flink.table.MixedFormatDynamicSource; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.math.NumberUtils; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableColumn.ComputedColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.api.WatermarkSpec; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.utils.LogicalTypeParser; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Function; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** An util that converts flink table schema. */ -public class FlinkSchemaUtil { - - private static final Logger LOG = LoggerFactory.getLogger(FlinkSchemaUtil.class); - public static final String FLINK_PREFIX = "flink"; - - public static final String COMPUTED_COLUMNS = "computed-column"; - - public static final String WATERMARK = "watermark"; - public static final String PROCTIME_FUNCTION = SCHEMA_PROCTIME + "()"; - public static final Pattern COMPUTE_PATTERN = - Pattern.compile("flink\\.computed-column\\.(\\d+)\\.name"); - - /** - * Convert iceberg Schema to flink TableSchema. - * - * @param icebergSchema - * @param tableProperties - * @return Flink TableSchema - */ - public static TableSchema toSchema( - Schema icebergSchema, List primaryKeys, Map tableProperties) { - TableSchema.Builder builder = TableSchema.builder(); - RowType rowType = org.apache.iceberg.flink.FlinkSchemaUtil.convert(icebergSchema); - - // add physical columns. - for (RowType.RowField field : rowType.getFields()) { - builder.field(field.getName(), TypeConversions.fromLogicalToDataType(field.getType())); - } - - // add primary key - if (CollectionUtils.isNotEmpty(primaryKeys)) { - builder.primaryKey(primaryKeys.toArray(new String[0])); - } - - Set computeIndex = getComputeIndex(tableProperties); - List fieldNames = rowType.getFieldNames(); - - // add computed columns - for (int index : computeIndex) { - builder.add(deserializeComputeColumn(tableProperties, index, fieldNames)); - fieldNames.add(tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME))); - } - - // add watermark - if (isWatermarkValid(tableProperties)) { - builder.watermark(deserializeWatermarkSpec(tableProperties, fieldNames)); - } - return builder.build(); - } - - /** - * Add watermark info to help {@link FlinkSource} and {@link MixedFormatDynamicSource} distinguish - * the watermark field. For now, it only be used in the case of mixed-format table as dim-table. - */ - public static TableSchema getPhysicalSchemaForDimTable(TableSchema tableSchema) { - TableSchema.Builder builder = filter(tableSchema, TableColumn::isPhysical); - tableSchema.getWatermarkSpecs().forEach(builder::watermark); - return builder.build(); - } - - /** - * filter watermark due to watermark is a virtual field for now, not in mixed-format physical - * table. - */ - public static TableSchema filterWatermark(TableSchema tableSchema) { - List watermarkSpecs = tableSchema.getWatermarkSpecs(); - if (watermarkSpecs.isEmpty()) { - return tableSchema; - } - - Function filter = - (tableColumn) -> { - boolean isWatermark = false; - for (WatermarkSpec spec : watermarkSpecs) { - if (spec.getRowtimeAttribute().equals(tableColumn.getName())) { - isWatermark = true; - break; - } - } - return !isWatermark; - }; - return filter(tableSchema, filter).build(); - } - - /** If filter result is true, keep the column; otherwise, remove the column. */ - public static TableSchema.Builder filter( - TableSchema tableSchema, Function filter) { - TableSchema.Builder builder = TableSchema.builder(); - - tableSchema - .getTableColumns() - .forEach( - tableColumn -> { - if (!filter.apply(tableColumn)) { - return; - } - builder.field(tableColumn.getName(), tableColumn.getType()); - }); - tableSchema - .getPrimaryKey() - .ifPresent( - uniqueConstraint -> - builder.primaryKey( - uniqueConstraint.getName(), - uniqueConstraint.getColumns().toArray(new String[0]))); - return builder; - } - - public static RowType toRowType(TableSchema tableSchema) { - LogicalType[] fields = new LogicalType[tableSchema.getFieldCount()]; - - for (int i = 0; i < fields.length; i++) { - TableColumn tableColumn = tableSchema.getTableColumn(i).get(); - fields[i] = tableColumn.getType().getLogicalType(); - } - return RowType.of(fields); - } - - /** - * Primary keys are the required fields to guarantee that readers can read keyed table in right - * order, due to the automatic scaling in/out of nodes. The required fields should be added even - * though projection push down - */ - @Deprecated - public static List addPrimaryKey( - List projectedColumns, MixedTable table) { - List primaryKeys = - table.isUnkeyedTable() - ? Collections.EMPTY_LIST - : table.asKeyedTable().primaryKeySpec().fields().stream() - .map(PrimaryKeySpec.PrimaryKeyField::fieldName) - .collect(Collectors.toList()); - - List columns = new ArrayList<>(projectedColumns); - Set projectedNames = new HashSet<>(); - - projectedColumns.forEach(c -> projectedNames.add(c.name())); - - primaryKeys.forEach( - pk -> { - if (!projectedNames.contains(pk)) { - columns.add(table.schema().findField(pk)); - } - }); - - LOG.info("Projected Columns after addPrimaryKey, columns:{}", columns); - return columns; - } - - /** - * Primary keys are the required fields to guarantee that readers can read keyed table in right - * order, due to the automatic scaling in/out of nodes. The required fields should be added even - * though projection push down - */ - @Deprecated - public static void addPrimaryKey( - TableSchema.Builder builder, - MixedTable table, - TableSchema tableSchema, - String[] projectedColumns) { - Set projectedNames = Arrays.stream(projectedColumns).collect(Collectors.toSet()); - - if (!table.isKeyedTable()) { - return; - } - - List pks = table.asKeyedTable().primaryKeySpec().fieldNames(); - pks.forEach( - pk -> { - if (projectedNames.contains(pk)) { - return; - } - builder.field( - pk, - tableSchema - .getFieldDataType(pk) - .orElseThrow( - () -> - new ValidationException( - "Mixed-format table primary key should be declared in table"))); - }); - } - - /** - * Generate table properties for watermark and computed columns from flink TableSchema. - * - * @param schema Flink TableSchema. - * @return tableProperties. - */ - public static Map generateExtraOptionsFrom(TableSchema schema) { - Map properties = Maps.newHashMap(); - - // add properties for computeColumns - Map computeColumnProperties = serializeComputeColumn(schema); - properties.putAll(computeColumnProperties); - - // add properties for watermark,only support one watermark now - List watermarkSpecs = schema.getWatermarkSpecs(); - if (!watermarkSpecs.isEmpty()) { - if (watermarkSpecs.size() > 1) { - throw new IllegalStateException("Multiple watermark definition is not supported yet."); - } - properties.putAll(serializeWatermarkSpec(watermarkSpecs.get(0))); - } - - return properties; - } - - /** Serialize compute columns into properties. */ - private static Map serializeComputeColumn(TableSchema schema) { - Map serialized = new HashMap<>(); - List tableColumns = schema.getTableColumns(); - // index in compute Column, starting from 1 - int computeIndex = 1; - for (TableColumn column : tableColumns) { - if (column instanceof ComputedColumn) { - serialized.put( - compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, NAME), column.getName()); - serialized.put( - compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, DATA_TYPE), - column.getType().getLogicalType().asSerializableString()); - serialized.put( - compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, computeIndex, EXPR), - ((TableColumn.ComputedColumn) column).getExpression()); - computeIndex++; - } - } - return serialized; - } - - /** Deserialize compute columns from properties. */ - private static TableColumn deserializeComputeColumn( - Map tableProperties, int index, List fieldNames) { - String expr = tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, EXPR)); - if (!isExprContainField(expr, fieldNames)) { - throw new IllegalStateException( - "expression " + expr + " does not match any columns in the table. "); - } - DataType dataType = - TypeConversions.fromLogicalToDataType( - LogicalTypeParser.parse( - tableProperties.get( - compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, DATA_TYPE)))); - TableColumn column = - TableColumn.computed( - tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME)), - dataType, - expr); - return column; - } - - private static boolean isExprContainField(String expr, List fieldNames) { - if (expr.equalsIgnoreCase(PROCTIME_FUNCTION)) { - return true; - } - for (String fieldName : fieldNames) { - if (expr.contains("`" + fieldName + "`")) { - return true; - } - } - return false; - } - - private static boolean isComputeValid(Map tableProperties, int index) { - // check if properties for computeColumn is valid and complete - if (StringUtils.isNotBlank( - tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, NAME))) - && StringUtils.isNotBlank( - tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, DATA_TYPE))) - && StringUtils.isNotBlank( - tableProperties.get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, index, EXPR)))) { - return true; - } - LOG.warn( - "properties for computeColumn {} is incomplete, It should contain {}, {}, {}. skip to convert it into computeColumn ", - index, - NAME, - DATA_TYPE, - EXPR); - return false; - } - - private static Set getComputeIndex(Map tableProperties) { - Set computedIndex = new TreeSet<>(); - tableProperties - .keySet() - .forEach( - k -> { - Matcher matcher = COMPUTE_PATTERN.matcher(k); - if (matcher.find()) { - int indexId = NumberUtils.toInt(matcher.group(1)); - if (indexId > 0 && isComputeValid(tableProperties, indexId)) { - computedIndex.add(indexId); - } - } - }); - return computedIndex; - } - - /** Serialize watermarkSpec into properties. */ - private static Map serializeWatermarkSpec(WatermarkSpec watermarkSpec) { - Map serializedWatermarkSpec = new HashMap<>(); - serializedWatermarkSpec.put( - compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME), - watermarkSpec.getRowtimeAttribute()); - serializedWatermarkSpec.put( - compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR), - watermarkSpec.getWatermarkExpr()); - serializedWatermarkSpec.put( - compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE), - watermarkSpec.getWatermarkExprOutputType().getLogicalType().asSerializableString()); - - return serializedWatermarkSpec; - } - - /** Deserialize watermarkSpec from properties. */ - private static WatermarkSpec deserializeWatermarkSpec( - Map tableProperties, List fieldNames) { - String rowtimeAttribute = - tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME)); - if (!fieldNames.contains(rowtimeAttribute)) { - throw new IllegalStateException( - "Watermark rowtime attribute '" - + rowtimeAttribute - + " does not match any columns in the table. "); - } - DataType watermarkExprOutputType = - TypeConversions.fromLogicalToDataType( - LogicalTypeParser.parse( - tableProperties.get( - compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE)))); - return new WatermarkSpec( - rowtimeAttribute, - tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR)), - watermarkExprOutputType); - } - - private static boolean isWatermarkValid(Map tableProperties) { - // check if properties for watermark is valid and complete - if (StringUtils.isNotBlank( - tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME))) - && StringUtils.isNotBlank( - tableProperties.get(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR))) - && StringUtils.isNotBlank( - tableProperties.get( - compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE)))) { - return true; - } - LOG.warn( - "properties for watermark is incomplete, It should contain {}, {}, {}. skip to convert it into watermark strategy ", - WATERMARK_ROWTIME, - WATERMARK_STRATEGY_EXPR, - WATERMARK_STRATEGY_DATA_TYPE); - return false; - } - - private static String compoundKey(Object... components) { - return Stream.of(components).map(Object::toString).collect(Collectors.joining(".")); - } - - /** - * get physical tableSchema - * - * @param tableSchema Flink TableSchema - * @return Flink tableSchema - */ - public static TableSchema getPhysicalSchema(TableSchema tableSchema) { - TableSchema.Builder builder = filter(tableSchema, TableColumn::isPhysical); - return builder.build(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java deleted file mode 100644 index e31bc6605f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/InternalCatalogBuilder.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import static org.apache.iceberg.CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE; -import static org.apache.iceberg.flink.FlinkCatalogFactory.HADOOP_CONF_DIR; -import static org.apache.iceberg.flink.FlinkCatalogFactory.HIVE_CONF_DIR; - -import org.apache.amoro.mixed.CatalogLoader; -import org.apache.amoro.mixed.MixedFormatCatalog; -import org.apache.amoro.properties.CatalogMetaProperties; -import org.apache.amoro.shade.guava32.com.google.common.base.Strings; -import org.apache.amoro.table.TableMetaStore; -import org.apache.amoro.utils.ConfigurationFileUtil; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.runtime.util.HadoopUtils; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.util.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.flink.FlinkCatalogFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.Serializable; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.HashMap; -import java.util.Map; - -/** Build {@link MixedFormatCatalog}. */ -public class InternalCatalogBuilder implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(InternalCatalogBuilder.class); - - private String amsUri; - private Map properties = new HashMap<>(0); - private String catalogName; - - private MixedFormatCatalog createMixedFormatCatalog() { - if (amsUri != null) { - return CatalogLoader.load(amsUri, properties); - } else { - Preconditions.checkArgument(catalogName != null, "Catalog name cannot be empty"); - String metastoreType = properties.get(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(metastoreType != null, "Catalog type cannot be empty"); - TableMetaStore tableMetaStore = - TableMetaStore.builder() - .withConfiguration(clusterHadoopConf(metastoreType, properties)) - .build(); - return CatalogLoader.createCatalog(catalogName, metastoreType, properties, tableMetaStore); - } - } - - public static Configuration clusterHadoopConf( - String metastoreType, Map properties) { - Configuration configuration = - HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); - if (ICEBERG_CATALOG_TYPE_HIVE.equals(metastoreType)) { - String hiveConfDir = properties.get(HIVE_CONF_DIR); - String hadoopConfDir = properties.get(HADOOP_CONF_DIR); - configuration = mergeHiveConf(configuration, hiveConfDir, hadoopConfDir); - } - return configuration; - } - - private static Configuration mergeHiveConf( - Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { - Configuration newConf = new Configuration(hadoopConf); - if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState( - Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", - hiveConfDir); - newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); - } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from - // classpath. If still - // couldn't load the configuration file, then it will throw exception in HiveCatalog. - URL configFile = InternalCatalogBuilder.class.getClassLoader().getResource("hive-site.xml"); - if (configFile != null) { - newConf.addResource(configFile); - } - } - - if (!Strings.isNullOrEmpty(hadoopConfDir)) { - java.nio.file.Path hdfsSiteFile = Paths.get(hadoopConfDir, "hdfs-site.xml"); - Preconditions.checkState( - Files.exists(hdfsSiteFile), - "Failed to load Hadoop configuration: missing %s", - hdfsSiteFile); - newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); - java.nio.file.Path coreSiteFile = Paths.get(hadoopConfDir, "core-site.xml"); - Preconditions.checkState( - Files.exists(coreSiteFile), - "Failed to load Hadoop configuration: missing %s", - coreSiteFile); - newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); - } - - return newConf; - } - - public String getAmsUri() { - return amsUri; - } - - public Map getProperties() { - return properties; - } - - public InternalCatalogBuilder() {} - - public static InternalCatalogBuilder builder() { - return new InternalCatalogBuilder(); - } - - public MixedFormatCatalog build() { - return createMixedFormatCatalog(); - } - - public InternalCatalogBuilder amsUri(String amsUri) { - this.amsUri = amsUri; - return this; - } - - public InternalCatalogBuilder properties(Map properties) { - Map finalProperties = new HashMap<>(); - for (Map.Entry property : properties.entrySet()) { - String key = property.getKey(); - String value = property.getValue(); - switch (key) { - case CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB_PATH: - try { - finalProperties.put( - CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB, - ConfigurationFileUtil.encodeConfigurationFileWithBase64(value)); - } catch (IOException e) { - LOG.error("encode keytab file failed", e); - throw new CatalogException("encode keytab file failed", e); - } - break; - case CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB_ENCODE: - finalProperties.put(CatalogMetaProperties.AUTH_CONFIGS_KEY_KEYTAB, value); - break; - case CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB_PATH: - try { - finalProperties.put( - CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB5, - ConfigurationFileUtil.encodeConfigurationFileWithBase64(value)); - } catch (IOException e) { - LOG.error("encode krb5 file failed", e); - throw new CatalogException("encode krb5 file failed", e); - } - break; - case CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB_ENCODE: - finalProperties.put(CatalogMetaProperties.AUTH_CONFIGS_KEY_KRB5, value); - break; - default: - finalProperties.put(key, value); - break; - } - } - this.properties = finalProperties; - return this; - } - - public InternalCatalogBuilder catalogName(String catalogName) { - this.catalogName = catalogName; - return this; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java deleted file mode 100644 index c56bb6c94b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalog.java +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.apache.amoro.Constants.THRIFT_TABLE_SERVICE_NAME; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; - -import org.apache.amoro.AlreadyExistsException; -import org.apache.amoro.AmoroTable; -import org.apache.amoro.NoSuchDatabaseException; -import org.apache.amoro.NoSuchTableException; -import org.apache.amoro.TableFormat; -import org.apache.amoro.UnifiedCatalog; -import org.apache.amoro.client.AmsThriftUrl; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.catalog.factories.FlinkUnifiedCatalogFactory; -import org.apache.amoro.flink.catalog.factories.iceberg.IcebergFlinkCatalogFactory; -import org.apache.amoro.flink.catalog.factories.mixed.MixedHiveCatalogFactory; -import org.apache.amoro.flink.catalog.factories.mixed.MixedIcebergCatalogFactory; -import org.apache.amoro.flink.catalog.factories.paimon.PaimonFlinkCatalogFactory; -import org.apache.amoro.flink.table.UnifiedDynamicTableFactory; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.amoro.table.TableIdentifier; -import org.apache.amoro.table.TableMetaStore; -import org.apache.amoro.utils.CatalogUtil; -import org.apache.amoro.utils.MixedFormatCatalogUtil; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabase; -import org.apache.flink.table.catalog.CatalogFunction; -import org.apache.flink.table.catalog.CatalogPartition; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; -import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; -import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; -import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.catalog.exceptions.TablePartitionedException; -import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; -import org.apache.flink.table.catalog.stats.CatalogTableStatistics; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.factories.CatalogFactory; -import org.apache.flink.table.factories.Factory; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; - -/** This is a Flink catalog wrap a unified catalog. */ -public class FlinkUnifiedCatalog extends AbstractCatalog { - - private final UnifiedCatalog unifiedCatalog; - private final String amsUri; - private final String amoroCatalogName; - /** - * Available Flink catalogs for Unified Catalog. - * - *

May include: Iceberg, Mixed and Paimon Catalogs, etc. - */ - private Map availableCatalogs; - - private final CatalogFactory.Context context; - private final org.apache.hadoop.conf.Configuration hadoopConf; - - public FlinkUnifiedCatalog( - String amsUri, - String defaultDatabase, - UnifiedCatalog unifiedCatalog, - CatalogFactory.Context context, - org.apache.hadoop.conf.Configuration hadoopConf) { - super(context.getName(), defaultDatabase); - this.amsUri = amsUri; - this.amoroCatalogName = AmsThriftUrl.parse(amsUri, THRIFT_TABLE_SERVICE_NAME).catalogName(); - this.unifiedCatalog = unifiedCatalog; - this.context = context; - this.hadoopConf = hadoopConf; - } - - @Override - public void open() throws CatalogException { - availableCatalogs = Maps.newHashMap(); - } - - @Override - public void close() throws CatalogException { - if (availableCatalogs != null) { - availableCatalogs.forEach((tableFormat, catalog) -> catalog.close()); - } - } - - @Override - public List listDatabases() { - return unifiedCatalog.listDatabases(); - } - - @Override - public CatalogDatabase getDatabase(String databaseName) { - throw new UnsupportedOperationException("Unsupported operation: get database."); - } - - @Override - public boolean databaseExists(String databaseName) { - return unifiedCatalog.databaseExists(databaseName); - } - - @Override - public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) - throws DatabaseAlreadyExistException { - try { - unifiedCatalog.createDatabase(name); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new DatabaseAlreadyExistException(getName(), name); - } - } - } - - @Override - public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) - throws DatabaseNotExistException { - try { - unifiedCatalog.dropDatabase(name); - } catch (NoSuchDatabaseException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) { - throw new UnsupportedOperationException("Unsupported operation: alter database."); - } - - @Override - public List listTables(String databaseName) { - return unifiedCatalog.listTables(databaseName).stream() - .map(table -> table.getIdentifier().getTableName()) - .collect(java.util.stream.Collectors.toList()); - } - - @Override - public List listViews(String databaseName) { - return Collections.emptyList(); - } - - @Override - public CatalogBaseTable getTable(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - TableIdentifier tableIdentifier = - TableIdentifier.of( - this.amoroCatalogName, tablePath.getDatabaseName(), tablePath.getObjectName()); - Set formats = - CatalogUtil.tableFormats(unifiedCatalog.metastoreType(), unifiedCatalog.properties()); - - TableMetaStore tableMetaStore = unifiedCatalog.authenticationContext(); - return formats.stream() - .map( - f -> { - try { - AbstractCatalog catalog = - getOriginalCatalog(f) - .orElseGet(() -> createOriginalCatalog(tableIdentifier, f)); - CatalogTable catalogTable = - (CatalogTable) tableMetaStore.doAs(() -> catalog.getTable(tablePath)); - final Map flinkProperties = - Maps.newHashMap(catalogTable.getOptions()); - flinkProperties.put(TABLE_FORMAT.key(), f.toString()); - return CatalogTable.of( - catalogTable.getUnresolvedSchema(), - catalogTable.getComment(), - catalogTable.getPartitionKeys(), - flinkProperties); - } catch (RuntimeException e) { - // only handle no such table case - if (e.getCause() instanceof TableNotExistException - || e.getCause() instanceof NoSuchTableException) { - return null; - } else { - throw e; - } - } - }) - .filter(Objects::nonNull) - .findFirst() - .orElseThrow(() -> new TableNotExistException(getName(), tablePath)); - } - - @Override - public boolean tableExists(ObjectPath tablePath) { - try { - return unifiedCatalog.tableExists(tablePath.getDatabaseName(), tablePath.getObjectName()); - } catch (NoSuchDatabaseException | NoSuchTableException e) { - return false; - } - } - - @Override - public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) - throws TableNotExistException { - try { - unifiedCatalog.dropTable(tablePath.getDatabaseName(), tablePath.getObjectName(), true); - } catch (NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(getName(), tablePath); - } - } - } - - @Override - public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) - throws TableNotExistException, TableAlreadyExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.renameTable(tablePath, newTableName, ignoreIfNotExists); - } - - @Override - public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { - Configuration configuration = new Configuration(); - table.getOptions().forEach(configuration::setString); - unifiedCatalog.refresh(); - table - .getOptions() - .putAll( - MixedFormatCatalogUtil.mergePersistedCatalogPropertiesToTable( - table.getOptions(), unifiedCatalog.properties())); - TableFormat format = TableFormat.valueOf(configuration.get(TABLE_FORMAT)); - TableIdentifier tableIdentifier = - TableIdentifier.of( - unifiedCatalog.name(), tablePath.getDatabaseName(), tablePath.getObjectName()); - String errorMessage = - String.format( - "Can't decide table format of table %s, Please specify 'table.format' " - + "in table properties", - tableIdentifier); - - Preconditions.checkNotNull(format, errorMessage); - try { - unifiedCatalog.loadTable(tableIdentifier.getDatabase(), tableIdentifier.getTableName()); - if (!ignoreIfExists) { - throw new TableAlreadyExistException(getName(), tablePath); - } - return; - } catch (NoSuchTableException e) { - // do nothing - } - - final TableFormat catalogFormat = format; - AbstractCatalog catalog = - getOriginalCatalog(format) - .orElseGet(() -> createOriginalCatalog(tableIdentifier, catalogFormat)); - catalog.createTable(tablePath, table, ignoreIfExists); - } - - @Override - public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterTable(tablePath, newTable, ignoreIfNotExists); - } - - @Override - public List listPartitions(ObjectPath tablePath) - throws TableNotExistException, TableNotPartitionedException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.listPartitions(tablePath); - } - - @Override - public List listPartitions( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, - CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.listPartitions(tablePath, partitionSpec); - } - - @Override - public List listPartitionsByFilter( - ObjectPath tablePath, List filters) - throws TableNotExistException, TableNotPartitionedException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.listPartitionsByFilter(tablePath, filters); - } - - @Override - public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.getPartition(tablePath, partitionSpec); - } - - @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.partitionExists(tablePath, partitionSpec); - } - - @Override - public void createPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition partition, - boolean ignoreIfExists) - throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, - PartitionAlreadyExistsException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.createPartition(tablePath, partitionSpec, partition, ignoreIfExists); - } - - @Override - public void dropPartition( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.dropPartition(tablePath, partitionSpec, ignoreIfNotExists); - } - - @Override - public void alterPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition newPartition, - boolean ignoreIfNotExists) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterPartition(tablePath, partitionSpec, newPartition, ignoreIfNotExists); - } - - @Override - public Optional getFactory() { - return Optional.of(new UnifiedDynamicTableFactory(availableCatalogs)); - } - - @Override - public List listFunctions(String dbName) { - return Collections.emptyList(); - } - - @Override - public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException { - throw new FunctionNotExistException(getName(), functionPath); - } - - @Override - public boolean functionExists(ObjectPath functionPath) { - return false; - } - - @Override - public void createFunction( - ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) { - throw new UnsupportedOperationException("Unsupported operation: create function."); - } - - @Override - public void alterFunction( - ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) { - throw new UnsupportedOperationException("Unsupported operation: alter function."); - } - - @Override - public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) { - throw new UnsupportedOperationException("Unsupported operation: drop function."); - } - - @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.getTableStatistics(tablePath); - } - - @Override - public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.getTableColumnStatistics(tablePath); - } - - @Override - public CatalogTableStatistics getPartitionStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.getPartitionStatistics(tablePath, partitionSpec); - } - - @Override - public CatalogColumnStatistics getPartitionColumnStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - return catalog.getPartitionColumnStatistics(tablePath, partitionSpec); - } - - @Override - public void alterTableStatistics( - ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterTableStatistics(tablePath, tableStatistics, ignoreIfNotExists); - } - - @Override - public void alterTableColumnStatistics( - ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) - throws TableNotExistException, CatalogException, TablePartitionedException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterTableColumnStatistics(tablePath, columnStatistics, ignoreIfNotExists); - } - - @Override - public void alterPartitionStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, - boolean ignoreIfNotExists) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterPartitionStatistics( - tablePath, partitionSpec, partitionStatistics, ignoreIfNotExists); - } - - @Override - public void alterPartitionColumnStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) - throws PartitionNotExistException, CatalogException { - AbstractCatalog catalog = originalCatalog(tablePath); - catalog.alterPartitionColumnStatistics( - tablePath, partitionSpec, columnStatistics, ignoreIfNotExists); - } - - /** - * Get the original flink catalog for the given table, if the flink catalog is not exists in the - * cache, would create a new original flink catalog for this table format. - * - * @param amoroTable amoroTable - * @return original Flink catalog - */ - private AbstractCatalog originalCatalog(AmoroTable amoroTable) { - TableFormat format = amoroTable.format(); - TableIdentifier tableIdentifier = amoroTable.id(); - return getOriginalCatalog(format) - .orElseGet(() -> createOriginalCatalog(tableIdentifier, format)); - } - - private AbstractCatalog originalCatalog(ObjectPath tablePath) { - AmoroTable amoroTable = loadAmoroTable(tablePath); - return originalCatalog(amoroTable); - } - - private Optional getOriginalCatalog(TableFormat format) { - return Optional.ofNullable(availableCatalogs.get(format)); - } - - private AmoroTable loadAmoroTable(ObjectPath tablePath) { - return unifiedCatalog.loadTable(tablePath.getDatabaseName(), tablePath.getObjectName()); - } - - private AbstractCatalog createOriginalCatalog( - TableIdentifier tableIdentifier, TableFormat tableFormat) { - CatalogFactory catalogFactory; - if (tableFormat.equals(TableFormat.MIXED_ICEBERG)) { - catalogFactory = new MixedIcebergCatalogFactory(); - } else if (tableFormat.equals(TableFormat.MIXED_HIVE)) { - catalogFactory = new MixedHiveCatalogFactory(); - } else if (tableFormat.equals(TableFormat.ICEBERG)) { - catalogFactory = new IcebergFlinkCatalogFactory(hadoopConf); - } else if (tableFormat.equals(TableFormat.PAIMON)) { - catalogFactory = - new PaimonFlinkCatalogFactory( - unifiedCatalog.properties(), unifiedCatalog.metastoreType()); - } else { - throw new UnsupportedOperationException( - String.format( - "Unsupported table format: [%s] in the unified catalog, table identifier is [%s], the supported table formats are [%s].", - tableFormat, tableIdentifier, FlinkUnifiedCatalogFactory.SUPPORTED_FORMATS)); - } - - AbstractCatalog originalCatalog; - try { - context.getOptions().put(CatalogFactoryOptions.FLINK_TABLE_FORMATS.key(), tableFormat.name()); - originalCatalog = (AbstractCatalog) catalogFactory.createCatalog(context); - } catch (CatalogException e) { - if (e.getMessage().contains("must implement createCatalog(Context)")) { - originalCatalog = - (AbstractCatalog) catalogFactory.createCatalog(context.getName(), context.getOptions()); - } else { - throw e; - } - } - originalCatalog.open(); - availableCatalogs.put(tableFormat, originalCatalog); - return originalCatalog; - } - - @Override - public String toString() { - return "FlinkUnifiedCatalog{" - + "name='" - + getName() - + '\'' - + ", defaultDatabase='" - + getDefaultDatabase() - + '\'' - + ", amsUri='" - + amsUri - + '\'' - + ", amoroCatalogName='" - + amoroCatalogName - + '\'' - + ", availableCatalogs size=" - + availableCatalogs.size() - + "}"; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java deleted file mode 100644 index 3026f608bd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/MixedCatalog.java +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.apache.amoro.flink.FlinkSchemaUtil.generateExtraOptionsFrom; -import static org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema; -import static org.apache.amoro.flink.FlinkSchemaUtil.toSchema; -import static org.apache.flink.table.factories.FactoryUtil.CONNECTOR; -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.NoSuchDatabaseException; -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.table.MixedDynamicTableFactory; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.mixed.MixedFormatCatalog; -import org.apache.amoro.scan.CombinedScanTask; -import org.apache.amoro.scan.KeyedTableScanTask; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.shade.guava32.com.google.common.base.Objects; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.amoro.table.TableBuilder; -import org.apache.amoro.table.TableIdentifier; -import org.apache.amoro.table.TableProperties; -import org.apache.amoro.table.UnkeyedTable; -import org.apache.amoro.utils.CompatiblePropertyUtil; -import org.apache.flink.table.api.TableColumn; -import org.apache.flink.table.api.TableColumn.ComputedColumn; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabase; -import org.apache.flink.table.catalog.CatalogFunction; -import org.apache.flink.table.catalog.CatalogPartition; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.CatalogTableImpl; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.CatalogException; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; -import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; -import org.apache.flink.table.catalog.stats.CatalogTableStatistics; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.factories.Factory; -import org.apache.flink.table.factories.FactoryUtil; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkFilters; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.util.FlinkAlterTableUtil; -import org.apache.iceberg.io.CloseableIterable; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** Catalogs for mixed table format(include mixed-iceberg and mixed-hive). */ -public class MixedCatalog extends AbstractCatalog { - public static final String DEFAULT_DB = "default"; - - /** - * To distinguish 'CREATE TABLE LIKE' by checking stack - * org.apache.flink.table.planner.operations.SqlCreateTableConverter#lookupLikeSourceTable - */ - public static final String SQL_LIKE_METHOD = "lookupLikeSourceTable"; - - public static final String LOCATION = "location"; - - public static final String CHERRY_PICK_SNAPSHOT_ID = "cherry-pick-snapshot-id"; - - public static final String CURRENT_SNAPSHOT_ID = "current-snapshot-id"; - - private final InternalCatalogBuilder catalogBuilder; - - private MixedFormatCatalog internalCatalog; - - public MixedCatalog(String name, String defaultDatabase, InternalCatalogBuilder catalogBuilder) { - super(name, defaultDatabase); - this.catalogBuilder = catalogBuilder; - } - - public MixedCatalog(MixedCatalog copy) { - this(copy.getName(), copy.getDefaultDatabase(), copy.catalogBuilder); - } - - @Override - public void open() throws CatalogException { - internalCatalog = catalogBuilder.build(); - } - - @Override - public void close() throws CatalogException {} - - @Override - public List listDatabases() throws CatalogException { - return internalCatalog.listDatabases(); - } - - @Override - public CatalogDatabase getDatabase(String databaseName) throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean databaseExists(String databaseName) throws CatalogException { - return listDatabases().stream().anyMatch(db -> db.equalsIgnoreCase(databaseName)); - } - - @Override - public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) - throws CatalogException, DatabaseAlreadyExistException { - try { - internalCatalog.createDatabase(name); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new DatabaseAlreadyExistException(getName(), name, e); - } - } - } - - @Override - public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) - throws CatalogException, DatabaseNotExistException { - try { - internalCatalog.dropDatabase(name); - } catch (NoSuchDatabaseException e) { - if (!ignoreIfNotExists) { - throw new DatabaseNotExistException(getName(), name); - } - } - } - - @Override - public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listTables(String databaseName) throws CatalogException { - return internalCatalog.listTables(databaseName).stream() - .map(TableIdentifier::getTableName) - .collect(Collectors.toList()); - } - - @Override - public List listViews(String databaseName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogBaseTable getTable(ObjectPath tablePath) - throws TableNotExistException, CatalogException { - TableIdentifier tableIdentifier = getTableIdentifier(tablePath); - try { - MixedTable table = internalCatalog.loadTable(tableIdentifier); - Schema mixedTableSchema = table.schema(); - - Map mixedTableProperties = Maps.newHashMap(table.properties()); - fillTableProperties(mixedTableProperties); - fillTableMetaPropertiesIfLookupLike(mixedTableProperties, tableIdentifier); - - List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - return CatalogTable.of( - toSchema(mixedTableSchema, MixedFormatUtils.getPrimaryKeys(table), mixedTableProperties) - .toSchema(), - null, - partitionKeys, - mixedTableProperties); - } catch (NoSuchTableException e) { - throw new TableNotExistException(this.getName(), tablePath); - } - } - - /** - * For now, 'CREATE TABLE LIKE' would be treated as the case which users want to add watermark in - * temporal join, as an alternative of lookup join, and use mixed-format table as build table, - * i.e. right table. So the properties those required in temporal join will be put automatically. - * - *

If you don't want the properties, 'EXCLUDING ALL' is what you need. More details @see LIKE - */ - private void fillTableMetaPropertiesIfLookupLike( - Map properties, TableIdentifier tableIdentifier) { - StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace(); - boolean isLookupLike = false; - for (StackTraceElement stackTraceElement : stackTraceElements) { - if (Objects.equal(SQL_LIKE_METHOD, stackTraceElement.getMethodName())) { - isLookupLike = true; - break; - } - } - - if (!isLookupLike) { - return; - } - - properties.put(CONNECTOR.key(), MixedDynamicTableFactory.IDENTIFIER); - properties.put(MixedFormatValidator.MIXED_FORMAT_CATALOG.key(), tableIdentifier.getCatalog()); - properties.put(MixedFormatValidator.MIXED_FORMAT_TABLE.key(), tableIdentifier.getTableName()); - properties.put(MixedFormatValidator.MIXED_FORMAT_DATABASE.key(), tableIdentifier.getDatabase()); - properties.put(CatalogFactoryOptions.AMS_URI.key(), catalogBuilder.getAmsUri()); - } - - private static List toPartitionKeys(PartitionSpec spec, Schema icebergSchema) { - List partitionKeys = Lists.newArrayList(); - for (PartitionField field : spec.fields()) { - if (field.transform().isIdentity()) { - partitionKeys.add(icebergSchema.findColumnName(field.sourceId())); - } else { - // Not created by Flink SQL. - // For compatibility with iceberg tables, return empty. - // TODO modify this after Flink support partition transform. - return Collections.emptyList(); - } - } - return partitionKeys; - } - - private void fillTableProperties(Map tableProperties) { - boolean enableStream = - CompatiblePropertyUtil.propertyAsBoolean( - tableProperties, - TableProperties.ENABLE_LOG_STORE, - TableProperties.ENABLE_LOG_STORE_DEFAULT); - if (enableStream) { - tableProperties.putIfAbsent( - FactoryUtil.FORMAT.key(), - tableProperties.getOrDefault( - TableProperties.LOG_STORE_DATA_FORMAT, - TableProperties.LOG_STORE_DATA_FORMAT_DEFAULT)); - } - } - - private TableIdentifier getTableIdentifier(ObjectPath tablePath) { - return TableIdentifier.of( - internalCatalog.name(), tablePath.getDatabaseName(), tablePath.getObjectName()); - } - - @Override - public boolean tableExists(ObjectPath tablePath) throws CatalogException { - return internalCatalog.tableExists(getTableIdentifier(tablePath)); - } - - @Override - public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) throws CatalogException { - internalCatalog.dropTable(getTableIdentifier(tablePath), true); - } - - @Override - public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) - throws CatalogException { - internalCatalog.renameTable(getTableIdentifier(tablePath), newTableName); - } - - @Override - public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - validateFlinkTable(table); - validateColumnOrder(table); - createAmoroTable(tablePath, table, ignoreIfExists); - } - - private void createAmoroTable( - ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) - throws CatalogException, TableAlreadyExistException { - TableSchema tableSchema = table.getSchema(); - // get PhysicalColumn for TableSchema - TableSchema physicalSchema = getPhysicalSchema(tableSchema); - Schema icebergSchema = FlinkSchemaUtil.convert(physicalSchema); - TableBuilder tableBuilder = - internalCatalog.newTableBuilder(getTableIdentifier(tablePath), icebergSchema); - - tableSchema - .getPrimaryKey() - .ifPresent( - k -> { - PrimaryKeySpec.Builder builder = PrimaryKeySpec.builderFor(icebergSchema); - k.getColumns().forEach(builder::addColumn); - tableBuilder.withPrimaryKeySpec(builder.build()); - }); - - PartitionSpec spec = toPartitionSpec(((CatalogTable) table).getPartitionKeys(), icebergSchema); - tableBuilder.withPartitionSpec(spec); - - Map properties = table.getOptions(); - // update computed columns and watermark to properties - Map extraOptions = generateExtraOptionsFrom(tableSchema); - properties.putAll(extraOptions); - - tableBuilder.withProperties(properties); - - try { - tableBuilder.create(); - } catch (AlreadyExistsException e) { - if (!ignoreIfExists) { - throw new TableAlreadyExistException(getName(), tablePath, e); - } - } - } - - private static PartitionSpec toPartitionSpec(List partitionKeys, Schema icebergSchema) { - PartitionSpec.Builder builder = PartitionSpec.builderFor(icebergSchema); - partitionKeys.forEach(builder::identity); - return builder.build(); - } - - private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument( - table instanceof CatalogTable, "The Table should be a CatalogTable."); - } - - @Override - public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) - throws CatalogException, TableNotExistException { - validateFlinkTable(newTable); - - TableIdentifier tableIdentifier = getTableIdentifier(tablePath); - MixedTable mixedTable; - try { - mixedTable = internalCatalog.loadTable(tableIdentifier); - } catch (NoSuchTableException e) { - if (!ignoreIfNotExists) { - throw new TableNotExistException(internalCatalog.name(), tablePath, e); - } else { - return; - } - } - - // Currently, Flink SQL only support altering table properties. - validateTableSchemaAndPartition( - toCatalogTable(mixedTable, tableIdentifier), (CatalogTable) newTable); - - if (mixedTable.isUnkeyedTable()) { - alterUnKeyedTable(mixedTable.asUnkeyedTable(), newTable); - } else if (mixedTable.isKeyedTable()) { - alterKeyedTable(mixedTable.asKeyedTable(), newTable); - } else { - throw new UnsupportedOperationException("Unsupported alter table"); - } - } - - @Override - public List listPartitions(ObjectPath tablePath) - throws CatalogException, TableNotPartitionedException { - return listPartitionsByFilter(tablePath, Collections.emptyList()); - } - - @Override - public List listPartitions( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException, TableNotPartitionedException, PartitionSpecInvalidException { - checkNotNull(tablePath, "Table path cannot be null"); - checkNotNull(partitionSpec, "CatalogPartitionSpec cannot be null"); - TableIdentifier tableIdentifier = getTableIdentifier(tablePath); - checkValidPartitionSpec( - partitionSpec, internalCatalog.loadTable(tableIdentifier).spec(), tablePath); - List catalogPartitionSpecs = listPartitions(tablePath); - return catalogPartitionSpecs.stream() - .filter(spec -> spec.equals(partitionSpec)) - .collect(Collectors.toList()); - } - - @Override - public List listPartitionsByFilter( - ObjectPath tablePath, List filters) - throws CatalogException, TableNotPartitionedException { - TableIdentifier tableIdentifier = getTableIdentifier(tablePath); - MixedTable mixedTable = internalCatalog.loadTable(tableIdentifier); - - org.apache.iceberg.expressions.Expression filter; - List expressions = - filters.stream() - .map(FlinkFilters::convert) - .filter(Optional::isPresent) - .map(Optional::get) - .collect(Collectors.toList()); - - filter = - expressions.isEmpty() - ? Expressions.alwaysTrue() - : expressions.stream().reduce(Expressions::and).orElse(Expressions.alwaysTrue()); - - if (mixedTable.spec().isUnpartitioned()) { - throw new TableNotPartitionedException(internalCatalog.name(), tablePath); - } - Set set = Sets.newHashSet(); - if (mixedTable.isKeyedTable()) { - KeyedTable table = mixedTable.asKeyedTable(); - try (CloseableIterable combinedScanTasks = - table.newScan().filter(filter).planTasks()) { - for (CombinedScanTask combinedScanTask : combinedScanTasks) { - combinedScanTask.tasks().stream() - .flatMap( - (Function>) - keyedTableScanTask -> - Stream.of( - keyedTableScanTask.dataTasks(), - keyedTableScanTask.mixedEquityDeletes()) - .flatMap(List::stream)) - .forEach( - mixedFileScanTask -> { - Map map = Maps.newHashMap(); - StructLike structLike = mixedFileScanTask.partition(); - PartitionSpec spec = table.spec(); - for (int i = 0; i < structLike.size(); i++) { - map.put( - spec.fields().get(i).name(), - String.valueOf(structLike.get(i, Object.class))); - } - set.add(new CatalogPartitionSpec(map)); - }); - } - } catch (IOException e) { - throw new CatalogException( - String.format("Failed to list partitions of table %s", tablePath), e); - } - } else { - UnkeyedTable table = mixedTable.asUnkeyedTable(); - try (CloseableIterable tasks = table.newScan().filter(filter).planFiles()) { - for (DataFile dataFile : CloseableIterable.transform(tasks, FileScanTask::file)) { - Map map = Maps.newHashMap(); - StructLike structLike = dataFile.partition(); - PartitionSpec spec = table.specs().get(dataFile.specId()); - for (int i = 0; i < structLike.size(); i++) { - map.put(spec.fields().get(i).name(), String.valueOf(structLike.get(i, Object.class))); - } - set.add(new CatalogPartitionSpec(map)); - } - } catch (IOException e) { - throw new CatalogException( - String.format("Failed to list partitions of table %s", tablePath), e); - } - } - return Lists.newArrayList(set); - } - - @Override - public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void createPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition partition, - boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropPartition( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartition( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogPartition newPartition, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List listFunctions(String dbName) throws CatalogException { - return Collections.emptyList(); - } - - @Override - public CatalogFunction getFunction(ObjectPath functionPath) - throws FunctionNotExistException, CatalogException { - throw new FunctionNotExistException(getName(), functionPath); - } - - @Override - public boolean functionExists(ObjectPath functionPath) throws CatalogException { - return false; - } - - @Override - public void createFunction( - ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterFunction( - ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) - throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } - - @Override - public CatalogTableStatistics getPartitionStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogTableStatistics.UNKNOWN; - } - - @Override - public CatalogColumnStatistics getPartitionColumnStatistics( - ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { - return CatalogColumnStatistics.UNKNOWN; - } - - @Override - public void alterTableStatistics( - ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterTableColumnStatistics( - ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public void alterPartitionColumnStatistics( - ObjectPath tablePath, - CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) - throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public Optional getFactory() { - return Optional.of(new MixedDynamicTableFactory(this)); - } - - public InternalCatalogBuilder catalogBuilder() { - return catalogBuilder; - } - - public String amsCatalogName() { - return internalCatalog.name(); - } - - /** - * Check whether a list of partition values are valid based on the given list of partition keys. - * - * @param partitionSpec a partition spec. - * @param mixedTablePartitionSpec mixedTablePartitionSpec - * @param tablePath tablePath - * @throws PartitionSpecInvalidException thrown if any key in partitionSpec doesn't exist in - * partitionKeys. - */ - private void checkValidPartitionSpec( - CatalogPartitionSpec partitionSpec, - PartitionSpec mixedTablePartitionSpec, - ObjectPath tablePath) - throws PartitionSpecInvalidException { - List partitionKeys = - mixedTablePartitionSpec.fields().stream() - .map(PartitionField::name) - .collect(Collectors.toList()); - for (String key : partitionSpec.getPartitionSpec().keySet()) { - if (!partitionKeys.contains(key)) { - throw new PartitionSpecInvalidException(getName(), partitionKeys, tablePath, partitionSpec); - } - } - } - - private void validateColumnOrder(CatalogBaseTable table) { - TableSchema schema = table.getSchema(); - List tableColumns = schema.getTableColumns(); - - boolean foundComputeColumn = false; - for (TableColumn tableColumn : tableColumns) { - if (tableColumn instanceof ComputedColumn) { - foundComputeColumn = true; - } else if (foundComputeColumn) { - throw new IllegalStateException( - "compute column must be listed after all physical columns. "); - } - } - } - - /** - * copy from - * https://github.com/apache/iceberg/blob/main/flink/v1.16/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java#L425C23-L425C54 - * - * @param ct1 CatalogTable before - * @param ct2 CatalogTable after - */ - private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTable ct2) { - TableSchema ts1 = ct1.getSchema(); - TableSchema ts2 = ct2.getSchema(); - boolean equalsPrimary = false; - - if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { - equalsPrimary = - Objects.equal(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) - && Objects.equal( - ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); - } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { - equalsPrimary = true; - } - - if (!(Objects.equal(ts1.getTableColumns(), ts2.getTableColumns()) - && Objects.equal(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) - && equalsPrimary)) { - throw new UnsupportedOperationException("Altering schema is not supported yet."); - } - - if (!ct1.getPartitionKeys().equals(ct2.getPartitionKeys())) { - throw new UnsupportedOperationException("Altering partition keys is not supported yet."); - } - } - - private void alterUnKeyedTable(UnkeyedTable table, CatalogBaseTable newTable) { - Map oldProperties = table.properties(); - Map setProperties = Maps.newHashMap(); - - String setLocation = null; - String setSnapshotId = null; - String pickSnapshotId = null; - - for (Map.Entry entry : newTable.getOptions().entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - - if (Objects.equal(value, oldProperties.get(key))) { - continue; - } - - if (LOCATION.equalsIgnoreCase(key)) { - setLocation = value; - } else if (CURRENT_SNAPSHOT_ID.equalsIgnoreCase(key)) { - setSnapshotId = value; - } else if (CHERRY_PICK_SNAPSHOT_ID.equalsIgnoreCase(key)) { - pickSnapshotId = value; - } else { - setProperties.put(key, value); - } - } - - oldProperties - .keySet() - .forEach( - k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); - - FlinkAlterTableUtil.commitChanges( - table, setLocation, setSnapshotId, pickSnapshotId, setProperties); - } - - private CatalogTable toCatalogTable(MixedTable table, TableIdentifier tableIdentifier) { - Schema mixedTableSchema = table.schema(); - - Map mixedTableProperties = Maps.newHashMap(table.properties()); - fillTableProperties(mixedTableProperties); - fillTableMetaPropertiesIfLookupLike(mixedTableProperties, tableIdentifier); - - List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - return new CatalogTableImpl( - toSchema(mixedTableSchema, MixedFormatUtils.getPrimaryKeys(table), mixedTableProperties), - partitionKeys, - mixedTableProperties, - null); - } - - private void alterKeyedTable(KeyedTable table, CatalogBaseTable newTable) { - Map oldProperties = table.properties(); - Map setProperties = Maps.newHashMap(); - for (Map.Entry entry : newTable.getOptions().entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - if (!Objects.equal(value, oldProperties.get(key))) { - setProperties.put(key, value); - } - } - oldProperties - .keySet() - .forEach( - k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); - commitKeyedChanges(table, setProperties); - } - - private void commitKeyedChanges(KeyedTable table, Map setProperties) { - if (!setProperties.isEmpty()) { - updateTransactionKey(table.updateProperties(), setProperties); - } - } - - private void updateTransactionKey( - UpdateProperties updateProperties, Map setProperties) { - setProperties.forEach( - (k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); - updateProperties.commit(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java deleted file mode 100644 index 95e5888e79..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/CatalogFactoryOptions.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories; - -import static org.apache.amoro.properties.CatalogMetaProperties.TABLE_FORMATS; - -import org.apache.amoro.flink.catalog.FlinkUnifiedCatalog; -import org.apache.amoro.flink.catalog.MixedCatalog; -import org.apache.amoro.properties.CatalogMetaProperties; -import org.apache.flink.annotation.Internal; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; - -/** {@link ConfigOption}s for {@link MixedCatalog} and {@link FlinkUnifiedCatalog}. */ -@Internal -public class CatalogFactoryOptions { - public static final String MIXED_ICEBERG_IDENTIFIER = "mixed_iceberg"; - public static final String MIXED_HIVE_IDENTIFIER = "mixed_hive"; - public static final String UNIFIED_IDENTIFIER = "unified"; - - public static final ConfigOption AMS_URI = - ConfigOptions.key(CatalogMetaProperties.AMS_URI).stringType().noDefaultValue(); - - public static final ConfigOption FLINK_TABLE_FORMATS = - ConfigOptions.key(TABLE_FORMATS) - .stringType() - .noDefaultValue() - .withDescription("This illustrates the table format contained in the catalog."); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java deleted file mode 100644 index 063666b17d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/FlinkUnifiedCatalogFactory.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories; - -import static org.apache.amoro.Constants.THRIFT_TABLE_SERVICE_NAME; -import static org.apache.amoro.flink.table.OptionsUtil.getCatalogProperties; -import static org.apache.amoro.properties.CatalogMetaProperties.TABLE_FORMATS; - -import org.apache.amoro.CommonUnifiedCatalog; -import org.apache.amoro.TableFormat; -import org.apache.amoro.UnifiedCatalog; -import org.apache.amoro.UnifiedCatalogLoader; -import org.apache.amoro.client.AmsThriftUrl; -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.catalog.FlinkUnifiedCatalog; -import org.apache.amoro.flink.catalog.MixedCatalog; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.amoro.table.TableMetaStore; -import org.apache.amoro.utils.CatalogUtil; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.CommonCatalogOptions; -import org.apache.flink.table.factories.CatalogFactory; -import org.apache.flink.util.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.flink.FlinkCatalogFactory; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; - -/** Factory for {@link FlinkUnifiedCatalog}. */ -public class FlinkUnifiedCatalogFactory implements CatalogFactory { - - public static final Set SUPPORTED_FORMATS = - Sets.newHashSet( - TableFormat.MIXED_ICEBERG, - TableFormat.MIXED_HIVE, - TableFormat.ICEBERG, - TableFormat.PAIMON); - - @Override - public String factoryIdentifier() { - return CatalogFactoryOptions.UNIFIED_IDENTIFIER; - } - - @Override - public Set> requiredOptions() { - return Collections.emptySet(); - } - - @Override - public Set> optionalOptions() { - return Collections.emptySet(); - } - - @Override - public Catalog createCatalog(Context context) { - - final String defaultDatabase = - context - .getOptions() - .getOrDefault(CommonCatalogOptions.DEFAULT_DATABASE_KEY, MixedCatalog.DEFAULT_DB); - final String metastoreUri = context.getOptions().get(CatalogFactoryOptions.AMS_URI.key()); - final Map catalogProperties = getCatalogProperties(context.getOptions()); - - UnifiedCatalog unifiedCatalog; - if (metastoreUri != null) { - String amoroCatalogName = - AmsThriftUrl.parse(metastoreUri, THRIFT_TABLE_SERVICE_NAME).catalogName(); - unifiedCatalog = - UnifiedCatalogLoader.loadUnifiedCatalog( - metastoreUri, amoroCatalogName, catalogProperties); - } else { - String metastoreType = catalogProperties.get(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(metastoreType != null, "Catalog type cannot be empty"); - TableMetaStore tableMetaStore = - TableMetaStore.builder() - .withConfiguration( - InternalCatalogBuilder.clusterHadoopConf(metastoreType, catalogProperties)) - .build(); - unifiedCatalog = - new CommonUnifiedCatalog( - context.getName(), metastoreType, catalogProperties, tableMetaStore); - } - Configuration hadoopConf = unifiedCatalog.authenticationContext().getConfiguration(); - Set tableFormats = - CatalogUtil.tableFormats(unifiedCatalog.metastoreType(), unifiedCatalog.properties()); - validate(tableFormats); - - return new FlinkUnifiedCatalog( - metastoreUri, defaultDatabase, unifiedCatalog, context, hadoopConf); - } - - private void validate(Set expectedFormats) { - if (expectedFormats.isEmpty()) { - throw new IllegalArgumentException( - String.format( - "The table formats must be specified in the catalog properties: [%s]", - TABLE_FORMATS)); - } - if (!SUPPORTED_FORMATS.containsAll(expectedFormats)) { - throw new IllegalArgumentException( - String.format( - "The table formats [%s] are not supported in the unified catalog, the supported table formats are [%s].", - expectedFormats, SUPPORTED_FORMATS)); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java deleted file mode 100644 index 44530bf1d1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/iceberg/IcebergFlinkCatalogFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories.iceberg; - -import org.apache.flink.table.catalog.Catalog; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.flink.FlinkCatalogFactory; - -import java.util.Map; - -/** Creating Iceberg Catalog by the hadoop configuration which stored in the AMS. */ -public class IcebergFlinkCatalogFactory extends FlinkCatalogFactory { - private final Configuration hadoopConf; - - public IcebergFlinkCatalogFactory(Configuration hadoopConf) { - this.hadoopConf = hadoopConf; - } - - @Override - public Catalog createCatalog(String name, Map properties) { - return super.createCatalog(name, properties, hadoopConf); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java deleted file mode 100644 index d55eff1a21..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedHiveCatalogFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories.mixed; - -import org.apache.amoro.flink.catalog.MixedCatalog; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; - -/** - * The factory to create {@link MixedCatalog} with {@link - * CatalogFactoryOptions#MIXED_HIVE_IDENTIFIER} identifier. - */ -public class MixedHiveCatalogFactory extends MixedIcebergCatalogFactory { - - @Override - public String factoryIdentifier() { - return CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java deleted file mode 100644 index b394e1eaa4..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/mixed/MixedIcebergCatalogFactory.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories.mixed; - -import static org.apache.amoro.flink.table.OptionsUtil.getCatalogProperties; - -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.catalog.MixedCatalog; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.CommonCatalogOptions; -import org.apache.flink.table.factories.CatalogFactory; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; - -/** - * The factory to create {@link MixedCatalog} with {@link - * CatalogFactoryOptions#MIXED_ICEBERG_IDENTIFIER} identifier. - */ -public class MixedIcebergCatalogFactory implements CatalogFactory { - - @Override - public String factoryIdentifier() { - return CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER; - } - - @Override - public Catalog createCatalog(Context context) { - - final String defaultDatabase = - context - .getOptions() - .getOrDefault(CommonCatalogOptions.DEFAULT_DATABASE_KEY, MixedCatalog.DEFAULT_DB); - final String amsUri = context.getOptions().get(CatalogFactoryOptions.AMS_URI.key()); - final Map catalogProperties = getCatalogProperties(context.getOptions()); - - final InternalCatalogBuilder catalogBuilder = - InternalCatalogBuilder.builder() - .amsUri(amsUri) - .catalogName(context.getName()) - .properties(catalogProperties); - - return new MixedCatalog(context.getName(), defaultDatabase, catalogBuilder); - } - - @Override - public Set> requiredOptions() { - return Collections.emptySet(); - } - - @Override - public Set> optionalOptions() { - return Collections.emptySet(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java deleted file mode 100644 index fd0b6ae937..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/catalog/factories/paimon/PaimonFlinkCatalogFactory.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog.factories.paimon; - -import org.apache.amoro.properties.CatalogMetaProperties; -import org.apache.paimon.catalog.FileSystemCatalogFactory; -import org.apache.paimon.flink.FlinkCatalog; -import org.apache.paimon.flink.FlinkCatalogFactory; -import org.apache.paimon.options.CatalogOptions; - -import java.util.Map; - -/** Creating Paimon FlinkCatalogFactory with properties which stored in the AMS */ -public class PaimonFlinkCatalogFactory extends FlinkCatalogFactory { - private final Map options; - private final String metastoreType; - - public PaimonFlinkCatalogFactory(Map options, String metastoreType) { - this.options = options; - this.metastoreType = metastoreType; - } - - @Override - public FlinkCatalog createCatalog(Context context) { - context.getOptions().putAll(options); - addMetastoreType(context); - return super.createCatalog(context); - } - - private void addMetastoreType(Context context) { - String type; - if (CatalogMetaProperties.CATALOG_TYPE_HADOOP.equalsIgnoreCase(metastoreType)) { - type = FileSystemCatalogFactory.IDENTIFIER; - } else { - type = metastoreType; - } - context.getOptions().put(CatalogOptions.METASTORE.key(), type); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java deleted file mode 100644 index e0d1ec1b7d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/FlinkTablePropertiesInvocationHandler.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.interceptor; - -import org.apache.amoro.flink.util.ReflectionUtil; -import org.apache.amoro.table.MixedTable; - -import java.io.Serializable; -import java.lang.reflect.InvocationHandler; -import java.lang.reflect.Method; -import java.lang.reflect.Proxy; -import java.util.HashMap; -import java.util.Map; - -/** Integrate flinkTable properties */ -public class FlinkTablePropertiesInvocationHandler implements InvocationHandler, Serializable { - - private final MixedTable mixedTable; - private final Map flinkTableProperties = new HashMap<>(); - protected Map tablePropertiesCombined = new HashMap<>(); - - public FlinkTablePropertiesInvocationHandler( - Map flinkTableProperties, MixedTable mixedTable) { - this.tablePropertiesCombined.putAll(mixedTable.properties()); - this.mixedTable = mixedTable; - if (flinkTableProperties == null) { - return; - } - this.flinkTableProperties.putAll(flinkTableProperties); - this.tablePropertiesCombined.putAll(flinkTableProperties); - } - - public Object getProxy() { - return Proxy.newProxyInstance( - mixedTable.getClass().getClassLoader(), - ReflectionUtil.getAllInterface(mixedTable.getClass()), - this); - } - - @Override - public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { - if ("properties".equals(method.getName())) { - return tablePropertiesCombined; - } else if ("asKeyedTable".equals(method.getName())) { - return proxy; - } - Object result = method.invoke(mixedTable, args); - // rewrite the properties as of the mixed-format table properties may be updated. - if ("refresh".equals(method.getName())) { - rewriteProperties(); - } - return result; - } - - private void rewriteProperties() { - Map refreshedProperties = mixedTable.properties(); - // iterate through the properties of the mixed-format table and update the properties of the - // tablePropertiesCombined. - for (Map.Entry entry : refreshedProperties.entrySet()) { - if (flinkTableProperties.containsKey(entry.getKey())) { - // Don't update the properties of the tablePropertiesCombined - continue; - } - if (!tablePropertiesCombined.containsKey(entry.getKey()) - || !tablePropertiesCombined.get(entry.getKey()).equals(entry.getValue())) { - tablePropertiesCombined.put(entry.getKey(), entry.getValue()); - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java deleted file mode 100644 index 14de1dabd6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInterceptor.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.interceptor; - -import net.sf.cglib.proxy.MethodInterceptor; -import net.sf.cglib.proxy.MethodProxy; -import org.apache.amoro.io.AuthenticatedFileIO; - -import java.io.Serializable; -import java.lang.reflect.Method; - -/** Using cglib proxy to avoid proxy object having different class */ -public class KerberosInterceptor implements MethodInterceptor, Serializable { - - private static final long serialVersionUID = 1L; - private final AuthenticatedFileIO authenticatedFileIO; - - public KerberosInterceptor(AuthenticatedFileIO authenticatedFileIO) { - this.authenticatedFileIO = authenticatedFileIO; - } - - @Override - public Object intercept(Object o, Method method, Object[] args, MethodProxy proxy) - throws Throwable { - Object res; - try { - res = - authenticatedFileIO.doAs( - () -> { - try { - return proxy.invokeSuper(o, args); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } catch (RuntimeException e) { - throw e.getCause(); - } - return res; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java deleted file mode 100644 index 25dce7fb0d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/KerberosInvocationHandler.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.interceptor; - -import org.apache.amoro.flink.util.ReflectionUtil; -import org.apache.amoro.io.AuthenticatedFileIO; - -import java.io.Serializable; -import java.lang.reflect.InvocationHandler; -import java.lang.reflect.Method; -import java.lang.reflect.Proxy; - -/** - * Proxy for iceberg-flink class. To support kerberos. Using jdk proxy can surrogate an instance - * which already exists. - * - * @param proxy class type - */ -public class KerberosInvocationHandler implements InvocationHandler, Serializable { - - private static final long serialVersionUID = 1L; - private final AuthenticatedFileIO authenticatedFileIO; - private T obj; - - public KerberosInvocationHandler(AuthenticatedFileIO authenticatedFileIO) { - this.authenticatedFileIO = authenticatedFileIO; - } - - public Object getProxy(T obj) { - this.obj = obj; - return Proxy.newProxyInstance( - obj.getClass().getClassLoader(), ReflectionUtil.getAllInterface(obj.getClass()), this); - } - - @Override - public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { - Object res; - try { - res = - authenticatedFileIO.doAs( - () -> { - try { - method.setAccessible(true); - return method.invoke(obj, args); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } catch (RuntimeException e) { - throw e.getCause(); - } - return res; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java deleted file mode 100644 index 0e0341bac6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/interceptor/ProxyFactory.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.interceptor; - -import org.apache.amoro.flink.util.ProxyUtil; - -import java.io.Serializable; - -/** - * Create proxy in runtime to avoid 'ClassNotFoundException: $$EnhancerByCglib' - * - * @param - */ -public class ProxyFactory implements Serializable { - private static final long serialVersionUID = 1L; - private final Class clazz; - private final KerberosInterceptor interceptor; - private final Class[] argumentTypes; - private final Object[] arguments; - - public ProxyFactory( - Class clazz, KerberosInterceptor interceptor, Class[] argumentTypes, Object[] arguments) { - this.clazz = clazz; - this.interceptor = interceptor; - this.argumentTypes = argumentTypes; - this.arguments = arguments; - } - - public T getInstance() { - return ProxyUtil.getProxy(clazz, interceptor, argumentTypes, arguments); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java deleted file mode 100644 index 114245de93..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BasicLookupFunction.java +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOOKUP_RELOADING_INTERVAL; -import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; -import static org.apache.flink.util.Preconditions.checkArgument; - -import org.apache.amoro.flink.read.MixedIncrementalLoader; -import org.apache.amoro.flink.read.hybrid.enumerator.MergeOnReadIncrementalPlanner; -import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.functions.FunctionContext; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.concurrent.ExecutorThreadFactory; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.CloseableIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.lang.reflect.Field; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ThreadLocalRandom; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.Predicate; - -/** This is a basic lookup function for an mixed-format table. */ -public class BasicLookupFunction implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(BasicLookupFunction.class); - private static final long serialVersionUID = 1671720424494168710L; - private MixedTable mixedTable; - private KVTable kvTable; - private final List joinKeys; - private final Schema projectSchema; - private final List filters; - private final MixedFormatTableLoader loader; - private long nextLoadTime = Long.MIN_VALUE; - private final long reloadIntervalSeconds; - private MixedIncrementalLoader incrementalLoader; - private final Configuration config; - private transient AtomicLong lookupLoadingTimeMs; - private final Predicate predicate; - private final TableFactory kvTableFactory; - private final AbstractAdaptHiveKeyedDataReader flinkMORDataReader; - private final DataIteratorReaderFunction readerFunction; - - private transient ScheduledExecutorService executor; - private final AtomicReference failureThrowable = new AtomicReference<>(); - - public BasicLookupFunction( - TableFactory tableFactory, - MixedTable mixedTable, - List joinKeys, - Schema projectSchema, - List filters, - MixedFormatTableLoader tableLoader, - Configuration config, - Predicate predicate, - AbstractAdaptHiveKeyedDataReader adaptHiveKeyedDataReader, - DataIteratorReaderFunction readerFunction) { - checkArgument( - mixedTable.isKeyedTable(), - String.format( - "Only keyed mixed-format table support lookup join, this table [%s] is an unkeyed table.", - mixedTable.name())); - Preconditions.checkNotNull(tableFactory, "kvTableFactory cannot be null"); - this.kvTableFactory = tableFactory; - this.joinKeys = joinKeys; - this.projectSchema = projectSchema; - this.filters = filters; - this.loader = tableLoader; - this.config = config; - this.reloadIntervalSeconds = config.get(LOOKUP_RELOADING_INTERVAL).getSeconds(); - this.predicate = predicate; - this.flinkMORDataReader = adaptHiveKeyedDataReader; - this.readerFunction = readerFunction; - } - - /** - * Open the lookup function, e.g.: create {@link KVTable} kvTable, and load data. - * - * @throws IOException If serialize or deserialize failed - */ - public void open(FunctionContext context) throws IOException { - init(context); - start(); - } - - /** - * Initialize the mixed-format table, kvTable and incrementalLoader. - * - * @param context - */ - public void init(FunctionContext context) { - LOG.info("lookup function row data predicate: {}.", predicate); - MetricGroup metricGroup = context.getMetricGroup().addGroup(LookupMetrics.GROUP_NAME_LOOKUP); - if (mixedTable == null) { - mixedTable = loadMixedTable(loader).asKeyedTable(); - } - mixedTable.refresh(); - - lookupLoadingTimeMs = new AtomicLong(); - metricGroup.gauge(LookupMetrics.LOADING_TIME_MS, () -> lookupLoadingTimeMs.get()); - - LOG.info("projected schema {}.\n table schema {}.", projectSchema, mixedTable.schema()); - kvTable = - kvTableFactory.create( - new RowDataStateFactory(generateRocksDBPath(context, mixedTable.name()), metricGroup), - mixedTable.asKeyedTable().primaryKeySpec().fieldNames(), - joinKeys, - projectSchema, - config, - predicate); - kvTable.open(); - - this.incrementalLoader = - new MixedIncrementalLoader<>( - new MergeOnReadIncrementalPlanner(loader), flinkMORDataReader, readerFunction, filters); - } - - public void start() { - // Keep the first-time synchronized loading to avoid a mass of null-match records during - // initialization - checkAndLoad(); - - this.executor = - Executors.newScheduledThreadPool( - 1, new ExecutorThreadFactory("Mixed-format-lookup-scheduled-loader")); - this.executor.scheduleWithFixedDelay( - () -> { - try { - checkAndLoad(); - } catch (Exception e) { - // fail the lookup and skip the rest of the items - // if the failure handler decides to throw an exception - failureThrowable.compareAndSet(null, e); - } - }, - 0, - reloadIntervalSeconds, - TimeUnit.MILLISECONDS); - } - - public List lookup(RowData lookupKey) { - checkErrorAndRethrow(); - try { - return kvTable.get(lookupKey); - } catch (Exception e) { - throw new FlinkRuntimeException(e); - } - } - - /** - * Check whether it is time to periodically load data to kvTable. Support to use {@link - * Expression} filters to filter the data. - */ - private synchronized void checkAndLoad() { - if (nextLoadTime > System.currentTimeMillis()) { - return; - } - nextLoadTime = System.currentTimeMillis() + 1000 * reloadIntervalSeconds; - - long batchStart = System.currentTimeMillis(); - while (incrementalLoader.hasNext()) { - long start = System.currentTimeMillis(); - mixedTable - .io() - .doAs( - () -> { - try (CloseableIterator iterator = incrementalLoader.next()) { - if (kvTable.initialized()) { - kvTable.upsert(iterator); - } else { - LOG.info( - "This table {} is still under initialization progress.", mixedTable.name()); - kvTable.initialize(iterator); - } - } - return null; - }); - LOG.info("Split task fetched, cost {}ms.", System.currentTimeMillis() - start); - } - if (!kvTable.initialized()) { - kvTable.waitInitializationCompleted(); - } - lookupLoadingTimeMs.set(System.currentTimeMillis() - batchStart); - - LOG.info( - "{} table lookup loading, these batch tasks completed, cost {}ms.", - mixedTable.name(), - lookupLoadingTimeMs.get()); - } - - public KVTable getKVTable() { - return kvTable; - } - - public void close() throws Exception { - if (kvTable != null) { - kvTable.close(); - } - if (executor != null) { - executor.shutdownNow(); - } - } - - private void checkErrorAndRethrow() { - Throwable cause = failureThrowable.get(); - if (cause != null) { - throw new RuntimeException("An error occurred in MixedFormatLookupFunction.", cause); - } - } - - private String generateRocksDBPath(FunctionContext context, String tableName) { - String tmpPath = getTmpDirectoryFromTMContainer(context); - File db = new File(tmpPath, tableName + "-lookup-" + UUID.randomUUID()); - return db.toString(); - } - - private static String getTmpDirectoryFromTMContainer(FunctionContext context) { - try { - Field field = context.getClass().getDeclaredField("context"); - field.setAccessible(true); - StreamingRuntimeContext runtimeContext = (StreamingRuntimeContext) field.get(context); - String[] tmpDirectories = runtimeContext.getTaskManagerRuntimeInfo().getTmpDirectories(); - return tmpDirectories[ThreadLocalRandom.current().nextInt(tmpDirectories.length)]; - } catch (NoSuchFieldException | IllegalAccessException e) { - throw new RuntimeException(e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java deleted file mode 100644 index e6bae12c04..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/BinaryRowDataSerializerWrapper.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.binary.BinaryRowData; -import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; - -import java.io.IOException; -import java.io.Serializable; - -/** - * This is a wrapper for {@link BinaryRowDataSerializer}. It is used to serialize and deserialize - * RowData. And serialize and deserialize operations are not thread-safe. - */ -public class BinaryRowDataSerializerWrapper implements Serializable, Cloneable { - - private static final long serialVersionUID = 1L; - protected BinaryRowDataSerializer serializer; - private RowDataSerializer rowDataSerializer; - private DataOutputSerializer outputView; - private DataInputDeserializer inputView; - private final Schema schema; - - public BinaryRowDataSerializerWrapper(Schema schema) { - this.serializer = new BinaryRowDataSerializer(schema.asStruct().fields().size()); - this.schema = schema; - } - - public byte[] serialize(RowData rowData) throws IOException { - if (rowDataSerializer == null) { - RowType rowType = FlinkSchemaUtil.convert(schema); - rowDataSerializer = new RowDataSerializer(rowType); - } - BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(rowData); - if (outputView == null) { - outputView = new DataOutputSerializer(32); - } - outputView.clear(); - serializer.serialize(binaryRowData, outputView); - return outputView.getCopyOfBuffer(); - } - - public RowData deserialize(byte[] recordBytes) throws IOException { - if (recordBytes == null) { - return null; - } - if (inputView == null) { - inputView = new DataInputDeserializer(); - } - inputView.setBuffer(recordBytes); - return serializer.deserialize(inputView); - } - - @Override - public BinaryRowDataSerializerWrapper clone() { - return new BinaryRowDataSerializerWrapper(schema); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java deleted file mode 100644 index 2d54d65944..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArraySetSerializer.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.commons.collections.CollectionUtils; - -import java.nio.ByteBuffer; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - -/** Utility class for serializing and deserializing a set of ByteArrayWrapper objects. */ -public class ByteArraySetSerializer { - - /** - * Deserializes a byte array into a set of ByteArrayWrapper objects. - * - * @param byteArray the byte array to deserialize - * @return the deserialized set of ByteArrayWrapper objects - */ - public static Set deserialize(byte[] byteArray) { - if (byteArray == null) { - return Collections.emptySet(); - } - - Set set = new HashSet<>(); - - ByteBuffer buffer = ByteBuffer.wrap(byteArray); - int setSize = buffer.getInt(); // Read the size of the set - - for (int i = 0; i < setSize; i++) { - int elementSize = buffer.getInt(); // Read the size of the element - byte[] element = new byte[elementSize]; - buffer.get(element); // Read the element bytes - ByteArrayWrapper baw = new ByteArrayWrapper(element, elementSize); - set.add(baw); - } - - return set; - } - - /** - * Serializes a set of ByteArrayWrapper objects into a byte array. - * - * @param set the set of ByteArrayWrapper objects to serialize - * @return the serialized byte array - */ - public static byte[] serialize(Set set) { - if (CollectionUtils.isEmpty(set)) { - return null; - } - - // Calculate the total size of the resulting byte array - // The first 4 bytes represent the size of the set - int totalSize = 4; - for (ByteArrayWrapper record : set) { - // Each element consists of 4 bytes representing the size of the element - totalSize += 4; - totalSize += record.size; - } - - // Create a new byte array with the total size - ByteBuffer buffer = ByteBuffer.allocate(totalSize); - buffer.putInt(set.size()); // Write the size of the set - - for (ByteArrayWrapper record : set) { - buffer.putInt(record.size); // Write the size of the element - buffer.put(record.bytes); // Write the element bytes - } - - return buffer.array(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java deleted file mode 100644 index 264a853a41..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/ByteArrayWrapper.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import java.io.Serializable; -import java.nio.ByteBuffer; -import java.util.Locale; - -/** This byte array wrapper utility class. copied from com.ibm.icu.util.ByteArrayWrapper. */ -public class ByteArrayWrapper implements Comparable, Serializable { - private static final long serialVersionUID = -6697944376117365645L; - public byte[] bytes; - - /** - * Size of the internal byte array used. Different from bytes.length, size will be <= - * bytes.length. Semantics of size is similar to java.util.Vector.size(). - */ - public int size; - - /** - * Construct a new ByteArrayWrapper from a byte array and size. - * - * @param bytesToAdopt the byte array to adopt - * @param size the length of valid data in the byte array - * @throws IndexOutOfBoundsException if bytesToAdopt == null and size != 0, or size < 0, or - * size > bytesToAdopt.length. - */ - public ByteArrayWrapper(byte[] bytesToAdopt, int size) { - if ((bytesToAdopt == null && size != 0) - || size < 0 - || (bytesToAdopt != null && size > bytesToAdopt.length)) { - throw new IndexOutOfBoundsException("illegal size: " + size); - } - this.bytes = bytesToAdopt; - this.size = size; - } - - /** - * Construct a new ByteArrayWrapper from the contents of a ByteBuffer. - * - * @param source the ByteBuffer from which to get the data. - */ - public ByteArrayWrapper(ByteBuffer source) { - size = source.limit(); - bytes = new byte[size]; - source.get(bytes, 0, size); - } - - /** - * Ensure that the internal byte array is at least of length capacity. If the byte array is null - * or its length is less than capacity, a new byte array of length capacity will be allocated. The - * contents of the array (between 0 and size) remain unchanged. - * - * @param capacity minimum length of internal byte array. - * @return this ByteArrayWrapper - */ - public ByteArrayWrapper ensureCapacity(int capacity) { - if (bytes == null || bytes.length < capacity) { - byte[] newBytes = new byte[capacity]; - if (bytes != null) { - copyBytes(bytes, 0, newBytes, 0, size); - } - bytes = newBytes; - } - return this; - } - - /** - * Set the internal byte array from offset 0 to (limit - start) with the contents of src from - * offset start to limit. If the byte array is null or its length is less than capacity, a new - * byte array of length (limit - start) will be allocated. This resets the size of the internal - * byte array to (limit - start). - * - * @param src source byte array to copy from - * @param start start offset of src to copy from - * @param limit end + 1 offset of src to copy from - * @return this ByteArrayWrapper - */ - public final ByteArrayWrapper set(byte[] src, int start, int limit) { - size = 0; - append(src, start, limit); - return this; - } - - /** - * Appends the internal byte array from offset size with the contents of src from offset start to - * limit. This increases the size of the internal byte array to (size + limit - start). - * - * @param src source byte array to copy from - * @param start start offset of src to copy from - * @param limit end + 1 offset of src to copy from - * @return this ByteArrayWrapper - */ - public final ByteArrayWrapper append(byte[] src, int start, int limit) { - int len = limit - start; - ensureCapacity(size + len); - copyBytes(src, start, bytes, size, len); - size += len; - return this; - } - - /** - * Releases the internal byte array to the caller, resets the internal byte array to null and its - * size to 0. - * - * @return internal byte array. - */ - public final byte[] releaseBytes() { - byte[] result = bytes; - bytes = null; - size = 0; - return result; - } - - /** Returns string value for debugging. */ - @Override - public String toString() { - StringBuilder result = new StringBuilder(); - for (int i = 0; i < size; ++i) { - if (i != 0) { - result.append(" "); - } - result.append(hex(bytes[i] & 0xFF)); - } - return result.toString(); - } - - private static String hex(long i) { - if (i == Long.MIN_VALUE) { - return "-8000000000000000"; - } else { - boolean negative = i < 0L; - if (negative) { - i = -i; - } - - String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); - if (result.length() < 2) { - result = "0000000000000000".substring(result.length(), 2) + result; - } - - return negative ? '-' + result : result; - } - } - - /** - * Return true if the bytes in each wrapper are equal. - * - * @param other the object to compare to. - * @return true if the two objects are equal. - */ - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (other == null) { - return false; - } - if (!(other instanceof ByteArrayWrapper)) { - return false; - } - - ByteArrayWrapper that = (ByteArrayWrapper) other; - if (size != that.size) { - return false; - } - for (int i = 0; i < size; ++i) { - if (bytes[i] != that.bytes[i]) { - return false; - } - } - return true; - } - - /** - * Return the hashcode. - * - * @return the hashcode. - */ - @Override - public int hashCode() { - int result = size; - for (int i = 0; i < size; ++i) { - result = 37 * result + bytes[i]; - } - return result; - } - - /** - * Compare this object to another ByteArrayWrapper, which must not be null. - * - * @param other the object to compare to. - * @return a value <0, 0, or >0 as this compares less than, equal to, or greater than other. - * @throws ClassCastException if the other object is not a ByteArrayWrapper - */ - @Override - public int compareTo(ByteArrayWrapper other) { - if (this == other) { - return 0; - } - int minSize = Math.min(size, other.size); - for (int i = 0; i < minSize; ++i) { - if (bytes[i] != other.bytes[i]) { - return (bytes[i] & 0xFF) - (other.bytes[i] & 0xFF); - } - } - return size - other.size; - } - - /** - * Copies the contents of src byte array from offset srcOff to the target of target byte array at - * the offset targetOff. - * - * @param src source byte array to copy from - * @param srcOff start offset of src to copy from - * @param target target byte array to copy to - * @param targetOff start offset of target to copy to - * @param length size of contents to copy - */ - private static void copyBytes(byte[] src, int srcOff, byte[] target, int targetOff, int length) { - if (length < 64) { - for (int i = srcOff, n = targetOff; --length >= 0; ++i, ++n) { - target[n] = src[i]; - } - } else { - System.arraycopy(src, srcOff, target, targetOff, length); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java deleted file mode 100644 index 596647eafa..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTable.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.flink.table.data.RowData; - -import java.io.Closeable; -import java.io.IOException; -import java.io.Serializable; -import java.util.Iterator; -import java.util.List; - -/** - * The KVTable interface is used for lookup join in mixed-format table on Flink. It includes methods - * for initializing and updating the lookup table, as well as getting results by key and cleaning up - * the cache. - */ -public interface KVTable extends Serializable, Closeable { - /** Initialize the lookup table */ - void open(); - - /** - * Get the result by the key. - * - * @throws IOException Serialize the rowData failed. - */ - List get(RowData key) throws IOException; - - /** - * Upsert the {@link KVTable} by the Change table dataStream. - * - * @throws IOException Serialize the rowData failed. - */ - void upsert(Iterator dataStream) throws IOException; - - /** - * Initial the {@link KVTable} by the MoR dataStream. - * - * @param dataStream the data stream for loading into the {@link KVTable}. - * @throws IOException Serialize the rowData failed. - */ - void initialize(Iterator dataStream) throws IOException; - - /** @return if the rowData is filtered, return true. */ - boolean filter(T value); - - /** @return if initialization is completed, return true. */ - boolean initialized(); - - /** - * Waiting for the initialization completed, and enable auto compaction at the end of the - * initialization. - */ - void waitInitializationCompleted(); - - /** - * Try to clean up the cache manually, due to the lookup_cache.ttl-after-write configuration. - * - *

lookup_cache.ttl-after-writ Only works in SecondaryIndexTable. - */ - default void cleanUp() {} - - void close(); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java deleted file mode 100644 index 746e7eed37..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KVTableFactory.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import static org.apache.amoro.flink.util.LookupUtil.convertLookupOptions; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -public class KVTableFactory implements TableFactory, Serializable { - private static final Logger LOG = LoggerFactory.getLogger(KVTableFactory.class); - private static final long serialVersionUID = 8090117643055858494L; - public static final KVTableFactory INSTANCE = new KVTableFactory(); - - public KVTable create( - RowDataStateFactory rowDataStateFactory, - List primaryKeys, - List joinKeys, - Schema projectSchema, - Configuration config, - Predicate rowDataPredicate) { - Set joinKeySet = new HashSet<>(joinKeys); - Set primaryKeySet = new HashSet<>(primaryKeys); - // keep the primary keys order with projected schema fields. - primaryKeys = - projectSchema.asStruct().fields().stream() - .map(Types.NestedField::name) - .filter(primaryKeySet::contains) - .collect(Collectors.toList()); - - if (primaryKeySet.equals(joinKeySet)) { - LOG.info( - "create unique index table, unique keys are {}, lookup keys are {}.", - primaryKeys.toArray(), - joinKeys.toArray()); - return new UniqueIndexTable( - rowDataStateFactory, - primaryKeys, - projectSchema, - convertLookupOptions(config), - rowDataPredicate); - } else { - LOG.info( - "create secondary index table, unique keys are {}, lookup keys are {}.", - primaryKeys.toArray(), - joinKeys.toArray()); - return new SecondaryIndexTable( - rowDataStateFactory, - primaryKeys, - joinKeys, - projectSchema, - convertLookupOptions(config), - rowDataPredicate); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java deleted file mode 100644 index 121fbbd080..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/KeyRowData.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.RowKind; - -public class KeyRowData implements RowData { - private final int[] keyIndexMapping; - private final RowData rowData; - - public KeyRowData(int[] keyIndexMapping, RowData rowData) { - this.keyIndexMapping = keyIndexMapping; - this.rowData = rowData; - } - - @Override - public int getArity() { - return keyIndexMapping.length; - } - - @Override - public RowKind getRowKind() { - return rowData.getRowKind(); - } - - @Override - public void setRowKind(RowKind kind) { - rowData.setRowKind(kind); - } - - @Override - public boolean isNullAt(int pos) { - return rowData.isNullAt(keyIndexMapping[pos]); - } - - @Override - public boolean getBoolean(int pos) { - return rowData.getBoolean(keyIndexMapping[pos]); - } - - @Override - public byte getByte(int pos) { - return rowData.getByte(keyIndexMapping[pos]); - } - - @Override - public short getShort(int pos) { - return rowData.getShort(keyIndexMapping[pos]); - } - - @Override - public int getInt(int pos) { - return rowData.getInt(keyIndexMapping[pos]); - } - - @Override - public long getLong(int pos) { - return rowData.getLong(keyIndexMapping[pos]); - } - - @Override - public float getFloat(int pos) { - return rowData.getFloat(keyIndexMapping[pos]); - } - - @Override - public double getDouble(int pos) { - return rowData.getDouble(keyIndexMapping[pos]); - } - - @Override - public StringData getString(int pos) { - return rowData.getString(keyIndexMapping[pos]); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return rowData.getDecimal(keyIndexMapping[pos], precision, scale); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return rowData.getTimestamp(keyIndexMapping[pos], precision); - } - - @Override - public RawValueData getRawValue(int pos) { - return rowData.getRawValue(keyIndexMapping[pos]); - } - - @Override - public byte[] getBinary(int pos) { - return rowData.getBinary(keyIndexMapping[pos]); - } - - @Override - public ArrayData getArray(int pos) { - return rowData.getArray(keyIndexMapping[pos]); - } - - @Override - public MapData getMap(int pos) { - return rowData.getMap(keyIndexMapping[pos]); - } - - @Override - public RowData getRow(int pos, int numFields) { - return rowData.getRow(keyIndexMapping[pos], numFields); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java deleted file mode 100644 index 44f7b0eeb2..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupMetrics.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -public class LookupMetrics { - - public static final String GROUP_NAME_LOOKUP = "mixed_format_lookup"; - public static final String LOADING_TIME_MS = "lookup_loading_cost_ms"; - public static final String UNIQUE_CACHE_SIZE = "lookup_unique_index_cache_size"; - public static final String SECONDARY_CACHE_SIZE = "lookup_secondary_index_cache_size"; -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java deleted file mode 100644 index de5da31dd5..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupOptions.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.flink.util.Preconditions; - -import java.io.Serializable; -import java.time.Duration; - -/** This class is used to configure lookup options. */ -public class LookupOptions implements Serializable { - private static final long serialVersionUID = -1L; - - private final long lruMaximumSize; - private final int writeRecordThreadNum; - private final Duration ttlAfterWrite; - private final long blockCacheCapacity; - private final int blockCacheNumShardBits; - - private LookupOptions(Builder builder) { - this.lruMaximumSize = builder.lruMaximumSize; - this.writeRecordThreadNum = builder.writeRecordThreadNum; - this.ttlAfterWrite = builder.ttlAfterWrite; - this.blockCacheCapacity = builder.blockCacheCapacity; - this.blockCacheNumShardBits = builder.blockCacheNumShardBits; - } - - public long lruMaximumSize() { - return lruMaximumSize; - } - - public int writeRecordThreadNum() { - return writeRecordThreadNum; - } - - public Duration ttlAfterWrite() { - return ttlAfterWrite; - } - - public boolean isTTLAfterWriteValidated() { - return ttlAfterWrite.compareTo(Duration.ZERO) > 0; - } - - public long blockCacheCapacity() { - return blockCacheCapacity; - } - - public int numShardBits() { - return blockCacheNumShardBits; - } - - @Override - public String toString() { - return "LookupOptions{" - + "lruMaximumSize=" - + lruMaximumSize - + ", writeRecordThreadNum=" - + writeRecordThreadNum - + ", ttlAfterWrite=" - + ttlAfterWrite - + ", blockCacheCapacity=" - + blockCacheCapacity - + ", blockCacheNumShardBits=" - + blockCacheNumShardBits - + "}"; - } - - public static class Builder { - private long lruMaximumSize; - private int writeRecordThreadNum; - private Duration ttlAfterWrite; - private long blockCacheCapacity; - private int blockCacheNumShardBits; - - /** LRU cache max size. */ - public Builder lruMaximumSize(long lruMaximumSize) { - Preconditions.checkArgument(lruMaximumSize >= 0, "lruMaximumSize must not be negative"); - this.lruMaximumSize = lruMaximumSize; - return this; - } - - /** Write record thread num. */ - public Builder writeRecordThreadNum(int writeRecordThreadNum) { - Preconditions.checkArgument( - writeRecordThreadNum > 0, "writeRecordThreadNum must be greater than 0"); - this.writeRecordThreadNum = writeRecordThreadNum; - return this; - } - - /** Clean expired records after write. */ - public Builder ttlAfterWrite(Duration ttlAfterWrite) { - Preconditions.checkArgument( - !ttlAfterWrite.isNegative(), "ttlAfterWrite must not be negative"); - this.ttlAfterWrite = ttlAfterWrite; - return this; - } - - public Builder blockCacheCapacity(long blockCacheCapacity) { - Preconditions.checkArgument( - blockCacheCapacity > 0, "blockCacheCapacity must be greater than 0"); - this.blockCacheCapacity = blockCacheCapacity; - return this; - } - - public Builder blockCacheNumShardBits(int blockCacheNumShardBits) { - Preconditions.checkArgument( - blockCacheNumShardBits >= -1, - "blockCacheNumShardBits must be greater than or equal to -1"); - this.blockCacheNumShardBits = blockCacheNumShardBits; - return this; - } - - public LookupOptions build() { - return new LookupOptions(this); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java deleted file mode 100644 index 0c5ddfe97c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/LookupRecord.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -public class LookupRecord { - private final byte[] keyBytes; - private final byte[] valueBytes; - - private final OpType opType; - - private LookupRecord(OpType opType, byte[] keyBytes, byte[] valueBytes) { - this.keyBytes = keyBytes; - this.valueBytes = valueBytes; - this.opType = opType; - } - - public static LookupRecord of(OpType opType, byte[] keyBytes, byte[] valueBytes) { - return new LookupRecord(opType, keyBytes, valueBytes); - } - - public byte[] keyBytes() { - return keyBytes; - } - - public byte[] valueBytes() { - return valueBytes; - } - - public OpType opType() { - return opType; - } - - enum OpType { - PUT_BYTES, - DELETE_BYTES - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java deleted file mode 100644 index ee50800585..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/MixedFormatRowDataLookupFunction.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.functions.FunctionContext; -import org.apache.flink.table.functions.LookupFunction; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.function.Predicate; - -/** A lookup function for {@link RowData} type. */ -public class MixedFormatRowDataLookupFunction extends LookupFunction { - private static final long serialVersionUID = -7694050999266540499L; - private final BasicLookupFunction basicLookupFunction; - - public MixedFormatRowDataLookupFunction( - TableFactory tableFactory, - MixedTable mixedTable, - List joinKeys, - Schema projectSchema, - List filters, - MixedFormatTableLoader tableLoader, - Configuration config, - Predicate predicate, - AbstractAdaptHiveKeyedDataReader flinkMORDataReader, - DataIteratorReaderFunction readerFunction) { - this.basicLookupFunction = - new BasicLookupFunction<>( - tableFactory, - mixedTable, - joinKeys, - projectSchema, - filters, - tableLoader, - config, - predicate, - flinkMORDataReader, - readerFunction); - } - - @Override - public void open(FunctionContext context) throws IOException { - basicLookupFunction.open(context); - } - - @Override - public Collection lookup(RowData keyRow) throws IOException { - return basicLookupFunction.lookup(keyRow); - } - - @Override - public void close() throws Exception { - basicLookupFunction.close(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java deleted file mode 100644 index a82bff54db..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBCacheState.java +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.AmoroIOException; -import org.apache.amoro.utils.map.RocksDBBackend; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; -import org.apache.flink.shaded.guava30.com.google.common.cache.CacheBuilder; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.Preconditions; -import org.rocksdb.ColumnFamilyHandle; -import org.rocksdb.MutableColumnFamilyOptions; -import org.rocksdb.RocksDBException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Queue; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * This is an abstract state backed by RocksDB and Guava cache for storing and retrieving key-value - * pairs of byte arrays. - * - * @param the type of the cache's values, which are not permitted to be null - */ -public abstract class RocksDBCacheState { - private static final Logger LOG = LoggerFactory.getLogger(RocksDBCacheState.class); - protected RocksDBBackend rocksDB; - protected final boolean secondaryIndexMemoryMapEnabled; - - protected Cache guavaCache; - - protected final String columnFamilyName; - protected final ColumnFamilyHandle columnFamilyHandle; - protected ThreadLocal keySerializerThreadLocal = - new ThreadLocal<>(); - - protected ThreadLocal valueSerializerThreadLocal = - new ThreadLocal<>(); - - protected final BinaryRowDataSerializerWrapper keySerializer; - - protected final BinaryRowDataSerializerWrapper valueSerializer; - private ExecutorService writeRocksDBService; - private final AtomicBoolean initialized = new AtomicBoolean(false); - private final AtomicBoolean closed = new AtomicBoolean(false); - protected Queue lookupRecordsQueue; - - private final int writeRocksDBThreadNum; - private List> writeRocksDBThreadFutures; - private final AtomicReference writingThreadException = new AtomicReference<>(); - protected final MetricGroup metricGroup; - private final LookupOptions lookupOptions; - - public RocksDBCacheState( - RocksDBBackend rocksDB, - String columnFamilyName, - BinaryRowDataSerializerWrapper keySerializer, - BinaryRowDataSerializerWrapper valueSerializer, - MetricGroup metricGroup, - LookupOptions lookupOptions, - boolean secondaryIndexMemoryMapEnabled) { - this.rocksDB = rocksDB; - this.columnFamilyName = columnFamilyName; - this.keySerializer = keySerializer; - this.valueSerializer = valueSerializer; - this.columnFamilyHandle = rocksDB.getColumnFamilyHandle(columnFamilyName); - this.writeRocksDBThreadNum = lookupOptions.writeRecordThreadNum(); - this.secondaryIndexMemoryMapEnabled = secondaryIndexMemoryMapEnabled; - this.metricGroup = metricGroup; - this.lookupOptions = lookupOptions; - } - - public void open() { - writeRocksDBService = Executors.newFixedThreadPool(writeRocksDBThreadNum); - - if (secondaryIndexMemoryMapEnabled) { - CacheBuilder cacheBuilder = CacheBuilder.newBuilder(); - if (lookupOptions.isTTLAfterWriteValidated()) { - cacheBuilder.expireAfterWrite(lookupOptions.ttlAfterWrite()); - } - } - guavaCache = CacheBuilder.newBuilder().maximumSize(lookupOptions.lruMaximumSize()).build(); - - addGauge(columnFamilyName + "_queue_size", () -> lookupRecordsQueue.size()); - - lookupRecordsQueue = new ConcurrentLinkedQueue<>(); - writeRocksDBThreadFutures = - IntStream.range(0, writeRocksDBThreadNum) - .mapToObj( - value -> - writeRocksDBService.submit( - new WriteRocksDBTask( - String.format( - "writing-rocksDB-cf_%s-thread-%d", columnFamilyName, value), - secondaryIndexMemoryMapEnabled))) - .collect(Collectors.toList()); - } - - @VisibleForTesting - public byte[] serializeKey(RowData key) throws IOException { - if (keySerializerThreadLocal.get() == null) { - keySerializerThreadLocal.set(keySerializer.clone()); - } - return serializeKey(keySerializerThreadLocal.get(), key); - } - - @VisibleForTesting - public byte[] serializeKey(BinaryRowDataSerializerWrapper keySerializer, RowData key) - throws IOException { - // key has a different RowKind would serialize different byte[], so unify the RowKind as INSERT. - byte[] result; - if (key.getRowKind() != RowKind.INSERT) { - RowKind rowKind = key.getRowKind(); - key.setRowKind(RowKind.INSERT); - result = keySerializer.serialize(key); - key.setRowKind(rowKind); - return result; - } - key.setRowKind(RowKind.INSERT); - return keySerializer.serialize(key); - } - - protected ByteArrayWrapper wrap(byte[] bytes) { - return new ByteArrayWrapper(bytes, bytes.length); - } - - protected void putIntoQueue(LookupRecord lookupRecord) { - Preconditions.checkNotNull(lookupRecord); - lookupRecordsQueue.add(lookupRecord); - } - - /** Waiting for the writing threads completed. */ - public void waitWriteRocksDBDone() { - long every5SecondsPrint = Long.MIN_VALUE; - - while (true) { - if (lookupRecordsQueue.isEmpty()) { - initialized.set(true); - break; - } else if (every5SecondsPrint < System.currentTimeMillis()) { - LOG.info("Currently rocksDB queue size is {}.", lookupRecordsQueue.size()); - every5SecondsPrint = System.currentTimeMillis() + 5000; - } - } - // Wait for all threads to finish - for (Future future : writeRocksDBThreadFutures) { - try { - // wait for the task to complete, with a timeout of 5 seconds - future.get(5, TimeUnit.SECONDS); - } catch (TimeoutException e) { - // task took too long, interrupt the thread and terminate the task - future.cancel(true); - } catch (InterruptedException | ExecutionException e) { - // handle other exceptions - throw new FlinkRuntimeException(e); - } - } - } - - public boolean initialized() { - return initialized.get(); - } - - protected LookupRecord.OpType convertToOpType(RowKind rowKind) { - switch (rowKind) { - case INSERT: - case UPDATE_AFTER: - return LookupRecord.OpType.PUT_BYTES; - case DELETE: - case UPDATE_BEFORE: - return LookupRecord.OpType.DELETE_BYTES; - default: - throw new IllegalArgumentException(String.format("Not support this rowKind %s", rowKind)); - } - } - - /** - * Closes the RocksDB instance and cleans up the Guava cache. - * - *

Additionally, it shuts down the write-service and clears the RocksDB record queue if they - * exist. - */ - public void close() { - rocksDB.close(); - guavaCache.cleanUp(); - if (writeRocksDBService != null) { - writeRocksDBService.shutdown(); - writeRocksDBService = null; - } - closed.set(true); - if (lookupRecordsQueue != null) { - lookupRecordsQueue.clear(); - lookupRecordsQueue = null; - } - } - - public void initializationCompleted() { - try { - rocksDB.getDB().enableAutoCompaction(Collections.singletonList(columnFamilyHandle)); - MutableColumnFamilyOptions mutableColumnFamilyOptions = - MutableColumnFamilyOptions.builder().setDisableAutoCompactions(false).build(); - rocksDB.setOptions(columnFamilyHandle, mutableColumnFamilyOptions); - } catch (RocksDBException e) { - throw new AmoroIOException(e); - } - - LOG.info("set db options[disable_auto_compactions={}]", false); - } - - public void addGauge(String metricName, Gauge gauge) { - metricGroup.gauge(metricName, gauge); - } - - protected void checkConcurrentFailed() { - if (writingThreadException.get() != null) { - LOG.error("Check concurrent writing threads.", writingThreadException.get()); - throw new FlinkRuntimeException(writingThreadException.get()); - } - } - - /** - * This task is running during the initialization phase to write data{@link LookupRecord} to - * RocksDB. - * - *

During the initialization phase, the Merge-on-Read approach is used to retrieve data, which - * will only return INSERT data. When there are multiple entries with the same primary key, only - * one entry will be returned. - * - *

During the initialization phase, the incremental pull approach is also used to retrieve data - * that include four {@link RowKind} rowKinds, -D, +I, -U, and +U. - */ - class WriteRocksDBTask implements Runnable { - - private final String name; - private final boolean secondaryIndexMemoryMapEnabled; - - public WriteRocksDBTask(String name, boolean secondaryIndexMemoryMapEnabled) { - this.name = name; - this.secondaryIndexMemoryMapEnabled = secondaryIndexMemoryMapEnabled; - } - - @Override - public void run() { - LOG.info("{} starting.", name); - try { - while (!closed.get() && !initialized.get()) { - LookupRecord record = lookupRecordsQueue.poll(); - if (record != null) { - switch (record.opType()) { - case PUT_BYTES: - put(record); - break; - case DELETE_BYTES: - delete(record); - break; - default: - throw new IllegalArgumentException( - String.format("Not support this OpType %s", record.opType())); - } - } - } - } catch (Throwable e) { - LOG.error("writing failed:", e); - writingThreadException.set(e); - } - LOG.info("{} stopping.", name); - } - - private void delete(LookupRecord record) { - if (secondaryIndexMemoryMapEnabled) { - deleteSecondaryCache(record.keyBytes(), record.valueBytes()); - } else { - rocksDB.delete(columnFamilyName, record.keyBytes()); - // manually clear the record - record = null; - } - } - - private void put(LookupRecord record) { - if (secondaryIndexMemoryMapEnabled) { - putSecondaryCache(record.keyBytes(), record.valueBytes()); - } else { - rocksDB.put(columnFamilyHandle, record.keyBytes(), record.valueBytes()); - // manually clear the record - record = null; - } - } - } - - void putSecondaryCache(byte[] key, byte[] value) { - ByteArrayWrapper keyWrap = wrap(key); - ByteArrayWrapper valueWrap = wrap(value); - putCacheValue(guavaCache, keyWrap, valueWrap); - } - - void deleteSecondaryCache(byte[] key, byte[] value) { - ByteArrayWrapper keyWrap = wrap(key); - ByteArrayWrapper valueWrap = wrap(value); - removeValue(guavaCache, keyWrap, valueWrap); - } - - void putCacheValue( - Cache cache, ByteArrayWrapper keyWrap, ByteArrayWrapper valueWrap) {} - - void removeValue( - Cache cache, ByteArrayWrapper keyWrap, ByteArrayWrapper valueWrap) {} -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java deleted file mode 100644 index 51bce5ab78..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBRecordState.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.utils.map.RocksDBBackend; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Optional; - -/** A class used to store the state of a lookup record. For {@link UniqueIndexTable}. */ -public class RocksDBRecordState extends RocksDBCacheState { - private static final Logger LOG = LoggerFactory.getLogger(RocksDBRecordState.class); - - public RocksDBRecordState( - RocksDBBackend rocksDB, - String columnFamilyName, - BinaryRowDataSerializerWrapper keySerializer, - BinaryRowDataSerializerWrapper valueSerializer, - MetricGroup metricGroup, - LookupOptions lookupOptions) { - super( - rocksDB, - columnFamilyName, - keySerializer, - valueSerializer, - metricGroup, - lookupOptions, - false); - } - - /** - * Writes a key-value pair to the sst file. - * - * @param key The key of the pair. - * @param value The value of the pair. - */ - public void asyncWrite(RowData key, RowData value) throws IOException { - byte[] keyBytes = serializeKey(key); - asyncWrite(key.getRowKind(), keyBytes, value); - } - - public void asyncWrite(RowKind rowKind, byte[] keyBytes, RowData value) throws IOException { - byte[] valueBytes = serializeValue(value); - LookupRecord.OpType opType = convertToOpType(rowKind); - putIntoQueue(LookupRecord.of(opType, keyBytes, valueBytes)); - } - - /** - * Retrieve the RowData from guava cache firstly, if value is null, fetch the value from the - * rocksDB. - * - * @param key try to find the record via this key. - * @throws IOException if serialize the RowData variable key failed. - */ - public Optional get(RowData key) throws IOException { - byte[] keyBytes = serializeKey(key); - return get(keyBytes); - } - - public Optional get(byte[] keyBytes) throws IOException { - ByteArrayWrapper key = wrap(keyBytes); - byte[] recordBytes = guavaCache.getIfPresent(key); - if (recordBytes == null) { - recordBytes = rocksDB.get(columnFamilyHandle, key.bytes); - if (recordBytes != null) { - guavaCache.put(key, recordBytes); - } - } - return Optional.ofNullable(deserializeValue(recordBytes)); - } - - /** - * Putting the serialized RowData key and value into the rocksDB and cache. - * - * @throws IOException if serialize the RowData variable key and value failed. - */ - public void put(RowData key, RowData value) throws IOException { - byte[] keyBytes = serializeKey(key); - put(keyBytes, value); - } - - public void put(byte[] keyBytes, RowData value) throws IOException { - Preconditions.checkNotNull(value); - - byte[] valueBytes = serializeValue(value); - rocksDB.put(columnFamilyHandle, keyBytes, valueBytes); - - // Speed up the initialization process of Lookup Join Function - ByteArrayWrapper key = wrap(keyBytes); - if (guavaCache.getIfPresent(wrap(keyBytes)) != null) { - guavaCache.put(key, valueBytes); - } - } - - /** - * Deleting the record in the rocksDB and cache if it exists. - * - * @throws IOException if serialize the RowData variable key failed. - */ - public void delete(RowData key) throws IOException { - byte[] keyBytes = serializeKey(key); - delete(keyBytes); - } - - public void delete(byte[] keyBytes) { - if (contain(wrap(keyBytes))) { - rocksDB.delete(columnFamilyName, keyBytes); - guavaCache.invalidate(wrap(keyBytes)); - } - } - - private boolean contain(ByteArrayWrapper byteArrayWrapper) { - byte[] recordBytes = guavaCache.getIfPresent(byteArrayWrapper); - if (recordBytes == null) { - recordBytes = rocksDB.get(columnFamilyName, byteArrayWrapper.bytes); - } - return recordBytes != null; - } - - private byte[] serializeValue(RowData value) throws IOException { - return valueSerializer().serialize(value); - } - - private RowData deserializeValue(byte[] recordBytes) throws IOException { - return valueSerializer().deserialize(recordBytes); - } - - private BinaryRowDataSerializerWrapper valueSerializer() { - if (valueSerializerThreadLocal.get() == null) { - valueSerializerThreadLocal.set(valueSerializer.clone()); - } - return valueSerializerThreadLocal.get(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java deleted file mode 100644 index 5215ed812c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetSpilledState.java +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.amoro.utils.map.RocksDBBackend; -import org.apache.commons.collections.CollectionUtils; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; -import org.apache.flink.table.data.RowData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -/** - * A class that stores the secondary index in the cache. For {@link SecondaryIndexTable}. - * - *

Support update the secondary index in the cache. - */ -public class RocksDBSetSpilledState extends RocksDBCacheState> { - private static final Logger LOG = LoggerFactory.getLogger(RocksDBSetSpilledState.class); - protected ThreadLocal joinKeySerializerThreadLocal = - new ThreadLocal<>(); - private final BinaryRowDataSerializerWrapper joinKeySerializer; - /** Multi-threads would put and delete the joinKeys and Set in the rocksdb. */ - private final Object rocksDBLock = new Object(); - - private final Map> tmpInitializationMap = - new ConcurrentHashMap<>(); - - public RocksDBSetSpilledState( - RocksDBBackend rocksDB, - String columnFamilyName, - BinaryRowDataSerializerWrapper joinKeySerializer, - BinaryRowDataSerializerWrapper uniqueKeySerialization, - BinaryRowDataSerializerWrapper valueSerializer, - MetricGroup metricGroup, - LookupOptions lookupOptions) { - super( - rocksDB, - columnFamilyName, - uniqueKeySerialization, - valueSerializer, - metricGroup, - lookupOptions, - true); - this.joinKeySerializer = joinKeySerializer; - } - - public void asyncWrite(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { - byte[] joinKeyBytes = serializeKey(joinKey); - LookupRecord.OpType opType = convertToOpType(joinKey.getRowKind()); - putIntoQueue(LookupRecord.of(opType, joinKeyBytes, uniqueKeyBytes)); - } - - @Override - public byte[] serializeKey(RowData key) throws IOException { - if (joinKeySerializerThreadLocal.get() == null) { - joinKeySerializerThreadLocal.set(joinKeySerializer.clone()); - } - return serializeKey(joinKeySerializerThreadLocal.get(), key); - } - - /** - * Serialize join key to bytes and put the join key bytes and unique key bytes in the cache. - * - * @param joinKey the join key - * @param uniqueKeyBytes the unique key bytes - * @throws IOException if serialize the RowData variable failed. - */ - public void put(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { - byte[] joinKeyBytes = serializeKey(joinKey); - putSecondaryCache(joinKeyBytes, uniqueKeyBytes); - } - - /** - * Delete the secondary index in the cache. - * - * @param joinKey the join key - * @param uniqueKeyBytes the unique key bytes - * @throws IOException if serialize the RowData variable failed. - */ - public void delete(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { - final byte[] joinKeyBytes = serializeKey(joinKey); - deleteSecondaryCache(joinKeyBytes, uniqueKeyBytes); - } - - /** - * Retrieve the elements of the key. - * - *

Fetch the Collection from guava cache, if not present, fetch the result from RocksDB. if - * present, just return the result. - * - * @return not null, but may be empty. - */ - public Collection get(RowData key) throws IOException { - final byte[] joinKeyBytes = serializeKey(key); - ByteArrayWrapper joinKeyWrap = wrap(joinKeyBytes); - Set result = guavaCache.getIfPresent(joinKeyWrap); - if (result == null) { - byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); - if (uniqueKeysDeserialized != null) { - result = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); - } - - if (CollectionUtils.isNotEmpty(result)) { - guavaCache.put(joinKeyWrap, result); - return result; - } - return Collections.emptyList(); - } - return result; - } - - @Override - public void putCacheValue( - Cache> cache, - ByteArrayWrapper keyWrap, - ByteArrayWrapper valueWrap) { - if (initialized()) { - byte[] joinKeyBytes = keyWrap.bytes; - synchronized (rocksDBLock) { - byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); - if (uniqueKeysDeserialized != null) { - Set set = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); - if (!set.contains(valueWrap)) { - set.add(valueWrap); - uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); - rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); - } - } else { - Set set = new HashSet<>(); - set.add(valueWrap); - uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); - rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); - } - } - return; - } - tmpInitializationMap.compute( - keyWrap, - (keyWrapper, oldSet) -> { - if (oldSet == null) { - oldSet = Sets.newHashSet(); - } - oldSet.add(valueWrap); - return oldSet; - }); - } - - @Override - public void removeValue( - Cache> cache, - ByteArrayWrapper keyWrap, - ByteArrayWrapper valueWrap) { - if (initialized()) { - byte[] joinKeyBytes = keyWrap.bytes; - synchronized (rocksDBLock) { - byte[] uniqueKeysDeserialized = rocksDB.get(columnFamilyHandle, joinKeyBytes); - if (uniqueKeysDeserialized == null) { - return; - } - Set set = ByteArraySetSerializer.deserialize(uniqueKeysDeserialized); - if (set.contains(valueWrap)) { - set.remove(valueWrap); - if (!set.isEmpty()) { - uniqueKeysDeserialized = ByteArraySetSerializer.serialize(set); - rocksDB.put(columnFamilyHandle, joinKeyBytes, uniqueKeysDeserialized); - } - } - } - return; - } - tmpInitializationMap.compute( - keyWrap, - (keyWrapper, oldSet) -> { - if (oldSet == null) { - return null; - } - oldSet.remove(valueWrap); - if (oldSet.isEmpty()) { - return null; - } - return oldSet; - }); - } - - public void bulkIntoRocksDB() { - LOG.info("Total size={} in the tmp map, try to bulk into rocksdb", tmpInitializationMap.size()); - int[] count = {0}; - long start = System.currentTimeMillis(); - - tmpInitializationMap.forEach( - (byteArrayWrapper, set) -> { - rocksDB.put( - columnFamilyHandle, byteArrayWrapper.bytes, ByteArraySetSerializer.serialize(set)); - set = null; - count[0] = count[0] + 1; - if (count[0] % 100000 == 0) { - LOG.info("Ingested {} into rocksdb.", count[0]); - } - }); - tmpInitializationMap.clear(); - - LOG.info("Ingested {} completely, cost:{} ms.", count, System.currentTimeMillis() - start); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java deleted file mode 100644 index e1f63ab4aa..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RocksDBSetState.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.log.Bytes; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.utils.map.RocksDBBackend; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.table.data.RowData; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -/** - * Guava cache structure: key -> list, the elements of this list are rocksdb keys. RocksDB - * structure: element -> empty. - */ -public class RocksDBSetState extends RocksDBCacheState> { - - protected BinaryRowDataSerializerWrapper joinKeySerializer; - - private static final byte[] EMPTY = new byte[0]; - - public RocksDBSetState( - RocksDBBackend rocksDB, - String columnFamilyName, - BinaryRowDataSerializerWrapper keySerialization, - BinaryRowDataSerializerWrapper elementSerialization, - BinaryRowDataSerializerWrapper valueSerializer, - MetricGroup metricGroup, - LookupOptions lookupOptions) { - super( - rocksDB, - columnFamilyName, - elementSerialization, - valueSerializer, - metricGroup, - lookupOptions, - false); - this.joinKeySerializer = keySerialization; - } - - /** - * Retrieve the elements of the key. - * - *

Fetch the Collection from guava cache, if not present, fetch from rocksDB continuously, via - * prefix key scanning the rocksDB; if present, just return the result. - * - * @return not null, but may be empty. - */ - public List get(RowData key) throws IOException { - final byte[] keyBytes = serializeKey(key); - ByteArrayWrapper keyWrap = wrap(keyBytes); - List result = guavaCache.getIfPresent(keyWrap); - if (result == null) { - try (RocksDBBackend.ValueIterator iterator = - (RocksDBBackend.ValueIterator) rocksDB.values(columnFamilyName, keyBytes)) { - result = Lists.newArrayList(); - while (iterator.hasNext()) { - byte[] targetKeyBytes = iterator.key(); - if (isPrefixKey(targetKeyBytes, keyBytes)) { - byte[] value = - Arrays.copyOfRange(targetKeyBytes, keyBytes.length, targetKeyBytes.length); - result.add(value); - } - iterator.next(); - } - if (!result.isEmpty()) { - guavaCache.put(keyWrap, result); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - } - return result; - } - - private boolean isPrefixKey(byte[] targetKeyBytes, byte[] keyBytes) { - for (int i = 0; i < keyBytes.length; i++) { - if (targetKeyBytes[i] != keyBytes[i]) { - return false; - } - } - return true; - } - - /** Merge key and element into guava cache and rocksdb. */ - public void merge(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { - byte[] joinKeyBytes = serializeKey(joinKey); - byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, uniqueKeyBytes); - ByteArrayWrapper keyWrap = wrap(joinKeyBytes); - if (guavaCache.getIfPresent(keyWrap) != null) { - guavaCache.invalidate(keyWrap); - } - rocksDB.put(columnFamilyName, joinKeyAndPrimaryKeyBytes, EMPTY); - } - - public void delete(RowData joinKey, byte[] elementBytes) throws IOException { - final byte[] joinKeyBytes = serializeKey(joinKey); - ByteArrayWrapper keyWrap = wrap(joinKeyBytes); - if (guavaCache.getIfPresent(keyWrap) != null) { - guavaCache.invalidate(keyWrap); - } - byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, elementBytes); - if (rocksDB.get(columnFamilyName, joinKeyAndPrimaryKeyBytes) != null) { - rocksDB.delete(columnFamilyName, joinKeyAndPrimaryKeyBytes); - } - } - - public void batchWrite(RowData joinKey, byte[] uniqueKeyBytes) throws IOException { - byte[] joinKeyBytes = serializeKey(joinKey); - byte[] joinKeyAndPrimaryKeyBytes = Bytes.mergeByte(joinKeyBytes, uniqueKeyBytes); - LookupRecord.OpType opType = convertToOpType(joinKey.getRowKind()); - lookupRecordsQueue.add(LookupRecord.of(opType, joinKeyAndPrimaryKeyBytes, EMPTY)); - } - - public byte[] serializeKey(RowData key) throws IOException { - return serializeKey(joinKeySerializer, key); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java deleted file mode 100644 index 57166b9ee8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/RowDataStateFactory.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.amoro.utils.map.RocksDBBackend; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.util.Preconditions; -import org.rocksdb.BlockBasedTableConfig; -import org.rocksdb.ColumnFamilyOptions; -import org.rocksdb.LRUCache; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class RowDataStateFactory { - private static final Logger LOG = LoggerFactory.getLogger(RowDataStateFactory.class); - - private final String dbPath; - private RocksDBBackend db; - private final MetricGroup metricGroup; - - public RowDataStateFactory(String dbPath, MetricGroup metricGroup) { - Preconditions.checkNotNull(metricGroup); - this.dbPath = dbPath; - this.metricGroup = metricGroup; - } - - public RocksDBRecordState createRecordState( - String columnFamilyName, - BinaryRowDataSerializerWrapper keySerializer, - BinaryRowDataSerializerWrapper valueSerializer, - LookupOptions lookupOptions) { - db = createDB(lookupOptions, columnFamilyName); - - return new RocksDBRecordState( - db, columnFamilyName, keySerializer, valueSerializer, metricGroup, lookupOptions); - } - - public RocksDBSetSpilledState createSetState( - String columnFamilyName, - BinaryRowDataSerializerWrapper keySerialization, - BinaryRowDataSerializerWrapper elementSerialization, - BinaryRowDataSerializerWrapper valueSerializer, - LookupOptions lookupOptions) { - db = createDB(lookupOptions, columnFamilyName); - - return new RocksDBSetSpilledState( - db, - columnFamilyName, - keySerialization, - elementSerialization, - valueSerializer, - metricGroup, - lookupOptions); - } - - RocksDBBackend createDB(final LookupOptions lookupOptions, final String columnFamilyName) { - if (lookupOptions.isTTLAfterWriteValidated()) { - db = - RocksDBBackend.getOrCreateInstance( - dbPath, (int) lookupOptions.ttlAfterWrite().getSeconds()); - } else { - db = RocksDBBackend.getOrCreateInstance(dbPath); - } - ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); - configColumnFamilyOption(columnFamilyOptions, lookupOptions); - db.addColumnFamily(columnFamilyName, columnFamilyOptions); - return db; - } - - private void configColumnFamilyOption( - ColumnFamilyOptions columnFamilyOptions, LookupOptions lookupOptions) { - columnFamilyOptions.setDisableAutoCompactions(true); - - BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setBlockCache( - new LRUCache(lookupOptions.blockCacheCapacity(), lookupOptions.numShardBits())); - columnFamilyOptions.setTableFormatConfig(blockBasedTableConfig); - - LOG.info("set db options[disable_auto_compactions={}]", true); - LOG.info("{}", lookupOptions); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java deleted file mode 100644 index 41d7322574..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/SecondaryIndexTable.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import static org.apache.amoro.flink.lookup.LookupMetrics.SECONDARY_CACHE_SIZE; - -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -/** - * Use secondary index to lookup. Working for the situation where the join keys don't match the - * mixed-format table's primary keys. - * - *

Example: - * SELECT * FROM t1 JOIN t2 for system_time as of t1.pt as dim ON t1.user_name = dim.user_name - * - * - *

t2 as an mixed-format table with primary keys: user_name, city_name. - */ -public class SecondaryIndexTable extends UniqueIndexTable { - private static final Logger LOG = LoggerFactory.getLogger(SecondaryIndexTable.class); - private static final long serialVersionUID = 8707586070315884365L; - private final int[] secondaryKeyIndexMapping; - private final RocksDBSetSpilledState setState; - - private final LookupOptions lookupOptions; - - public SecondaryIndexTable( - RowDataStateFactory rowDataStateFactory, - List primaryKeys, - List joinKeys, - Schema projectSchema, - LookupOptions lookupOptions, - Predicate rowDataPredicate) { - super(rowDataStateFactory, primaryKeys, projectSchema, lookupOptions, rowDataPredicate); - - this.setState = - rowDataStateFactory.createSetState( - "secondaryIndex", - createKeySerializer(projectSchema, joinKeys), - createKeySerializer(projectSchema, primaryKeys), - createValueSerializer(projectSchema), - lookupOptions); - - List fields = - projectSchema.asStruct().fields().stream() - .map(Types.NestedField::name) - .collect(Collectors.toList()); - secondaryKeyIndexMapping = joinKeys.stream().mapToInt(fields::indexOf).toArray(); - this.lookupOptions = lookupOptions; - } - - @Override - public void open() { - super.open(); - setState.open(); - setState.addGauge(SECONDARY_CACHE_SIZE, () -> setState.guavaCache.size()); - } - - @Override - public List get(RowData key) throws IOException { - Collection uniqueKeys = setState.get(key); - if (!uniqueKeys.isEmpty()) { - List result = new ArrayList<>(uniqueKeys.size()); - for (ByteArrayWrapper uniqueKey : uniqueKeys) { - recordState.get(uniqueKey.bytes).ifPresent(result::add); - } - return result; - } - return Collections.emptyList(); - } - - @Override - public void upsert(Iterator dataStream) throws IOException { - while (dataStream.hasNext()) { - RowData value = dataStream.next(); - if (filter(value)) { - continue; - } - RowData uniqueKey = new KeyRowData(uniqueKeyIndexMapping, value); - RowData joinKey = new KeyRowData(secondaryKeyIndexMapping, value); - byte[] uniqueKeyBytes = recordState.serializeKey(uniqueKey); - - if (value.getRowKind() == RowKind.INSERT || value.getRowKind() == RowKind.UPDATE_AFTER) { - recordState.put(uniqueKeyBytes, value); - setState.put(joinKey, uniqueKeyBytes); - } else { - recordState.delete(uniqueKeyBytes); - setState.delete(joinKey, uniqueKeyBytes); - } - } - cleanUp(); - } - - @Override - public void initialize(Iterator dataStream) throws IOException { - while (dataStream.hasNext()) { - RowData value = dataStream.next(); - if (filter(value)) { - continue; - } - RowData uniqueKey = new KeyRowData(uniqueKeyIndexMapping, value); - RowData joinKey = new KeyRowData(secondaryKeyIndexMapping, value); - byte[] uniqueKeyBytes = recordState.serializeKey(uniqueKey); - - recordState.asyncWrite(value.getRowKind(), uniqueKeyBytes, value); - setState.asyncWrite(joinKey, uniqueKeyBytes); - } - recordState.checkConcurrentFailed(); - setState.checkConcurrentFailed(); - } - - @Override - public boolean initialized() { - return recordState.initialized() && setState.initialized(); - } - - @Override - public void cleanUp() { - if (lookupOptions.isTTLAfterWriteValidated()) { - setState.guavaCache.cleanUp(); - } - } - - @Override - public void waitInitializationCompleted() { - super.waitInitializationCompleted(); - LOG.info("Waiting for Set State initialization"); - setState.waitWriteRocksDBDone(); - LOG.info("Queue is empty row, try to bulk tmp map into rocksdb"); - setState.bulkIntoRocksDB(); - LOG.info("The concurrent threads have finished writing data into the Set State."); - setState.initializationCompleted(); - } - - @Override - public void close() { - super.close(); - recordState.close(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java deleted file mode 100644 index 3cfed235fd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/TableFactory.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.apache.flink.configuration.Configuration; -import org.apache.iceberg.Schema; - -import java.util.List; -import java.util.function.Predicate; - -public interface TableFactory { - - KVTable create( - RowDataStateFactory rowDataStateFactory, - List primaryKeys, - List joinKeys, - Schema projectSchema, - Configuration config, - Predicate predicate); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java deleted file mode 100644 index 8f454f158b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/UniqueIndexTable.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import static org.apache.amoro.flink.lookup.LookupMetrics.UNIQUE_CACHE_SIZE; - -import org.apache.amoro.utils.SchemaUtil; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -/** - * Use a unique index to lookup. Working for the situation where the join keys include the - * mixed-format table's primary keys. - */ -public class UniqueIndexTable implements KVTable { - private static final Logger LOG = LoggerFactory.getLogger(UniqueIndexTable.class); - private static final long serialVersionUID = -6537777722200330050L; - protected final RocksDBRecordState recordState; - - protected int[] uniqueKeyIndexMapping; - protected final Predicate rowDataPredicate; - - public UniqueIndexTable( - RowDataStateFactory rowDataStateFactory, - List primaryKeys, - Schema projectSchema, - LookupOptions lookupOptions, - Predicate rowDataPredicate) { - - recordState = - rowDataStateFactory.createRecordState( - "uniqueIndex", - createKeySerializer(projectSchema, primaryKeys), - createValueSerializer(projectSchema), - lookupOptions); - List fields = - projectSchema.asStruct().fields().stream() - .map(Types.NestedField::name) - .collect(Collectors.toList()); - this.uniqueKeyIndexMapping = primaryKeys.stream().mapToInt(fields::indexOf).toArray(); - this.rowDataPredicate = rowDataPredicate; - } - - @Override - public void open() { - recordState.open(); - recordState.addGauge(UNIQUE_CACHE_SIZE, () -> recordState.guavaCache.size()); - } - - @Override - public List get(RowData key) throws IOException { - Optional record = recordState.get(key); - return record.map(Collections::singletonList).orElse(Collections.emptyList()); - } - - @Override - public void upsert(Iterator dataStream) throws IOException { - while (dataStream.hasNext()) { - RowData value = dataStream.next(); - if (filter(value)) { - continue; - } - RowData key = new KeyRowData(uniqueKeyIndexMapping, value); - - if (value.getRowKind() == RowKind.INSERT || value.getRowKind() == RowKind.UPDATE_AFTER) { - recordState.put(key, value); - } else { - recordState.delete(key); - } - } - } - - @Override - public void initialize(Iterator dataStream) throws IOException { - while (dataStream.hasNext()) { - RowData value = dataStream.next(); - if (filter(value)) { - continue; - } - - RowData key = new KeyRowData(uniqueKeyIndexMapping, value); - recordState.asyncWrite(key, value); - } - recordState.checkConcurrentFailed(); - } - - @Override - public boolean filter(RowData value) { - return predicate(value); - } - - protected boolean predicate(RowData value) { - return Optional.ofNullable(rowDataPredicate) - .map(predicate -> !predicate.test(value)) - .orElse(false); - } - - @Override - public boolean initialized() { - return recordState.initialized(); - } - - @Override - public void waitInitializationCompleted() { - LOG.info("Waiting for Record State initialization"); - recordState.waitWriteRocksDBDone(); - LOG.info("The concurrent threads have finished writing data into the Record State."); - recordState.initializationCompleted(); - } - - protected BinaryRowDataSerializerWrapper createKeySerializer( - Schema mixedTableSchema, List keys) { - Schema keySchema = SchemaUtil.selectInOrder(mixedTableSchema, keys); - return new BinaryRowDataSerializerWrapper(keySchema); - } - - protected BinaryRowDataSerializerWrapper createValueSerializer(Schema projectSchema) { - return new BinaryRowDataSerializerWrapper(projectSchema); - } - - @Override - public void close() { - recordState.close(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java deleted file mode 100644 index 00e2769eab..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicate.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup.filter; - -import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision; - -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.shade.guava32.com.google.common.collect.Iterables; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.util.Preconditions; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -/** - * A predicate to be used in a filter operation on a {@link RowData} object. It can be constructed - * from various comparison operators on a field or from boolean operators with other predicates. - * - *

The {@code test} method will apply the predicate to a {@link RowData} object, returning true - * if the predicate is satisfied by the given data. - */ -public class RowDataPredicate implements Predicate, Serializable { - private static final long serialVersionUID = 1L; - private Opt opt; - private final String fieldName; - private final int fieldIndex; - private final DataType dataType; - private Serializable[] parameters; - private final RowDataPredicate[] leftPredicates; - private final RowDataPredicate[] rightPredicates; - - /** Constructor used for testing purposes. */ - public RowDataPredicate( - Opt opt, - String fieldName, - int fieldIndex, - DataType dataType, - Serializable[] parameters, - RowDataPredicate[] leftPredicates, - RowDataPredicate[] rightPredicates) { - this.opt = opt; - this.fieldName = fieldName; - this.fieldIndex = fieldIndex; - this.dataType = dataType; - this.parameters = parameters; - this.leftPredicates = leftPredicates; - this.rightPredicates = rightPredicates; - } - - /** - * Constructor for logical operation, when left and right side of the operation is not a simple - * comparison. - */ - public RowDataPredicate( - Opt opt, RowDataPredicate[] leftPredicates, RowDataPredicate[] rightPredicates) { - this(opt, null, -1, null, null, leftPredicates, rightPredicates); - } - - /** Constructor for simple comparison operator. */ - public RowDataPredicate(String fieldName, int fieldIndex, DataType dataType) { - this(null, fieldName, fieldIndex, dataType, null, null, null); - } - - /** Constructor for comparing value to a fixed value or NULL value. */ - public RowDataPredicate(Serializable[] parameters) { - this(null, null, -1, null, parameters, null, null); - } - - /** Test if the RowData record satisfies this predicate. */ - @Override - public boolean test(RowData rowData) { - boolean result; - Object val; - switch (opt) { - case EQUALS: - val = getter(rowData); - result = compareEquals(val); - break; - case NOT_EQUALS: - val = getter(rowData); - result = !compareEquals(val); - break; - case GREATER_THAN: - val = getter(rowData); - result = compareGreaterThan(val); - break; - case GREATER_THAN_OR_EQUAL: - val = getter(rowData); - result = compareGreaterThanOrEqual(val); - break; - case LESS_THAN: - val = getter(rowData); - result = compareLessThan(val); - break; - case LESS_THAN_OR_EQUAL: - val = getter(rowData); - result = compareLessThanOrEqual(val); - break; - case IS_NOT_NULL: - val = getter(rowData); - result = compareIsNotNull(val); - break; - case IS_NULL: - val = getter(rowData); - result = compareIsNull(val); - break; - case AND: - Preconditions.checkNotNull(leftPredicates); - Preconditions.checkNotNull(rightPredicates); - result = Arrays.stream(leftPredicates).allMatch(p -> p.test(rowData)); - if (!result) { - return false; - } - result = Arrays.stream(rightPredicates).allMatch(p -> p.test(rowData)); - break; - case OR: - Preconditions.checkNotNull(leftPredicates); - Preconditions.checkNotNull(rightPredicates); - result = Arrays.stream(leftPredicates).allMatch(p -> p.test(rowData)); - if (result) { - return true; - } - result = Arrays.stream(rightPredicates).allMatch(p -> p.test(rowData)); - break; - default: - throw new IllegalArgumentException("Unsupported opt: " + opt); - } - - return result; - } - - public Serializable[] parameters() { - return parameters; - } - - /** - * Combines this RowDataPredicate with another using the specified operator. - * - * @param operator the operator to use for the combination - * @param that the other RowDataPredicate to combine with this one - * @return the combined RowDataPredicate - */ - public RowDataPredicate combine(Opt operator, RowDataPredicate that) { - this.opt = operator; - if (that == null) { - this.parameters = null; - } else { - this.parameters = that.parameters; - } - return this; - } - - private boolean compareLessThanOrEqual(Object val) { - return compareLiteral(dataType, parameters[0], val) >= 0; - } - - private boolean compareLessThan(Object val) { - return compareLiteral(dataType, parameters[0], val) > 0; - } - - private boolean compareGreaterThanOrEqual(Object val) { - return compareLiteral(dataType, parameters[0], val) <= 0; - } - - private boolean compareIsNotNull(Object val) { - return val != null; - } - - private boolean compareIsNull(Object val) { - return val == null; - } - - private boolean compareGreaterThan(Object val) { - return compareLiteral(dataType, parameters[0], val) < 0; - } - - private boolean compareEquals(Object val) { - if (parameters[0] == null && val == null) { - return true; - } - if (parameters[0] == null || val == null) { - return false; - } - return compareLiteral(dataType, parameters[0], val) == 0; - } - - Object getter(RowData rowData) { - int pos = fieldIndex; - if (rowData.isNullAt(pos)) { - return null; - } - Preconditions.checkNotNull(dataType); - LogicalType logicalType = dataType.getLogicalType(); - switch (logicalType.getTypeRoot()) { - case CHAR: - case VARCHAR: - return rowData.getString(pos).toString(); - case BOOLEAN: - return rowData.getBoolean(pos); - case BINARY: - case VARBINARY: - return rowData.getBinary(pos); - case DECIMAL: - DecimalType decimalType = (DecimalType) logicalType; - return rowData - .getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()) - .toBigDecimal(); - case TINYINT: - return rowData.getByte(pos); - case SMALLINT: - return rowData.getShort(pos); - case INTEGER: - case DATE: - case INTERVAL_YEAR_MONTH: - case TIME_WITHOUT_TIME_ZONE: - return rowData.getInt(pos); - case BIGINT: - case INTERVAL_DAY_TIME: - return rowData.getLong(pos); - case FLOAT: - return rowData.getFloat(pos); - case DOUBLE: - return rowData.getDouble(pos); - case TIMESTAMP_WITHOUT_TIME_ZONE: - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - final int timestampPrecision = getPrecision(logicalType); - return rowData.getTimestamp(pos, timestampPrecision).getMillisecond(); - default: - throw new IllegalArgumentException( - String.format("Not supported datatype: %s, field: %s", dataType, fieldName)); - } - } - - private static int compareLiteral(DataType type, Object v1, Object v2) { - if (v1 instanceof Comparable) { - return ((Comparable) v1).compareTo(v2); - } else { - throw new RuntimeException(String.format("Unsupported type: %s, val: %s", type, v1)); - } - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(RowDataPredicate.class) - .add("\n\topt", opt) - .add("\n\tfieldName", fieldName) - .add("\n\tfieldIndex", fieldIndex) - .add("\n\tdataType", dataType) - .add("\n\tparameters", parameters) - .add( - "\n\tleftPredicates", - leftPredicates == null - ? "[]" - : Iterables.toString( - Arrays.stream(leftPredicates) - .map(predicate -> predicate.toString().replaceAll("\n", "\n\t")) - .collect(Collectors.toList()))) - .add( - "\n\trightPredicates", - rightPredicates == null - ? "[]" - : Iterables.toString( - Arrays.stream(rightPredicates) - .map(predicate -> predicate.toString().replaceAll("\n", "\n\t")) - .collect(Collectors.toList()))) - .toString(); - } - - public enum Opt { - AND, - OR, - EQUALS, - GREATER_THAN, - GREATER_THAN_OR_EQUAL, - LESS_THAN, - LESS_THAN_OR_EQUAL, - NOT_EQUALS, - IS_NULL, - IS_NOT_NULL, - TO_TIMESTAMP, - MINUS, - PLUS, - DIVIDE, - TIMES - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java deleted file mode 100644 index 66d3f7e4d1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/lookup/filter/RowDataPredicateExpressionVisitor.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup.filter; - -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.AND; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.DIVIDE; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.EQUALS; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.GREATER_THAN; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.GREATER_THAN_OR_EQUAL; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.IS_NOT_NULL; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.IS_NULL; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.LESS_THAN; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.LESS_THAN_OR_EQUAL; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.MINUS; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.NOT_EQUALS; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.OR; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.PLUS; -import static org.apache.amoro.flink.lookup.filter.RowDataPredicate.Opt.TIMES; - -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.Expression; -import org.apache.flink.table.expressions.ExpressionDefaultVisitor; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.TypeLiteralExpression; -import org.apache.flink.table.expressions.ValueLiteralExpression; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.util.Preconditions; - -import java.io.Serializable; -import java.math.BigDecimal; -import java.sql.Date; -import java.sql.Timestamp; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.util.List; -import java.util.Map; -import java.util.Optional; - -/** - * This class implements the visitor pattern for traversing expressions and building a {@link - * RowDataPredicate} out of them. - * - *

It supports a limited set of built-in functions, such as EQUALS, LESS_THAN, GREATER_THAN, - * NOT_EQUALS, etc. - */ -public class RowDataPredicateExpressionVisitor - extends ExpressionDefaultVisitor> { - - /** - * A map from field names to their respective indices in the input row. - * - *

Start from 0. - */ - private final Map fieldIndexMap; - /** A map from field names to their respective data types */ - private final Map fieldDataTypeMap; - - public RowDataPredicateExpressionVisitor( - Map fieldIndexMap, Map fieldDataTypeMap) { - this.fieldIndexMap = fieldIndexMap; - this.fieldDataTypeMap = fieldDataTypeMap; - } - - /** - * Visits a {@link CallExpression} and renders it as a {@link RowDataPredicate}. - * - * @param call the call expression to visit - * @return an optional {@link RowDataPredicate} - */ - @Override - public Optional visit(CallExpression call) { - if (BuiltInFunctionDefinitions.EQUALS.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(EQUALS, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.LESS_THAN.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(LESS_THAN, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(LESS_THAN_OR_EQUAL, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.GREATER_THAN.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(GREATER_THAN, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(GREATER_THAN_OR_EQUAL, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.NOT_EQUALS.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(NOT_EQUALS, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.OR.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(OR, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.AND.equals(call.getFunctionDefinition())) { - return renderBinaryOperator(AND, call.getResolvedChildren()); - } - if (BuiltInFunctionDefinitions.IS_NULL.equals(call.getFunctionDefinition())) { - return renderUnaryOperator(IS_NULL, call.getResolvedChildren().get(0)); - } - if (BuiltInFunctionDefinitions.IS_NOT_NULL.equals(call.getFunctionDefinition())) { - return renderUnaryOperator(IS_NOT_NULL, call.getResolvedChildren().get(0)); - } - if (BuiltInFunctionDefinitions.PLUS.equals(call.getFunctionDefinition())) { - return arithmeticOperator(PLUS, call); - } - if (BuiltInFunctionDefinitions.MINUS.equals(call.getFunctionDefinition())) { - return arithmeticOperator(MINUS, call); - } - if (BuiltInFunctionDefinitions.TIMES.equals(call.getFunctionDefinition())) { - return arithmeticOperator(TIMES, call); - } - if (BuiltInFunctionDefinitions.DIVIDE.equals(call.getFunctionDefinition())) { - return arithmeticOperator(DIVIDE, call); - } - if (BuiltInFunctionDefinitions.CAST.equals(call.getFunctionDefinition())) { - return castOperator(call); - } - throw new IllegalArgumentException( - String.format( - "Not supported build-in function: %s, CallExpression: %s, for RowDataPredicateExpressionVisitor", - call.getFunctionDefinition(), call)); - } - - @Override - public Optional visit(ValueLiteralExpression valueLiteralExpression) { - LogicalType tpe = valueLiteralExpression.getOutputDataType().getLogicalType(); - Serializable[] params = new Serializable[1]; - switch (tpe.getTypeRoot()) { - case CHAR: - case VARCHAR: - params[0] = valueLiteralExpression.getValueAs(String.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case BOOLEAN: - params[0] = valueLiteralExpression.getValueAs(Boolean.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case DECIMAL: - params[0] = valueLiteralExpression.getValueAs(BigDecimal.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case TINYINT: - params[0] = valueLiteralExpression.getValueAs(Byte.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case SMALLINT: - params[0] = valueLiteralExpression.getValueAs(Short.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case INTEGER: - params[0] = valueLiteralExpression.getValueAs(Integer.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case BIGINT: - params[0] = valueLiteralExpression.getValueAs(Long.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case FLOAT: - params[0] = valueLiteralExpression.getValueAs(Float.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case DOUBLE: - params[0] = valueLiteralExpression.getValueAs(Double.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case DATE: - params[0] = - valueLiteralExpression.getValueAs(LocalDate.class).map(Date::valueOf).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case TIME_WITHOUT_TIME_ZONE: - params[0] = valueLiteralExpression.getValueAs(java.sql.Time.class).orElse(null); - return Optional.of(new RowDataPredicate(params)); - case TIMESTAMP_WITHOUT_TIME_ZONE: - params[0] = - valueLiteralExpression - .getValueAs(LocalDateTime.class) - .map(Timestamp::valueOf) - .orElse(null); - return Optional.of(new RowDataPredicate(params)); - default: - return Optional.empty(); - } - } - - @Override - public Optional visit(FieldReferenceExpression fieldReferenceExpression) { - String fieldName = fieldReferenceExpression.getName(); - int fieldIndex = fieldIndexMap.get(fieldName); - DataType dataType = fieldDataTypeMap.get(fieldName); - return Optional.of(new RowDataPredicate(fieldName, fieldIndex, dataType)); - } - - @Override - protected Optional defaultMethod(Expression expression) { - return Optional.empty(); - } - - protected Optional arithmeticOperator( - RowDataPredicate.Opt arithmeticOpt, CallExpression call) { - List resolvedChildren = call.getResolvedChildren(); - Optional leftPredicate = resolvedChildren.get(0).accept(this); - Optional rightPredicate = resolvedChildren.get(1).accept(this); - Serializable left = leftPredicate.get().parameters()[0]; - Serializable right = rightPredicate.get().parameters()[0]; - if (left instanceof Number && right instanceof Number) { - Serializable result; - switch (arithmeticOpt) { - case MINUS: - result = ((Number) left).longValue() - ((Number) right).longValue(); - break; - case TIMES: - result = ((Number) left).longValue() * ((Number) right).longValue(); - break; - case PLUS: - result = ((Number) left).longValue() + ((Number) right).longValue(); - break; - case DIVIDE: - result = ((Number) left).longValue() / ((Number) right).longValue(); - break; - default: - throw new IllegalArgumentException( - String.format( - "Not supported arithmetic opt: %s, call expression: %s", arithmeticOpt, call)); - } - return Optional.of(new RowDataPredicate(new Serializable[] {result})); - } - throw new IllegalArgumentException( - String.format( - "arithmetic operator: %s only supported numerical parameters, call expression: %s", - arithmeticOpt, call)); - } - - protected Optional castOperator(CallExpression call) { - List resolvedChildren = call.getResolvedChildren(); - Optional leftPredicate = resolvedChildren.get(0).accept(this); - if (resolvedChildren.size() != 2) { - throw new IllegalArgumentException( - String.format( - "cast operator's children expressions should be 2. call expression: %s", call)); - } - if (resolvedChildren.get(1) instanceof TypeLiteralExpression) { - Class type = resolvedChildren.get(1).getOutputDataType().getConversionClass(); - Serializable se = (Serializable) type.cast(leftPredicate.get().parameters()[0]); - return Optional.of(new RowDataPredicate(new Serializable[] {se})); - } - throw new IllegalArgumentException( - String.format( - "cast operator's children expressions should be 2. call expression: %s", call)); - } - - protected Optional renderUnaryOperator( - RowDataPredicate.Opt opt, ResolvedExpression resolvedExpression) { - if (resolvedExpression instanceof FieldReferenceExpression) { - Optional leftPredicate = resolvedExpression.accept(this); - return leftPredicate.map(rowDataPredicate -> rowDataPredicate.combine(opt, null)); - } - return Optional.empty(); - } - - protected Optional renderBinaryOperator( - RowDataPredicate.Opt opt, List resolvedExpressions) { - Optional leftPredicate = resolvedExpressions.get(0).accept(this); - - Optional rightPredicate = resolvedExpressions.get(1).accept(this); - - if (AND.equals(opt) || OR.equals(opt)) { - Preconditions.checkArgument(leftPredicate.isPresent()); - Preconditions.checkArgument(rightPredicate.isPresent()); - return Optional.of( - new RowDataPredicate( - opt, - new RowDataPredicate[] {leftPredicate.get()}, - new RowDataPredicate[] {rightPredicate.get()})); - } - - return leftPredicate.flatMap(left -> rightPredicate.map(right -> left.combine(opt, right))); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java deleted file mode 100644 index e86ac102f9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricConstant.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.metric; - -/** metric constant */ -public class MetricConstant { - - /** - * The start time of mixed-format table's initialization when it used as build table in temporal - * join. - */ - public static final String TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP = - "temporalTableInitializationStartTimestamp"; - /** - * The end time of mixed-format table's initialization when it used as build table in temporal - * join. - */ - public static final String TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP = - "temporalTableInitializationEndTimestamp"; -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java deleted file mode 100644 index d539f1664a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/metric/MetricsGenerator.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.metric; - -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.time.LocalDateTime; -import java.time.ZoneId; - -/** - * A generator that generates the latency metrics of the writing operators in flink applications. - */ -public class MetricsGenerator implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(MetricsGenerator.class); - private long currentLatency; - private final boolean latencyEnable; - private final boolean metricEnable; - private final Schema schema; - private final RowType flinkSchema; - private RowData.FieldGetter modifyTimeGetter; - private boolean findColumn = false; - - private MetricsGenerator( - boolean latencyEnable, - Schema schema, - RowType flinkSchema, - String modifyTimeColumn, - boolean metricEnable) { - this.latencyEnable = latencyEnable; - this.schema = schema; - this.metricEnable = metricEnable; - this.flinkSchema = flinkSchema; - checkColumnExist(modifyTimeColumn); - } - - private void checkColumnExist(String modifyTimeColumn) { - if (!this.latencyEnable) { - return; - } - if (modifyTimeColumn == null || this.schema.findField(modifyTimeColumn) == null) { - LOG.warn("can't find event time column {}", modifyTimeColumn); - findColumn = false; - } else { - findColumn = true; - int modifyTimeColumnIndex = flinkSchema.getFieldIndex(modifyTimeColumn); - LogicalType type = flinkSchema.getTypeAt(modifyTimeColumnIndex); - LOG.info( - "event latency with column {}, index {}, type {}", - modifyTimeColumn, - modifyTimeColumnIndex, - type); - modifyTimeGetter = RowData.createFieldGetter(type, modifyTimeColumnIndex); - } - } - - public static MetricsGenerator empty(boolean metricEnable) { - return new MetricsGenerator(false, null, null, null, metricEnable); - } - - public static MetricsGenerator newGenerator( - Schema schema, RowType flinkSchema, String modifyTimeColumn, boolean metricEnable) { - return new MetricsGenerator(true, schema, flinkSchema, modifyTimeColumn, metricEnable); - } - - public boolean enable() { - return latencyEnable; - } - - public boolean isMetricEnable() { - return metricEnable; - } - - public void recordLatency(StreamRecord element) { - if (latencyEnable) { - if (findColumn) { - RowData rowData = element.getValue(); - if (rowData.getRowKind() == RowKind.UPDATE_BEFORE - || rowData.getRowKind() == RowKind.DELETE) { - return; - } - - Object value = modifyTimeGetter.getFieldOrNull(rowData); - if (value == null) { - return; - } - if (value instanceof LocalDateTime) { - LocalDateTime localDateTime = (LocalDateTime) value; - long eventTime = localDateTime.atZone(ZoneId.systemDefault()).toInstant().toEpochMilli(); - this.currentLatency = System.currentTimeMillis() - eventTime; - } else if (value instanceof Long) { - this.currentLatency = System.currentTimeMillis() - (Long) value; - } else { - LOG.warn("eventTimeColumn is not LocalDateTime/Long, {}", value.getClass()); - } - } else if (element.hasTimestamp()) { - this.currentLatency = System.currentTimeMillis() - element.getTimestamp(); - } - } - } - - public long getCurrentLatency() { - return currentLatency; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java deleted file mode 100644 index deb6639d12..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/planner/calcite/FlinkTypeSystem.java +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.planner.calcite; - -import static org.apache.calcite.sql.type.SqlTypeName.DECIMAL; -import static org.apache.flink.table.planner.utils.ShortcutUtils.unwrapTypeFactory; - -import org.apache.calcite.rel.type.RelDataType; -import org.apache.calcite.rel.type.RelDataTypeFactory; -import org.apache.calcite.rel.type.RelDataTypeFactoryImpl; -import org.apache.calcite.rel.type.RelDataTypeSystemImpl; -import org.apache.calcite.sql.type.SqlTypeName; -import org.apache.calcite.sql.type.SqlTypeUtil; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.planner.calcite.FlinkTypeFactory; -import org.apache.flink.table.types.logical.DecimalType; -import org.apache.flink.table.types.logical.LocalZonedTimestampType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.TimestampType; -import org.apache.flink.table.types.logical.utils.LogicalTypeMerging; -import org.apache.flink.util.function.QuadFunction; - -import javax.annotation.Nullable; - -/** - * Custom type system for Flink. - * - *

Copied from flink-1.18. - */ -@Internal -public class FlinkTypeSystem extends RelDataTypeSystemImpl { - - public static final FlinkTypeSystem INSTANCE = new FlinkTypeSystem(); - public static final DecimalType DECIMAL_SYSTEM_DEFAULT = - new DecimalType(DecimalType.MAX_PRECISION, 18); - - private FlinkTypeSystem() {} - - @Override - public int getMaxNumericPrecision() { - // set the maximum precision of a NUMERIC or DECIMAL type to DecimalType.MAX_PRECISION. - return DecimalType.MAX_PRECISION; - } - - @Override - public int getMaxNumericScale() { - // the max scale can't be greater than precision - return DecimalType.MAX_PRECISION; - } - - @Override - public int getDefaultPrecision(SqlTypeName typeName) { - switch (typeName) { - case VARCHAR: - case VARBINARY: - // Calcite will limit the length of the VARCHAR field to 65536 - return Integer.MAX_VALUE; - case TIMESTAMP: - // by default we support timestamp with microseconds precision (Timestamp(6)) - return TimestampType.DEFAULT_PRECISION; - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - // by default we support timestamp with local time zone with microseconds precision - // Timestamp(6) with local time zone - return LocalZonedTimestampType.DEFAULT_PRECISION; - } - return super.getDefaultPrecision(typeName); - } - - @Override - public int getMaxPrecision(SqlTypeName typeName) { - switch (typeName) { - case VARCHAR: - case CHAR: - case VARBINARY: - case BINARY: - return Integer.MAX_VALUE; - - case TIMESTAMP: - // The maximum precision of TIMESTAMP is 3 in Calcite, - // change it to 9 to support nanoseconds precision - return TimestampType.MAX_PRECISION; - - case TIMESTAMP_WITH_LOCAL_TIME_ZONE: - // The maximum precision of TIMESTAMP_WITH_LOCAL_TIME_ZONE is 3 in Calcite, - // change it to 9 to support nanoseconds precision - return LocalZonedTimestampType.MAX_PRECISION; - } - return super.getMaxPrecision(typeName); - } - - @Override - public boolean shouldConvertRaggedUnionTypesToVarying() { - // when union a number of CHAR types of different lengths, we should cast to a VARCHAR - // this fixes the problem of CASE WHEN with different length string literals but get wrong - // result with additional space suffix - return true; - } - - @Override - public RelDataType deriveAvgAggType(RelDataTypeFactory typeFactory, RelDataType argRelDataType) { - LogicalType argType = FlinkTypeFactory.toLogicalType(argRelDataType); - LogicalType resultType = LogicalTypeMerging.findAvgAggType(argType); - return unwrapTypeFactory(typeFactory).createFieldTypeFromLogicalType(resultType); - } - - @Override - public RelDataType deriveSumType(RelDataTypeFactory typeFactory, RelDataType argRelDataType) { - LogicalType argType = FlinkTypeFactory.toLogicalType(argRelDataType); - LogicalType resultType = LogicalTypeMerging.findSumAggType(argType); - return unwrapTypeFactory(typeFactory).createFieldTypeFromLogicalType(resultType); - } - - @Override - public RelDataType deriveDecimalPlusType( - RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { - return deriveDecimalType( - typeFactory, type1, type2, LogicalTypeMerging::findAdditionDecimalType); - } - - @Override - public RelDataType deriveDecimalModType( - RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { - return deriveDecimalRelDataType( - typeFactory, - type1, - type2, - (p1, s1, p2, s2) -> { - if (s1 == 0 && s2 == 0) { - return type2; - } - DecimalType result = LogicalTypeMerging.findModuloDecimalType(p1, s1, p2, s2); - return typeFactory.createSqlType(DECIMAL, result.getPrecision(), result.getScale()); - }); - } - - @Override - public RelDataType deriveDecimalDivideType( - RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { - return deriveDecimalType( - typeFactory, type1, type2, LogicalTypeMerging::findDivisionDecimalType); - } - - @Override - public RelDataType deriveDecimalMultiplyType( - RelDataTypeFactory typeFactory, RelDataType type1, RelDataType type2) { - return deriveDecimalType( - typeFactory, type1, type2, LogicalTypeMerging::findMultiplicationDecimalType); - } - - /** Use derivation from {@link LogicalTypeMerging} to derive decimal type. */ - private @Nullable RelDataType deriveDecimalType( - RelDataTypeFactory typeFactory, - RelDataType type1, - RelDataType type2, - QuadFunction deriveImpl) { - return deriveDecimalRelDataType( - typeFactory, - type1, - type2, - (p1, s1, p2, s2) -> { - DecimalType result = deriveImpl.apply(p1, s1, p2, s2); - return typeFactory.createSqlType(DECIMAL, result.getPrecision(), result.getScale()); - }); - } - - private @Nullable RelDataType deriveDecimalRelDataType( - RelDataTypeFactory typeFactory, - RelDataType type1, - RelDataType type2, - QuadFunction deriveImpl) { - if (canDeriveDecimal(type1, type2)) { - RelDataType decType1 = adjustType(typeFactory, type1); - RelDataType decType2 = adjustType(typeFactory, type2); - return deriveImpl.apply( - decType1.getPrecision(), - decType1.getScale(), - decType2.getPrecision(), - decType2.getScale()); - } else { - return null; - } - } - - /** - * Java numeric will always have invalid precision/scale, use its default decimal precision/scale - * instead. - */ - private RelDataType adjustType(RelDataTypeFactory typeFactory, RelDataType relDataType) { - return RelDataTypeFactoryImpl.isJavaType(relDataType) - ? typeFactory.decimalOf(relDataType) - : relDataType; - } - - private boolean canDeriveDecimal(RelDataType type1, RelDataType type2) { - return SqlTypeUtil.isExactNumeric(type1) - && SqlTypeUtil.isExactNumeric(type2) - && (SqlTypeUtil.isDecimal(type1) || SqlTypeUtil.isDecimal(type2)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java deleted file mode 100644 index 155bda30ad..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java +++ /dev/null @@ -1,873 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.time.Instant; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -public class AdaptHiveFlinkParquetReaders { - private AdaptHiveFlinkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - if (fieldReaders.get(i) != null) { - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - reorderedFields.add(ParquetValueReaders.constant(idToConstant.get(id))); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else { - ParquetValueReader reader = readersById.get(id); - if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } - } - } - - return new RowDataReader(types, reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - if (expectedList == null) { - return null; - } - - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = repeated.getType(0); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - if (expectedMap == null) { - return null; - } - - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - @Override - @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - if (expected == null) { - return null; - } - - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return new StringReader(desc); - case INT_8: - case INT_16: - case INT_32: - if (expected.typeId() == Types.LongType.get().typeId()) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case TIME_MICROS: - return new LossyMicrosToMillisTimeReader(desc); - case TIME_MILLIS: - return new MillisTimeReader(desc); - case DATE: - case INT_64: - return new ParquetValueReaders.UnboxedReader<>(desc); - case TIMESTAMP_MICROS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MicrosToTimestampTzReader(desc); - } else { - return new MicrosToTimestampReader(desc); - } - case TIMESTAMP_MILLIS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MillisToTimestampTzReader(desc); - } else { - return new MillisToTimestampReader(desc); - } - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT32: - return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return new ParquetValueReaders.ByteArrayReader(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case FLOAT: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { - return new ParquetValueReaders.FloatAsDoubleReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new ParquetValueReaders.UnboxedReader<>(desc); - case INT96: - Types.TimestampType tsMicrosType = (Types.TimestampType) expected; - if (tsMicrosType.shouldAdjustToUTC()) { - return new TimestampIntWithTZ96Reader(desc); - } else { - return new TimestampIntWithOutTZ96Reader(desc); - } - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class TimestampIntWithOutTZ96Reader - extends ParquetValueReaders.PrimitiveReader { - private static final long UNIX_EPOCH_JULIAN = 2_440_588L; - - TimestampIntWithOutTZ96Reader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData reuse) { - final ByteBuffer byteBuffer = - column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); - final long timeOfDayNanos = byteBuffer.getLong(); - final int julianDay = byteBuffer.getInt(); - - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) - .plusNanos(timeOfDayNanos) - .atZone(ZoneId.systemDefault()) - .toLocalDateTime()); - } - } - - private static class TimestampIntWithTZ96Reader - extends ParquetValueReaders.PrimitiveReader { - private static final long UNIX_EPOCH_JULIAN = 2_440_588L; - - private TimestampIntWithTZ96Reader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData reuse) { - final ByteBuffer byteBuffer = - column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); - final long timeOfDayNanos = byteBuffer.getLong(); - final int julianDay = byteBuffer.getInt(); - - return TimestampData.fromInstant( - Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) - .plusNanos(timeOfDayNanos)); - } - } - - private static class BinaryDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - Binary binary = column.nextBinary(); - BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); - // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader - return DecimalData.fromBigDecimal(bigDecimal, precision, scale); - } - } - - private static class IntegerDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); - } - } - - private static class MicrosToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MicrosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromInstant( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000)); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromEpochMillis(millis); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class StringReader extends ParquetValueReaders.PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public StringData read(StringData ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return StringData.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return StringData.fromBytes(binary.getBytes()); - } - } - } - - private static class LossyMicrosToMillisTimeReader - extends ParquetValueReaders.PrimitiveReader { - LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - // Discard microseconds since Flink uses millisecond unit for TIME type. - return (int) Math.floorDiv(column.nextLong(), 1000L); - } - } - - private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { - MillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - return (int) column.nextLong(); - } - } - - private static class ArrayReader - extends ParquetValueReaders.RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - list.setNumElements(writePos); - return list; - } - } - - private static class MapReader - extends ParquetValueReaders.RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ParquetValueReaders.ReusableEntry entry = - new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = - new ParquetValueReaders.ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class RowDataReader - extends ParquetValueReaders.StructReader { - private final int numFields; - - RowDataReader(List types, List> readers) { - super(types, readers); - this.numFields = readers.size(); - } - - @Override - protected GenericRowData newStructData(RowData reuse) { - if (reuse instanceof GenericRowData) { - return (GenericRowData) reuse; - } else { - return new GenericRowData(numFields); - } - } - - @Override - protected Object getField(GenericRowData intermediate, int pos) { - return intermediate.getField(pos); - } - - @Override - protected RowData buildStruct(GenericRowData struct) { - return struct; - } - - @Override - protected void set(GenericRowData row, int pos, Object value) { - row.setField(pos, value); - } - - @Override - protected void setNull(GenericRowData row, int pos) { - row.setField(pos, null); - } - - @Override - protected void setBoolean(GenericRowData row, int pos, boolean value) { - row.setField(pos, value); - } - - @Override - protected void setInteger(GenericRowData row, int pos, int value) { - row.setField(pos, value); - } - - @Override - protected void setLong(GenericRowData row, int pos, long value) { - row.setField(pos, value); - } - - @Override - protected void setFloat(GenericRowData row, int pos, float value) { - row.setField(pos, value); - } - - @Override - protected void setDouble(GenericRowData row, int pos, double value) { - row.setField(pos, value); - } - } - - private static class ReusableMapData implements MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int size() { - return numElements; - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData implements ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 1]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public StringData getString(int pos) { - return (StringData) values[pos]; - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) values[pos]; - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) values[pos]; - } - - @SuppressWarnings("unchecked") - @Override - public RawValueData getRawValue(int pos) { - return (RawValueData) values[pos]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) values[pos]; - } - - @Override - public boolean[] toBooleanArray() { - return ArrayUtil.toPrimitive((Boolean[]) values); - } - - @Override - public byte[] toByteArray() { - return ArrayUtil.toPrimitive((Byte[]) values); - } - - @Override - public short[] toShortArray() { - return ArrayUtil.toPrimitive((Short[]) values); - } - - @Override - public int[] toIntArray() { - return ArrayUtil.toPrimitive((Integer[]) values); - } - - @Override - public long[] toLongArray() { - return ArrayUtil.toPrimitive((Long[]) values); - } - - @Override - public float[] toFloatArray() { - return ArrayUtil.toPrimitive((Float[]) values); - } - - @Override - public double[] toDoubleArray() { - return ArrayUtil.toPrimitive((Double[]) values); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java deleted file mode 100644 index f591ebe527..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/FlinkSplitPlanner.java +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.data.DataFileType; -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; -import org.apache.amoro.scan.ChangeTableIncrementalScan; -import org.apache.amoro.scan.CombinedScanTask; -import org.apache.amoro.scan.KeyedTableScan; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.table.KeyedTable; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -/** - * An util class that plans mixed-format table(base and change) or just plans change table. invoked - * by mixed-format enumerator. - */ -public class FlinkSplitPlanner { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSplitPlanner.class); - - private FlinkSplitPlanner() {} - - public static List planFullTable( - KeyedTable keyedTable, AtomicInteger splitCount) { - CloseableIterable combinedScanTasks = keyedTable.newScan().planTasks(); - BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.of(combinedScanTasks); - return planFullTable(baseAndChangeTask, splitCount); - } - - /** - * Plans full table scanning for a {@link KeyedTable} with optional filters and a specified split - * count. - * - * @param keyedTable The {@link KeyedTable} to scan. - * @param filters Optional list of filters to apply to the scan. - * @param splitCount The atomic integer to track the split count. - * @return The list of planned {@link MixedFormatSplit} included {@link SnapshotSplit}, {@link - * ChangelogSplit}. - */ - public static List planFullTable( - KeyedTable keyedTable, List filters, AtomicInteger splitCount) { - KeyedTableScan keyedTableScan = keyedTable.newScan(); - if (filters != null) { - filters.forEach(keyedTableScan::filter); - } - CloseableIterable combinedScanTasks = keyedTableScan.planTasks(); - BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.of(combinedScanTasks); - return planFullTable(baseAndChangeTask, splitCount); - } - - private static List planFullTable( - BaseAndChangeTask baseAndChangeTask, AtomicInteger splitCount) { - Collection baseTasks = baseAndChangeTask.allBaseTasks(); - List allSplits = - baseTasks.stream() - .map( - mixedFileScanTask -> - new SnapshotSplit( - Collections.singleton(mixedFileScanTask), splitCount.incrementAndGet())) - .collect(Collectors.toList()); - - Collection changeTasks = baseAndChangeTask.transactionTasks(); - List changeSplits = planChangeTable(changeTasks, splitCount); - allSplits.addAll(changeSplits); - - return allSplits; - } - - /** - * Plans full table scanning for a {@link KeyedTable} with optional filters and a specified split - * count. - * - * @param keyedTable The {@link KeyedTable} to scan. - * @param filters Optional list of filters to apply to the scan. - * @param splitCount The atomic integer to track the split count. - * @return The list of planned {@link MixedFormatSplit} included {@link MergeOnReadSplit}. - */ - public static List mergeOnReadPlan( - KeyedTable keyedTable, List filters, AtomicInteger splitCount) { - KeyedTableScan keyedTableScan = keyedTable.newScan(); - if (filters != null) { - filters.forEach(keyedTableScan::filter); - } - CloseableIterable combinedScanTasks = keyedTableScan.planTasks(); - List morSplits = Lists.newArrayList(); - try (CloseableIterator initTasks = combinedScanTasks.iterator()) { - - while (initTasks.hasNext()) { - CombinedScanTask combinedScanTask = initTasks.next(); - combinedScanTask - .tasks() - .forEach( - keyedTableScanTask -> - morSplits.add(new MergeOnReadSplit(splitCount.get(), keyedTableScanTask))); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - return morSplits; - } - - public static List planChangeTable( - ChangeTableIncrementalScan tableIncrementalScan, AtomicInteger splitCount) { - CloseableIterable tasks = tableIncrementalScan.planFiles(); - BaseAndChangeTask baseAndChangeTask = BaseAndChangeTask.ofIceberg(tasks); - return planChangeTable(baseAndChangeTask.transactionTasks(), splitCount); - } - - private static List planChangeTable( - Collection transactionTasks, AtomicInteger splitCount) { - List changeTasks = new ArrayList<>(transactionTasks.size()); - transactionTasks.forEach( - transactionTask -> { - PartitionAndNodeGroup partitionAndNodeGroup = - new PartitionAndNodeGroup() - .insertFileScanTask(transactionTask.insertTasks) - .deleteFileScanTask(transactionTask.deleteTasks) - .splitCount(splitCount); - changeTasks.addAll(partitionAndNodeGroup.planSplits()); - }); - return changeTasks; - } - - private static class TransactionTask { - private Set insertTasks; - private Set deleteTasks; - Long transactionId; - - public TransactionTask(Long transactionId) { - this.transactionId = transactionId; - } - - public void putInsertTask(MixedFileScanTask insert) { - if (insertTasks == null) { - insertTasks = new HashSet<>(); - } - insertTasks.add(insert); - } - - public void putDeleteTask(MixedFileScanTask delete) { - if (deleteTasks == null) { - deleteTasks = new HashSet<>(); - } - deleteTasks.add(delete); - } - } - - public static class BaseAndChangeTask { - Collection allBaseTasks; - Collection changeTableTasks; - - private BaseAndChangeTask( - Collection allBaseTasks, Map changeTableTaskMap) { - this.allBaseTasks = allBaseTasks; - if (changeTableTaskMap == null || changeTableTaskMap.isEmpty()) { - this.changeTableTasks = Collections.emptyList(); - } else { - this.changeTableTasks = - changeTableTaskMap.values().stream() - .sorted(Comparator.comparing(o -> o.transactionId)) - .collect(Collectors.toList()); - } - } - - public static BaseAndChangeTask ofIceberg(CloseableIterable tasks) { - try (CloseableIterator tasksIterator = tasks.iterator()) { - Map transactionTasks = new HashMap<>(); - long startTime = System.currentTimeMillis(); - int count = 0; - while (tasksIterator.hasNext()) { - count++; - MixedFileScanTask fileScanTask = (MixedFileScanTask) tasksIterator.next(); - if (fileScanTask.file().type().equals(DataFileType.INSERT_FILE)) { - taskMap(Collections.singleton(fileScanTask), true, transactionTasks); - } else if (fileScanTask.file().type().equals(DataFileType.EQ_DELETE_FILE)) { - taskMap(Collections.singleton(fileScanTask), false, transactionTasks); - } else { - throw new IllegalArgumentException( - String.format( - "DataFileType %s is not supported during change log reading period.", - fileScanTask.file().type())); - } - } - LOG.info( - "Read {} change log from {} in {} ms", - count, - tasksIterator.getClass(), - System.currentTimeMillis() - startTime); - return new BaseAndChangeTask(Collections.emptySet(), transactionTasks); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - public static BaseAndChangeTask of(CloseableIterable combinedScanTasks) { - try (CloseableIterator initTasks = combinedScanTasks.iterator()) { - final Set allBaseTasks = new HashSet<>(); - final Map transactionTasks = new HashMap<>(); - - while (initTasks.hasNext()) { - CombinedScanTask combinedScanTask = initTasks.next(); - combinedScanTask - .tasks() - .forEach( - keyedTableScanTask -> { - allBaseTasks.addAll(keyedTableScanTask.baseTasks()); - - taskMap(keyedTableScanTask.insertTasks(), true, transactionTasks); - taskMap(keyedTableScanTask.mixedEquityDeletes(), false, transactionTasks); - }); - } - List baseTasks = - allBaseTasks.stream() - .sorted(Comparator.comparing(t -> t.file().transactionId())) - .collect(Collectors.toList()); - - return new BaseAndChangeTask(baseTasks, transactionTasks); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private static void taskMap( - Collection tasks, - boolean insert, - Map transactionTaskMap) { - tasks.forEach( - task -> { - long transactionId = task.file().transactionId(); - TransactionTask tasksInSingleTransaction = - transactionTaskMap.getOrDefault(transactionId, new TransactionTask(transactionId)); - if (insert) { - tasksInSingleTransaction.putInsertTask(task); - } else { - tasksInSingleTransaction.putDeleteTask(task); - } - transactionTaskMap.put(transactionId, tasksInSingleTransaction); - }); - } - - public Collection allBaseTasks() { - return allBaseTasks; - } - - public Collection transactionTasks() { - return changeTableTasks; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java deleted file mode 100644 index 5597c0e2ed..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedFormatSource.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; -import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; -import org.apache.amoro.flink.read.hybrid.assigner.StaticSplitAssigner; -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumStateSerializer; -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumerator; -import org.apache.amoro.flink.read.hybrid.enumerator.StaticMixedFormatSourceEnumerator; -import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; -import org.apache.amoro.flink.read.hybrid.reader.ReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitSerializer; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.core.io.SimpleVersionedSerializer; - -/** - * Mixed-format Source based of FLIP-27. - * - *

If MixedFormatSource is used as a build table in lookup join, it will be implemented by - * temporal join. Two source should use processing time as watermark. MixedFormatSource will - * generate watermark after first splits planned by MixedFormatSourceEnumerator having been - * finished. - */ -public class MixedFormatSource - implements Source, ResultTypeQueryable { - private static final long serialVersionUID = 1L; - private final MixedFormatScanContext scanContext; - private final ReaderFunction readerFunction; - private final TypeInformation typeInformation; - private final MixedFormatTableLoader loader; - private final String tableName; - /** - * generate mixed-format watermark. This is only for lookup join mixed-format table, and - * mixed-format table is used as build table, i.e. right table. - */ - private final boolean dimTable; - - public MixedFormatSource( - MixedFormatTableLoader loader, - MixedFormatScanContext scanContext, - ReaderFunction readerFunction, - TypeInformation typeInformation, - String tableName, - boolean dimTable) { - this.loader = loader; - this.scanContext = scanContext; - this.readerFunction = readerFunction; - this.typeInformation = typeInformation; - this.tableName = tableName; - this.dimTable = dimTable; - } - - @Override - public Boundedness getBoundedness() { - return scanContext.isStreaming() ? Boundedness.CONTINUOUS_UNBOUNDED : Boundedness.BOUNDED; - } - - @Override - public SourceReader createReader(SourceReaderContext readerContext) { - return new MixedFormatSourceReader<>( - readerFunction, readerContext.getConfiguration(), readerContext, dimTable); - } - - @Override - public SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext) { - return createEnumerator(enumContext, null); - } - - private SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext, MixedFormatSourceEnumState enumState) { - SplitAssigner splitAssigner; - if (scanContext.isStreaming()) { - splitAssigner = new ShuffleSplitAssigner(enumContext, tableName, enumState); - return new MixedFormatSourceEnumerator( - enumContext, splitAssigner, loader, scanContext, enumState, dimTable); - } else { - splitAssigner = new StaticSplitAssigner(enumState); - return new StaticMixedFormatSourceEnumerator( - enumContext, splitAssigner, loader, scanContext, null); - } - } - - @Override - public SplitEnumerator restoreEnumerator( - SplitEnumeratorContext enumContext, MixedFormatSourceEnumState checkpoint) { - return createEnumerator(enumContext, checkpoint); - } - - @Override - public SimpleVersionedSerializer getSplitSerializer() { - return new MixedFormatSplitSerializer(); - } - - @Override - public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { - return new MixedFormatSourceEnumStateSerializer(); - } - - @Override - public TypeInformation getProducedType() { - return typeInformation; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java deleted file mode 100644 index 04417f297c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/MixedIncrementalLoader.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousEnumerationResult; -import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousSplitPlanner; -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset; -import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.CloseableIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayDeque; -import java.util.List; -import java.util.Queue; -import java.util.concurrent.atomic.AtomicReference; - -/** - * This is a mixed-format table(mixed iceberg, mixed-hive) incremental loader. - * - *

This loader is used to load data by the merge on read approach first, then by the incremental - * pull approach. - * - *

Merge on read approach only contain INSERT rows. - * - *

Incremental pull approach contains INSERT, DELETE, UPDATE_BEFORE, and UPDATE_AFTER. - * - *

Support projection and filter push-down to speed up the loading process. - */ -public class MixedIncrementalLoader implements AutoCloseable { - private static final Logger LOG = LoggerFactory.getLogger(MixedIncrementalLoader.class); - private final ContinuousSplitPlanner continuousSplitPlanner; - private final DataIteratorReaderFunction readerFunction; - private AbstractAdaptHiveKeyedDataReader flinkMORDataReader; - private final List filters; - private final AtomicReference enumeratorPosition; - private final Queue splitQueue; - - public MixedIncrementalLoader( - ContinuousSplitPlanner continuousSplitPlanner, - AbstractAdaptHiveKeyedDataReader flinkMORDataReader, - DataIteratorReaderFunction readerFunction, - List filters) { - this.continuousSplitPlanner = continuousSplitPlanner; - this.flinkMORDataReader = flinkMORDataReader; - this.readerFunction = readerFunction; - this.filters = filters; - this.enumeratorPosition = new AtomicReference<>(); - this.splitQueue = new ArrayDeque<>(); - } - - public MixedIncrementalLoader( - ContinuousSplitPlanner continuousSplitPlanner, - DataIteratorReaderFunction readerFunction, - List filters) { - this.continuousSplitPlanner = continuousSplitPlanner; - this.readerFunction = readerFunction; - this.filters = filters; - this.enumeratorPosition = new AtomicReference<>(); - this.splitQueue = new ArrayDeque<>(); - } - - public boolean hasNext() { - if (splitQueue.isEmpty()) { - ContinuousEnumerationResult planResult = - continuousSplitPlanner.planSplits(enumeratorPosition.get(), filters); - if (!planResult.isEmpty()) { - planResult.splits().forEach(split -> LOG.info("Putting this split into queue: {}.", split)); - splitQueue.addAll(planResult.splits()); - } - if (!planResult.toOffset().isEmpty()) { - enumeratorPosition.set(planResult.toOffset()); - } - LOG.info( - "Currently, queue contain {} splits, scan position is {}.", - splitQueue.size(), - enumeratorPosition.get()); - return !splitQueue.isEmpty(); - } - return true; - } - - public CloseableIterator next() { - MixedFormatSplit split = splitQueue.poll(); - if (split == null) { - throw new IllegalStateException("next() called, but no more valid splits"); - } - - LOG.info("Fetching data by this split:{}.", split); - if (split.isMergeOnReadSplit()) { - return flinkMORDataReader.readData(split.asMergeOnReadSplit().keyedTableScanTask()); - } - return readerFunction.createDataIterator(split); - } - - @Override - public void close() throws Exception { - continuousSplitPlanner.close(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java deleted file mode 100644 index a8b1d5ca0c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/PartitionAndNodeGroup.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.scan.MixedFileScanTask; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; - -/** - * This is a group of the partitions and nodes of the mixed-format table, it can plan different - * nodes and different partitions into different {@link MixedFormatSplit}. - */ -public class PartitionAndNodeGroup { - AtomicInteger splitCount = new AtomicInteger(); - Collection insertTasks; - Collection deleteTasks; - - public PartitionAndNodeGroup insertFileScanTask(Set insertTasks) { - this.insertTasks = insertTasks; - return this; - } - - public PartitionAndNodeGroup deleteFileScanTask(Set deleteTasks) { - this.deleteTasks = deleteTasks; - return this; - } - - public PartitionAndNodeGroup splitCount(AtomicInteger splitCount) { - this.splitCount = splitCount; - return this; - } - - List planSplits() { - Map> nodes = new HashMap<>(); - plan(true, nodes); - plan(false, nodes); - - List splits = new ArrayList<>(); - - nodes - .values() - .forEach( - indexNodes -> - indexNodes - .values() - .forEach( - node -> - splits.add( - new ChangelogSplit( - node.inserts, node.deletes, splitCount.incrementAndGet())))); - return splits; - } - - /** - * Split the collection of {@link MixedFileScanTask} into different groups. - * - * @param insert if plan insert files or not - * @param nodes the key of nodes is partition info which the file located, the value of nodes is - * hashmap of mixed-format tree node id and {@link Node} - */ - private void plan(boolean insert, Map> nodes) { - Collection tasks = insert ? insertTasks : deleteTasks; - if (tasks == null) { - return; - } - - tasks.forEach( - task -> { - String partitionKey = task.file().partition().toString(); - Long nodeId = task.file().node().getId(); - Map indexNodes = nodes.getOrDefault(partitionKey, new HashMap<>()); - Node node = indexNodes.getOrDefault(nodeId, new Node()); - if (insert) { - node.addInsert(task); - } else { - node.addDelete(task); - } - indexNodes.put(nodeId, node); - nodes.put(partitionKey, indexNodes); - }); - } - - private static class Node { - List inserts = new ArrayList<>(1); - List deletes = new ArrayList<>(1); - - void addInsert(MixedFileScanTask task) { - inserts.add(task); - } - - void addDelete(MixedFileScanTask task) { - deletes.add(task); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java deleted file mode 100644 index cd0671fe84..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/ShuffleSplitAssigner.java +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.data.PrimaryKeyedFile; -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.util.FlinkRuntimeException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.PriorityBlockingQueue; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; - -/** - * According to Mark, Index TreeNodes and subtaskId assigning a split to special subtask to read. - */ -public class ShuffleSplitAssigner implements SplitAssigner { - private static final Logger LOG = LoggerFactory.getLogger(ShuffleSplitAssigner.class); - - private static final long POLL_TIMEOUT = 200; - private final SplitEnumeratorContext enumeratorContext; - - private int totalParallelism; - private int totalSplitNum; - private Long currentMaskOfTreeNode; - private final Object lock = new Object(); - - /** - * Key is the partition data and file index of the mixed-format file, Value is flink application - * subtaskId. - */ - private final Map partitionIndexSubtaskMap; - /** Key is subtaskId, Value is the queue of unAssigned mixed-format splits. */ - private final Map> subtaskSplitMap; - - private CompletableFuture availableFuture; - - @VisibleForTesting - public ShuffleSplitAssigner(SplitEnumeratorContext enumeratorContext) { - this.enumeratorContext = enumeratorContext; - this.totalParallelism = enumeratorContext.currentParallelism(); - this.partitionIndexSubtaskMap = new ConcurrentHashMap<>(); - this.subtaskSplitMap = new ConcurrentHashMap<>(); - } - - public ShuffleSplitAssigner( - SplitEnumeratorContext enumeratorContext, - String tableName, - @Nullable MixedFormatSourceEnumState enumState) { - this.enumeratorContext = enumeratorContext; - this.partitionIndexSubtaskMap = new ConcurrentHashMap<>(); - this.subtaskSplitMap = new ConcurrentHashMap<>(); - if (enumState == null) { - this.totalParallelism = enumeratorContext.currentParallelism(); - LOG.info( - "Mixed-format source enumerator current parallelism is {} for table {}", - totalParallelism, - tableName); - } else { - LOG.info( - "Mixed-format source restored {} splits from state for table {}", - enumState.pendingSplits().size(), - tableName); - deserializePartitionIndex( - Objects.requireNonNull( - enumState.shuffleSplitRelation(), - "The partition index and subtask state couldn't be null.")); - enumState - .pendingSplits() - .forEach(state -> onDiscoveredSplits(Collections.singleton(state.toSourceSplit()))); - } - } - - @Override - public Split getNext() { - throw new UnsupportedOperationException( - "ShuffleSplitAssigner couldn't support this operation."); - } - - @Override - public Split getNext(int subtaskId) { - return getNextSplit(subtaskId) - .map(Split::of) - .orElseGet(isEmpty() ? Split::unavailable : Split::subtaskUnavailable); - } - - private Optional getNextSplit(int subTaskId) { - int currentParallelism = enumeratorContext.currentParallelism(); - if (totalParallelism != currentParallelism) { - throw new FlinkRuntimeException( - String.format( - "Source parallelism has been changed, before parallelism is %s, now is %s", - totalParallelism, currentParallelism)); - } - if (subtaskSplitMap.containsKey(subTaskId)) { - PriorityBlockingQueue queue = subtaskSplitMap.get(subTaskId); - - MixedFormatSplit mixedFormatSplit = null; - try { - mixedFormatSplit = queue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - LOG.warn("interruptedException", e); - } - if (mixedFormatSplit == null) { - LOG.debug( - "Subtask {}, couldn't retrieve mixed-format source split in the queue.", subTaskId); - return Optional.empty(); - } else { - LOG.info( - "get next mixed-format split taskIndex {}, totalSplitNum {}, mixed-format split {}.", - mixedFormatSplit.taskIndex(), - totalSplitNum, - mixedFormatSplit); - return Optional.of(mixedFormatSplit); - } - } else { - LOG.debug( - "Subtask {}, it's an idle subtask due to the empty queue with this subtask.", subTaskId); - return Optional.empty(); - } - } - - @Override - public void onDiscoveredSplits(Collection splits) { - splits.forEach(this::putSplitIntoQueue); - // only complete pending future if new splits are discovered - completeAvailableFuturesIfNeeded(); - } - - @Override - public void onUnassignedSplits(Collection splits) { - onDiscoveredSplits(splits); - } - - void putSplitIntoQueue(final MixedFormatSplit split) { - List exactlyTreeNodes = getExactlyTreeNodes(split); - - PrimaryKeyedFile file = findAnyFileInSplit(split); - - for (DataTreeNode node : exactlyTreeNodes) { - long partitionIndexKey = Math.abs(file.partition().toString().hashCode() + node.index()); - int subtaskId = - partitionIndexSubtaskMap.computeIfAbsent( - partitionIndexKey, key -> (partitionIndexSubtaskMap.size() + 1) % totalParallelism); - LOG.info( - "partition = {}, (mask, index) = ({}, {}), subtaskId = {}", - file.partition().toString(), - node.mask(), - node.index(), - subtaskId); - - PriorityBlockingQueue queue = - subtaskSplitMap.getOrDefault(subtaskId, new PriorityBlockingQueue<>()); - MixedFormatSplit copiedSplit = split.copy(); - copiedSplit.modifyTreeNode(node); - LOG.info("put split into queue: {}", copiedSplit); - queue.add(copiedSplit); - totalSplitNum = totalSplitNum + 1; - subtaskSplitMap.put(subtaskId, queue); - } - } - - @Override - public Collection state() { - List mixedFormatSplitStates = new ArrayList<>(); - subtaskSplitMap.forEach( - (key, value) -> - mixedFormatSplitStates.addAll( - value.stream().map(MixedFormatSplitState::new).collect(Collectors.toList()))); - - return mixedFormatSplitStates; - } - - @Override - public synchronized CompletableFuture isAvailable() { - if (availableFuture == null) { - availableFuture = new CompletableFuture<>(); - } - return availableFuture; - } - - public boolean isEmpty() { - if (subtaskSplitMap.isEmpty()) { - return true; - } - for (Map.Entry> entry : - subtaskSplitMap.entrySet()) { - if (!entry.getValue().isEmpty()) { - return false; - } - } - return true; - } - - @Override - public void close() throws IOException { - subtaskSplitMap.clear(); - partitionIndexSubtaskMap.clear(); - } - - public long[] serializePartitionIndex() { - int prefixParams = 3; - long[] shuffleSplitRelation = new long[partitionIndexSubtaskMap.size() * 2 + prefixParams]; - shuffleSplitRelation[0] = totalParallelism; - shuffleSplitRelation[1] = totalSplitNum; - shuffleSplitRelation[2] = currentMaskOfTreeNode == null ? -1 : currentMaskOfTreeNode; - - int i = prefixParams; - for (Map.Entry entry : partitionIndexSubtaskMap.entrySet()) { - shuffleSplitRelation[i++] = entry.getKey(); - shuffleSplitRelation[i++] = entry.getValue(); - } - return shuffleSplitRelation; - } - - void deserializePartitionIndex(long[] shuffleSplitRelation) { - int prefixParams = 3; - this.totalParallelism = (int) shuffleSplitRelation[0]; - this.totalSplitNum = (int) shuffleSplitRelation[1]; - this.currentMaskOfTreeNode = shuffleSplitRelation[2] == -1 ? null : shuffleSplitRelation[2]; - - for (int i = prefixParams; i < shuffleSplitRelation.length; i++) { - partitionIndexSubtaskMap.put(shuffleSplitRelation[i], (int) shuffleSplitRelation[++i]); - } - } - - /** - * Different data files may locate in different layers when multi snapshots are committed, so - * mixed-format source reading should consider emitting the records and keeping ordering. - * According to the dataTreeNode of the mixed-format split and the currentMaskOfTreeNode, return - * the exact tree node list which may move up or go down layers in the mixed-format tree. - * - *

-   * |mask=0          o
-   * |             /     \
-   * |mask=1     o        o
-   * |         /   \    /   \
-   * |mask=3  o     o  o     o
-   * 
- * - * @param mixedFormatSplit Mixed-format split. - * @return The exact tree node list. - */ - public List getExactlyTreeNodes(MixedFormatSplit mixedFormatSplit) { - DataTreeNode dataTreeNode = mixedFormatSplit.dataTreeNode(); - long mask = dataTreeNode.mask(); - - synchronized (lock) { - if (currentMaskOfTreeNode == null) { - currentMaskOfTreeNode = mask; - } - } - - return scanTreeNode(dataTreeNode); - } - - private List scanTreeNode(DataTreeNode dataTreeNode) { - long mask = dataTreeNode.mask(); - if (mask == currentMaskOfTreeNode) { - return Collections.singletonList(dataTreeNode); - } else if (mask > currentMaskOfTreeNode) { - // move up one layer - return scanTreeNode(dataTreeNode.parent()); - } else { - // go down one layer - List allNodes = new ArrayList<>(); - allNodes.addAll(scanTreeNode(dataTreeNode.left())); - allNodes.addAll(scanTreeNode(dataTreeNode.right())); - return allNodes; - } - } - - /** - * In one mixed-format split, the partitions, mask and index of the files are the same. - * - * @param mixedFormatSplit mixed-format source split - * @return anyone primary keyed file in the mixed-format split. - */ - private PrimaryKeyedFile findAnyFileInSplit(MixedFormatSplit mixedFormatSplit) { - AtomicReference file = new AtomicReference<>(); - if (mixedFormatSplit.isChangelogSplit()) { - List mixedFileScanTasks = - new ArrayList<>(mixedFormatSplit.asChangelogSplit().insertTasks()); - mixedFileScanTasks.addAll(mixedFormatSplit.asChangelogSplit().deleteTasks()); - mixedFileScanTasks.stream().findFirst().ifPresent(task -> file.set(task.file())); - if (file.get() != null) { - return file.get(); - } - } - - List mixedFileScanTasks = - new ArrayList<>(mixedFormatSplit.asSnapshotSplit().insertTasks()); - mixedFileScanTasks.stream().findFirst().ifPresent(task -> file.set(task.file())); - if (file.get() != null) { - return file.get(); - } - throw new FlinkRuntimeException("Couldn't find a primaryKeyedFile."); - } - - private synchronized void completeAvailableFuturesIfNeeded() { - if (availableFuture != null && !isEmpty()) { - availableFuture.complete(null); - } - availableFuture = null; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java deleted file mode 100644 index 265710ee1c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/Split.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.util.Preconditions; - -/** This is a wrapper Split of {@link MixedFormatSplit} with split status. */ -public class Split { - - public enum Status { - AVAILABLE, - - /** Assigner has pending splits. But current subtask doesn't have pending splits. */ - SUBTASK_UNAVAILABLE, - - /** Assigner doesn't have pending splits. */ - UNAVAILABLE - } - - private final Status status; - private final MixedFormatSplit split; - - private Split(Status status) { - this.status = status; - this.split = null; - } - - private Split(MixedFormatSplit split) { - Preconditions.checkNotNull(split, "Split cannot be null"); - this.status = Status.AVAILABLE; - this.split = split; - } - - @VisibleForTesting - public Status status() { - return status; - } - - public boolean isAvailable() { - return status == Status.AVAILABLE; - } - - public boolean isUnavailable() { - return status == Status.UNAVAILABLE; - } - - public MixedFormatSplit split() { - return split; - } - - private static final Split UNAVAILABLE = new Split(Status.UNAVAILABLE); - private static final Split SUBTASK_UNAVAILABLE = new Split(Status.SUBTASK_UNAVAILABLE); - - public static Split unavailable() { - return UNAVAILABLE; - } - - public static Split subtaskUnavailable() { - return SUBTASK_UNAVAILABLE; - } - - public static Split of(MixedFormatSplit mixedFormatSplit) { - return new Split(mixedFormatSplit); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java deleted file mode 100644 index 8e7b36a40b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/SplitAssigner.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; - -import java.io.Closeable; -import java.util.Collection; -import java.util.concurrent.CompletableFuture; - -/** An interface SplitAssigner for {@link MixedFormatSplit} */ -public interface SplitAssigner extends Closeable { - - default void open() {} - - Split getNext(); - - Split getNext(int subtaskId); - - /** Add new splits discovered by enumerator */ - void onDiscoveredSplits(Collection splits); - - /** Forward addSplitsBack event (for failed reader) to assigner */ - void onUnassignedSplits(Collection splits); - - /** - * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon - * completed splits - */ - default void onCompletedSplits(Collection completedSplitIds) {} - - Collection state(); - - /** - * Enumerator can get a notification via CompletableFuture when the assigner has more splits - * available later. Enumerator should schedule assignment in the thenAccept action of the future. - * - *

Assigner will return the same future if this method is called again before the previous - * future is completed. - * - *

The future can be completed from other thread, e.g. the coordinator thread from another - * thread for event time alignment. - * - *

If enumerator need to trigger action upon the future completion, it may want to run it in - * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. - */ - CompletableFuture isAvailable(); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java deleted file mode 100644 index ff39e4124c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/assigner/StaticSplitAssigner.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatSourceEnumState; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Optional; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.PriorityBlockingQueue; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -/** This is a static split assigner which is used for batch mode. */ -public class StaticSplitAssigner implements SplitAssigner { - private static final Logger LOG = LoggerFactory.getLogger(StaticSplitAssigner.class); - - private static final long POLL_TIMEOUT = 200; - private int totalSplitNum; - - private final PriorityBlockingQueue splitQueue; - - private CompletableFuture availableFuture; - - public StaticSplitAssigner(@Nullable MixedFormatSourceEnumState enumState) { - this.splitQueue = new PriorityBlockingQueue<>(); - if (enumState != null) { - Collection splitStates = enumState.pendingSplits(); - splitStates.forEach( - state -> onDiscoveredSplits(Collections.singleton(state.toSourceSplit()))); - } - } - - @Override - public Split getNext() { - return getNextSplit().map(Split::of).orElseGet(Split::unavailable); - } - - @Override - public Split getNext(int subtaskId) { - return getNext(); - } - - private Optional getNextSplit() { - MixedFormatSplit mixedFormatSplit = null; - try { - mixedFormatSplit = splitQueue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - LOG.warn("Interrupted when polling splits from the split queue", e); - } - if (mixedFormatSplit == null) { - LOG.debug( - "Couldn't retrieve mixed-format source split from the queue, as the queue is empty."); - return Optional.empty(); - } else { - LOG.info( - "Assigning the mixed-format split, task index is {}, total number of splits is {}, mixed-format split is {}.", - mixedFormatSplit.taskIndex(), - totalSplitNum, - mixedFormatSplit); - return Optional.of(mixedFormatSplit); - } - } - - @Override - public void onDiscoveredSplits(Collection splits) { - splits.forEach(this::putSplitIntoQueue); - totalSplitNum += splits.size(); - // only complete pending future if new splits are discovered - completeAvailableFuturesIfNeeded(); - } - - @Override - public void onUnassignedSplits(Collection splits) { - onDiscoveredSplits(splits); - } - - void putSplitIntoQueue(final MixedFormatSplit split) { - splitQueue.put(split); - } - - @Override - public Collection state() { - return splitQueue.stream().map(MixedFormatSplitState::new).collect(Collectors.toList()); - } - - @Override - public synchronized CompletableFuture isAvailable() { - if (availableFuture == null) { - availableFuture = new CompletableFuture<>(); - } - return availableFuture; - } - - public boolean isEmpty() { - return splitQueue.isEmpty(); - } - - @Override - public void close() throws IOException { - splitQueue.clear(); - } - - private synchronized void completeAvailableFuturesIfNeeded() { - if (availableFuture != null && !isEmpty()) { - availableFuture.complete(null); - } - availableFuture = null; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java deleted file mode 100644 index 49a0f07a3f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/AbstractMixedFormatEnumerator.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.hybrid.assigner.Split; -import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; -import org.apache.amoro.flink.read.hybrid.reader.ReaderStartedEvent; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicReference; - -/** The abstract mixed-format source enumerator. */ -public abstract class AbstractMixedFormatEnumerator - implements SplitEnumerator { - private static final Logger LOG = LoggerFactory.getLogger(AbstractMixedFormatEnumerator.class); - private final SplitEnumeratorContext enumeratorContext; - private final SplitAssigner assigner; - private final Map readersAwaitingSplit; - private final AtomicReference> availableFuture; - - AbstractMixedFormatEnumerator( - SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { - this.enumeratorContext = enumeratorContext; - this.assigner = assigner; - this.readersAwaitingSplit = new ConcurrentHashMap<>(); - this.availableFuture = new AtomicReference<>(); - } - - @Override - public void start() {} - - @Override - public void close() throws IOException { - assigner.close(); - } - - @Override - public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { - throw new UnsupportedOperationException( - String.format( - "Received invalid default split request event " - + "from subtask %d as mixed-format source uses custom split request event", - subtaskId)); - } - - @Override - public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { - if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; - LOG.info("Received request split event from subtask {}", subtaskId); - assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); - readersAwaitingSplit.put(subtaskId, String.valueOf(splitRequestEvent.requesterHostname())); - assignSplits(); - } else if (sourceEvent instanceof ReaderStartedEvent) { - LOG.info("Received ReaderStartEvent from subtask {}", subtaskId); - } else { - throw new IllegalArgumentException( - String.format( - "Received unknown event from subtask %d: %s", - subtaskId, sourceEvent.getClass().getCanonicalName())); - } - } - - @Override - public void addReader(int subtaskId) { - LOG.info("Added reader: {}", subtaskId); - } - - @Override - public void addSplitsBack(List splits, int subtaskId) { - LOG.info("addSplitsBack from subtaskId {}, splits {}.", subtaskId, splits); - assigner.onUnassignedSplits(splits); - } - - /** return true if enumerator should wait for splits like in the continuous enumerator case. */ - protected abstract boolean shouldWaitForMoreSplits(); - - protected void assignSplits() { - LOG.info( - "Assign mixed-format splits to {} readers, subtasks:{}.", - readersAwaitingSplit.size(), - readersAwaitingSplit.keySet().toArray()); - final Iterator> awaitingReader = - readersAwaitingSplit.entrySet().iterator(); - while (awaitingReader.hasNext()) { - final Map.Entry nextAwaiting = awaitingReader.next(); - - // if the reader that requested another split has failed in the meantime, remove - // it from the list of waiting readers - if (!enumeratorContext.registeredReaders().containsKey(nextAwaiting.getKey())) { - LOG.info( - "Due to this reader doesn't registered in the enumerator context any more, so remove this subtask reader" - + " [{}] from the awaiting reader map.", - nextAwaiting.getKey()); - awaitingReader.remove(); - continue; - } - - final int awaitingSubtask = nextAwaiting.getKey(); - final Split nextSplit = assigner.getNext(awaitingSubtask); - if (nextSplit.isAvailable()) { - MixedFormatSplit mixedFormatSplit = nextSplit.split(); - LOG.info( - "assign a mixed-format split to subtaskId {}, taskIndex {}, mixed-format split {}.", - awaitingSubtask, - mixedFormatSplit.taskIndex(), - mixedFormatSplit); - enumeratorContext.assignSplit(mixedFormatSplit, awaitingSubtask); - awaitingReader.remove(); - } else if (nextSplit.isUnavailable()) { - if (!shouldWaitForMoreSplits()) { - LOG.info("No more splits available for subtask {}", awaitingSubtask); - enumeratorContext.signalNoMoreSplits(awaitingSubtask); - awaitingReader.remove(); - } else { - fetchAvailableFutureIfNeeded(); - break; - } - } - } - } - - private synchronized void fetchAvailableFutureIfNeeded() { - if (availableFuture.get() != null) { - return; - } - - CompletableFuture future = - assigner - .isAvailable() - .thenAccept( - ignore -> - // Must run assignSplits in coordinator thread - // because the future may be completed from other threads. - // E.g., in event time alignment assigner, - // watermark advancement from another source may - // cause the available future to be completed - enumeratorContext.runInCoordinatorThread( - () -> { - LOG.debug("Executing callback of assignSplits"); - availableFuture.set(null); - assignSplits(); - })); - availableFuture.set(future); - LOG.debug("Registered callback for future available splits"); - } - - @VisibleForTesting - public Map getReadersAwaitingSplit() { - return readersAwaitingSplit; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java deleted file mode 100644 index ff9c610187..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousEnumerationResult.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; - -/** - * The result that contains {@link MixedFormatSplit}s and is generated by {@link FlinkSplitPlanner}. - */ -public class ContinuousEnumerationResult { - public static final ContinuousEnumerationResult EMPTY = - new ContinuousEnumerationResult( - Collections.emptyList(), null, MixedFormatEnumeratorOffset.empty()); - - private final Collection splits; - private final MixedFormatEnumeratorOffset fromOffset; - private final MixedFormatEnumeratorOffset toOffset; - - /** - * @param splits should never be null. But it can be an empty collection - * @param fromOffset can be null - * @param toOffset should never be null. But it can have null snapshotId and snapshotTimestampMs - */ - public ContinuousEnumerationResult( - Collection splits, - MixedFormatEnumeratorOffset fromOffset, - MixedFormatEnumeratorOffset toOffset) { - Preconditions.checkArgument(splits != null, "Invalid to splits collection: null"); - Preconditions.checkArgument(toOffset != null, "Invalid end position: null"); - this.splits = splits; - this.fromOffset = fromOffset; - this.toOffset = toOffset; - } - - public Collection splits() { - return splits; - } - - public MixedFormatEnumeratorOffset fromOffset() { - return fromOffset; - } - - public MixedFormatEnumeratorOffset toOffset() { - return toOffset; - } - - public boolean isEmpty() { - return null == splits || splits.isEmpty(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("splits", Arrays.toString(splits.toArray())) - .add("fromPosition", fromOffset) - .add("toPosition", toOffset) - .toString(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java deleted file mode 100644 index 804ae9db81..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlanner.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.commons.compress.utils.Lists; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.expressions.Expression; - -import java.io.Closeable; -import java.util.List; - -/** This interface is introduced so that we can plug in a different split planner for unit test */ -@Internal -public interface ContinuousSplitPlanner extends Closeable { - - /** Discover the files appended between {@code lastPosition} and current table snapshot */ - default ContinuousEnumerationResult planSplits(MixedFormatEnumeratorOffset lastPosition) { - return planSplits(lastPosition, Lists.newArrayList()); - } - - /** - * Discover the files appended between {@code lastPosition} and current table snapshot, filter the - * data with expressions. - */ - ContinuousEnumerationResult planSplits( - MixedFormatEnumeratorOffset lastPosition, List filters); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java deleted file mode 100644 index b513975ffb..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/ContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.amoro.flink.read.FlinkSplitPlanner.planChangeTable; -import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; -import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.scan.ChangeTableIncrementalScan; -import org.apache.amoro.table.KeyedTable; -import org.apache.commons.collections.CollectionUtils; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.expressions.Expression; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -/** - * Continuous planning {@link KeyedTable} by {@link MixedFormatEnumeratorOffset} and generate a - * {@link ContinuousEnumerationResult}. - * - *

{@link ContinuousEnumerationResult#splits()} includes the {@link SnapshotSplit}s and {@link - * ChangelogSplit}s. - */ -@Internal -public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { - private static final Logger LOG = LoggerFactory.getLogger(ContinuousSplitPlannerImpl.class); - - protected transient KeyedTable table; - protected final MixedFormatTableLoader loader; - protected static final AtomicInteger SPLIT_COUNT = new AtomicInteger(); - - public ContinuousSplitPlannerImpl(MixedFormatTableLoader loader) { - this.loader = loader; - } - - @Override - public void close() throws IOException { - if (loader != null) { - loader.close(); - } - } - - @Override - public ContinuousEnumerationResult planSplits( - MixedFormatEnumeratorOffset lastOffset, List filters) { - if (table == null) { - table = loadMixedTable(loader).asKeyedTable(); - } - table.refresh(); - if (lastOffset != null) { - return discoverIncrementalSplits(lastOffset, filters); - } else { - return discoverInitialSplits(filters); - } - } - - protected ContinuousEnumerationResult discoverIncrementalSplits( - MixedFormatEnumeratorOffset lastPosition, List filters) { - long fromChangeSnapshotId = lastPosition.changeSnapshotId(); - Snapshot changeSnapshot = table.changeTable().currentSnapshot(); - if (changeSnapshot != null && changeSnapshot.snapshotId() != fromChangeSnapshotId) { - long snapshotId = changeSnapshot.snapshotId(); - ChangeTableIncrementalScan changeTableScan = - table.changeTable().newScan().useSnapshot(snapshotId); - if (filters != null) { - for (Expression filter : filters) { - changeTableScan = changeTableScan.filter(filter); - } - } - - if (fromChangeSnapshotId != Long.MIN_VALUE) { - Snapshot snapshot = table.changeTable().snapshot(fromChangeSnapshotId); - changeTableScan = changeTableScan.fromSequence(snapshot.sequenceNumber()); - } - - List changeSplit = planChangeTable(changeTableScan, SPLIT_COUNT); - return new ContinuousEnumerationResult( - changeSplit, lastPosition, MixedFormatEnumeratorOffset.of(snapshotId, null)); - } - return ContinuousEnumerationResult.EMPTY; - } - - protected ContinuousEnumerationResult discoverInitialSplits(List filters) { - Snapshot changeSnapshot = table.changeTable().currentSnapshot(); - // todo ShuffleSplitAssigner doesn't support MergeOnReadSplit right now, - // because it doesn't implement the dataTreeNode() method - // fix AMORO-1950 in the future. - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(table, filters, SPLIT_COUNT); - - long changeStartSnapshotId = - changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; - if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { - LOG.info("There have no change snapshot, and no base splits in table: {}.", table); - return ContinuousEnumerationResult.EMPTY; - } - - return new ContinuousEnumerationResult( - mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java deleted file mode 100644 index b2ad16e819..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/InitializationFinishedEvent.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; -import org.apache.flink.api.connector.source.SourceEvent; - -/** {@link MixedFormatSourceReader} won't set timestamp to RowData until receiving this Event. */ -public class InitializationFinishedEvent implements SourceEvent { - private static final long serialVersionUID = 1L; - - public static final InitializationFinishedEvent INSTANCE = new InitializationFinishedEvent(); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java deleted file mode 100644 index 8f669ed914..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadIncrementalPlanner.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.commons.collections.CollectionUtils; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.expressions.Expression; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; - -/** - * A planner for merge-on-read scanning by {@link this#discoverInitialSplits} and incremental - * scanning by {@link this#discoverIncrementalSplits(MixedFormatEnumeratorOffset, List)}. - * - *

{@link ContinuousEnumerationResult#splits()} includes the {@link MergeOnReadSplit}s and {@link - * ChangelogSplit}s. - */ -public class MergeOnReadIncrementalPlanner extends ContinuousSplitPlannerImpl { - private static final Logger LOG = LoggerFactory.getLogger(MergeOnReadIncrementalPlanner.class); - - public MergeOnReadIncrementalPlanner(MixedFormatTableLoader loader) { - super(loader); - } - - @Override - protected ContinuousEnumerationResult discoverInitialSplits(List filters) { - Snapshot changeSnapshot = table.changeTable().currentSnapshot(); - - List mixedFormatSplits = - FlinkSplitPlanner.mergeOnReadPlan(table, filters, SPLIT_COUNT); - - long changeStartSnapshotId = - changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; - if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { - LOG.info("There have no change snapshot, and no base splits in table: {}.", table); - return ContinuousEnumerationResult.EMPTY; - } - - return new ContinuousEnumerationResult( - mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java deleted file mode 100644 index 84b276ec89..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MergeOnReadPlannerImpl.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; -import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.KeyedTable; -import org.apache.commons.collections.CollectionUtils; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.expressions.Expression; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -/** Used for MergeOnRead, only for the bounded reading and return append stream. */ -public class MergeOnReadPlannerImpl implements ContinuousSplitPlanner { - private static final Logger LOG = LoggerFactory.getLogger(MergeOnReadPlannerImpl.class); - - protected transient KeyedTable table; - protected final MixedFormatTableLoader loader; - protected static final AtomicInteger SPLIT_COUNT = new AtomicInteger(); - - public MergeOnReadPlannerImpl(MixedFormatTableLoader loader) { - this.loader = loader; - } - - @Override - public ContinuousEnumerationResult planSplits( - MixedFormatEnumeratorOffset ignored, List filters) { - // todo support mor the table from the specific offset in the future - if (table == null) { - table = loadMixedTable(loader).asKeyedTable(); - } - table.refresh(); - return discoverInitialSplits(filters); - } - - protected ContinuousEnumerationResult discoverInitialSplits(List filters) { - Snapshot changeSnapshot = table.changeTable().currentSnapshot(); - List mixedFormatSplits = - FlinkSplitPlanner.mergeOnReadPlan(table, filters, SPLIT_COUNT); - - long changeStartSnapshotId = - changeSnapshot != null ? changeSnapshot.snapshotId() : EARLIEST_SNAPSHOT_ID; - if (changeSnapshot == null && CollectionUtils.isEmpty(mixedFormatSplits)) { - LOG.info("There have no change snapshot, and no base splits in table: {}.", table); - return ContinuousEnumerationResult.EMPTY; - } - - return new ContinuousEnumerationResult( - mixedFormatSplits, null, MixedFormatEnumeratorOffset.of(changeStartSnapshotId, null)); - } - - @Override - public void close() throws IOException { - if (loader != null) { - loader.close(); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java deleted file mode 100644 index 5c9beed199..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffset.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.shade.guava32.com.google.common.base.Objects; - -/** - * The enumerator offset indicate the snapshot id of the change table, or the timestamp of snapshot. - */ -public class MixedFormatEnumeratorOffset { - private static final MixedFormatEnumeratorOffset EMPTY = of(Long.MIN_VALUE, Long.MIN_VALUE); - - /** use Long.MIN_VALUE to indicate the earliest offset */ - public static final long EARLIEST_SNAPSHOT_ID = Long.MIN_VALUE; - - private Long changeSnapshotId; - private Long snapshotTimestampMs; - - private MixedFormatEnumeratorOffset(Long changeSnapshotId, Long snapshotTimestampMs) { - this.changeSnapshotId = changeSnapshotId; - this.snapshotTimestampMs = snapshotTimestampMs; - } - - public static MixedFormatEnumeratorOffset of(Long changeSnapshotId, Long snapshotTimestampMs) { - return new MixedFormatEnumeratorOffset(changeSnapshotId, snapshotTimestampMs); - } - - public static MixedFormatEnumeratorOffset empty() { - return EMPTY; - } - - public Long changeSnapshotId() { - return changeSnapshotId; - } - - public void changeSnapshotId(long changeSnapshotId) { - this.changeSnapshotId = changeSnapshotId; - } - - public Long snapshotTimestampMs() { - return snapshotTimestampMs; - } - - public void snapshotTimestampMs(Long snapshotTimestamp) { - this.snapshotTimestampMs = snapshotTimestamp; - } - - public boolean isEmpty() { - return (changeSnapshotId == null && snapshotTimestampMs == null) || equals(EMPTY); - } - - @Override - public int hashCode() { - return Objects.hashCode(changeSnapshotId, snapshotTimestampMs); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("changeSnapshotId", changeSnapshotId) - .add("snapshotTimestamp", snapshotTimestampMs) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - MixedFormatEnumeratorOffset other = (MixedFormatEnumeratorOffset) o; - return Objects.equal(changeSnapshotId, other.changeSnapshotId()) - && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java deleted file mode 100644 index 1692dbff4b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatEnumeratorOffsetSerializer.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; - -import java.io.IOException; - -/** - * Serializer that serializes and deserializes mixed-format enumerator {@link - * MixedFormatEnumeratorOffset}. - */ -class MixedFormatEnumeratorOffsetSerializer - implements SimpleVersionedSerializer { - public static final MixedFormatEnumeratorOffsetSerializer INSTANCE = - new MixedFormatEnumeratorOffsetSerializer(); - - private static final int VERSION = 1; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(128)); - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(MixedFormatEnumeratorOffset position) throws IOException { - return serializeV1(position); - } - - @Override - public MixedFormatEnumeratorOffset deserialize(int version, byte[] serialized) - throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - private byte[] serializeV1(MixedFormatEnumeratorOffset position) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - out.writeBoolean(position.changeSnapshotId() != null); - if (position.changeSnapshotId() != null) { - out.writeLong(position.changeSnapshotId()); - } - out.writeBoolean(position.snapshotTimestampMs() != null); - if (position.snapshotTimestampMs() != null) { - out.writeLong(position.snapshotTimestampMs()); - } - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - private MixedFormatEnumeratorOffset deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - Long snapshotId = null; - if (in.readBoolean()) { - snapshotId = in.readLong(); - } - - Long snapshotTimestampMs = null; - if (in.readBoolean()) { - snapshotTimestampMs = in.readLong(); - } - - return MixedFormatEnumeratorOffset.of(snapshotId, snapshotTimestampMs); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java deleted file mode 100644 index 181c15dd9c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumState.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; - -import javax.annotation.Nullable; - -import java.util.Collection; - -/** - * State that contains pending mixed-format splits and last enumerator offset in mixed-format source - * enumerator {@link MixedFormatSourceEnumerator}. - */ -public class MixedFormatSourceEnumState { - @Nullable private final MixedFormatEnumeratorOffset lastEnumeratedOffset; - private final Collection pendingSplits; - @Nullable private final long[] shuffleSplitRelation; - @Nullable private final TemporalJoinSplits temporalJoinSplits; - - public MixedFormatSourceEnumState( - Collection pendingSplits, - @Nullable MixedFormatEnumeratorOffset lastEnumeratedOffset, - @Nullable long[] shuffleSplitRelation, - @Nullable TemporalJoinSplits temporalJoinSplits) { - this.pendingSplits = pendingSplits; - this.lastEnumeratedOffset = lastEnumeratedOffset; - this.shuffleSplitRelation = shuffleSplitRelation; - this.temporalJoinSplits = temporalJoinSplits; - } - - @Nullable - public MixedFormatEnumeratorOffset lastEnumeratedOffset() { - return lastEnumeratedOffset; - } - - public Collection pendingSplits() { - return pendingSplits; - } - - @Nullable - public long[] shuffleSplitRelation() { - return shuffleSplitRelation; - } - - @Nullable - public TemporalJoinSplits temporalJoinSplits() { - return temporalJoinSplits; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java deleted file mode 100644 index c28cc7de65..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumStateSerializer.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitSerializer; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.util.InstantiationUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; -import java.util.Objects; - -/** - * Serializer that serializes and deserializes mixed-format enumerator {@link - * MixedFormatSourceEnumState}. - */ -public class MixedFormatSourceEnumStateSerializer - implements SimpleVersionedSerializer { - - private static final Logger LOGGER = - LoggerFactory.getLogger(MixedFormatSourceEnumStateSerializer.class); - private static final int VERSION = 1; - private final MixedFormatSplitSerializer splitSerializer = MixedFormatSplitSerializer.INSTANCE; - private final MixedFormatEnumeratorOffsetSerializer offsetSerializer = - MixedFormatEnumeratorOffsetSerializer.INSTANCE; - - private static final ThreadLocal SERIALIZER_CACHE = - ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(MixedFormatSourceEnumState mixedFormatSourceEnumState) - throws IOException { - return serializeV1(mixedFormatSourceEnumState); - } - - private byte[] serializeV1(MixedFormatSourceEnumState enumState) throws IOException { - DataOutputSerializer out = SERIALIZER_CACHE.get(); - - out.writeBoolean(enumState.lastEnumeratedOffset() != null); - if (enumState.lastEnumeratedOffset() != null) { - out.writeInt(offsetSerializer.getVersion()); - byte[] positionBytes = offsetSerializer.serialize(enumState.lastEnumeratedOffset()); - out.writeInt(positionBytes.length); - out.write(positionBytes); - } - - out.writeInt(splitSerializer.getVersion()); - out.writeInt(enumState.pendingSplits().size()); - for (MixedFormatSplitState splitState : enumState.pendingSplits()) { - byte[] splitBytes = splitSerializer.serialize(splitState.toSourceSplit()); - out.writeInt(splitBytes.length); - out.write(splitBytes); - } - - out.writeBoolean(enumState.shuffleSplitRelation() != null); - if (enumState.shuffleSplitRelation() != null) { - long[] shuffleSplitRelation = enumState.shuffleSplitRelation(); - out.writeInt(Objects.requireNonNull(shuffleSplitRelation).length); - for (long l : shuffleSplitRelation) { - out.writeLong(l); - } - } - - out.writeBoolean(enumState.temporalJoinSplits() != null); - if (enumState.temporalJoinSplits() != null) { - byte[] temporalJoinSplits = InstantiationUtil.serializeObject(enumState.temporalJoinSplits()); - out.writeInt(temporalJoinSplits.length); - out.write(temporalJoinSplits); - } - - byte[] result = out.getCopyOfBuffer(); - out.clear(); - return result; - } - - @Override - public MixedFormatSourceEnumState deserialize(int version, byte[] serialized) throws IOException { - switch (version) { - case 1: - return deserializeV1(serialized); - default: - throw new IOException("Unknown version: " + version); - } - } - - private MixedFormatSourceEnumState deserializeV1(byte[] serialized) throws IOException { - DataInputDeserializer in = new DataInputDeserializer(serialized); - - MixedFormatEnumeratorOffset enumeratorOffset = null; - if (in.readBoolean()) { - int version = in.readInt(); - byte[] positionBytes = new byte[in.readInt()]; - in.read(positionBytes); - enumeratorOffset = offsetSerializer.deserialize(version, positionBytes); - } - - int splitSerializerVersion = in.readInt(); - int splitCount = in.readInt(); - Collection pendingSplits = Lists.newArrayListWithCapacity(splitCount); - for (int i = 0; i < splitCount; ++i) { - byte[] splitBytes = new byte[in.readInt()]; - in.read(splitBytes); - MixedFormatSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); - pendingSplits.add(new MixedFormatSplitState(split)); - } - - long[] shuffleSplitRelation = null; - if (in.readBoolean()) { - int length = in.readInt(); - shuffleSplitRelation = new long[length]; - for (int i = 0; i < length; i++) { - shuffleSplitRelation[i] = in.readLong(); - } - } - - TemporalJoinSplits temporalJoinSplits = null; - if (in.readBoolean()) { - byte[] bytes = new byte[in.readInt()]; - in.read(bytes); - try { - temporalJoinSplits = - InstantiationUtil.deserializeObject(bytes, TemporalJoinSplits.class.getClassLoader()); - } catch (ClassNotFoundException e) { - throw new RuntimeException("deserialize FirstSplit error", e); - } - } - - return new MixedFormatSourceEnumState( - pendingSplits, enumeratorOffset, shuffleSplitRelation, temporalJoinSplits); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java deleted file mode 100644 index 92929d2a94..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/MixedFormatSourceEnumerator.java +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.amoro.flink.read.hybrid.enumerator.MixedFormatEnumeratorOffset.EARLIEST_SNAPSHOT_ID; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; -import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; - -import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; -import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; -import org.apache.amoro.flink.read.hybrid.reader.HybridSplitReader; -import org.apache.amoro.flink.read.hybrid.reader.MixedFormatSourceReader; -import org.apache.amoro.flink.read.hybrid.reader.ReaderStartedEvent; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; -import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.iceberg.Snapshot; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.util.Collection; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.IntStream; - -/** - * Enumerator for mixed-format source, assign {@link MixedFormatSplit} to mixed-format source reader - * {@link HybridSplitReader} - */ -public class MixedFormatSourceEnumerator extends AbstractMixedFormatEnumerator { - private static final Logger LOG = LoggerFactory.getLogger(MixedFormatSourceEnumerator.class); - private transient KeyedTable keyedTable; - /** - * To record the snapshotId at the first planSplits. - * - *

If its value is null, it means that we don't need to generate watermark. Won't check. - */ - private transient volatile TemporalJoinSplits temporalJoinSplits = null; - - private final MixedFormatTableLoader loader; - private final SplitEnumeratorContext context; - private final ContinuousSplitPlanner continuousSplitPlanner; - private final SplitAssigner splitAssigner; - private final MixedFormatScanContext scanContext; - private final long snapshotDiscoveryIntervalMs; - /** - * If true, using mixed-format table as build table. {@link MixedFormatSourceEnumerator} will - * notify {@link MixedFormatSourceReader} after MixedFormatReaders have finished reading all - * {@link TemporalJoinSplits}. Then {@link MixedFormatSourceReader} will emit a Watermark values - * Long.MAX_VALUE. Advancing TemporalJoinOperator's watermark can trigger the join operation and - * push the results to downstream. The watermark of Long.MAX_VALUE avoids affecting the watermark - * defined by user arbitrary probe side - */ - private final boolean dimTable; - - private volatile boolean sourceEventBeforeFirstPlan = false; - /** - * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off - * this as the starting position. - */ - private final AtomicReference enumeratorPosition; - - private final AtomicBoolean lock = new AtomicBoolean(false); - - public MixedFormatSourceEnumerator( - SplitEnumeratorContext enumContext, - SplitAssigner splitAssigner, - MixedFormatTableLoader loader, - MixedFormatScanContext scanContext, - @Nullable MixedFormatSourceEnumState enumState, - boolean dimTable) { - super(enumContext, splitAssigner); - this.loader = loader; - this.context = enumContext; - this.splitAssigner = splitAssigner; - this.scanContext = scanContext; - this.continuousSplitPlanner = new ContinuousSplitPlannerImpl(loader); - this.snapshotDiscoveryIntervalMs = scanContext.monitorInterval().toMillis(); - this.enumeratorPosition = new AtomicReference<>(); - if (enumState != null) { - this.enumeratorPosition.set(enumState.lastEnumeratedOffset()); - this.temporalJoinSplits = enumState.temporalJoinSplits(); - } - this.dimTable = dimTable; - LOG.info("dimTable: {}", dimTable); - } - - @Override - public void start() { - if (keyedTable == null) { - keyedTable = loadMixedTable(loader).asKeyedTable(); - } - if (enumeratorPosition.get() == null - && SCAN_STARTUP_MODE_LATEST.equalsIgnoreCase(scanContext.scanStartupMode())) { - keyedTable.refresh(); - Snapshot snapshot = keyedTable.changeTable().currentSnapshot(); - long snapshotId = snapshot == null ? EARLIEST_SNAPSHOT_ID : snapshot.snapshotId(); - enumeratorPosition.set(MixedFormatEnumeratorOffset.of(snapshotId, null)); - LOG.info( - "{} is {}, the current snapshot id of the change table {} is {}.", - SCAN_STARTUP_MODE.key(), - SCAN_STARTUP_MODE_LATEST, - keyedTable.id(), - snapshotId); - } - if (snapshotDiscoveryIntervalMs > 0) { - LOG.info( - "Starting the MixedFormatSourceEnumerator with mixed-format table {} snapshot discovery interval of {} ms.", - keyedTable, - snapshotDiscoveryIntervalMs); - context.callAsync( - this::planSplits, this::handleResultOfSplits, 0, snapshotDiscoveryIntervalMs); - } - } - - private ContinuousEnumerationResult planSplits() { - ContinuousEnumerationResult result = doPlanSplits(); - if (dimTable && temporalJoinSplits == null) { - temporalJoinSplits = new TemporalJoinSplits(result.splits(), context.metricGroup()); - // the first SourceEvent may be faster than plan splits - if (result.isEmpty() && sourceEventBeforeFirstPlan) { - notifyReaders(); - } - } - return result; - } - - private ContinuousEnumerationResult doPlanSplits() { - if (lock.get()) { - LOG.info("prefix plan splits thread haven't finished."); - return ContinuousEnumerationResult.EMPTY; - } - lock.set(true); - LOG.info("begin to plan splits current offset {}.", enumeratorPosition.get()); - Optional.ofNullable(scanContext.filters()) - .ifPresent( - filters -> - filters.forEach( - expression -> - LOG.info( - "mixed-format source filter expression: {}.", expression.toString()))); - return continuousSplitPlanner.planSplits(enumeratorPosition.get(), scanContext.filters()); - } - - private void handleResultOfSplits(ContinuousEnumerationResult enumerationResult, Throwable t) { - if (t != null) { - lock.set(false); - throw new FlinkRuntimeException("Failed to scan mixed-format table due to ", t); - } - if (!enumerationResult.isEmpty()) { - splitAssigner.onDiscoveredSplits(enumerationResult.splits()); - } - if (!enumerationResult.toOffset().isEmpty()) { - enumeratorPosition.set(enumerationResult.toOffset()); - } - LOG.info( - "handled result of splits, discover splits size {}, latest offset {}.", - enumerationResult.splits().size(), - enumeratorPosition.get()); - lock.set(false); - } - - @Override - public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { - super.handleSourceEvent(subtaskId, sourceEvent); - if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; - Collection finishedSplitIds = splitRequestEvent.finishedSplitIds(); - if (dimTable) { - checkAndNotifyReader(finishedSplitIds); - } - } else if (sourceEvent instanceof ReaderStartedEvent) { - if (!dimTable || temporalJoinSplits == null || !temporalJoinSplits.hasNotifiedReader()) { - return; - } - // If tm failover, the reader may not be notified and watermark will not be retrieved in - // reader. - sourceEventBeforeFirstPlan = true; - LOG.info("send InitializationFinishedEvent to reader again."); - context.sendEventToSourceReader(subtaskId, InitializationFinishedEvent.INSTANCE); - } else { - throw new IllegalArgumentException( - String.format( - "Received unknown event from subtask %d: %s", - subtaskId, sourceEvent.getClass().getCanonicalName())); - } - } - - /** - * Check whether all first splits have been finished or not. After all finished, enumerator will - * send a {@link InitializationFinishedEvent} to notify all {@link MixedFormatSourceReader}. - * - * @param finishedSplitIds - */ - public void checkAndNotifyReader(Collection finishedSplitIds) { - if (temporalJoinSplits == null) { - sourceEventBeforeFirstPlan = true; - return; - } - - if (temporalJoinSplits.hasNotifiedReader() - || !temporalJoinSplits.removeAndReturnIfAllFinished(finishedSplitIds)) { - return; - } - notifyReaders(); - } - - private void notifyReaders() { - LOG.info("all splits finished, send events to readers"); - IntStream.range(0, context.currentParallelism()) - .forEach(i -> context.sendEventToSourceReader(i, InitializationFinishedEvent.INSTANCE)); - temporalJoinSplits.clear(); - temporalJoinSplits.notifyReader(); - } - - @Override - public MixedFormatSourceEnumState snapshotState(long checkpointId) throws Exception { - long[] shuffleSplitRelation = null; - if (splitAssigner instanceof ShuffleSplitAssigner) { - shuffleSplitRelation = ((ShuffleSplitAssigner) splitAssigner).serializePartitionIndex(); - } - return new MixedFormatSourceEnumState( - splitAssigner.state(), enumeratorPosition.get(), shuffleSplitRelation, temporalJoinSplits); - } - - @Override - public void close() throws IOException { - continuousSplitPlanner.close(); - splitAssigner.close(); - super.close(); - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return true; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java deleted file mode 100644 index 3a4a35039e..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/enumerator/StaticMixedFormatSourceEnumerator.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.amoro.flink.util.MixedFormatUtils.loadMixedTable; - -import org.apache.amoro.flink.read.hybrid.assigner.SplitAssigner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.iceberg.flink.source.ScanContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.util.Collection; - -/** - * This is a static mixed-format source enumerator, used for bounded source scan. Working enabled - * only just {@link ScanContext#STREAMING} is equal to false. - */ -public class StaticMixedFormatSourceEnumerator extends AbstractMixedFormatEnumerator { - private static final Logger LOG = - LoggerFactory.getLogger(StaticMixedFormatSourceEnumerator.class); - private final SplitAssigner assigner; - private final MixedFormatTableLoader loader; - private transient KeyedTable keyedTable; - private final MixedFormatScanContext scanContext; - private final boolean shouldEnumerate; - private final ContinuousSplitPlanner splitPlanner; - - public StaticMixedFormatSourceEnumerator( - SplitEnumeratorContext enumeratorContext, - SplitAssigner assigner, - MixedFormatTableLoader loader, - MixedFormatScanContext scanContext, - @Nullable MixedFormatSourceEnumState enumState) { - super(enumeratorContext, assigner); - this.loader = loader; - this.assigner = assigner; - this.scanContext = scanContext; - // split enumeration is not needed during a restore scenario - this.shouldEnumerate = enumState == null; - this.splitPlanner = new MergeOnReadPlannerImpl(loader); - } - - @Override - public void start() { - super.start(); - if (keyedTable == null) { - keyedTable = loadMixedTable(loader).asKeyedTable(); - } - if (shouldEnumerate) { - keyedTable.baseTable().refresh(); - keyedTable.changeTable().refresh(); - Collection splits = - splitPlanner.planSplits(null, scanContext.filters()).splits(); - assigner.onDiscoveredSplits(splits); - LOG.info( - "Discovered {} splits from table {} during job initialization", - splits.size(), - keyedTable.name()); - } - } - - @Override - protected boolean shouldWaitForMoreSplits() { - return false; - } - - @Override - public MixedFormatSourceEnumState snapshotState(long checkpointId) throws Exception { - return new MixedFormatSourceEnumState(assigner.state(), null, null, null); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java deleted file mode 100644 index ad51b2da93..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayBatchRecords.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.flink.table.data.RowData; - -import javax.annotation.Nullable; - -import java.util.Collections; -import java.util.Set; - -/** - * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. - * Batching is to improve the efficiency for records handover. - * - *

{@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is - * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at - * the same time. - * - *

For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we - * will only have a batch of records for one split here. - * - *

This class uses array to store a batch of records from the same file (with the same - * fileOffset). - */ -class ArrayBatchRecords implements RecordsWithSplitIds> { - @Nullable private String splitId; - @Nullable private final Pool.Recycler recycler; - @Nullable private final T[] records; - private final int numberOfRecords; - private final Set finishedSplits; - private final MixedFormatRecordWithOffset recordWithOffset; - - // point to current read position within the records array - private int position; - - private RecordPosition[] recordPositions; - - private ArrayBatchRecords( - @Nullable String splitId, - @Nullable Pool.Recycler recycler, - @Nullable T[] records, - int numberOfRecords, - int fileOffset, - long startingRecordOffset, - Set finishedSplits) { - Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); - Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); - Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); - - this.splitId = splitId; - this.recycler = recycler; - this.records = records; - this.numberOfRecords = numberOfRecords; - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordWithOffset = new MixedFormatRecordWithOffset<>(); - - this.position = 0; - } - - private ArrayBatchRecords( - @Nullable String splitId, - @Nullable Pool.Recycler recycler, - @Nullable T[] records, - int numberOfRecords, - RecordPosition[] positions, - Set finishedSplits) { - Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); - - this.splitId = splitId; - this.recycler = recycler; - this.records = records; - this.numberOfRecords = numberOfRecords; - this.recordPositions = Preconditions.checkNotNull(positions, "recordPositions can't be null"); - this.finishedSplits = - Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); - this.recordWithOffset = new MixedFormatRecordWithOffset<>(); - - this.position = 0; - } - - @Nullable - @Override - public String nextSplit() { - String nextSplit = this.splitId; - // set the splitId to null to indicate no more splits - // this class only contains record for one split - this.splitId = null; - return nextSplit; - } - - @Nullable - @Override - public MixedFormatRecordWithOffset nextRecordFromSplit() { - if (position < numberOfRecords) { - setRecordWithOffset(); - position++; - return recordWithOffset; - } else { - return null; - } - } - - private void setRecordWithOffset() { - assert records != null; - assert recordPositions[position] != null; - RecordPosition offset = recordPositions[position]; - Preconditions.checkArgument( - offset.currentInsertFileOffset() >= 0 || offset.currentDeleteFileOffset() >= 0, - "fileOffset can't be negative"); - Preconditions.checkArgument( - offset.currentInsertRecordOffset() >= 0, "numberOfRecords can't be negative"); - Preconditions.checkArgument( - offset.currentDeleteRecordOffset() >= 0, "numberOfRecords can't be negative"); - recordWithOffset.set( - records[position], - offset.currentInsertFileOffset(), - offset.currentInsertRecordOffset(), - offset.currentDeleteFileOffset(), - offset.currentDeleteRecordOffset()); - } - - /** - * This method is called when all records from this batch has been emitted. If recycler is set, it - * should be called to return the records array back to pool. - */ - @Override - public void recycle() { - if (recycler != null) { - recycler.recycle(records); - } - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - - @VisibleForTesting - T[] records() { - return records; - } - - @VisibleForTesting - int numberOfRecords() { - return numberOfRecords; - } - - /** - * Create a ArrayBatchRecords backed up an array with records from the same file - * - * @param splitId Iceberg source only read from one split a time. We never have multiple records - * from multiple splits. - * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused - * RowData object, we need to clone RowData eagerly when constructing a batch of records. We - * can use object pool to reuse the RowData array object which can be expensive to create. - * This recycler can be provided to recycle the array object back to pool after read is - * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't - * need to clone objects. It is cheap to just create the batch array. Hence, we don't need - * object pool and recycler can be set to null. - * @param records an array (maybe reused) holding a batch of records - * @param numberOfRecords actual number of records in the array - * @param positions fileOffset and recordOffset for all records in this batch - * @param record type - */ - public static RecordsWithSplitIds> forRecords( - String splitId, - Pool.Recycler recycler, - T[] records, - int numberOfRecords, - RecordPosition[] positions) { - return new ArrayBatchRecords<>( - splitId, recycler, records, numberOfRecords, positions, Collections.emptySet()); - } - - /** - * Create ab ArrayBatchRecords with only finished split id - * - * @param splitId for the split that is just exhausted - */ - public static ArrayBatchRecords finishedSplit(String splitId) { - return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java deleted file mode 100644 index 75bcb1abe7..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ArrayPoolDataIteratorBatcher.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SOURCE_READER_FETCH_BATCH_RECORD_COUNT; - -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderOptions; -import org.apache.flink.connector.file.src.util.Pool; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.IOException; -import java.util.NoSuchElementException; - -/** This implementation stores record batch in array from recyclable pool */ -class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { - private final int batchSize; - private final int handoverQueueSize; - private final RecordFactory recordFactory; - - private transient Pool pool; - - ArrayPoolDataIteratorBatcher(ReadableConfig config, RecordFactory recordFactory) { - this.batchSize = config.get(SOURCE_READER_FETCH_BATCH_RECORD_COUNT); - this.handoverQueueSize = config.get(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY); - this.recordFactory = recordFactory; - } - - @Override - public CloseableIterator>> batch( - String splitId, DataIterator inputIterator) { - Preconditions.checkArgument(inputIterator != null, "Input data iterator can't be null"); - // lazily create pool as it is not serializable - if (pool == null) { - this.pool = createPoolOfBatches(handoverQueueSize); - } - return new ArrayPoolBatchIterator(splitId, inputIterator, pool); - } - - private Pool createPoolOfBatches(int numBatches) { - Pool poolOfBatches = new Pool<>(numBatches); - for (int batchId = 0; batchId < numBatches; batchId++) { - T[] batch = recordFactory.createBatch(batchSize); - poolOfBatches.add(batch); - } - - return poolOfBatches; - } - - private class ArrayPoolBatchIterator - implements CloseableIterator>> { - - private final String splitId; - private final DataIterator inputIterator; - private final Pool pool; - - ArrayPoolBatchIterator(String splitId, DataIterator inputIterator, Pool pool) { - this.splitId = splitId; - this.inputIterator = inputIterator; - this.pool = pool; - } - - @Override - public boolean hasNext() { - return inputIterator.hasNext(); - } - - @Override - public RecordsWithSplitIds> next() { - if (!inputIterator.hasNext()) { - throw new NoSuchElementException(); - } - - T[] batch = getCachedEntry(); - int recordCount = 0; - - RecordPosition[] positions = initPositionArray(); - while (inputIterator.hasNext() && recordCount < batchSize) { - // The record produced by inputIterator can be reused like for the - // MixedFormatRecordWithOffset - // case. - // inputIterator.next() can't be called again until the copy is made - // since the record is not consumed immediately. - T nextRecord = inputIterator.next(); - recordFactory.clone(nextRecord, batch, recordCount); - positions[recordCount].set(inputIterator); - recordCount++; - if (!inputIterator.currentFileHasNext()) { - // break early so that records in the ArrayResultIterator - // have the same fileOffset. - break; - } - } - return ArrayBatchRecords.forRecords(splitId, pool.recycler(), batch, recordCount, positions); - } - - @Override - public void close() throws IOException { - inputIterator.close(); - } - - private T[] getCachedEntry() { - try { - return pool.pollEntry(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while waiting for array pool entry", e); - } - } - - private RecordPosition[] initPositionArray() { - RecordPosition[] positions = new RecordPosition[batchSize]; - for (int i = 0; i < batchSize; i++) { - positions[i] = new RecordPosition(); - } - return positions; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java deleted file mode 100644 index 1119f6a67d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorBatcher.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.Serializable; - -/** - * Batcher converts iterator of T into iterator of batched {@code - * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns - * batched records. - */ -@FunctionalInterface -public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> batch( - String splitId, DataIterator inputIterator); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java deleted file mode 100644 index 273d2b539f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/DataIteratorReaderFunction.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.ChangeLogDataIterator; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.flink.read.source.MergeOnReadDataIterator; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.io.CloseableIterator; - -/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ -public abstract class DataIteratorReaderFunction implements ReaderFunction { - private final DataIteratorBatcher batcher; - - public DataIteratorReaderFunction(DataIteratorBatcher batcher) { - this.batcher = batcher; - } - - public abstract DataIterator createDataIterator(MixedFormatSplit split); - - @Override - public CloseableIterator>> apply( - MixedFormatSplit split) { - DataIterator inputIterator = createDataIterator(split); - if (inputIterator instanceof MergeOnReadDataIterator) { - inputIterator.seek(0, split.asMergeOnReadSplit().recordOffset()); - } else if (inputIterator instanceof ChangeLogDataIterator) { - ChangeLogDataIterator changelogInputIterator = (ChangeLogDataIterator) inputIterator; - ChangelogSplit changelogSplit = split.asChangelogSplit(); - changelogInputIterator.seek( - changelogSplit.insertFileOffset(), - changelogSplit.deleteFileOffset(), - changelogSplit.insertRecordOffset(), - changelogSplit.deleteRecordOffset()); - } else { - inputIterator.seek( - split.asSnapshotSplit().insertFileOffset(), split.asSnapshotSplit().insertRecordOffset()); - } - return batcher.batch(split.splitId(), inputIterator); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java deleted file mode 100644 index 1b0073e1a5..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/HybridSplitReader.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MergeOnReadSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.SnapshotSplit; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.RecordsBySplits; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.iceberg.io.CloseableIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.ArrayDeque; -import java.util.Collections; -import java.util.Queue; - -/** - * A hybrid source split reader that could read {@link SnapshotSplit} and {@link ChangelogSplit}. - */ -public class HybridSplitReader - implements SplitReader, MixedFormatSplit> { - private static final Logger LOG = LoggerFactory.getLogger(HybridSplitReader.class); - - private final ReaderFunction openSplitFunction; - private final int indexOfSubtask; - private final Queue splits; - - private CloseableIterator>> currentReader; - private String currentSplitId; - - public HybridSplitReader(ReaderFunction openSplitFunction, SourceReaderContext context) { - this.openSplitFunction = openSplitFunction; - this.indexOfSubtask = context.getIndexOfSubtask(); - this.splits = new ArrayDeque<>(); - } - - @Override - public RecordsWithSplitIds> fetch() throws IOException { - if (currentReader == null) { - if (splits.isEmpty()) { - return new RecordsBySplits<>(Collections.emptyMap(), Collections.emptySet()); - } - MixedFormatSplit mixedFormatSplit = splits.poll(); - currentReader = openSplitFunction.apply(mixedFormatSplit); - currentSplitId = mixedFormatSplit.splitId(); - } - if (currentReader.hasNext()) { - // Because Iterator#next() doesn't support checked exception, - // we need to wrap and unwrap the checked IOException with UncheckedIOException - try { - return currentReader.next(); - } catch (UncheckedIOException e) { - throw e.getCause(); - } - } else { - return finishSplit(); - } - } - - @Override - public void handleSplitsChanges(SplitsChange splitsChange) { - if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException( - String.format("The SplitChange type of %s is not supported.", splitsChange.getClass())); - } - LOG.info("Handling a split change {}.", splitsChange); - - splitsChange - .splits() - .forEach( - mixedFormatSplit -> { - if (mixedFormatSplit instanceof SnapshotSplit - || mixedFormatSplit instanceof ChangelogSplit - || mixedFormatSplit instanceof MergeOnReadSplit) { - splits.add(mixedFormatSplit); - } else { - throw new IllegalArgumentException( - String.format( - "As of now, The %s of SourceSplit type is unsupported, available source splits are %s, %s.", - mixedFormatSplit.getClass().getSimpleName(), - SnapshotSplit.class.getSimpleName(), - ChangelogSplit.class.getSimpleName())); - } - }); - } - - @Override - public void wakeUp() {} - - @Override - public void close() throws Exception { - currentSplitId = null; - if (currentReader != null) { - currentReader.close(); - } - } - - private RecordsWithSplitIds> finishSplit() throws IOException { - if (currentReader != null) { - currentReader.close(); - currentReader = null; - } - ArrayBatchRecords finishRecords = ArrayBatchRecords.finishedSplit(currentSplitId); - LOG.info("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); - currentSplitId = null; - return finishRecords; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java deleted file mode 100644 index 1c29b51874..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordEmitter.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.flink.connector.base.source.reader.RecordEmitter; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.utils.JoinedRowData; -import org.apache.flink.util.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Emitter that emit {@link T} to the next flink operator and update the record offset of {@link T} - * into split state. - */ -public class MixedFormatRecordEmitter - implements RecordEmitter, T, MixedFormatSplitState> { - - public static final Logger LOGGER = LoggerFactory.getLogger(MixedFormatRecordEmitter.class); - - /** It signifies whether the Long.MIN_VALUE need to be set into RowData. */ - public boolean populateRowTime; - - public MixedFormatRecordEmitter(boolean populateRowTime) { - this.populateRowTime = populateRowTime; - } - - @Override - public void emitRecord( - MixedFormatRecordWithOffset element, - SourceOutput sourceOutput, - MixedFormatSplitState split) - throws Exception { - T record = element.record(); - if (!populateRowTime) { - sourceOutput.collect(record); - } else { - Preconditions.checkArgument( - record instanceof RowData, - "Custom watermark strategy doesn't support %s, except RowData for now.", - record.getClass()); - RowData rowData = - new JoinedRowData( - (RowData) record, GenericRowData.of(TimestampData.fromEpochMillis(Long.MIN_VALUE))); - rowData.setRowKind(((RowData) record).getRowKind()); - sourceOutput.collect((T) rowData); - } - split.updateOffset( - new Object[] { - element.insertFileOffset(), - element.insertRecordOffset(), - element.deleteFileOffset(), - element.deleteRecordOffset() - }); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java deleted file mode 100644 index dff5dbe3e8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatRecordWithOffset.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -/** A record along with the reader position to be stored in the checkpoint. */ -public class MixedFormatRecordWithOffset { - private T record; - - private int insertFileOffset; - private long insertRecordOffset; - private int deleteFileOffset; - private long deleteRecordOffset; - - public T record() { - return record; - } - - public void record(T record) { - this.record = record; - } - - public int insertFileOffset() { - return insertFileOffset; - } - - public long insertRecordOffset() { - return insertRecordOffset; - } - - public int deleteFileOffset() { - return deleteFileOffset; - } - - public long deleteRecordOffset() { - return deleteRecordOffset; - } - - public void set( - T newRecord, - int insertFileOffset, - long insertRecordOffset, - int deleteFileOffset, - long deleteRecordOffset) { - this.record = newRecord; - this.insertFileOffset = insertFileOffset; - this.deleteFileOffset = deleteFileOffset; - this.insertRecordOffset = insertRecordOffset; - this.deleteRecordOffset = deleteRecordOffset; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java deleted file mode 100644 index caa7b6f837..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/MixedFormatSourceReader.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.MixedFormatSource; -import org.apache.amoro.flink.read.hybrid.enumerator.InitializationFinishedEvent; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; -import org.apache.amoro.flink.util.FlinkClassReflectionUtil; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.common.eventtime.WatermarkOutputMultiplexer; -import org.apache.flink.api.connector.source.ReaderOutput; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; -import org.apache.flink.core.io.InputStatus; -import org.apache.flink.streaming.api.operators.source.ProgressiveTimestampsAndWatermarks; -import org.apache.flink.streaming.api.operators.source.SourceOutputWithWatermarks; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collection; -import java.util.Collections; -import java.util.Map; - -/** - * Mixed-format source reader that is created by a {@link - * MixedFormatSource#createReader(SourceReaderContext)}. - */ -public class MixedFormatSourceReader - extends SingleThreadMultiplexSourceReaderBase< - MixedFormatRecordWithOffset, T, MixedFormatSplit, MixedFormatSplitState> { - - public static final Logger LOGGER = LoggerFactory.getLogger(MixedFormatSourceReader.class); - - public ReaderOutput output; - /** SourceEvents may be received before this#pollNext. */ - private volatile boolean maxWatermarkToBeEmitted = false; - - public MixedFormatSourceReader( - ReaderFunction readerFunction, - Configuration config, - SourceReaderContext context, - boolean populateRowTime) { - super( - () -> new HybridSplitReader<>(readerFunction, context), - new MixedFormatRecordEmitter(populateRowTime), - config, - context); - } - - @Override - public void start() { - // We request a split only if we did not get splits during the checkpoint restore. - // Otherwise, reader restarts will keep requesting more and more splits. - if (getNumberOfCurrentlyAssignedSplits() == 0) { - requestSplit(Collections.emptyList()); - } - context.sendSourceEventToCoordinator(ReaderStartedEvent.INSTANCE); - } - - @Override - protected void onSplitFinished(Map finishedSplitIds) { - requestSplit(Lists.newArrayList(finishedSplitIds.keySet())); - } - - @Override - protected MixedFormatSplitState initializedState(MixedFormatSplit split) { - return new MixedFormatSplitState(split); - } - - @Override - protected MixedFormatSplit toSplitType(String splitId, MixedFormatSplitState splitState) { - return splitState.toSourceSplit(); - } - - private void requestSplit(Collection finishedSplitIds) { - context.sendSourceEventToCoordinator(new SplitRequestEvent(finishedSplitIds)); - } - - @Override - public void handleSourceEvents(SourceEvent sourceEvent) { - if (!(sourceEvent instanceof InitializationFinishedEvent)) { - return; - } - LOGGER.info("receive InitializationFinishedEvent"); - maxWatermarkToBeEmitted = true; - emitWatermarkIfNeeded(); - } - - private void emitWatermarkIfNeeded() { - if (this.output == null || !maxWatermarkToBeEmitted) { - return; - } - LOGGER.info("emit watermark"); - output.emitWatermark(new Watermark(Long.MAX_VALUE)); - maxWatermarkToBeEmitted = false; - } - - @Override - public InputStatus pollNext(ReaderOutput output) throws Exception { - this.output = output; - emitWatermarkIfNeeded(); - return super.pollNext(wrapOutput(output)); - } - - public ReaderOutput wrapOutput(ReaderOutput output) { - if (!(output instanceof SourceOutputWithWatermarks)) { - return output; - } - return new MixedFormatReaderOutput<>(output); - } - - /** - * There is a case that the watermark in {@link WatermarkOutputMultiplexer.OutputState} has been - * updated, but watermark has not been emitted for that when {@link - * WatermarkOutputMultiplexer#onPeriodicEmit} called, the outputState has been removed by {@link - * WatermarkOutputMultiplexer#unregisterOutput(String)} after split finished. Wrap {@link - * ReaderOutput} to call {@link - * ProgressiveTimestampsAndWatermarks.SplitLocalOutputs#emitPeriodicWatermark()} when split - * finishes. - */ - static class MixedFormatReaderOutput implements ReaderOutput { - - private final ReaderOutput internal; - - public MixedFormatReaderOutput(ReaderOutput readerOutput) { - Preconditions.checkArgument( - readerOutput instanceof SourceOutputWithWatermarks, - "readerOutput should be SourceOutputWithWatermarks, but was %s", - readerOutput.getClass()); - this.internal = readerOutput; - } - - @Override - public void collect(T record) { - internal.collect(record); - } - - @Override - public void collect(T record, long timestamp) { - internal.collect(record, timestamp); - } - - @Override - public void emitWatermark(Watermark watermark) { - internal.emitWatermark(watermark); - } - - @Override - public void markIdle() { - internal.markIdle(); - } - - @Override - public void markActive() { - internal.markActive(); - } - - @Override - public SourceOutput createOutputForSplit(String splitId) { - return internal.createOutputForSplit(splitId); - } - - @Override - public void releaseOutputForSplit(String splitId) { - Object splitLocalOutput = FlinkClassReflectionUtil.getSplitLocalOutput(internal); - FlinkClassReflectionUtil.emitPeriodWatermark(splitLocalOutput); - internal.releaseOutputForSplit(splitId); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java deleted file mode 100644 index 23266f9391..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderFunction.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.Serializable; -import java.util.function.Function; - -/** - * This function that accepts one {@link MixedFormatSplit} and produces an iterator of {@link - * MixedFormatRecordWithOffset }. - */ -@FunctionalInterface -public interface ReaderFunction - extends Serializable, - Function< - MixedFormatSplit, - CloseableIterator>>> {} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java deleted file mode 100644 index baa3e41527..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/ReaderStartedEvent.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.flink.api.connector.source.SourceEvent; - -/** It denotes {@link MixedFormatSourceReader} is starting. */ -public class ReaderStartedEvent implements SourceEvent { - private static final long serialVersionUID = 1L; - - public static final ReaderStartedEvent INSTANCE = new ReaderStartedEvent(); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java deleted file mode 100644 index 44b8c2c93a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import java.io.Serializable; - -/** - * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData - * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array - * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. - */ -interface RecordFactory extends Serializable { - /** Create a batch of records */ - T[] createBatch(int batchSize); - - /** Clone record into the specified position of the batch array */ - void clone(T from, T[] batch, int position); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java deleted file mode 100644 index ca8d8b4617..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RecordPosition.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.flink.read.source.ChangeLogDataIterator; -import org.apache.amoro.flink.read.source.DataIterator; - -/** This class contains the file offset and record offset with actual record. */ -public class RecordPosition { - private int currentInsertFileOffset; - private int currentDeleteFileOffset; - private long currentInsertRecordOffset; - private long currentDeleteRecordOffset; - - public RecordPosition() {} - - void set(DataIterator dataIterator) { - if (dataIterator instanceof ChangeLogDataIterator) { - ChangeLogDataIterator changelog = (ChangeLogDataIterator) dataIterator; - currentInsertFileOffset = changelog.insertFileOffset(); - currentInsertRecordOffset = changelog.insertRecordOffset(); - currentDeleteFileOffset = changelog.deleteFileOffset(); - currentDeleteRecordOffset = changelog.deleteRecordOffset(); - } else { - currentInsertFileOffset = dataIterator.fileOffset(); - currentInsertRecordOffset = dataIterator.recordOffset(); - } - } - - public int currentInsertFileOffset() { - return currentInsertFileOffset; - } - - public int currentDeleteFileOffset() { - return currentDeleteFileOffset; - } - - public long currentInsertRecordOffset() { - return currentInsertRecordOffset; - } - - public long currentDeleteRecordOffset() { - return currentDeleteRecordOffset; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java deleted file mode 100644 index ce49544e0d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataReaderFunction.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import static org.apache.amoro.flink.shuffle.RowKindUtil.convertToFlinkRowKind; -import static org.apache.amoro.utils.SchemaUtil.changeWriteSchema; -import static org.apache.amoro.utils.SchemaUtil.fillUpIdentifierFields; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.ChangeLogDataIterator; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.flink.read.source.FileScanTaskReader; -import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; -import org.apache.amoro.flink.read.source.FlinkUnkyedDataReader; -import org.apache.amoro.flink.read.source.MergeOnReadDataIterator; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.amoro.utils.NodeFilter; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.RowDataUtil; - -import java.util.Collections; - -/** - * This Function accept a {@link MixedFormatSplit} and produces an {@link DataIterator} of {@link - * RowData}. - */ -public class RowDataReaderFunction extends DataIteratorReaderFunction { - private static final long serialVersionUID = 1446614576495721883L; - private final Schema tableSchema; - private final Schema readSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final AuthenticatedFileIO io; - private final PrimaryKeySpec primaryKeySpec; - /** The accurate selected columns size if the mixed-format source projected */ - private final int columnSize; - /** - * The index of the mixed-format file offset field in the read schema Refer to {@link - * this#wrapFileOffsetColumnMeta} - */ - private final int fileOffsetIndex; - - private final boolean reuse; - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - PrimaryKeySpec primaryKeySpec, - String nameMapping, - boolean caseSensitive, - AuthenticatedFileIO io) { - this( - config, - tableSchema, - projectedSchema, - primaryKeySpec, - nameMapping, - caseSensitive, - io, - false); - } - - public RowDataReaderFunction( - ReadableConfig config, - Schema tableSchema, - Schema projectedSchema, - PrimaryKeySpec primaryKeySpec, - String nameMapping, - boolean caseSensitive, - AuthenticatedFileIO io, - boolean reuse) { - super( - new ArrayPoolDataIteratorBatcher<>( - config, - new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); - this.tableSchema = tableSchema; - this.readSchema = fillUpReadSchema(tableSchema, projectedSchema, primaryKeySpec); - this.primaryKeySpec = primaryKeySpec; - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - this.io = io; - // Add file offset column after readSchema. - this.fileOffsetIndex = readSchema.columns().size(); - this.columnSize = - projectedSchema == null ? readSchema.columns().size() : projectedSchema.columns().size(); - this.reuse = reuse; - } - - @Override - public DataIterator createDataIterator(MixedFormatSplit split) { - if (split.isMergeOnReadSplit()) { - FlinkKeyedMORDataReader morDataReader = - new FlinkKeyedMORDataReader( - io, - tableSchema, - readSchema, - primaryKeySpec, - nameMapping, - caseSensitive, - RowDataUtil::convertConstant, - reuse); - return new MergeOnReadDataIterator( - morDataReader, split.asMergeOnReadSplit().keyedTableScanTask(), io); - } else if (split.isSnapshotSplit()) { - FileScanTaskReader rowDataReader = - new FlinkUnkyedDataReader( - io, - tableSchema, - readSchema, - primaryKeySpec, - nameMapping, - caseSensitive, - RowDataUtil::convertConstant, - Collections.singleton(split.dataTreeNode()), - reuse); - return new DataIterator<>( - rowDataReader, - split.asSnapshotSplit().insertTasks(), - rowData -> Long.MIN_VALUE, - this::removeMixedFormatMetaColumn); - } else if (split.isChangelogSplit()) { - FileScanTaskReader rowDataReader = - new FlinkUnkyedDataReader( - io, - wrapFileOffsetColumnMeta(tableSchema), - wrapFileOffsetColumnMeta(readSchema), - primaryKeySpec, - nameMapping, - caseSensitive, - RowDataUtil::convertConstant, - Collections.singleton(split.dataTreeNode()), - reuse); - return new ChangeLogDataIterator<>( - rowDataReader, - split.asChangelogSplit().insertTasks(), - split.asChangelogSplit().deleteTasks(), - this::mixedFormatFileOffset, - this::removeMixedFormatMetaColumn, - this::transformRowKind); - } else { - throw new IllegalArgumentException( - String.format( - "As of now this split %s is not supported.", split.getClass().getSimpleName())); - } - } - - private Schema wrapFileOffsetColumnMeta(Schema schema) { - return changeWriteSchema(schema); - } - - long mixedFormatFileOffset(RowData rowData) { - return rowData.getLong(fileOffsetIndex); - } - - /** - * @param rowData It may have more columns than readSchema. Refer to {@link - * FlinkUnkyedDataReader}'s annotation. - */ - RowData removeMixedFormatMetaColumn(RowData rowData) { - return MixedFormatUtils.removeMixedFormatMetaColumn(rowData, columnSize); - } - - RowData transformRowKind(ChangeLogDataIterator.ChangeActionTrans trans) { - RowData rowData = trans.row(); - rowData.setRowKind(convertToFlinkRowKind(trans.changeAction())); - return rowData; - } - - /** - * If the projected schema is not null, this method will check and fill up the identifierFields of - * the tableSchema and the projected schema. - * - *

projectedSchema may not include the primary keys, but the {@link NodeFilter} must filter the - * record with the value of the primary keys. So the mixed-format reader function schema must - * include the primary keys. - * - * @param tableSchema table schema - * @param projectedSchema projected schema - * @return a new Schema on which includes the identifier fields. - */ - private static Schema fillUpReadSchema( - Schema tableSchema, Schema projectedSchema, PrimaryKeySpec primaryKeySpec) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null - ? tableSchema - : fillUpIdentifierFields(tableSchema, projectedSchema, primaryKeySpec); - } - - private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { - Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); - return projectedSchema == null ? tableSchema : projectedSchema; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java deleted file mode 100644 index 0e922e50dc..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalSerializers; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.flink.data.RowDataUtil; - -/** A factory create a batch of empty {@link RowData}s. */ -class RowDataRecordFactory implements RecordFactory { - private final RowType rowType; - private final TypeSerializer[] fieldSerializers; - private final RowData.FieldGetter[] fieldGetters; - - RowDataRecordFactory(RowType rowType) { - this.rowType = rowType; - this.fieldSerializers = createFieldSerializers(rowType); - this.fieldGetters = createFieldGetters(rowType); - } - - static TypeSerializer[] createFieldSerializers(RowType rowType) { - return rowType.getChildren().stream() - .map(InternalSerializers::create) - .toArray(TypeSerializer[]::new); - } - - static RowData.FieldGetter[] createFieldGetters(RowType rowType) { - RowData.FieldGetter[] getters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); i++) { - getters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - return getters; - } - - @Override - public RowData[] createBatch(int batchSize) { - RowData[] arr = new RowData[batchSize]; - for (int i = 0; i < batchSize; ++i) { - arr[i] = new GenericRowData(rowType.getFieldCount()); - } - return arr; - } - - @Override - public void clone(RowData from, RowData[] batch, int position) { - // Set the return value from RowDataUtil.clone back to the array. - // Clone method returns same clone target object (reused) if it is a GenericRowData. - // Clone method will allocate a new GenericRowData object - // if the target object is NOT a GenericRowData. - // So we should always set the clone return value back to the array. - batch[position] = - RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java deleted file mode 100644 index 2b3129ef8b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/ChangelogSplit.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.data.PrimaryKeyedFile; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.utils.FileScanTaskUtil; - -import java.util.Collection; -import java.util.Optional; - -/** A changelog split generated during planning change table. */ -public class ChangelogSplit extends MixedFormatSplit { - private static final long serialVersionUID = 1L; - private final int taskIndex; - private final Collection insertScanTasks; - private final Collection deleteScanTasks; - private int insertFileOffset; - private long insertRecordOffset; - private int deleteFileOffset; - private long deleteRecordOffset; - private DataTreeNode dataTreeNode; - - public ChangelogSplit( - Collection insertScanTasks, - Collection deleteScanTasks, - int taskIndex) { - Preconditions.checkArgument(insertScanTasks.size() > 0 || deleteScanTasks.size() > 0); - this.taskIndex = taskIndex; - this.insertScanTasks = insertScanTasks; - this.deleteScanTasks = deleteScanTasks; - Optional task = insertScanTasks.stream().findFirst(); - PrimaryKeyedFile file = - task.isPresent() ? task.get().file() : deleteScanTasks.stream().findFirst().get().file(); - this.dataTreeNode = DataTreeNode.of(file.node().mask(), file.node().index()); - } - - @Override - public Integer taskIndex() { - return taskIndex; - } - - @Override - public DataTreeNode dataTreeNode() { - return dataTreeNode; - } - - @Override - public void modifyTreeNode(DataTreeNode expectedNode) { - Preconditions.checkNotNull(expectedNode); - this.dataTreeNode = expectedNode; - } - - @Override - public void updateOffset(Object[] offsets) { - Preconditions.checkArgument(offsets.length == 4); - insertFileOffset = (int) offsets[0]; - insertRecordOffset = (long) offsets[1]; - deleteFileOffset = (int) offsets[2]; - deleteRecordOffset = (long) offsets[3]; - } - - @Override - public MixedFormatSplit copy() { - return new ChangelogSplit(insertScanTasks, deleteScanTasks, taskIndex); - } - - @Override - public String splitId() { - return MoreObjects.toStringHelper(this) - .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) - .add("mixedFormatEquityDeletes", FileScanTaskUtil.toString(deleteScanTasks)) - .toString(); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) - .add("mixedFormatEquityDeletes", FileScanTaskUtil.toString(deleteScanTasks)) - .add("dataTreeNode", dataTreeNode.toString()) - .toString(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof ChangelogSplit)) { - return false; - } - ChangelogSplit other = (ChangelogSplit) obj; - return splitId().equals(other.splitId()) - && insertFileOffset == other.insertFileOffset - && insertRecordOffset == other.insertRecordOffset - && deleteFileOffset == other.deleteFileOffset - && deleteRecordOffset == other.deleteRecordOffset - && taskIndex == other.taskIndex; - } - - public int insertFileOffset() { - return insertFileOffset; - } - - public long insertRecordOffset() { - return insertRecordOffset; - } - - public int deleteFileOffset() { - return deleteFileOffset; - } - - public long deleteRecordOffset() { - return deleteRecordOffset; - } - - public Collection insertTasks() { - return insertScanTasks; - } - - public Collection deleteTasks() { - return deleteScanTasks; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java deleted file mode 100644 index ccb2bc8996..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MergeOnReadSplit.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.scan.KeyedTableScanTask; -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.utils.FileScanTaskUtil; -import org.apache.flink.util.Preconditions; - -public class MergeOnReadSplit extends MixedFormatSplit { - private static final long serialVersionUID = 1L; - private final int taskIndex; - private final KeyedTableScanTask keyedTableScanTask; - private long recordOffset; - - public MergeOnReadSplit(int taskIndex, KeyedTableScanTask keyedTableScanTask) { - this.taskIndex = taskIndex; - this.keyedTableScanTask = keyedTableScanTask; - } - - public KeyedTableScanTask keyedTableScanTask() { - return keyedTableScanTask; - } - - @Override - public Integer taskIndex() { - return taskIndex; - } - - @Override - public void updateOffset(Object[] offsets) { - Preconditions.checkArgument(offsets.length == 2); - // offsets[0] is file offset, but we don't need it - recordOffset = (long) offsets[1]; - } - - @Override - public MixedFormatSplit copy() { - return new MergeOnReadSplit(taskIndex, keyedTableScanTask); - } - - @Override - public String splitId() { - return MoreObjects.toStringHelper(this) - .add("insertTasks", FileScanTaskUtil.toString(keyedTableScanTask.insertTasks())) - .add("baseTasks", FileScanTaskUtil.toString(keyedTableScanTask.baseTasks())) - .add( - "mixedFormatEquityDeletes", - FileScanTaskUtil.toString(keyedTableScanTask.mixedEquityDeletes())) - .toString(); - } - - public long recordOffset() { - return recordOffset; - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof MergeOnReadSplit)) { - return false; - } - MergeOnReadSplit other = (MergeOnReadSplit) obj; - return splitId().equals(other.splitId()) - && recordOffset == other.recordOffset - && taskIndex == other.taskIndex; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("\ninsertTasks", FileScanTaskUtil.toString(keyedTableScanTask.insertTasks())) - .add("\nbaseTasks", FileScanTaskUtil.toString(keyedTableScanTask.baseTasks())) - .add( - "\nmixedFormatEquityDeletes", - FileScanTaskUtil.toString(keyedTableScanTask.mixedEquityDeletes())) - .add("\ncost", keyedTableScanTask.cost() / 1024 + " KB") - .add("\nrecordCount", keyedTableScanTask.recordCount()) - .toString(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java deleted file mode 100644 index 79818f9952..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplit.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.flink.api.connector.source.SourceSplit; - -import java.io.Serializable; - -/** An abstract mixed-format source split. */ -public abstract class MixedFormatSplit - implements SourceSplit, Serializable, Comparable { - private static final long serialVersionUID = 1L; - - public abstract Integer taskIndex(); - - public DataTreeNode dataTreeNode() { - throw new UnsupportedOperationException("This operation is not supported right now."); - } - - public void modifyTreeNode(DataTreeNode expectedNode) { - throw new UnsupportedOperationException("This operation is not supported right now."); - } - - /** Checks whether this split is a snapshot split. */ - public final boolean isSnapshotSplit() { - return getClass() == SnapshotSplit.class; - } - - /** Checks whether this split is a changelog split. */ - public final boolean isChangelogSplit() { - return getClass() == ChangelogSplit.class; - } - - public final boolean isMergeOnReadSplit() { - return getClass() == MergeOnReadSplit.class; - } - - /** Casts this split into a {@link SnapshotSplit}. */ - public final SnapshotSplit asSnapshotSplit() { - return (SnapshotSplit) this; - } - - /** Casts this split into a {@link ChangelogSplit}. */ - public final ChangelogSplit asChangelogSplit() { - return (ChangelogSplit) this; - } - - public final MergeOnReadSplit asMergeOnReadSplit() { - return (MergeOnReadSplit) this; - } - - /** - * update split current file offset and record offset if this split is {@link SnapshotSplit} - * recordOffsets means [insertFileOffset, insertRecordOffset] if this split is {@link - * ChangelogSplit} recordOffsets means [insertFileOffset, insertRecordOffset, deleteFileOffset, - * deleteRecordOffset, ] - * - * @param recordOffsets [insertFileOffset, insertRecordOffset] - */ - public abstract void updateOffset(Object[] recordOffsets); - - @Override - public int compareTo(MixedFormatSplit that) { - return this.taskIndex().compareTo(that.taskIndex()); - } - - public abstract MixedFormatSplit copy(); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java deleted file mode 100644 index 2b7aef4a9e..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitSerializer.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.log.Bytes; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.InstantiationUtil; - -import java.io.IOException; - -/** Serializer that serializes and deserializes {@link MixedFormatSplit}. */ -public class MixedFormatSplitSerializer implements SimpleVersionedSerializer { - public static final MixedFormatSplitSerializer INSTANCE = new MixedFormatSplitSerializer(); - private static final int VERSION = 1; - - private static final byte SNAPSHOT_SPLIT_FLAG = 1; - private static final byte CHANGELOG_SPLIT_FLAG = 2; - private static final byte MOR_SPLIT_FLAG = 3; - - @Override - public int getVersion() { - return VERSION; - } - - @Override - public byte[] serialize(MixedFormatSplit split) throws IOException { - if (split == null) { - return new byte[0]; - } - if (split.isMergeOnReadSplit()) { - MergeOnReadSplit mergeOnReadSplit = (MergeOnReadSplit) split; - byte[] content = InstantiationUtil.serializeObject(mergeOnReadSplit); - return Bytes.mergeByte(new byte[] {MOR_SPLIT_FLAG}, content); - } else if (split.isSnapshotSplit()) { - SnapshotSplit snapshotSplit = (SnapshotSplit) split; - byte[] content = InstantiationUtil.serializeObject(snapshotSplit); - return Bytes.mergeByte(new byte[] {SNAPSHOT_SPLIT_FLAG}, content); - } else if (split.isChangelogSplit()) { - ChangelogSplit changelogSplit = (ChangelogSplit) split; - byte[] content = InstantiationUtil.serializeObject(changelogSplit); - return Bytes.mergeByte(new byte[] {CHANGELOG_SPLIT_FLAG}, content); - } else { - throw new IllegalArgumentException( - String.format( - "This mixed-format split is not supported, class %s.", - split.getClass().getSimpleName())); - } - } - - @Override - public MixedFormatSplit deserialize(int version, byte[] serialized) throws IOException { - if (serialized.length == 0) { - return null; - } - try { - byte flag = serialized[0]; - if (version == VERSION) { - byte[] content = Bytes.subByte(serialized, 1, serialized.length - 1); - if (flag == MOR_SPLIT_FLAG) { - return InstantiationUtil.deserializeObject( - content, MergeOnReadSplit.class.getClassLoader()); - } else if (flag == SNAPSHOT_SPLIT_FLAG) { - return InstantiationUtil.deserializeObject( - content, SnapshotSplit.class.getClassLoader()); - } else if (flag == CHANGELOG_SPLIT_FLAG) { - return InstantiationUtil.deserializeObject( - content, ChangelogSplit.class.getClassLoader()); - } else { - throw new IllegalArgumentException( - String.format( - "this flag split %s is unsupported. available: %s, %s, and %s.", - flag, SNAPSHOT_SPLIT_FLAG, CHANGELOG_SPLIT_FLAG, MOR_SPLIT_FLAG)); - } - } - } catch (ClassNotFoundException e) { - throw new FlinkRuntimeException("deserialize split failed", e); - } - throw new FlinkRuntimeException( - String.format("this version %s is not supported during deserialize split.", version)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java deleted file mode 100644 index 5548d917c6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/MixedFormatSplitState.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.flink.util.FlinkRuntimeException; - -/** This is the mutable state for per mixed-format source split. */ -public class MixedFormatSplitState { - private final MixedFormatSplit mixedFormatSplit; - - private int currentInsertFileOffset; - private long currentInsertRecordOffset; - private int currentDeleteFileOffset; - private long currentDeleteRecordOffset; - - public MixedFormatSplitState(MixedFormatSplit mixedFormatSplit) { - this.mixedFormatSplit = mixedFormatSplit; - } - - public MixedFormatSplit toSourceSplit() { - if (mixedFormatSplit.isMergeOnReadSplit()) { - MergeOnReadSplit mergeOnReadSplit = (MergeOnReadSplit) mixedFormatSplit; - mergeOnReadSplit.updateOffset( - new Object[] {currentInsertFileOffset, currentInsertRecordOffset}); - return mergeOnReadSplit; - } else if (mixedFormatSplit.isSnapshotSplit()) { - SnapshotSplit snapshotSplit = (SnapshotSplit) mixedFormatSplit; - snapshotSplit.updateOffset(new Object[] {currentInsertFileOffset, currentInsertRecordOffset}); - return snapshotSplit; - } else if (mixedFormatSplit.isChangelogSplit()) { - ChangelogSplit changelogSplit = (ChangelogSplit) mixedFormatSplit; - changelogSplit.updateOffset( - new Object[] { - currentInsertFileOffset, - currentInsertRecordOffset, - currentDeleteFileOffset, - currentDeleteRecordOffset - }); - return changelogSplit; - } - - throw new FlinkRuntimeException( - String.format( - "As of now this source split is unsupported %s, available split are %s, %s, and %s", - mixedFormatSplit.getClass().getSimpleName(), - SnapshotSplit.class.getSimpleName(), - ChangelogSplit.class.getSimpleName(), - MergeOnReadSplit.class.getSimpleName())); - } - - public void updateOffset(Object[] offsets) { - currentInsertFileOffset = (int) offsets[0]; - currentInsertRecordOffset = (long) offsets[1]; - if (mixedFormatSplit.isChangelogSplit()) { - currentDeleteFileOffset = (int) offsets[2]; - currentDeleteRecordOffset = (long) offsets[3]; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java deleted file mode 100644 index fd1bcab730..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SnapshotSplit.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.data.PrimaryKeyedFile; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.utils.FileScanTaskUtil; - -import java.util.Collection; - -/** A snapshot split generated during planning base table. */ -public class SnapshotSplit extends MixedFormatSplit { - private static final long serialVersionUID = 1L; - private final int taskIndex; - private final Collection insertScanTasks; - private int insertFileOffset; - private long insertRecordOffset; - private DataTreeNode dataTreeNode; - - public SnapshotSplit(Collection insertScanTasks, int taskIndex) { - Preconditions.checkArgument(insertScanTasks.size() > 0); - this.insertScanTasks = insertScanTasks; - this.taskIndex = taskIndex; - PrimaryKeyedFile file = insertScanTasks.stream().findFirst().get().file(); - this.dataTreeNode = DataTreeNode.of(file.node().mask(), file.node().index()); - } - - @Override - public String splitId() { - return MoreObjects.toStringHelper(this) - .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) - .toString(); - } - - @Override - public Integer taskIndex() { - return taskIndex; - } - - @Override - public DataTreeNode dataTreeNode() { - return dataTreeNode; - } - - @Override - public void modifyTreeNode(DataTreeNode expectedNode) { - Preconditions.checkNotNull(expectedNode); - this.dataTreeNode = expectedNode; - } - - public Collection insertTasks() { - return insertScanTasks; - } - - @Override - public void updateOffset(Object[] offsets) { - Preconditions.checkArgument(offsets.length == 2); - insertFileOffset = (int) offsets[0]; - insertRecordOffset = (long) offsets[1]; - } - - @Override - public MixedFormatSplit copy() { - return new SnapshotSplit(insertScanTasks, taskIndex); - } - - public int insertFileOffset() { - return insertFileOffset; - } - - public long insertRecordOffset() { - return insertRecordOffset; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("insertTasks", FileScanTaskUtil.toString(insertScanTasks)) - .add("dataTreeNode", dataTreeNode.toString()) - .toString(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof SnapshotSplit)) { - return false; - } - SnapshotSplit other = (SnapshotSplit) obj; - return splitId().equals(other.splitId()) - && insertFileOffset == other.insertFileOffset - && insertRecordOffset == other.insertRecordOffset - && taskIndex == other.taskIndex; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java deleted file mode 100644 index b1eec7f40d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/SplitRequestEvent.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.connector.source.SourceEvent; - -import java.util.Collection; -import java.util.Collections; - -/** We can remove this class once FLINK-21364 is resolved. */ -@Internal -public class SplitRequestEvent implements SourceEvent { - private static final long serialVersionUID = 1L; - - private final Collection finishedSplitIds; - private final String requesterHostname; - - public SplitRequestEvent() { - this(Collections.emptyList()); - } - - public SplitRequestEvent(Collection finishedSplitIds) { - this(finishedSplitIds, null); - } - - public SplitRequestEvent(Collection finishedSplitIds, String requesterHostname) { - this.finishedSplitIds = finishedSplitIds; - this.requesterHostname = requesterHostname; - } - - public Collection finishedSplitIds() { - return finishedSplitIds; - } - - public String requesterHostname() { - return requesterHostname; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java deleted file mode 100644 index 6d6d79f84c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/hybrid/split/TemporalJoinSplits.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import static org.apache.amoro.flink.metric.MetricConstant.TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP; -import static org.apache.amoro.flink.metric.MetricConstant.TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP; - -import org.apache.flink.api.connector.source.SourceSplit; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.util.CollectionUtil; -import org.apache.flink.util.Preconditions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.time.LocalDateTime; -import java.util.Collection; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; - -/** - * If using mixed-format table as build-table, TemporalJoinSplits can record the first splits - * planned by Enumerator. - */ -public class TemporalJoinSplits implements Serializable { - - public static final long serialVersionUID = 1L; - public static final Logger LOGGER = LoggerFactory.getLogger(TemporalJoinSplits.class); - - private final transient MetricGroup metricGroup; - private final long startTimeMs = System.currentTimeMillis(); - private Map splits; - private long unfinishedCount; - /** transient because it is necessary to notify reader again after failover. */ - private transient boolean hasNotifiedReader = false; - - public TemporalJoinSplits(Collection splits, MetricGroup metricGroup) { - Preconditions.checkNotNull(splits, "plan splits should not be null"); - this.splits = - splits.stream().map(SourceSplit::splitId).collect(Collectors.toMap((k) -> k, (i) -> false)); - - unfinishedCount = this.splits.size(); - LOGGER.info("init splits at {}, size:{}", LocalDateTime.now(), unfinishedCount); - this.metricGroup = metricGroup; - if (metricGroup != null) { - metricGroup.gauge(TEMPORAL_TABLE_INITIALIZATION_START_TIMESTAMP, () -> startTimeMs); - } - } - - public Map getSplits() { - return splits; - } - - public synchronized void addSplitsBack(Collection splits) { - if (this.splits == null || CollectionUtil.isNullOrEmpty(splits)) { - return; - } - splits.forEach( - (p) -> { - Boolean finished = this.splits.get(p.splitId()); - if (finished == null || !finished) { - return; - } - unfinishedCount++; - LOGGER.debug("add back split:{} at {}", p, LocalDateTime.now()); - this.splits.put(p.splitId(), false); - }); - } - - /** - * Remove finished splits. - * - * @return True if all splits are finished, otherwise false. - */ - public synchronized boolean removeAndReturnIfAllFinished(Collection finishedSplitIds) { - if (splits == null) { - return true; - } - if (CollectionUtil.isNullOrEmpty(finishedSplitIds)) { - return unfinishedCount == 0; - } - - finishedSplitIds.forEach( - (p) -> { - Boolean finished = this.splits.get(p); - if (finished == null || finished) { - return; - } - unfinishedCount--; - this.splits.put(p, true); - LOGGER.debug("finish split:{} at {}", p, LocalDateTime.now()); - }); - if (unfinishedCount == 0) { - LOGGER.info("finish all splits at {}", LocalDateTime.now()); - if (metricGroup != null) { - metricGroup.gauge(TEMPORAL_TABLE_INITIALIZATION_END_TIMESTAMP, System::currentTimeMillis); - } - return true; - } - return false; - } - - public synchronized void clear() { - if (unfinishedCount == 0) { - this.splits = null; - } - } - - public boolean hasNotifiedReader() { - return hasNotifiedReader; - } - - public void notifyReader() { - this.hasNotifiedReader = true; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - TemporalJoinSplits that = (TemporalJoinSplits) o; - return startTimeMs == that.startTimeMs - && unfinishedCount == that.unfinishedCount - && hasNotifiedReader == that.hasNotifiedReader - && Objects.equals(metricGroup, that.metricGroup) - && Objects.equals(splits, that.splits); - } - - @Override - public int hashCode() { - return Objects.hash(metricGroup, startTimeMs, splits, unfinishedCount, hasNotifiedReader); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java deleted file mode 100644 index 0b104e5c9a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaPartitionSplitReader.java +++ /dev/null @@ -1,499 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.internals; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.flink.connector.kafka.source.KafkaSourceOptions; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.flink.util.Preconditions; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.clients.consumer.OffsetCommitCallback; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.WakeupException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.StringJoiner; -import java.util.function.Supplier; -import java.util.stream.Collectors; - -/** A {@link SplitReader} implementation that reads records from Kafka partitions. */ -@Internal -public class KafkaPartitionSplitReader - implements SplitReader, KafkaPartitionSplit> { - private static final Logger LOG = LoggerFactory.getLogger(KafkaPartitionSplitReader.class); - protected static final long POLL_TIMEOUT = 10000L; - - protected final KafkaConsumer consumer; - private final Map stoppingOffsets; - private final String groupId; - private final int subtaskId; - - protected final KafkaSourceReaderMetrics kafkaSourceReaderMetrics; - - // Tracking empty splits that has not been added to finished splits in fetch() - private final Set emptySplits = new HashSet<>(); - - public KafkaPartitionSplitReader( - Properties props, - SourceReaderContext context, - KafkaSourceReaderMetrics kafkaSourceReaderMetrics) { - this.subtaskId = context.getIndexOfSubtask(); - this.kafkaSourceReaderMetrics = kafkaSourceReaderMetrics; - Properties consumerProps = new Properties(); - consumerProps.putAll(props); - consumerProps.setProperty(ConsumerConfig.CLIENT_ID_CONFIG, createConsumerClientId(props)); - this.consumer = new KafkaConsumer<>(consumerProps); - this.stoppingOffsets = new HashMap<>(); - this.groupId = consumerProps.getProperty(ConsumerConfig.GROUP_ID_CONFIG); - - // Metric registration - maybeRegisterKafkaConsumerMetrics(props, kafkaSourceReaderMetrics, consumer); - this.kafkaSourceReaderMetrics.registerNumBytesIn(consumer); - } - - @Override - public RecordsWithSplitIds> fetch() throws IOException { - ConsumerRecords consumerRecords; - try { - consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); - } catch (WakeupException | IllegalStateException e) { - // IllegalStateException will be thrown if the consumer is not assigned any partitions. - // This happens if all assigned partitions are invalid or empty (starting offset >= - // stopping offset). We just mark empty partitions as finished and return an empty - // record container, and this consumer will be closed by SplitFetcherManager. - KafkaPartitionSplitRecords recordsBySplits = - new KafkaPartitionSplitRecords(ConsumerRecords.empty(), kafkaSourceReaderMetrics); - markEmptySplitsAsFinished(recordsBySplits); - return recordsBySplits; - } - KafkaPartitionSplitRecords recordsBySplits = - new KafkaPartitionSplitRecords(consumerRecords, kafkaSourceReaderMetrics); - List finishedPartitions = new ArrayList<>(); - for (TopicPartition tp : consumerRecords.partitions()) { - long stoppingOffset = getStoppingOffset(tp); - final List> recordsFromPartition = consumerRecords.records(tp); - - if (recordsFromPartition.size() > 0) { - final ConsumerRecord lastRecord = - recordsFromPartition.get(recordsFromPartition.size() - 1); - - // After processing a record with offset of "stoppingOffset - 1", the split reader - // should not continue fetching because the record with stoppingOffset may not - // exist. Keep polling will just block forever. - if (lastRecord.offset() >= stoppingOffset - 1) { - recordsBySplits.setPartitionStoppingOffset(tp, stoppingOffset); - finishSplitAtRecord( - tp, stoppingOffset, lastRecord.offset(), finishedPartitions, recordsBySplits); - } - } - // Track this partition's record lag if it never appears before - kafkaSourceReaderMetrics.maybeAddRecordsLagMetric(consumer, tp); - } - - markEmptySplitsAsFinished(recordsBySplits); - - // Unassign the partitions that has finished. - if (!finishedPartitions.isEmpty()) { - finishedPartitions.forEach(kafkaSourceReaderMetrics::removeRecordsLagMetric); - unassignPartitions(finishedPartitions); - } - - // Update numBytesIn - kafkaSourceReaderMetrics.updateNumBytesInCounter(); - - return recordsBySplits; - } - - protected void markEmptySplitsAsFinished(KafkaPartitionSplitRecords recordsBySplits) { - // Some splits are discovered as empty when handling split additions. These splits should be - // added to finished splits to clean up states in split fetcher and source reader. - if (!emptySplits.isEmpty()) { - recordsBySplits.finishedSplits.addAll(emptySplits); - emptySplits.clear(); - } - } - - @Override - public void handleSplitsChanges(SplitsChange splitsChange) { - // Get all the partition assignments and stopping offsets. - if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException( - String.format("The SplitChange type of %s is not supported.", splitsChange.getClass())); - } - - // Assignment. - List newPartitionAssignments = new ArrayList<>(); - // Starting offsets. - Map partitionsStartingFromSpecifiedOffsets = new HashMap<>(); - List partitionsStartingFromEarliest = new ArrayList<>(); - List partitionsStartingFromLatest = new ArrayList<>(); - // Stopping offsets. - List partitionsStoppingAtLatest = new ArrayList<>(); - Set partitionsStoppingAtCommitted = new HashSet<>(); - - // Parse the starting and stopping offsets. - splitsChange - .splits() - .forEach( - s -> { - newPartitionAssignments.add(s.getTopicPartition()); - parseStartingOffsets( - s, - partitionsStartingFromEarliest, - partitionsStartingFromLatest, - partitionsStartingFromSpecifiedOffsets); - parseStoppingOffsets(s, partitionsStoppingAtLatest, partitionsStoppingAtCommitted); - // Track the new topic partition in metrics - kafkaSourceReaderMetrics.registerTopicPartition(s.getTopicPartition()); - }); - - // Assign new partitions. - newPartitionAssignments.addAll(consumer.assignment()); - consumer.assign(newPartitionAssignments); - - // Seek on the newly assigned partitions to their stating offsets. - seekToStartingOffsets( - partitionsStartingFromEarliest, - partitionsStartingFromLatest, - partitionsStartingFromSpecifiedOffsets); - // Setup the stopping offsets. - acquireAndSetStoppingOffsets(partitionsStoppingAtLatest, partitionsStoppingAtCommitted); - - // After acquiring the starting and stopping offsets, remove the empty splits if necessary. - removeEmptySplits(); - - maybeLogSplitChangesHandlingResult(splitsChange); - } - - @Override - public void wakeUp() { - consumer.wakeup(); - } - - @Override - public void close() throws Exception { - consumer.close(); - } - - // --------------- - - public void notifyCheckpointComplete( - Map offsetsToCommit, - OffsetCommitCallback offsetCommitCallback) { - consumer.commitAsync(offsetsToCommit, offsetCommitCallback); - } - - @VisibleForTesting - KafkaConsumer consumer() { - return consumer; - } - - // --------------- private helper method ---------------------- - - private void parseStartingOffsets( - KafkaPartitionSplit split, - List partitionsStartingFromEarliest, - List partitionsStartingFromLatest, - Map partitionsStartingFromSpecifiedOffsets) { - TopicPartition tp = split.getTopicPartition(); - // Parse starting offsets. - if (split.getStartingOffset() == KafkaPartitionSplit.EARLIEST_OFFSET) { - partitionsStartingFromEarliest.add(tp); - } else if (split.getStartingOffset() == KafkaPartitionSplit.LATEST_OFFSET) { - partitionsStartingFromLatest.add(tp); - } else if (split.getStartingOffset() == KafkaPartitionSplit.COMMITTED_OFFSET) { - // Do nothing here, the consumer will first try to get the committed offsets of - // these partitions by default. - } else { - partitionsStartingFromSpecifiedOffsets.put(tp, split.getStartingOffset()); - } - } - - private void parseStoppingOffsets( - KafkaPartitionSplit split, - List partitionsStoppingAtLatest, - Set partitionsStoppingAtCommitted) { - TopicPartition tp = split.getTopicPartition(); - split - .getStoppingOffset() - .ifPresent( - stoppingOffset -> { - if (stoppingOffset >= 0) { - stoppingOffsets.put(tp, stoppingOffset); - } else if (stoppingOffset == KafkaPartitionSplit.LATEST_OFFSET) { - partitionsStoppingAtLatest.add(tp); - } else if (stoppingOffset == KafkaPartitionSplit.COMMITTED_OFFSET) { - partitionsStoppingAtCommitted.add(tp); - } else { - // This should not happen. - throw new FlinkRuntimeException( - String.format( - "Invalid stopping offset %d for partition %s", stoppingOffset, tp)); - } - }); - } - - private void seekToStartingOffsets( - List partitionsStartingFromEarliest, - List partitionsStartingFromLatest, - Map partitionsStartingFromSpecifiedOffsets) { - - if (!partitionsStartingFromEarliest.isEmpty()) { - LOG.trace("Seeking starting offsets to beginning: {}", partitionsStartingFromEarliest); - consumer.seekToBeginning(partitionsStartingFromEarliest); - } - - if (!partitionsStartingFromLatest.isEmpty()) { - LOG.trace("Seeking starting offsets to end: {}", partitionsStartingFromLatest); - consumer.seekToEnd(partitionsStartingFromLatest); - } - - if (!partitionsStartingFromSpecifiedOffsets.isEmpty()) { - LOG.trace( - "Seeking starting offsets to specified offsets: {}", - partitionsStartingFromSpecifiedOffsets); - partitionsStartingFromSpecifiedOffsets.forEach(consumer::seek); - } - } - - private void acquireAndSetStoppingOffsets( - List partitionsStoppingAtLatest, - Set partitionsStoppingAtCommitted) { - Map endOffset = consumer.endOffsets(partitionsStoppingAtLatest); - stoppingOffsets.putAll(endOffset); - if (!partitionsStoppingAtCommitted.isEmpty()) { - retryOnWakeup( - () -> consumer.committed(partitionsStoppingAtCommitted), - "getting committed offset as stopping offsets") - .forEach( - (tp, offsetAndMetadata) -> { - Preconditions.checkNotNull( - offsetAndMetadata, - String.format( - "Partition %s should stop at committed offset. " - + "But there is no committed offset of this partition for group %s", - tp, groupId)); - stoppingOffsets.put(tp, offsetAndMetadata.offset()); - }); - } - } - - private void removeEmptySplits() { - List emptyPartitions = new ArrayList<>(); - // If none of the partitions have any records, - for (TopicPartition tp : consumer.assignment()) { - if (retryOnWakeup( - () -> consumer.position(tp), "getting starting offset to check if split is empty") - >= getStoppingOffset(tp)) { - emptyPartitions.add(tp); - } - } - if (!emptyPartitions.isEmpty()) { - LOG.debug( - "These assigning splits are empty and will be marked as finished in later fetch: {}", - emptyPartitions); - // Add empty partitions to empty split set for later cleanup in fetch() - emptySplits.addAll( - emptyPartitions.stream().map(KafkaPartitionSplit::toSplitId).collect(Collectors.toSet())); - // Un-assign partitions from Kafka consumer - unassignPartitions(emptyPartitions); - } - } - - private void maybeLogSplitChangesHandlingResult(SplitsChange splitsChange) { - if (LOG.isDebugEnabled()) { - StringJoiner splitsInfo = new StringJoiner(","); - for (KafkaPartitionSplit split : splitsChange.splits()) { - long startingOffset = - retryOnWakeup( - () -> consumer.position(split.getTopicPartition()), "logging starting position"); - long stoppingOffset = getStoppingOffset(split.getTopicPartition()); - splitsInfo.add( - String.format( - "[%s, start:%d, stop: %d]", - split.getTopicPartition(), startingOffset, stoppingOffset)); - } - LOG.debug("SplitsChange handling result: {}", splitsInfo); - } - } - - protected void unassignPartitions(Collection partitionsToUnassign) { - Collection newAssignment = new HashSet<>(consumer.assignment()); - newAssignment.removeAll(partitionsToUnassign); - consumer.assign(newAssignment); - } - - private String createConsumerClientId(Properties props) { - String prefix = props.getProperty(KafkaSourceOptions.CLIENT_ID_PREFIX.key()); - return prefix + "-" + subtaskId; - } - - protected void finishSplitAtRecord( - TopicPartition tp, - long stoppingOffset, - long currentOffset, - List finishedPartitions, - KafkaPartitionSplitRecords recordsBySplits) { - LOG.debug( - "{} has reached stopping offset {}, current offset is {}", - tp, - stoppingOffset, - currentOffset); - finishedPartitions.add(tp); - recordsBySplits.addFinishedSplit(KafkaPartitionSplit.toSplitId(tp)); - } - - protected long getStoppingOffset(TopicPartition tp) { - return stoppingOffsets.getOrDefault(tp, Long.MAX_VALUE); - } - - private void maybeRegisterKafkaConsumerMetrics( - Properties props, - KafkaSourceReaderMetrics kafkaSourceReaderMetrics, - KafkaConsumer consumer) { - final Boolean needToRegister = - KafkaSourceOptions.getOption( - props, KafkaSourceOptions.REGISTER_KAFKA_CONSUMER_METRICS, Boolean::parseBoolean); - if (needToRegister) { - kafkaSourceReaderMetrics.registerKafkaConsumerMetrics(consumer); - } - } - - /** - * Catch {@link WakeupException} in Kafka consumer call and retry the invocation on exception. - * - *

This helper function handles a race condition as below: - * - *

    - *
  1. Fetcher thread finishes a {@link KafkaConsumer#poll(Duration)} call - *
  2. Task thread assigns new splits so invokes {@link #wakeUp()}, then the wakeup is recorded - * and held by the consumer - *
  3. Later fetcher thread invokes {@link #handleSplitsChanges(SplitsChange)}, and interactions - * with consumer will throw {@link WakeupException} because of the previously held wakeup in - * the consumer - *
- * - *

Under this case we need to catch the {@link WakeupException} and retry the operation. - */ - private V retryOnWakeup(Supplier consumerCall, String description) { - try { - return consumerCall.get(); - } catch (WakeupException we) { - LOG.info( - "Caught WakeupException while executing Kafka consumer call for {}. Will retry the consumer call.", - description); - return consumerCall.get(); - } - } - - // ---------------- private helper class ------------------------ - - public static class KafkaPartitionSplitRecords - implements RecordsWithSplitIds> { - - private final Set finishedSplits = new HashSet<>(); - private final Map stoppingOffsets = new HashMap<>(); - private final ConsumerRecords consumerRecords; - private final KafkaSourceReaderMetrics metrics; - private final Iterator splitIterator; - private Iterator> recordIterator; - private TopicPartition currentTopicPartition; - private Long currentSplitStoppingOffset; - - public KafkaPartitionSplitRecords( - ConsumerRecords consumerRecords, KafkaSourceReaderMetrics metrics) { - this.consumerRecords = consumerRecords; - this.splitIterator = consumerRecords.partitions().iterator(); - this.metrics = metrics; - } - - public void setPartitionStoppingOffset(TopicPartition topicPartition, long stoppingOffset) { - stoppingOffsets.put(topicPartition, stoppingOffset); - } - - public void addFinishedSplit(String splitId) { - finishedSplits.add(splitId); - } - - @Nullable - @Override - public String nextSplit() { - if (splitIterator.hasNext()) { - currentTopicPartition = splitIterator.next(); - recordIterator = consumerRecords.records(currentTopicPartition).iterator(); - currentSplitStoppingOffset = - stoppingOffsets.getOrDefault(currentTopicPartition, Long.MAX_VALUE); - return currentTopicPartition.toString(); - } else { - currentTopicPartition = null; - recordIterator = null; - currentSplitStoppingOffset = null; - return null; - } - } - - @Nullable - @Override - public ConsumerRecord nextRecordFromSplit() { - Preconditions.checkNotNull( - currentTopicPartition, - "Make sure nextSplit() did not return null before iterate over the records split."); - if (recordIterator.hasNext()) { - final ConsumerRecord record = recordIterator.next(); - // Only emit records before stopping offset - if (record.offset() < currentSplitStoppingOffset) { - metrics.recordCurrentOffset(currentTopicPartition, record.offset()); - return record; - } - } - return null; - } - - @Override - public Set finishedSplits() { - return finishedSplits; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java deleted file mode 100644 index 56317638a7..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSource.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.internals; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.PublicEvolving; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.common.serialization.DeserializationSchema; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.Source; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.api.connector.source.SplitEnumerator; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; -import org.apache.flink.connector.kafka.source.KafkaSourceBuilder; -import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumState; -import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumStateSerializer; -import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumerator; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; -import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.reader.KafkaPartitionSplitReader; -import org.apache.flink.connector.kafka.source.reader.KafkaRecordEmitter; -import org.apache.flink.connector.kafka.source.reader.KafkaSourceReader; -import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; -import org.apache.flink.connector.kafka.source.reader.fetcher.KafkaSourceFetcherManager; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitSerializer; -import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.util.UserCodeClassLoader; -import org.apache.kafka.clients.consumer.ConsumerRecord; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.util.Collection; -import java.util.Properties; -import java.util.function.Consumer; -import java.util.function.Supplier; - -/** - * The Source implementation of Kafka. Please use a {@link KafkaSourceBuilder} to construct a {@link - * KafkaSource}. The following example shows how to create a KafkaSource emitting records of - * String type. - * - *

{@code
- * KafkaSource source = KafkaSource
- *     .builder()
- *     .setBootstrapServers(KafkaSourceTestEnv.brokerConnectionStrings)
- *     .setGroupId("MyGroup")
- *     .setTopics(Arrays.asList(TOPIC1, TOPIC2))
- *     .setDeserializer(new TestingKafkaRecordDeserializationSchema())
- *     .setStartingOffsets(OffsetsInitializer.earliest())
- *     .build();
- * }
- * - * @param the output type of the source. - */ -@PublicEvolving -public class KafkaSource - implements Source, ResultTypeQueryable { - private static final long serialVersionUID = -8755372893283732098L; - // Users can choose only one of the following ways to specify the topics to consume from. - private final KafkaSubscriber subscriber; - // Users can specify the starting / stopping offset initializer. - private final OffsetsInitializer startingOffsetsInitializer; - private final OffsetsInitializer stoppingOffsetsInitializer; - // Boundedness - private final Boundedness boundedness; - private final KafkaRecordDeserializationSchema deserializationSchema; - // The configurations. - protected final Properties props; - - protected KafkaSource( - KafkaSubscriber subscriber, - OffsetsInitializer startingOffsetsInitializer, - @Nullable OffsetsInitializer stoppingOffsetsInitializer, - Boundedness boundedness, - KafkaRecordDeserializationSchema deserializationSchema, - Properties props) { - this.subscriber = subscriber; - this.startingOffsetsInitializer = startingOffsetsInitializer; - this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; - this.boundedness = boundedness; - this.deserializationSchema = deserializationSchema; - this.props = props; - } - - @Override - public Boundedness getBoundedness() { - return this.boundedness; - } - - @Internal - @Override - public SourceReader createReader(SourceReaderContext readerContext) - throws Exception { - return createReader(readerContext, (ignore) -> {}); - } - - @VisibleForTesting - SourceReader createReader( - SourceReaderContext readerContext, Consumer> splitFinishedHook) - throws Exception { - FutureCompletingBlockingQueue>> - elementsQueue = new FutureCompletingBlockingQueue<>(); - deserializationSchema.open( - new DeserializationSchema.InitializationContext() { - @Override - public MetricGroup getMetricGroup() { - return readerContext.metricGroup().addGroup("deserializer"); - } - - @Override - public UserCodeClassLoader getUserCodeClassLoader() { - return readerContext.getUserCodeClassLoader(); - } - }); - final KafkaSourceReaderMetrics kafkaSourceReaderMetrics = - new KafkaSourceReaderMetrics(readerContext.metricGroup()); - - Supplier splitReaderSupplier = - () -> new KafkaPartitionSplitReader(props, readerContext, kafkaSourceReaderMetrics); - KafkaRecordEmitter recordEmitter = new KafkaRecordEmitter<>(deserializationSchema); - - return new KafkaSourceReader<>( - elementsQueue, - new KafkaSourceFetcherManager(elementsQueue, splitReaderSupplier::get, splitFinishedHook), - recordEmitter, - toConfiguration(props), - readerContext, - kafkaSourceReaderMetrics); - } - - @Internal - @Override - public SplitEnumerator createEnumerator( - SplitEnumeratorContext enumContext) { - return new KafkaSourceEnumerator( - subscriber, - startingOffsetsInitializer, - stoppingOffsetsInitializer, - props, - enumContext, - boundedness); - } - - @Internal - @Override - public SplitEnumerator restoreEnumerator( - SplitEnumeratorContext enumContext, KafkaSourceEnumState checkpoint) - throws IOException { - return new KafkaSourceEnumerator( - subscriber, - startingOffsetsInitializer, - stoppingOffsetsInitializer, - props, - enumContext, - boundedness, - checkpoint.assignedPartitions()); - } - - @Internal - @Override - public SimpleVersionedSerializer getSplitSerializer() { - return new KafkaPartitionSplitSerializer(); - } - - @Internal - @Override - public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() { - return new KafkaSourceEnumStateSerializer(); - } - - @Override - public TypeInformation getProducedType() { - return deserializationSchema.getProducedType(); - } - - // ----------- private helper methods --------------- - - private Configuration toConfiguration(Properties props) { - Configuration config = new Configuration(); - props.stringPropertyNames().forEach(key -> config.setString(key, props.getProperty(key))); - return config; - } - - @VisibleForTesting - Configuration getConfiguration() { - return toConfiguration(props); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java deleted file mode 100644 index 728f41ab07..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceFetcherManager.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.internals; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SourceReaderBase; -import org.apache.flink.connector.base.source.reader.fetcher.SingleThreadFetcherManager; -import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcher; -import org.apache.flink.connector.base.source.reader.fetcher.SplitFetcherTask; -import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.clients.consumer.OffsetCommitCallback; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Collection; -import java.util.Map; -import java.util.function.Consumer; -import java.util.function.Supplier; - -/** - * The SplitFetcherManager for Kafka source. This class is needed to help commit the offsets to - * Kafka using the KafkaConsumer inside the {@link KafkaPartitionSplitReader}. - */ -public class KafkaSourceFetcherManager - extends SingleThreadFetcherManager, KafkaPartitionSplit> { - private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceFetcherManager.class); - - /** - * Creates a new SplitFetcherManager with a single I/O threads. - * - * @param elementsQueue The queue that is used to hand over data from the I/O thread (the - * fetchers) to the reader (which emits the records and book-keeps the state. This must be the - * same queue instance that is also passed to the {@link SourceReaderBase}. - * @param splitReaderSupplier The factory for the split reader that connects to the source system. - * @param splitFinishedHook Hook for handling finished splits in split fetchers. - */ - public KafkaSourceFetcherManager( - FutureCompletingBlockingQueue>> - elementsQueue, - Supplier, KafkaPartitionSplit>> - splitReaderSupplier, - Consumer> splitFinishedHook, - Configuration configuration) { - super(elementsQueue, splitReaderSupplier, configuration, splitFinishedHook); - } - - public void commitOffsets( - Map offsetsToCommit, OffsetCommitCallback callback) { - LOG.debug("Committing offsets {}", offsetsToCommit); - if (offsetsToCommit.isEmpty()) { - return; - } - SplitFetcher, KafkaPartitionSplit> splitFetcher = - fetchers.get(0); - if (splitFetcher != null) { - // The fetcher thread is still running. This should be the majority of the cases. - enqueueOffsetsCommitTask(splitFetcher, offsetsToCommit, callback); - } else { - splitFetcher = createSplitFetcher(); - enqueueOffsetsCommitTask(splitFetcher, offsetsToCommit, callback); - startFetcher(splitFetcher); - } - } - - private void enqueueOffsetsCommitTask( - SplitFetcher, KafkaPartitionSplit> splitFetcher, - Map offsetsToCommit, - OffsetCommitCallback callback) { - KafkaPartitionSplitReader kafkaReader = - (KafkaPartitionSplitReader) splitFetcher.getSplitReader(); - - splitFetcher.enqueueTask( - new SplitFetcherTask() { - @Override - public boolean run() throws IOException { - kafkaReader.notifyCheckpointComplete(offsetsToCommit, callback); - return true; - } - - @Override - public void wakeUp() {} - }); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java deleted file mode 100644 index 706d163da9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/KafkaSourceReader.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.internals; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordEmitter; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.SingleThreadMultiplexSourceReaderBase; -import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; -import org.apache.flink.connector.kafka.source.KafkaSourceOptions; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; - -/** The source reader for Kafka partitions. */ -@Internal -public class KafkaSourceReader - extends SingleThreadMultiplexSourceReaderBase< - ConsumerRecord, T, KafkaPartitionSplit, KafkaPartitionSplitState> { - private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceReader.class); - // These maps need to be concurrent because it will be accessed by both the main thread - // and the split fetcher thread in the callback. - private final SortedMap> offsetsToCommit; - private final ConcurrentMap offsetsOfFinishedSplits; - private final KafkaSourceReaderMetrics kafkaSourceReaderMetrics; - private final boolean commitOffsetsOnCheckpoint; - - public KafkaSourceReader( - FutureCompletingBlockingQueue>> - elementsQueue, - KafkaSourceFetcherManager kafkaSourceFetcherManager, - RecordEmitter, T, KafkaPartitionSplitState> recordEmitter, - Configuration config, - SourceReaderContext context, - KafkaSourceReaderMetrics kafkaSourceReaderMetrics) { - super(elementsQueue, kafkaSourceFetcherManager, recordEmitter, config, context); - this.offsetsToCommit = Collections.synchronizedSortedMap(new TreeMap<>()); - this.offsetsOfFinishedSplits = new ConcurrentHashMap<>(); - this.kafkaSourceReaderMetrics = kafkaSourceReaderMetrics; - this.commitOffsetsOnCheckpoint = config.get(KafkaSourceOptions.COMMIT_OFFSETS_ON_CHECKPOINT); - if (!commitOffsetsOnCheckpoint) { - LOG.warn( - "Offset commit on checkpoint is disabled. Consuming offset will not be reported back to Kafka cluster."); - } - } - - @Override - protected void onSplitFinished(Map finishedSplitIds) { - finishedSplitIds.forEach( - (ignored, splitState) -> { - if (splitState.getCurrentOffset() >= 0) { - offsetsOfFinishedSplits.put( - splitState.getTopicPartition(), - new OffsetAndMetadata(splitState.getCurrentOffset())); - } - }); - } - - @Override - public List snapshotState(long checkpointId) { - List splits = super.snapshotState(checkpointId); - if (!commitOffsetsOnCheckpoint) { - return splits; - } - - if (splits.isEmpty() && offsetsOfFinishedSplits.isEmpty()) { - offsetsToCommit.put(checkpointId, Collections.emptyMap()); - } else { - Map offsetsMap = - offsetsToCommit.computeIfAbsent(checkpointId, id -> new HashMap<>()); - // Put the offsets of the active splits. - for (KafkaPartitionSplit split : splits) { - // If the checkpoint is triggered before the partition starting offsets - // is retrieved, do not commit the offsets for those partitions. - if (split.getStartingOffset() >= 0) { - offsetsMap.put( - split.getTopicPartition(), new OffsetAndMetadata(split.getStartingOffset())); - } - } - // Put offsets of all the finished splits. - offsetsMap.putAll(offsetsOfFinishedSplits); - } - return splits; - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - LOG.debug("Committing offsets for checkpoint {}", checkpointId); - if (!commitOffsetsOnCheckpoint) { - return; - } - - Map committedPartitions = offsetsToCommit.get(checkpointId); - if (committedPartitions == null) { - LOG.debug( - "Offsets for checkpoint {} either do not exist or have already been committed.", - checkpointId); - return; - } - - ((KafkaSourceFetcherManager) splitFetcherManager) - .commitOffsets( - committedPartitions, - (ignored, e) -> { - // The offset commit here is needed by the external monitoring. It won't - // break Flink job's correctness if we fail to commit the offset here. - if (e != null) { - kafkaSourceReaderMetrics.recordFailedCommit(); - LOG.warn("Failed to commit consumer offsets for checkpoint {}", checkpointId, e); - } else { - LOG.debug("Successfully committed offsets for checkpoint {}", checkpointId); - kafkaSourceReaderMetrics.recordSucceededCommit(); - // If the finished topic partition has been committed, we remove it - // from the offsets of the finished splits map. - committedPartitions.forEach( - (tp, offset) -> - kafkaSourceReaderMetrics.recordCommittedOffset(tp, offset.offset())); - offsetsOfFinishedSplits - .entrySet() - .removeIf(entry -> committedPartitions.containsKey(entry.getKey())); - while (!offsetsToCommit.isEmpty() && offsetsToCommit.firstKey() <= checkpointId) { - offsetsToCommit.remove(offsetsToCommit.firstKey()); - } - } - }); - } - - @Override - protected KafkaPartitionSplitState initializedState(KafkaPartitionSplit split) { - return new KafkaPartitionSplitState(split); - } - - @Override - protected KafkaPartitionSplit toSplitType(String splitId, KafkaPartitionSplitState splitState) { - return splitState.toKafkaPartitionSplit(); - } - - // ------------------------ - - public SortedMap> getOffsetsToCommit() { - return offsetsToCommit; - } - - @VisibleForTesting - int getNumAliveFetchers() { - return splitFetcherManager.getNumAliveFetchers(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java deleted file mode 100644 index 103392af21..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/internals/metrics/KafkaConsumerMetricConstants.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.internals.metrics; - -import org.apache.flink.annotation.Internal; - -/** - * A collection of Kafka consumer metrics related constant strings. - * - *

The names must not be changed, as that would break backward compatibility for the consumer's - * metrics. - */ -@Internal -public class KafkaConsumerMetricConstants { - - public static final String KAFKA_LATENCY_METRIC_NAME = "kafkaLatency"; -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java deleted file mode 100644 index 05d2df58d2..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/ChangeLogDataIterator.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import static org.apache.amoro.data.ChangeAction.DELETE; -import static org.apache.amoro.data.ChangeAction.INSERT; -import static org.apache.amoro.data.ChangeAction.UPDATE_AFTER; -import static org.apache.amoro.data.ChangeAction.UPDATE_BEFORE; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.scan.MixedFileScanTask; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.function.Function; - -/** - * This is a change log data iterator that replays the change log data appended to mixed-format - * change table with ordered. - */ -public class ChangeLogDataIterator extends DataIterator { - private final DataIterator insertDataIterator; - private DataIterator deleteDataIterator = empty(); - - private final Function mixedFormatMetaColumnRemover; - private final Function, T> changeActionTransformer; - - private final QueueHolder insertHolder = new QueueHolder<>(); - private final QueueHolder deleteHolder = new QueueHolder<>(); - - public ChangeLogDataIterator( - FileScanTaskReader fileScanTaskReader, - Collection insertTasks, - Collection deleteTasks, - Function mixedFormatFileOffsetGetter, - Function mixedFormatMetaColumnRemover, - Function, T> changeActionTransformer) { - super( - fileScanTaskReader, - Collections.emptyList(), - mixedFormatFileOffsetGetter, - mixedFormatMetaColumnRemover); - this.insertDataIterator = - new DataIterator<>( - fileScanTaskReader, - insertTasks, - mixedFormatFileOffsetGetter, - mixedFormatMetaColumnRemover); - if (deleteTasks != null && !deleteTasks.isEmpty()) { - this.deleteDataIterator = - new DataIterator<>( - fileScanTaskReader, - deleteTasks, - mixedFormatFileOffsetGetter, - mixedFormatMetaColumnRemover); - } - this.mixedFormatMetaColumnRemover = mixedFormatMetaColumnRemover; - this.changeActionTransformer = changeActionTransformer; - } - - public void seek( - int startingInsertFileOffset, - int startingDeleteFileOffset, - long startingInsertRecordOffset, - long startingDeleteRecordOffset) { - insertDataIterator.seek(startingInsertFileOffset, startingInsertRecordOffset); - deleteDataIterator.seek(startingDeleteFileOffset, startingDeleteRecordOffset); - } - - @Override - public void seek(int startingFileOffset, long startingRecordOffset) { - throw new UnsupportedOperationException( - "This operation is not supported in change log data iterator."); - } - - private void loadQueueHolder(boolean insert) { - DataIterator dataIterator = insert ? insertDataIterator : deleteDataIterator; - QueueHolder holder = insert ? insertHolder : deleteHolder; - if (dataIterator.hasNext() && holder.isEmpty()) { - T next = dataIterator.next(); - long nextOffset = dataIterator.currentMixedFormatFileOffset(); - ChangeAction changeAction = insert ? INSERT : DELETE; - holder.put(next, changeAction, nextOffset); - } - } - - @Override - public boolean hasNext() { - loadQueueHolder(false); - loadQueueHolder(true); - - return deleteHolder.isNotEmpty() || insertHolder.isNotEmpty(); - } - - @Override - public boolean currentFileHasNext() { - return deleteDataIterator.currentFileHasNext() - || insertDataIterator.currentFileHasNext() - || deleteHolder.isNotEmpty() - || insertHolder.isNotEmpty(); - } - - @Override - public T next() { - T row; - if (deleteHolder.isEmpty() && insertHolder.isNotEmpty()) { - row = - changeActionTransformer.apply( - ChangeActionTrans.of(insertHolder.nextRow, insertHolder.changeAction)); - insertHolder.clean(); - } else if (deleteHolder.isNotEmpty() && insertHolder.isEmpty()) { - row = - changeActionTransformer.apply( - ChangeActionTrans.of(deleteHolder.nextRow, deleteHolder.changeAction)); - deleteHolder.clean(); - } else if (deleteHolder.equalTo(insertHolder)) { - row = - changeActionTransformer.apply(ChangeActionTrans.of(deleteHolder.nextRow, UPDATE_BEFORE)); - insertHolder.changeAction = UPDATE_AFTER; - deleteHolder.clean(); - } else if (deleteHolder.lesser(insertHolder)) { - row = - changeActionTransformer.apply( - ChangeActionTrans.of(deleteHolder.nextRow, deleteHolder.changeAction)); - deleteHolder.clean(); - } else { - row = - changeActionTransformer.apply( - ChangeActionTrans.of(insertHolder.nextRow, insertHolder.changeAction)); - insertHolder.clean(); - } - - return mixedFormatMetaColumnRemover.apply(row); - } - - @Override - public void close() throws IOException { - insertDataIterator.close(); - deleteDataIterator.close(); - } - - public int insertFileOffset() { - return insertDataIterator.fileOffset(); - } - - public long insertRecordOffset() { - return insertDataIterator.recordOffset(); - } - - public int deleteFileOffset() { - return deleteDataIterator.fileOffset(); - } - - public long deleteRecordOffset() { - return deleteDataIterator.recordOffset(); - } - - private static class QueueHolder { - T nextRow; - ChangeAction changeAction; - Long nextOffset; - - public QueueHolder() {} - - boolean isEmpty() { - return nextRow == null; - } - - boolean isNotEmpty() { - return nextRow != null; - } - - public void put(T nextRow, ChangeAction changeAction, Long nextOffset) { - this.nextRow = nextRow; - this.changeAction = changeAction; - this.nextOffset = nextOffset; - } - - public T get() { - return nextRow; - } - - boolean lesser(QueueHolder that) { - return this.nextOffset.compareTo(that.nextOffset) < 0; - } - - boolean equalTo(QueueHolder that) { - return this.nextOffset.compareTo(that.nextOffset) == 0; - } - - void clean() { - nextRow = null; - nextOffset = null; - } - } - - public static class ChangeActionTrans { - protected final T row; - protected final ChangeAction changeAction; - - private ChangeActionTrans(T row, ChangeAction changeAction) { - this.row = row; - this.changeAction = changeAction; - } - - public static ChangeActionTrans of(T row, ChangeAction changeAction) { - return new ChangeActionTrans<>(row, changeAction); - } - - public T row() { - return row; - } - - public ChangeAction changeAction() { - return changeAction; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java deleted file mode 100644 index 1dccaeb9c3..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/DataIterator.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.NoSuchElementException; -import java.util.function.Function; - -/** - * Flink data iterator that reads {@link MixedFileScanTask} into a {@link CloseableIterator} - * - * @param T is the output data type returned by this iterator. - */ -@Internal -public class DataIterator implements CloseableIterator { - - private final FileScanTaskReader fileScanTaskReader; - private final int taskSize; - - private Iterator tasks; - private CloseableIterator currentIterator; - private int fileOffset; - private long recordOffset; - private long currentFileOffset; - private final Function fileOffsetGetter; - private final Function metaColumnRemover; - - public DataIterator() { - this(null, Collections.emptyList(), t -> Long.MIN_VALUE, t -> t); - } - - public DataIterator( - FileScanTaskReader fileScanTaskReader, - Collection tasks, - Function fileOffsetGetter, - Function metaColumnRemover) { - this.fileScanTaskReader = fileScanTaskReader; - this.tasks = tasks.iterator(); - this.taskSize = tasks.size(); - this.fileOffsetGetter = fileOffsetGetter; - this.metaColumnRemover = metaColumnRemover; - - this.currentIterator = CloseableIterator.empty(); - - // fileOffset starts at -1 because we started - // from an empty iterator that is not from the split files. - this.fileOffset = -1; - // record offset points to the record that next() should return when called - this.recordOffset = 0L; - // actual record offset in data file. - // it's incremental within inserting and deleting files in the same tree node group. - this.currentFileOffset = 0L; - } - - /** - * (startingFileOffset, startingRecordOffset) points to the next row that the reader should resume - * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the - * second row in file 0. When next() is called after seek; the second row from file 0 should be - * returned. - */ - public void seek(int startingFileOffset, long startingRecordOffset) { - // It means file is empty. - if (taskSize == 0) { - return; - } - Preconditions.checkState( - fileOffset == -1, "Seek should be called before any other iterator actions"); - // skip files - Preconditions.checkState( - startingFileOffset < taskSize, - "Invalid starting file offset %s for combined scan task with %s files.", - startingFileOffset, - taskSize); - for (long i = 0L; i < startingFileOffset; ++i) { - tasks.next(); - } - - updateCurrentIterator(); - // skip records within the file - for (long i = 0; i < startingRecordOffset; ++i) { - if (currentFileHasNext() && hasNext()) { - next(); - } else { - throw new IllegalStateException( - String.format( - "Invalid starting record offset %d for file %d from FileScanTask List.", - startingRecordOffset, startingFileOffset)); - } - } - - fileOffset = startingFileOffset; - recordOffset = startingRecordOffset; - } - - @Override - public boolean hasNext() { - updateCurrentIterator(); - return currentIterator.hasNext(); - } - - @Override - public T next() { - updateCurrentIterator(); - recordOffset += 1; - T row = currentIterator.next(); - currentFileOffset = fileOffsetGetter.apply(row); - return metaColumnRemover.apply(row); - } - - public boolean currentFileHasNext() { - return currentIterator.hasNext(); - } - - /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ - private void updateCurrentIterator() { - try { - while (!currentIterator.hasNext() && tasks.hasNext()) { - currentIterator.close(); - currentIterator = openTaskIterator(tasks.next()); - fileOffset += 1; - recordOffset = 0L; - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private CloseableIterator openTaskIterator(MixedFileScanTask scanTask) { - return fileScanTaskReader.open(scanTask); - } - - @Override - public void close() throws IOException { - // close the current iterator - currentIterator.close(); - tasks = null; - } - - public int fileOffset() { - return fileOffset; - } - - public long recordOffset() { - return recordOffset; - } - - public long currentMixedFormatFileOffset() { - return currentFileOffset; - } - - static DataIterator empty() { - return new EmptyIterator<>(); - } - - private static class EmptyIterator extends DataIterator { - - public EmptyIterator() { - super(null, Collections.emptyList(), t -> Long.MIN_VALUE, t -> t); - } - - @Override - public boolean hasNext() { - return false; - } - - @Override - public T next() { - throw new NoSuchElementException(); - } - - @Override - public void seek(int startingFileOffset, long startingRecordOffset) {} - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java deleted file mode 100644 index 0eeacab445..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FileScanTaskReader.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import org.apache.flink.annotation.Internal; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.Serializable; - -/** - * Read a {@link FileScanTask} into a {@link CloseableIterator} - * - * @param is the output data type returned by this iterator. - */ -@Internal -public interface FileScanTaskReader extends Serializable { - CloseableIterator open(FileScanTask fileScanTask); -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java deleted file mode 100644 index 7ad5031877..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkKeyedMORDataReader.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.types.Type; -import org.apache.orc.TypeDescription; -import org.apache.parquet.schema.MessageType; - -import java.util.Map; -import java.util.function.BiFunction; -import java.util.function.Function; - -public class FlinkKeyedMORDataReader extends AbstractAdaptHiveKeyedDataReader { - public FlinkKeyedMORDataReader( - AuthenticatedFileIO fileIO, - Schema tableSchema, - Schema projectedSchema, - PrimaryKeySpec primaryKeySpec, - String nameMapping, - boolean caseSensitive, - BiFunction convertConstant, - boolean reuseContainer) { - super( - fileIO, - tableSchema, - projectedSchema, - primaryKeySpec, - nameMapping, - caseSensitive, - convertConstant, - reuseContainer); - } - - @Override - protected Function> getParquetReaderFunction( - Schema projectSchema, Map idToConstant) { - return fileSchema -> - AdaptHiveFlinkParquetReaders.buildReader(projectSchema, fileSchema, idToConstant); - } - - @Override - protected Function> getOrcReaderFunction( - Schema projectSchema, Map idToConstant) { - return fileSchema -> new FlinkOrcReader(projectSchema, fileSchema, idToConstant); - } - - @Override - protected Function> toStructLikeFunction() { - return schema -> { - RowType requiredRowType = FlinkSchemaUtil.convert(schema); - RowDataWrapper asStructLike = new RowDataWrapper(requiredRowType, schema.asStruct()); - return asStructLike::wrap; - }; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java deleted file mode 100644 index 04d17f7d31..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/FlinkUnkyedDataReader.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveUnkeyedDataReader; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.io.reader.DeleteFilter; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.types.Type; -import org.apache.orc.TypeDescription; -import org.apache.parquet.schema.MessageType; - -import java.util.Map; -import java.util.Set; -import java.util.function.BiFunction; -import java.util.function.Function; - -/** - * This is an mixed-format table reader accepts a {@link FileScanTask} and produces a {@link - * CloseableIterator}. The RowData read from this reader may have more columns than the - * original schema. The additional columns are added after the original columns, see {@link - * DeleteFilter}. It shall be projected before sent to downstream. This can be processed in {@link - * DataIterator#next()} - */ -public class FlinkUnkyedDataReader extends AbstractAdaptHiveUnkeyedDataReader - implements FileScanTaskReader { - private static final long serialVersionUID = -6773693031945244386L; - - public FlinkUnkyedDataReader( - AuthenticatedFileIO fileIO, - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - BiFunction convertConstant, - boolean reuseContainer) { - super( - fileIO, - tableSchema, - projectedSchema, - nameMapping, - caseSensitive, - convertConstant, - reuseContainer); - } - - public FlinkUnkyedDataReader( - AuthenticatedFileIO fileIO, - Schema tableSchema, - Schema projectedSchema, - PrimaryKeySpec primaryKeySpec, - String nameMapping, - boolean caseSensitive, - BiFunction convertConstant, - Set sourceNodes, - boolean reuseContainer) { - super( - fileIO, - tableSchema, - projectedSchema, - primaryKeySpec, - nameMapping, - caseSensitive, - convertConstant, - sourceNodes, - reuseContainer); - } - - @Override - protected Function> getParquetReaderFunction( - Schema projectedSchema, Map idToConstant) { - return fileSchema -> - AdaptHiveFlinkParquetReaders.buildReader(projectedSchema, fileSchema, idToConstant); - } - - @Override - protected Function> getOrcReaderFunction( - Schema projectSchema, Map idToConstant) { - return fileSchema -> new FlinkOrcReader(projectSchema, fileSchema, idToConstant); - } - - @Override - protected Function> toStructLikeFunction() { - return schema -> { - RowType requiredRowType = FlinkSchemaUtil.convert(schema); - RowDataWrapper asStructLike = new RowDataWrapper(requiredRowType, schema.asStruct()); - return asStructLike::wrap; - }; - } - - @Override - public CloseableIterator open(FileScanTask fileScanTask) { - MixedFileScanTask mixedFileScanTask = (MixedFileScanTask) fileScanTask; - CloseableIterable rowDataIterable = readData(mixedFileScanTask); - return fileIO.doAs(rowDataIterable::iterator); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java deleted file mode 100644 index 872cc9dade..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MergeOnReadDataIterator.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.scan.KeyedTableScanTask; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.io.CloseableIterator; - -import java.io.IOException; - -/** - * Iterator for reading data in a Merge on Read (MOR) way. This iterator handles reading data from - * an Amoro mix-format table while keeping track of file and record offsets for efficient data - * retrieval. - */ -public class MergeOnReadDataIterator extends DataIterator { - private int fileOffset; - private long recordOffset; - private final CloseableIterator iterator; - - public MergeOnReadDataIterator( - FlinkKeyedMORDataReader flinkKeyedMORDataReader, - KeyedTableScanTask keyedTableScanTask, - AuthenticatedFileIO io) { - super(); - this.iterator = - IteratorWithIO.of(io, io.doAs(() -> flinkKeyedMORDataReader.readData(keyedTableScanTask))); - } - - @Override - public void seek(int startingFileOffset, long startingRecordOffset) { - // startingFileOffset is not used, because we only have one file per task - Preconditions.checkNotNull(iterator, "iterator is null in the MergeOnReadDataIterator."); - // skip records within the file - for (long i = 0; i < startingRecordOffset; ++i) { - if (hasNext()) { - next(); - } else { - throw new IllegalStateException( - String.format( - "Invalid starting record offset %d for file %d from KeyedTableScanTask.", - startingRecordOffset, startingFileOffset)); - } - } - this.fileOffset = startingFileOffset; - this.recordOffset = startingRecordOffset; - } - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public RowData next() { - return iterator.next(); - } - - public boolean currentFileHasNext() { - return iterator.hasNext(); - } - - @Override - public int fileOffset() { - return fileOffset; - } - - @Override - public long recordOffset() { - return recordOffset; - } - - @Override - public void close() throws IOException { - // close the current iterator - if (iterator != null) { - iterator.close(); - } - } - - static class IteratorWithIO implements CloseableIterator { - private final AuthenticatedFileIO io; - private final CloseableIterator iterator; - - private IteratorWithIO(AuthenticatedFileIO io, CloseableIterator iterator) { - this.io = io; - this.iterator = iterator; - } - - static IteratorWithIO of(AuthenticatedFileIO io, CloseableIterator iterator) { - Preconditions.checkNotNull(io); - return new IteratorWithIO(io, iterator); - } - - @Override - public void close() throws IOException { - io.doAs( - () -> { - iterator.close(); - return null; - }); - } - - @Override - public boolean hasNext() { - return io.doAs(iterator::hasNext); - } - - @Override - public RowData next() { - return io.doAs(iterator::next); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java deleted file mode 100644 index a00e75821c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/MixedFormatScanContext.java +++ /dev/null @@ -1,378 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadOptions; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingStartingStrategy; - -import java.io.Serializable; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.TimeUnit; - -/** This is an mixed-format source scan context. */ -public class MixedFormatScanContext extends ScanContext implements Serializable { - - private static final long serialVersionUID = 1L; - - private final String scanStartupMode; - private final boolean batchMode; - - protected MixedFormatScanContext(Builder builder) { - super( - builder.caseSensitive, - builder.snapshotId, - builder.startingStrategy, - builder.startSnapshotTimestamp, - builder.startSnapshotId, - builder.endSnapshotId, - builder.asOfTimestamp, - builder.splitSize, - builder.splitLookback, - builder.splitOpenFileCost, - builder.isStreaming, - builder.monitorInterval, - builder.nameMapping, - builder.projectedSchema, - builder.filters, - builder.limit, - builder.includeColumnStats, - builder.includeStatsForColumns, - builder.exposeLocality, - builder.planParallelism, - builder.maxPlanningSnapshotCount, - builder.maxAllowedPlanningFailures, - builder.watermarkColumn, - builder.watermarkColumnTimeUnit, - builder.branch, - builder.tag, - builder.startTag, - builder.endTag); - this.scanStartupMode = builder.scanStartupMode; - this.batchMode = builder.batchMode; - } - - public boolean caseSensitive() { - return caseSensitive; - } - - public Long snapshotId() { - return snapshotId; - } - - public Long startSnapshotId() { - return startSnapshotId; - } - - public Long endSnapshotId() { - return endSnapshotId; - } - - public Long asOfTimestamp() { - return asOfTimestamp; - } - - public Long splitSize() { - return splitSize; - } - - public Integer splitLookback() { - return splitLookback; - } - - public Long splitOpenFileCost() { - return splitOpenFileCost; - } - - public boolean isStreaming() { - return isStreaming; - } - - public Duration monitorInterval() { - return monitorInterval; - } - - public String nameMapping() { - return nameMapping; - } - - public Schema project() { - return schema; - } - - /** Only working for base store right now. */ - public List filters() { - return filters; - } - - public long limit() { - return limit; - } - - public static Builder contextBuilder() { - return new Builder(); - } - - public String scanStartupMode() { - return scanStartupMode; - } - - public boolean isBatchMode() { - return batchMode; - } - - public static class Builder { - private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); - private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); - private StreamingStartingStrategy startingStrategy = - FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); - private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); - private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); - private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); - private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); - private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); - private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); - private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); - private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); - private Duration monitorInterval = - TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); - private String nameMapping; - private Schema projectedSchema; - private List filters; - private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); - private boolean includeColumnStats = - FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); - private Collection includeStatsForColumns = null; - private boolean exposeLocality; - private Integer planParallelism = - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); - private int maxPlanningSnapshotCount = MAX_PLANNING_SNAPSHOT_COUNT.defaultValue(); - - private int maxAllowedPlanningFailures = - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); - private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); - private TimeUnit watermarkColumnTimeUnit = - FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); - private String branch = FlinkReadOptions.BRANCH.defaultValue(); - - private String tag = FlinkReadOptions.TAG.defaultValue(); - - private String startTag = FlinkReadOptions.START_TAG.defaultValue(); - - private String endTag = FlinkReadOptions.END_TAG.defaultValue(); - private String scanStartupMode; - private boolean batchMode = false; - - private Builder() {} - - public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - this.snapshotId = newSnapshotId; - return this; - } - - public Builder useTag(String tag) { - this.tag = tag; - return this; - } - - public Builder useBranch(String branch) { - this.branch = branch; - return this; - } - - public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { - this.startingStrategy = newStartingStrategy; - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - this.startSnapshotTimestamp = newStartSnapshotTimestamp; - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - this.startSnapshotId = newStartSnapshotId; - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - this.endSnapshotId = newEndSnapshotId; - return this; - } - - public Builder startTag(String startTag) { - this.startTag = startTag; - return this; - } - - public Builder endTag(String endTag) { - this.endTag = endTag; - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - this.asOfTimestamp = newAsOfTimestamp; - return this; - } - - public Builder splitSize(Long newSplitSize) { - this.splitSize = newSplitSize; - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - this.splitLookback = newSplitLookback; - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - this.splitOpenFileCost = newSplitOpenFileCost; - return this; - } - - public Builder streaming(boolean streaming) { - this.isStreaming = streaming; - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - this.monitorInterval = newMonitorInterval; - return this; - } - - public Builder nameMapping(String newNameMapping) { - this.nameMapping = newNameMapping; - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.projectedSchema = newProjectedSchema; - return this; - } - - public Builder filters(List newFilters) { - this.filters = newFilters; - return this; - } - - public Builder limit(long newLimit) { - this.limit = newLimit; - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder planParallelism(Integer parallelism) { - this.planParallelism = parallelism; - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; - return this; - } - - Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { - this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; - return this; - } - - public Builder scanStartupMode(String scanStartupMode) { - this.scanStartupMode = scanStartupMode; - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - this.includeColumnStats = newIncludeColumnStats; - return this; - } - - public Builder batchMode(boolean batchMode) { - this.batchMode = batchMode; - return this; - } - - public Builder fromProperties(Map properties) { - Configuration config = new Configuration(); - properties.forEach(config::setString); - - return this.useSnapshotId(config.get(SNAPSHOT_ID)) - .useTag(config.get(TAG)) - .useBranch(config.get(BRANCH)) - .startTag(config.get(START_TAG)) - .endTag(config.get(END_TAG)) - .caseSensitive(config.get(CASE_SENSITIVE)) - .asOfTimestamp(config.get(AS_OF_TIMESTAMP)) - .startingStrategy(config.get(STARTING_STRATEGY)) - .startSnapshotTimestamp(config.get(START_SNAPSHOT_TIMESTAMP)) - .startSnapshotId(config.get(START_SNAPSHOT_ID)) - .endSnapshotId(config.get(END_SNAPSHOT_ID)) - .splitSize(config.get(SPLIT_SIZE)) - .splitLookback(config.get(SPLIT_LOOKBACK)) - .splitOpenFileCost(config.get(SPLIT_FILE_OPEN_COST)) - .streaming(config.get(STREAMING)) - .monitorInterval(config.get(MONITOR_INTERVAL)) - .nameMapping(properties.get(DEFAULT_NAME_MAPPING)) - .scanStartupMode(properties.get(MixedFormatValidator.SCAN_STARTUP_MODE.key())) - .includeColumnStats(config.get(INCLUDE_COLUMN_STATS)) - .maxPlanningSnapshotCount(config.get(MAX_PLANNING_SNAPSHOT_COUNT)) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures); - } - - public MixedFormatScanContext build() { - scanStartupMode = scanStartupMode == null ? null : scanStartupMode.toLowerCase(); - Preconditions.checkArgument( - Objects.isNull(scanStartupMode) - || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST) - || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_LATEST), - String.format( - "only support %s, %s when %s is %s", - MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST, - MixedFormatValidator.SCAN_STARTUP_MODE_LATEST, - MixedFormatValidator.MIXED_FORMAT_READ_MODE, - MixedFormatValidator.MIXED_FORMAT_READ_FILE)); - Preconditions.checkArgument( - !(isStreaming && batchMode), - String.format( - "only support %s = false when execution.runtime-mode is batch", STREAMING.key())); - return new MixedFormatScanContext(this); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java deleted file mode 100644 index d908b92ef0..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/LogSourceHelper.java +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log; - -import static org.apache.amoro.log.LogData.MAGIC_NUMBER; -import static org.apache.flink.util.Preconditions.checkArgument; -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplit; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.NavigableMap; -import java.util.Set; -import java.util.TreeMap; - -/** - * According to upstreamId and partition topic dealing with the flip message, when should begin to - * retract message and when to end it. - */ -public class LogSourceHelper implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(LogSourceHelper.class); - private static final long serialVersionUID = 1L; - - /** Record the topic partitions that are in retracting state. */ - private final Map retractingInfo; - /** - * Key: topic partition + "_" + upstream job id + "_" + epicNo, generated by {@link - * #combineTopicPartitionAndUpstreamIdAndEpicNo)} method. Value: epic start offset - */ - private final NavigableMap upstreamEpicStartOffsets; - - public LogSourceHelper() { - retractingInfo = new HashMap<>(); - upstreamEpicStartOffsets = new TreeMap<>(); - } - - public void initializedState(KafkaPartitionSplit s) { - if (!(s instanceof LogKafkaPartitionSplit)) { - return; - } - LogKafkaPartitionSplit split = (LogKafkaPartitionSplit) s; - if (split.isRetracting()) { - retractingInfo.put( - split.getTopicPartition(), - EpicRetractingInfo.of( - split.getRetractingEpicNo(), split.getRetractingUpstreamId(), - split.getRetractStopOffset(), split.getRevertStartOffset())); - } - Map upStreamEpicStartOffsets = split.getUpStreamEpicStartOffsets(); - - upStreamEpicStartOffsets.forEach( - (upstreamEpic, offset) -> { - String key = - combineTopicPartitionAndUpstreamIdAndEpicNo(split.getTopicPartition(), upstreamEpic); - upstreamEpicStartOffsets.putIfAbsent(key, offset); - }); - } - - /** - * Turn row kind of a row. - * - *

-   * +I -> -D
-   * -D -> +I
-   * -U -> +U
-   * +U -> -U
-   * 
- * - * @param rowData Before reset row - * @return After reset row kind. - */ - public RowData turnRowKind(RowData rowData) { - switch (rowData.getRowKind()) { - case INSERT: - rowData.setRowKind(RowKind.DELETE); - break; - case DELETE: - rowData.setRowKind(RowKind.INSERT); - break; - case UPDATE_AFTER: - rowData.setRowKind(RowKind.UPDATE_BEFORE); - break; - case UPDATE_BEFORE: - rowData.setRowKind(RowKind.UPDATE_AFTER); - break; - default: - throw new FlinkRuntimeException("unKnown ChangeAction=" + rowData.getRowKind()); - } - LOG.debug("after retract a row, ChangeAction={}", rowData.getRowKind()); - return rowData; - } - - public Set getRetractTopicPartitions() { - return retractingInfo.keySet(); - } - - public EpicRetractingInfo getRetractInfo(TopicPartition topicPartition) { - EpicRetractingInfo info = retractingInfo.get(topicPartition); - if (info == null) { - throw new IllegalStateException( - String.format( - "the topic partition: %s, %d is not in retracting state", - topicPartition.topic(), topicPartition.partition())); - } - return info; - } - - public void suspendRetracting(TopicPartition tp) { - EpicRetractingInfo info = retractingInfo.remove(tp); - clearEpicStartOffsetsBeforeOrEqual(tp, info.upstreamId, info.epicNo); - } - - public void suspendRetracting(Collection tps) { - tps.forEach(this::suspendRetracting); - } - - /** - * clear the epic start offsets before or equal the epicNo in the topicPartition. - * - * @param tp - * @param upstreamId - * @param epicNo - */ - public void clearEpicStartOffsetsBeforeOrEqual( - TopicPartition tp, String upstreamId, long epicNo) { - String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); - NavigableMap beforeOrEqual = upstreamEpicStartOffsets.headMap(key, true); - - String prefix = combineTopicPartitionAndUpstreamId(tp, upstreamId); - for (String s : beforeOrEqual.keySet()) { - if (!s.contains(prefix)) { - continue; - } - upstreamEpicStartOffsets.remove(s); - } - } - - /** - * @param revertStartingOffset the offset where job revert to normal read starts from. It should - * skip the flip which has been read. - */ - public void startRetracting( - TopicPartition tp, String upstreamId, long epicNo, long revertStartingOffset) { - String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); - if (!upstreamEpicStartOffsets.containsKey(key)) { - // data have not been read, so that it's unnecessary to retract - return; - } - long retractStoppingOffset = upstreamEpicStartOffsets.get(key); - - retractingInfo.put( - tp, - new EpicRetractingInfo(epicNo, upstreamId, retractStoppingOffset, revertStartingOffset)); - } - - public void initialEpicStartOffsetIfEmpty( - TopicPartition tp, String upstreamId, long epicNo, long startOffset) { - String key = combineTopicPartitionAndUpstreamIdAndEpicNo(tp, upstreamId, epicNo); - upstreamEpicStartOffsets.putIfAbsent(key, startOffset); - } - - private String combineTopicPartitionAndUpstreamIdAndEpicNo( - TopicPartition tp, String upstreamId, long epicNo) { - return combineTopicPartitionAndUpstreamId(tp, upstreamId) + "_" + epicNo; - } - - private String combineTopicPartitionAndUpstreamIdAndEpicNo( - TopicPartition tp, String upstreamIdAndEpicNo) { - return combineTopicPartition(tp) + "_" + upstreamIdAndEpicNo; - } - - private String combineTopicPartitionAndUpstreamId(TopicPartition tp, String upstreamId) { - return combineTopicPartition(tp) + "_" + upstreamId; - } - - private String combineTopicPartition(TopicPartition tp) { - return tp.topic() + "_" + tp.partition(); - } - - public static boolean checkMagicNum(byte[] value) { - checkNotNull(value); - checkArgument(value.length >= 3); - return value[0] == MAGIC_NUMBER[0] - && value[1] == MAGIC_NUMBER[1] - && value[2] == MAGIC_NUMBER[2]; - } - - public static class EpicRetractingInfo implements Serializable { - private static final long serialVersionUID = 1L; - private final long epicNo; - private final String upstreamId; - private final long retractStoppingOffset; - private final long revertStartingOffset; - - public EpicRetractingInfo( - long epicNo, String upstreamId, long retractStoppingOffset, long revertStartingOffset) { - this.epicNo = epicNo; - this.upstreamId = upstreamId; - this.retractStoppingOffset = retractStoppingOffset; - this.revertStartingOffset = revertStartingOffset; - } - - private static EpicRetractingInfo of( - long epicNo, String upstreamId, long retractStopOffset, long revertStartOffset) { - return new EpicRetractingInfo(epicNo, upstreamId, retractStopOffset, revertStartOffset); - } - - public long getEpicNo() { - return epicNo; - } - - public String getUpstreamId() { - return upstreamId; - } - - public long getRetractStoppingOffset() { - return retractStoppingOffset; - } - - public long getRevertStartingOffset() { - return revertStartingOffset; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java deleted file mode 100644 index 06a6ddf415..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplit.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; - -import java.util.NavigableMap; - -public class LogKafkaPartitionSplit extends KafkaPartitionSplit { - - /** - * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and - * opposite RowKind. - */ - private final boolean retracting; - /** The offset where job retract stops, i.e. Read reversely ends. */ - private final Long retractStopOffset; - /** - * The offset where job revert to normal read starts from. It should skip the flip which has been - * read. - */ - private final Long revertStartOffset; - /** - * The epic No. which has finished checkpoint. The data whose epic No. larger than it should be - * retracted. - */ - private final Long retractingEpicNo; - /** The upstream JobId which should be retracted. */ - private final String retractingUpstreamId; - /** Key: upstream job id + "_" + epicNo Value: epic start offset */ - private final NavigableMap upStreamEpicStartOffsets; - - public boolean isRetracting() { - return retracting; - } - - public Long getRetractStopOffset() { - return retractStopOffset; - } - - public Long getRevertStartOffset() { - return revertStartOffset; - } - - public NavigableMap getUpStreamEpicStartOffsets() { - return upStreamEpicStartOffsets; - } - - public Long getRetractingEpicNo() { - return retractingEpicNo; - } - - public String getRetractingUpstreamId() { - return retractingUpstreamId; - } - - public LogKafkaPartitionSplit(LogKafkaPartitionSplitState splitState) { - super( - splitState.getTopicPartition(), - splitState.getCurrentOffset(), - splitState.getStoppingOffset().orElse(NO_STOPPING_OFFSET)); - retracting = splitState.isRetracting(); - retractStopOffset = splitState.getRetractStopOffset(); - revertStartOffset = splitState.getRevertStartOffset(); - upStreamEpicStartOffsets = splitState.getUpstreamEpicStartOffsets(); - retractingEpicNo = splitState.getRetractingEpicNo(); - retractingUpstreamId = splitState.getRetractingUpstreamId(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java deleted file mode 100644 index f8ba4af61d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitReader.java +++ /dev/null @@ -1,443 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import static org.apache.amoro.flink.read.source.log.LogSourceHelper.checkMagicNum; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY; - -import org.apache.amoro.flink.read.internals.KafkaPartitionSplitReader; -import org.apache.amoro.flink.read.source.log.LogSourceHelper; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.WakeupException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Properties; -import java.util.Set; - -/** - * This reader supports read log data in log-store. If {@link - * MixedFormatValidator#MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE} values true, reader would - * read data consistently with file-store. Some data would be written into log-store repeatedly if - * upstream job failovers several times, so it's necessary to retract these data to guarantee the - * consistency with file-store. - * - *
- * The data in log-store with Flip like: 1 2 3 4 5   6 7 8 9  Flip  6 7 8 9 10 11 12   13 14
- *                                       ckp-1     |ckp-2   |     | ckp-2            | ckp-3
- * The data reads like: 1 2 3 4 5 6 7 8 9 -9 -8 -7 -6 6 7 8 9 10 11 12 13 14
- *
- * The implementation of reading consistently lists below:
- * 1. read data normally {@link #readNormal()}
- *    - convert data to {@link LogRecordWithRetractInfo} in {@link #convertToLogRecord(ConsumerRecords)}. If it comes to
- *    Flip, the data would be cut.
- *    - save retracting info {@link LogSourceHelper.EpicRetractingInfo} in
- *    {@link LogSourceHelper#startRetracting(TopicPartition, String, long, long)}.
- *    - record the epic start offsets
- *    {@link LogSourceHelper#initialEpicStartOffsetIfEmpty(TopicPartition, String, long, long)} in
- *    - handle normal data like {@link KafkaPartitionSplitReader}
- * 2. read data reversely {@link #readReversely} if some topic partitions come into Flip,
- *  i.e. {@link LogSourceHelper#getRetractTopicPartitions()}
- *    - record the offsets that consumer's current positions, stoppingOffsetsFromConsumer.
- *    - reset consumer to the offset: current position - batchSize
- *    - poll data until stoppingOffsetsFromConsumer {@link #pollToDesignatedPositions}
- *    - locate the stop offset in the batch data {@link #findIndexOfOffset(List, long)}, and start from it to read
- *    reversely, stop at {@link LogSourceHelper.EpicRetractingInfo#getRetractStoppingOffset()}
- *    - suspend retract {@link LogSourceHelper#suspendRetracting(TopicPartition)} when it comes to
- *    {@link LogSourceHelper.EpicRetractingInfo#getRetractStoppingOffset()}, else repeat {@link #readReversely} in next
- *    {@link #fetch()}
- * 3. write offset and retract info into splitState in
- * {@link LogKafkaPartitionSplitState#updateState(LogRecordWithRetractInfo)}
- * 4. initialize state from state {@link LogSourceHelper#initializedState}
- * 
- */ -public class LogKafkaPartitionSplitReader extends KafkaPartitionSplitReader { - - private static final Logger LOG = LoggerFactory.getLogger(LogKafkaPartitionSplitReader.class); - - private final LogDataJsonDeserialization logDataJsonDeserialization; - private final LogSourceHelper logReadHelper; - private final boolean logRetractionEnable; - private final boolean logConsumerAppendOnly; - - public LogKafkaPartitionSplitReader( - Properties props, - SourceReaderContext context, - KafkaSourceReaderMetrics kafkaSourceReaderMetrics, - Schema schema, - boolean logRetractionEnable, - LogSourceHelper logReadHelper, - String logConsumerChangelogMode) { - super(props, context, kafkaSourceReaderMetrics); - - this.logDataJsonDeserialization = - new LogDataJsonDeserialization<>( - schema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); - this.logRetractionEnable = logRetractionEnable; - this.logReadHelper = logReadHelper; - this.logConsumerAppendOnly = - LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY.equalsIgnoreCase(logConsumerChangelogMode); - } - - public static int RETRACT_SIZE = 500; - public static long RETRACT_FETCH_MAX_ROUND = 5; - - @Override - public RecordsWithSplitIds> fetch() throws IOException { - KafkaPartitionSplitRecords recordsBySplits; - Set retractTps; - if (logRetractionEnable - && !(retractTps = logReadHelper.getRetractTopicPartitions()).isEmpty()) { - recordsBySplits = readReversely(retractTps); - } else { - recordsBySplits = readNormal(); - } - - return recordsBySplits; - } - - private KafkaPartitionSplitRecords readNormal() throws IOException { - ConsumerRecords consumerRecords; - try { - consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); - } catch (WakeupException | IllegalStateException e) { - // IllegalStateException will be thrown if the consumer is not assigned any partitions. - // This happens if all assigned partitions are invalid or empty (starting offset >= - // stopping offset). We just mark empty partitions as finished and return an empty - // record container, and this consumer will be closed by SplitFetcherManager. - KafkaPartitionSplitRecords recordsBySplits = - new KafkaPartitionSplitRecords(ConsumerRecords.empty(), kafkaSourceReaderMetrics); - markEmptySplitsAsFinished(recordsBySplits); - return recordsBySplits; - } - - ConsumerRecords logRecords = convertToLogRecord(consumerRecords); - KafkaPartitionSplitRecords recordsBySplits = - new KafkaPartitionSplitRecords(logRecords, kafkaSourceReaderMetrics); - - List finishedPartitions = new ArrayList<>(); - for (TopicPartition tp : logRecords.partitions()) { - long stoppingOffset = getStoppingOffset(tp); - final List> recordsFromPartition = logRecords.records(tp); - - if (recordsFromPartition.size() > 0) { - final ConsumerRecord lastRecord = - recordsFromPartition.get(recordsFromPartition.size() - 1); - - // After processing a record with offset of "stoppingOffset - 1", the split reader - // should not continue fetching because the record with stoppingOffset may not - // exist. Keep polling will just block forever. - if (lastRecord.offset() >= stoppingOffset - 1) { - recordsBySplits.setPartitionStoppingOffset(tp, stoppingOffset); - finishSplitAtRecord( - tp, stoppingOffset, lastRecord.offset(), finishedPartitions, recordsBySplits); - } - } - // Track this partition's record lag if it never appears before - kafkaSourceReaderMetrics.maybeAddRecordsLagMetric(consumer, tp); - } - - markEmptySplitsAsFinished(recordsBySplits); - - // Unassign the partitions that has finished. - if (!finishedPartitions.isEmpty()) { - finishedPartitions.forEach(kafkaSourceReaderMetrics::removeRecordsLagMetric); - unassignPartitions(finishedPartitions); - } - - // Update numBytesIn - kafkaSourceReaderMetrics.updateNumBytesInCounter(); - - return recordsBySplits; - } - - private ConsumerRecords convertToLogRecord( - ConsumerRecords consumerRecords) throws IOException { - Map>> records = new HashMap<>(); - - for (TopicPartition tp : consumerRecords.partitions()) { - List> rs = consumerRecords.records(tp); - List> recordsForSplit = new ArrayList<>(rs.size()); - records.put(tp, recordsForSplit); - - for (ConsumerRecord consumerRecord : rs) { - byte[] value = consumerRecord.value(); - boolean magicFormat = checkMagicNum(value); - if (!magicFormat) { - throw new UnsupportedOperationException( - "Can't deserialize mixed-format log queue message due to it does not contain magic number."); - } - - LogData logData = logDataJsonDeserialization.deserialize(value); - if (!logData.getFlip() && filterByRowKind(logData.getActualValue())) { - LOG.info( - "filter the rowData, because of logConsumerAppendOnly is true, and rowData={}.", - logData.getActualValue()); - continue; - } - - final long currentOffset = consumerRecord.offset(); - - if (logData.getFlip()) { - if (logRetractionEnable) { - logReadHelper.startRetracting( - tp, logData.getUpstreamId(), logData.getEpicNo(), currentOffset + 1); - break; - } else { - continue; - } - } - - if (logRetractionEnable) { - logReadHelper.initialEpicStartOffsetIfEmpty( - tp, logData.getUpstreamId(), logData.getEpicNo(), currentOffset); - } - recordsForSplit.add(LogRecordWithRetractInfo.of(consumerRecord, logData)); - } - } - return new ConsumerRecords<>(records); - } - - /** read reversely in retracting mode */ - private KafkaPartitionSplitRecords readReversely(Set retractTps) - throws IOException { - Set origin = consumer.assignment(); - consumer.assign(retractTps); - - // stop in current offsets, the msg in the offset would be read - Map stoppingOffsetsFromConsumer = new HashMap<>(); - for (TopicPartition tp : retractTps) { - // the next poll offset - long offset = consumer.position(tp); - stoppingOffsetsFromConsumer.put(tp, Math.max(0, offset - 1)); - long startFrom = Math.max(0, offset - RETRACT_SIZE); - LOG.info("consumer reset offset to: {}", startFrom); - consumer.seek(tp, startFrom); - } - Map>> records = - pollToDesignatedPositions(stoppingOffsetsFromConsumer); - - Map>> logRecords = new HashMap<>(); - - Set finishRetract = new HashSet<>(); - for (Map.Entry>> entry : - records.entrySet()) { - TopicPartition tp = entry.getKey(); - List> consumerRecords = entry.getValue(); - - List> recordsForSplit = - new ArrayList<>(consumerRecords.size()); - logRecords.put(tp, recordsForSplit); - - long stoppingOffsetFromConsumer = stoppingOffsetsFromConsumer.get(tp); - LogSourceHelper.EpicRetractingInfo retractingInfo = logReadHelper.getRetractInfo(tp); - // stoppingOffsetFromConsumer is the offset queried from consumer, it may be larger than flip - // offset because - // kafka poll batch records every time. - // revertStartingOffset is the offset after flip, so it should minus 2 to get the offset - // before flip. - long stoppingOffset = - Math.min(stoppingOffsetFromConsumer, retractingInfo.getRevertStartingOffset() - 2); - int startIndex = findIndexOfOffset(consumerRecords, stoppingOffset); - - for (int i = startIndex; i >= 0; i--) { - ConsumerRecord r = consumerRecords.get(i); - - if (r.offset() < retractingInfo.getRetractStoppingOffset()) { - finishRetract.add(tp); - break; - } - LogData logData = logDataJsonDeserialization.deserialize(r.value()); - - if (!Objects.equals(logData.getUpstreamId(), retractingInfo.getUpstreamId()) - || logData.getEpicNo() <= retractingInfo.getEpicNo()) { - LOG.debug( - "won't retract other job or the success ckp epic data, upstreamId: {}, epicNo: {}", - logData.getUpstreamId(), - logData.getEpicNo()); - } else { - RowData actualValue = logReadHelper.turnRowKind(logData.getActualValue()); - recordsForSplit.add( - LogRecordWithRetractInfo.ofRetract( - r, - retractingInfo.getRetractStoppingOffset(), - retractingInfo.getRevertStartingOffset(), - retractingInfo.getEpicNo(), - logData, - actualValue)); - } - - if (r.offset() == retractingInfo.getRetractStoppingOffset()) { - finishRetract.add(tp); - break; - } - } - } - - suspendRetracting(finishRetract); - consumer.assign(origin); - - return new KafkaPartitionSplitRecords( - new ConsumerRecords<>(logRecords), kafkaSourceReaderMetrics); - } - - private void suspendRetracting(Set finishRetract) { - revertConsumer(finishRetract); - logReadHelper.suspendRetracting(finishRetract); - } - - /** revert consumer to original offset after flip */ - public void revertConsumer(Set finishRetract) { - for (TopicPartition tp : finishRetract) { - LogSourceHelper.EpicRetractingInfo retractingInfo = logReadHelper.getRetractInfo(tp); - long revert = retractingInfo.getRevertStartingOffset(); - consumer.seek(tp, revert); - } - } - - /** - * @param records should be in order of kafka. - * @param offset Kafka offset - * @return the index in records - */ - private int findIndexOfOffset(List> records, long offset) { - int last = records.size() - 1; - int idx = Math.min(RETRACT_SIZE, last); - - long diff = -1; - while (idx >= 0 && idx <= last && (diff = records.get(idx).offset() - offset) != 0) { - if (diff > 0) { - idx--; - } else { - idx++; - } - } - if (diff == 0) { - LOG.debug("start index is: {}", idx); - return idx; - } - LOG.info( - "topic: {}, partition: {}, records' offset range: [{}, {}], need to find: {}", - records.get(0).topic(), - records.get(0).partition(), - records.get(0).offset(), - records.get(last).offset(), - offset); - throw new IllegalStateException("can not find offset in records"); - } - - /** - * @param stoppingOffsets the stopping offset is the position which should be read. - * @return value in map may contain some useless records. It should be filtered. - */ - private Map>> pollToDesignatedPositions( - Map stoppingOffsets) { - ConsumerRecords consumerRecords; - try { - consumerRecords = consumer.poll(Duration.ofMillis(POLL_TIMEOUT)); - } catch (WakeupException we) { - LOG.error("consume reversely error"); - return Collections.EMPTY_MAP; - } - - Map>> recordsForTps = new HashMap<>(); - - int unfinished = stoppingOffsets.size(); - int round = 0; - - Set unfinishedTps = new HashSet<>(); - while (unfinished > 0 && round++ < RETRACT_FETCH_MAX_ROUND) { - unfinishedTps.clear(); - - for (TopicPartition tp : consumerRecords.partitions()) { - recordsForTps.putIfAbsent(tp, new ArrayList<>(RETRACT_SIZE)); - List> records = recordsForTps.get(tp); - - records.addAll(consumerRecords.records(tp)); - - long stoppingOffset = stoppingOffsets.get(tp); - if (records.get(records.size() - 1).offset() >= stoppingOffset) { - unfinished--; - LOG.info( - "reach the stopping offset. stopping offset: {}, tp: {}. data size:{}", - stoppingOffset, - tp, - records.size()); - } else { - unfinishedTps.add(tp); - } - } - if (unfinished == 0) { - break; - } - consumer.assign(unfinishedTps); - } - - if (unfinished > 0) { - LOG.error("can not poll msg to designated positions. unfinished: {}", unfinishedTps); - for (TopicPartition tp : unfinishedTps) { - List> records = recordsForTps.get(tp); - LOG.info( - "tp: {}, polled offset:{}, stopping offset: {}", - tp, - records.get(records.size() - 1).offset(), - stoppingOffsets.get(tp)); - } - throw new UnsupportedOperationException("poll msg reversely error"); - } - - return recordsForTps; - } - - /** - * filter the rowData only works during {@link - * MixedFormatValidator#MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE} is false and {@link - * MixedFormatValidator#MIXED_FORMAT_CONSUMER_CHANGELOG_MODE} is {@link - * MixedFormatValidator#LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY} and rowData.rowKind != INSERT - * - * @param rowData the judged data - * @return true means should be filtered. - */ - boolean filterByRowKind(RowData rowData) { - return !logRetractionEnable - && logConsumerAppendOnly - && !rowData.getRowKind().equals(RowKind.INSERT); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java deleted file mode 100644 index 2e499d05f9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaPartitionSplitState.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; -import org.apache.flink.table.data.RowData; - -import javax.annotation.Nullable; - -import java.util.NavigableMap; -import java.util.TreeMap; - -public class LogKafkaPartitionSplitState extends KafkaPartitionSplitState { - - /** - * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and - * opposite RowKind. - */ - private boolean retracting; - /** @see LogKafkaPartitionSplit#retractStopOffset */ - @Nullable private Long retractStopOffset; - /** @see LogKafkaPartitionSplit#revertStartOffset */ - @Nullable private Long revertStartOffset; - /** @see LogKafkaPartitionSplit#retractingEpicNo */ - @Nullable private Long retractingEpicNo; - /** @see LogKafkaPartitionSplit#retractingUpstreamId */ - @Nullable private String retractingUpstreamId; - /** Key: upstream job id + "_" + epicNo, Value: epic start offset */ - private final NavigableMap upstreamEpicStartOffsets; - - public LogKafkaPartitionSplitState(KafkaPartitionSplit s) { - super(s); - - if (!(s instanceof LogKafkaPartitionSplit)) { - retracting = false; - upstreamEpicStartOffsets = new TreeMap<>(); - return; - } - LogKafkaPartitionSplit partitionSplit = (LogKafkaPartitionSplit) s; - upstreamEpicStartOffsets = partitionSplit.getUpStreamEpicStartOffsets(); - retracting = partitionSplit.isRetracting(); - revertStartOffset = partitionSplit.getRevertStartOffset(); - retractStopOffset = partitionSplit.getRetractStopOffset(); - retractingEpicNo = partitionSplit.getRetractingEpicNo(); - retractingUpstreamId = partitionSplit.getRetractingUpstreamId(); - } - - public void initEpicStartOffsetIfEmpty(String upstreamId, long epicNo, long offset) { - String key = combineUpstreamIdAndEpicNo(upstreamId, epicNo); - upstreamEpicStartOffsets.putIfAbsent(key, offset); - } - - public void updateState(LogRecordWithRetractInfo record) { - if (record.isRetracting()) { - setCurrentOffset(record.offset() - 1); - revertStartOffset = record.getRevertStartingOffset(); - retractStopOffset = record.getRetractStoppingOffset(); - retractingEpicNo = record.getRetractingEpicNo(); - retractingUpstreamId = record.getLogData().getUpstreamId(); - } else { - setCurrentOffset(record.offset() + 1); - } - initEpicStartOffsetIfEmpty( - record.getLogData().getUpstreamId(), record.getLogData().getEpicNo(), record.offset()); - - // todo: clear useless epic start offset in state - retracting = record.isRetracting(); - } - - public boolean isRetracting() { - return retracting; - } - - public Long getRetractStopOffset() { - return retractStopOffset; - } - - public Long getRevertStartOffset() { - return revertStartOffset; - } - - public NavigableMap getUpstreamEpicStartOffsets() { - return upstreamEpicStartOffsets; - } - - public Long getRetractingEpicNo() { - return retractingEpicNo; - } - - public String getRetractingUpstreamId() { - return retractingUpstreamId; - } - - private String combineUpstreamIdAndEpicNo(String upstreamId, long epicNo) { - return upstreamId + "_" + epicNo; - } - - public LogKafkaPartitionSplit toLogKafkaPartitionSplit() { - return new LogKafkaPartitionSplit(this); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java deleted file mode 100644 index b4b2f9628b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaRecordEmitter.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.flink.connector.kafka.source.reader.KafkaRecordEmitter; -import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; -import org.apache.flink.table.data.RowData; -import org.apache.kafka.clients.consumer.ConsumerRecord; - -public class LogKafkaRecordEmitter extends KafkaRecordEmitter { - - public LogKafkaRecordEmitter(KafkaRecordDeserializationSchema deserializationSchema) { - super(deserializationSchema); - } - - @Override - public void emitRecord( - ConsumerRecord consumerRecord, - SourceOutput output, - KafkaPartitionSplitState splitState) - throws Exception { - LogRecordWithRetractInfo element = (LogRecordWithRetractInfo) consumerRecord; - output.collect(element.getActualValue(), element.timestamp()); - ((LogKafkaPartitionSplitState) splitState).updateState(element); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java deleted file mode 100644 index b3854f0743..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSource.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE; - -import org.apache.amoro.flink.read.internals.KafkaSource; -import org.apache.amoro.flink.read.internals.KafkaSourceFetcherManager; -import org.apache.amoro.flink.read.source.log.LogSourceHelper; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; -import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.kafka.clients.consumer.ConsumerRecord; - -import javax.annotation.Nullable; - -import java.util.Map; -import java.util.Properties; -import java.util.function.Supplier; - -/** - * The Source implementation of LogKafka. - * - *
{@code
- * LogKafkaSource source = LogKafkaSource.builder(mixedFormatSchema, configuration)
- *    .setTopics(Arrays.asList(TOPIC1))
- *    .setStartingOffsets(OffsetsInitializer.earliest())
- *    .setProperties(properties)
- *    .build();
- * }
- * - *

See {@link LogKafkaSourceBuilder} for more details. - */ -public class LogKafkaSource extends KafkaSource { - private static final long serialVersionUID = 1L; - - /** read schema, only contains the selected fields */ - private final Schema schema; - - private final boolean logRetractionEnable; - private final String logConsumerChangelogMode; - - LogKafkaSource( - KafkaSubscriber subscriber, - OffsetsInitializer startingOffsetsInitializer, - @Nullable OffsetsInitializer stoppingOffsetsInitializer, - Boundedness boundedness, - KafkaRecordDeserializationSchema deserializationSchema, - Properties props, - Schema schema, - Map tableProperties) { - super( - subscriber, - startingOffsetsInitializer, - stoppingOffsetsInitializer, - boundedness, - deserializationSchema, - props); - this.schema = schema; - logRetractionEnable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - tableProperties, - MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), - MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue()); - logConsumerChangelogMode = - CompatibleFlinkPropertyUtil.propertyAsString( - tableProperties, - MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), - MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.defaultValue()); - } - - /** - * Get a logKafkaSourceBuilder to build a {@link LogKafkaSource}. - * - * @return a Log Kafka source builder. - */ - public static LogKafkaSourceBuilder builder(Schema schema, Map tableProperties) { - return new LogKafkaSourceBuilder(schema, tableProperties); - } - - @Override - public SourceReader createReader( - SourceReaderContext readerContext) { - FutureCompletingBlockingQueue>> - elementsQueue = new FutureCompletingBlockingQueue<>(); - LogSourceHelper logReadHelper = logRetractionEnable ? new LogSourceHelper() : null; - - final KafkaSourceReaderMetrics kafkaSourceReaderMetrics = - new KafkaSourceReaderMetrics(readerContext.metricGroup()); - Supplier splitReaderSupplier = - () -> - new LogKafkaPartitionSplitReader( - props, - readerContext, - kafkaSourceReaderMetrics, - schema, - logRetractionEnable, - logReadHelper, - logConsumerChangelogMode); - LogKafkaRecordEmitter recordEmitter = new LogKafkaRecordEmitter(null); - - return new LogKafkaSourceReader<>( - elementsQueue, - new KafkaSourceFetcherManager( - elementsQueue, - splitReaderSupplier::get, - (ignore) -> {}, - readerContext.getConfiguration()), - recordEmitter, - toConfiguration(props), - readerContext, - kafkaSourceReaderMetrics, - logReadHelper); - } - - @Override - public TypeInformation getProducedType() { - RowType rowType = FlinkSchemaUtil.convert(schema); - return InternalTypeInfo.of(rowType); - } - - // ----------- private helper methods --------------- - - private Configuration toConfiguration(Properties props) { - Configuration config = new Configuration(); - props.stringPropertyNames().forEach(key -> config.setString(key, props.getProperty(key))); - return config; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java deleted file mode 100644 index 2956965ea4..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceBuilder.java +++ /dev/null @@ -1,578 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_GROUP_OFFSETS; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_SPECIFIC_OFFSETS; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_TIMESTAMP; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_SPECIFIC_OFFSETS; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_TIMESTAMP_MILLIS; -import static org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil.fetchLogstorePrefixProperties; -import static org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil.getLogTopic; -import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; -import static org.apache.flink.util.Preconditions.checkNotNull; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.table.TableProperties; -import org.apache.amoro.utils.CompatiblePropertyUtil; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.connector.kafka.source.KafkaSource; -import org.apache.flink.connector.kafka.source.KafkaSourceBuilder; -import org.apache.flink.connector.kafka.source.KafkaSourceOptions; -import org.apache.flink.connector.kafka.source.enumerator.initializer.NoStoppingOffsetsInitializer; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; -import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; -import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.Schema; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.serialization.ByteArrayDeserializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Random; -import java.util.Set; -import java.util.regex.Pattern; - -/** - * The @builder class for {@link LogKafkaSource} to make it easier for the users to construct a - * {@link LogKafkaSource}. - * - *

{@code
- * LogKafkaSource source = LogKafkaSource.builder(mixedFormatSchema, configuration)
- *    .setTopics(Arrays.asList(TOPIC1))
- *    .setStartingOffsets(OffsetsInitializer.earliest())
- *    .setProperties(properties)
- *    .build();
- * }
- */ -public class LogKafkaSourceBuilder { - private static final Logger LOG = LoggerFactory.getLogger(KafkaSourceBuilder.class); - private static final String[] REQUIRED_CONFIGS = { - ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, ConsumerConfig.GROUP_ID_CONFIG - }; - private static final String PARTITION = "partition"; - private static final String OFFSET = "offset"; - // The subscriber specifies the partitions to subscribe to. - private KafkaSubscriber subscriber; - // Users can specify the starting / stopping offset initializer. - private OffsetsInitializer startingOffsetsInitializer; - private OffsetsInitializer stoppingOffsetsInitializer; - // Boundedness - private Boundedness boundedness; - private KafkaRecordDeserializationSchema deserializationSchema; - // The configurations. - protected Properties kafkaProperties; - - private final Schema schema; - private final Map tableProperties; - - /** - * @param schema read schema, only contains the selected fields - * @param tableProperties mixed-format table properties, maybe include Flink SQL hints. - */ - LogKafkaSourceBuilder(Schema schema, Map tableProperties) { - this.subscriber = null; - this.startingOffsetsInitializer = OffsetsInitializer.earliest(); - this.stoppingOffsetsInitializer = new NoStoppingOffsetsInitializer(); - this.boundedness = Boundedness.CONTINUOUS_UNBOUNDED; - this.deserializationSchema = null; - this.kafkaProperties = fetchLogstorePrefixProperties(tableProperties); - this.schema = schema; - this.tableProperties = tableProperties; - setupKafkaProperties(); - } - - /** - * Sets the bootstrap servers for the KafkaConsumer of the LogKafkaSource. - * - * @param bootstrapServers the bootstrap servers of the Kafka cluster. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setBootstrapServers(String bootstrapServers) { - return setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); - } - - /** - * Sets the consumer group id of the LogKafkaSource. - * - * @param groupId the group id of the LogKafkaSource. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setGroupId(String groupId) { - return setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId); - } - - /** - * Set a list of topics the LogKafkaSource should consume from. All the topics in the list should - * have existed in the Kafka cluster. Otherwise an exception will be thrown. To allow some of the - * topics to be created lazily, please use {@link #setTopicPattern(Pattern)} instead. - */ - public LogKafkaSourceBuilder setTopics(List topics) { - ensureSubscriberIsNull("topics"); - subscriber = KafkaSubscriber.getTopicListSubscriber(topics); - return this; - } - - /** - * Set a list of topics the LogKafkaSource should consume from. All the topics in the list should - * have existed in the Kafka cluster. Otherwise an exception will be thrown. To allow some of the - * topics to be created lazily, please use {@link #setTopicPattern(Pattern)} instead. - * - * @param topics the list of topics to consume from. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setTopics(String... topics) { - return setTopics(Arrays.asList(topics)); - } - - /** - * Set a topic pattern to consume from use the java {@link Pattern}. - * - * @param topicPattern the pattern of the topic name to consume from. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setTopicPattern(Pattern topicPattern) { - ensureSubscriberIsNull("topic pattern"); - subscriber = KafkaSubscriber.getTopicPatternSubscriber(topicPattern); - return this; - } - - /** - * Set a set of partitions to consume from. - * - * @param partitions the set of partitions to consume from. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setPartitions(Set partitions) { - ensureSubscriberIsNull("partitions"); - subscriber = KafkaSubscriber.getPartitionSetSubscriber(partitions); - return this; - } - - /** - * Specify from which offsets the LogKafkaSource should start consume from by providing an {@link - * OffsetsInitializer}. - * - *

The following {@link OffsetsInitializer}s are commonly used and provided out of the box. - * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. - * - *

    - *
  • {@link OffsetsInitializer#earliest()} - starting from the earliest offsets. This is also - * the default {@link OffsetsInitializer} of the KafkaSource for starting offsets. - *
  • {@link OffsetsInitializer#latest()} - starting from the latest offsets. - *
  • {@link OffsetsInitializer#committedOffsets()} - starting from the committed offsets of - * the consumer group. - *
  • {@link - * OffsetsInitializer#committedOffsets(org.apache.kafka.clients.consumer.OffsetResetStrategy)} - * - starting from the committed offsets of the consumer group. If there is no committed - * offsets, starting from the offsets specified by the {@link - * org.apache.kafka.clients.consumer.OffsetResetStrategy OffsetResetStrategy}. - *
  • {@link OffsetsInitializer#offsets(Map)} - starting from the specified offsets for each - * partition. - *
  • {@link OffsetsInitializer#timestamp(long)} - starting from the specified timestamp for - * each partition. Note that the guarantee here is that all the records in Kafka whose - * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the - * given starting timestamp will be consumed. However, it is possible that some consumer - * records whose timestamp is smaller than the given starting timestamp are also consumed. - *
- * - * @param startingOffsetsInitializer the {@link OffsetsInitializer} setting the starting offsets - * for the Source. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setStartingOffsets(OffsetsInitializer startingOffsetsInitializer) { - this.startingOffsetsInitializer = startingOffsetsInitializer; - LOG.info("Setting LogKafkaSource starting offset: {}", startingOffsetsInitializer); - return this; - } - - /** - * By default the LogKafkaSource is set to run in {@link Boundedness#CONTINUOUS_UNBOUNDED} manner - * and thus never stops until the Flink job fails or is canceled. To let the KafkaSource run as a - * streaming source but still stops at some point, one can set an {@link OffsetsInitializer} to - * specify the stopping offsets for each partition. When all the partitions have reached their - * stopping offsets, the KafkaSource will then exit. - * - *

This method is different from {@link #setBounded(OffsetsInitializer)} that after setting the - * stopping offsets with this method, {@link KafkaSource#getBoundedness()} will still return - * {@link Boundedness#CONTINUOUS_UNBOUNDED} even though it will stop at the stopping offsets - * specified by the stopping offsets {@link OffsetsInitializer}. - * - *

The following {@link OffsetsInitializer} are commonly used and provided out of the box. - * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. - * - *

    - *
  • {@link OffsetsInitializer#latest()} - stop at the latest offsets of the partitions when - * the KafkaSource starts to run. - *
  • {@link OffsetsInitializer#committedOffsets()} - stops at the committed offsets of the - * consumer group. - *
  • {@link OffsetsInitializer#offsets(Map)} - stops at the specified offsets for each - * partition. - *
  • {@link OffsetsInitializer#timestamp(long)} - stops at the specified timestamp for each - * partition. The guarantee of setting the stopping timestamp is that no Kafka records whose - * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the - * given stopping timestamp will be consumed. However, it is possible that some records - * whose timestamp is smaller than the specified stopping timestamp are not consumed. - *
- * - * @param stoppingOffsetsInitializer The {@link OffsetsInitializer} to specify the stopping - * offset. - * @return this LogKafkaSourceBuilder. - * @see #setBounded(OffsetsInitializer) - */ - public LogKafkaSourceBuilder setUnbounded(OffsetsInitializer stoppingOffsetsInitializer) { - this.boundedness = Boundedness.CONTINUOUS_UNBOUNDED; - this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; - return this; - } - - /** - * By default the LogKafkaSource is set to run in {@link Boundedness#CONTINUOUS_UNBOUNDED} manner - * and thus never stops until the Flink job fails or is canceled. To let the KafkaSource run in - * {@link Boundedness#BOUNDED} manner and stops at some point, one can set an {@link - * OffsetsInitializer} to specify the stopping offsets for each partition. When all the partitions - * have reached their stopping offsets, the KafkaSource will then exit. - * - *

This method is different from {@link #setUnbounded(OffsetsInitializer)} that after setting - * the stopping offsets with this method, {@link KafkaSource#getBoundedness()} will return {@link - * Boundedness#BOUNDED} instead of {@link Boundedness#CONTINUOUS_UNBOUNDED}. - * - *

The following {@link OffsetsInitializer} are commonly used and provided out of the box. - * Users can also implement their own {@link OffsetsInitializer} for custom behaviors. - * - *

    - *
  • {@link OffsetsInitializer#latest()} - stop at the latest offsets of the partitions when - * the KafkaSource starts to run. - *
  • {@link OffsetsInitializer#committedOffsets()} - stops at the committed offsets of the - * consumer group. - *
  • {@link OffsetsInitializer#offsets(Map)} - stops at the specified offsets for each - * partition. - *
  • {@link OffsetsInitializer#timestamp(long)} - stops at the specified timestamp for each - * partition. The guarantee of setting the stopping timestamp is that no Kafka records whose - * {@link org.apache.kafka.clients.consumer.ConsumerRecord#timestamp()} is greater than the - * given stopping timestamp will be consumed. However, it is possible that some records - * whose timestamp is smaller than the specified stopping timestamp are not consumed. - *
- * - * @param stoppingOffsetsInitializer the {@link OffsetsInitializer} to specify the stopping - * offsets. - * @return this LogKafkaSourceBuilder. - * @see #setUnbounded(OffsetsInitializer) - */ - public LogKafkaSourceBuilder setBounded(OffsetsInitializer stoppingOffsetsInitializer) { - this.boundedness = Boundedness.BOUNDED; - this.stoppingOffsetsInitializer = stoppingOffsetsInitializer; - return this; - } - - /** - * Sets the {@link KafkaRecordDeserializationSchema deserializer} of the {@link - * org.apache.kafka.clients.consumer.ConsumerRecord ConsumerRecord} for LogKafkaSource. - * - * @param recordDeserializer the deserializer for Kafka {@link - * org.apache.kafka.clients.consumer.ConsumerRecord ConsumerRecord}. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setDeserializer( - KafkaRecordDeserializationSchema recordDeserializer) { - this.deserializationSchema = recordDeserializer; - return this; - } - - /** - * Sets the client id prefix of this LogKafkaSource. - * - * @param prefix the client id prefix to use for this LogKafkaSource. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setClientIdPrefix(String prefix) { - return setProperty(KafkaSourceOptions.CLIENT_ID_PREFIX.key(), prefix); - } - - /** - * Set an arbitrary property for the LogKafkaSource and LogKafkaConsumer. The valid keys can be - * found in {@link ConsumerConfig} and {@link KafkaSourceOptions}. - * - *

Note that the following keys will be overridden by the builder when the KafkaSource is - * created. - * - *

    - *
  • key.deserializer is always set to {@link ByteArrayDeserializer}. - *
  • value.deserializer is always set to {@link ByteArrayDeserializer}. - *
  • auto.offset.reset.strategy is overridden by {@link - * OffsetsInitializer#getAutoOffsetResetStrategy()} for the starting offsets, which is by - * default {@link OffsetsInitializer#earliest()}. - *
  • partition.discovery.interval.ms is overridden to -1 when {@link - * #setBounded(OffsetsInitializer)} has been invoked. - *
- * - * @param key the key of the property. - * @param value the value of the property. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setProperty(String key, String value) { - kafkaProperties.setProperty(key, value); - return this; - } - - /** - * Set arbitrary properties for the LogKafkaSource and LogKafkaConsumer. The valid keys can be - * found in {@link ConsumerConfig} and {@link KafkaSourceOptions}. - * - *

Note that the following keys will be overridden by the builder when the KafkaSource is - * created. - * - *

    - *
  • key.deserializer is always set to {@link ByteArrayDeserializer}. - *
  • value.deserializer is always set to {@link ByteArrayDeserializer}. - *
  • auto.offset.reset.strategy is overridden by {@link - * OffsetsInitializer#getAutoOffsetResetStrategy()} for the starting offsets, which is by - * default {@link OffsetsInitializer#earliest()}. - *
  • partition.discovery.interval.ms is overridden to -1 when {@link - * #setBounded(OffsetsInitializer)} has been invoked. - *
  • client.id is overridden to the "client.id.prefix-RANDOM_LONG", or - * "group.id-RANDOM_LONG" if the client id prefix is not set. - *
- * - * @param props the properties to set for the LogKafkaSource. - * @return this LogKafkaSourceBuilder. - */ - public LogKafkaSourceBuilder setProperties(Properties props) { - this.kafkaProperties.putAll(props); - return this; - } - - /** - * Build the {@link LogKafkaSource}. - * - * @return a LogKafkaSource with the settings made for this builder. - */ - public LogKafkaSource build() { - sanityCheck(); - parseAndSetRequiredProperties(); - return new LogKafkaSource( - subscriber, - startingOffsetsInitializer, - stoppingOffsetsInitializer, - boundedness, - deserializationSchema, - kafkaProperties, - schema, - tableProperties); - } - - private void setupKafkaProperties() { - if (tableProperties.containsKey(TableProperties.LOG_STORE_ADDRESS)) { - kafkaProperties.put( - BOOTSTRAP_SERVERS_CONFIG, tableProperties.get(TableProperties.LOG_STORE_ADDRESS)); - } - if (tableProperties.containsKey(TableProperties.LOG_STORE_MESSAGE_TOPIC)) { - setTopics(getLogTopic(tableProperties)); - } - - kafkaProperties.putIfAbsent( - "properties.key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - kafkaProperties.putIfAbsent( - "properties.value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - kafkaProperties.putIfAbsent( - "properties.key.deserializer", - "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - kafkaProperties.putIfAbsent( - "properties.value.deserializer", - "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - - setupStartupMode(); - } - - private void setupStartupMode() { - String startupMode = - CompatiblePropertyUtil.propertyAsString( - tableProperties, SCAN_STARTUP_MODE.key(), SCAN_STARTUP_MODE.defaultValue()) - .toLowerCase(); - - switch (startupMode) { - case SCAN_STARTUP_MODE_EARLIEST: - setStartingOffsets(OffsetsInitializer.earliest()); - break; - case SCAN_STARTUP_MODE_LATEST: - setStartingOffsets(OffsetsInitializer.latest()); - break; - case SCAN_STARTUP_MODE_TIMESTAMP: - long startupTimestampMillis = - Long.parseLong( - Preconditions.checkNotNull( - tableProperties.get(SCAN_STARTUP_TIMESTAMP_MILLIS.key()), - String.format( - "'%s' should be set in '%s' mode", - SCAN_STARTUP_TIMESTAMP_MILLIS.key(), SCAN_STARTUP_MODE_TIMESTAMP))); - setStartingOffsets(OffsetsInitializer.timestamp(startupTimestampMillis)); - break; - case SCAN_STARTUP_MODE_GROUP_OFFSETS: - setStartingOffsets(OffsetsInitializer.committedOffsets()); - break; - case SCAN_STARTUP_MODE_SPECIFIC_OFFSETS: - Map specificOffsets = new HashMap<>(); - String specificOffsetsStrOpt = - Preconditions.checkNotNull( - tableProperties.get(SCAN_STARTUP_SPECIFIC_OFFSETS.key()), - String.format( - "'%s' should be set in '%s' mode", - SCAN_STARTUP_SPECIFIC_OFFSETS.key(), SCAN_STARTUP_MODE_SPECIFIC_OFFSETS)); - final Map offsetMap = - parseSpecificOffsets(specificOffsetsStrOpt, SCAN_STARTUP_SPECIFIC_OFFSETS.key()); - offsetMap.forEach( - (partition, offset) -> { - final TopicPartition topicPartition = - new TopicPartition(getLogTopic(tableProperties).get(0), partition); - specificOffsets.put(topicPartition, offset); - }); - setStartingOffsets(OffsetsInitializer.offsets(specificOffsets)); - break; - default: - throw new ValidationException( - String.format( - "%s only support '%s', '%s', '%s', '%s', '%s'. But input is '%s'", - MixedFormatValidator.SCAN_STARTUP_MODE, - SCAN_STARTUP_MODE_LATEST, - SCAN_STARTUP_MODE_EARLIEST, - SCAN_STARTUP_MODE_TIMESTAMP, - SCAN_STARTUP_MODE_GROUP_OFFSETS, - SCAN_STARTUP_MODE_SPECIFIC_OFFSETS, - startupMode)); - } - } - - // ------------- private helpers -------------- - - private void ensureSubscriberIsNull(String attemptingSubscribeMode) { - if (subscriber != null) { - throw new IllegalStateException( - String.format( - "Cannot use %s for consumption because a %s is already set for consumption.", - attemptingSubscribeMode, subscriber.getClass().getSimpleName())); - } - } - - private void parseAndSetRequiredProperties() { - maybeOverride( - ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName(), true); - maybeOverride( - ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, - ByteArrayDeserializer.class.getName(), - true); - maybeOverride(ConsumerConfig.GROUP_ID_CONFIG, "KafkaSource-" + new Random().nextLong(), false); - maybeOverride(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false", false); - maybeOverride( - ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, - startingOffsetsInitializer.getAutoOffsetResetStrategy().name().toLowerCase(), - true); - - // If the source is bounded, do not run periodic partition discovery. - maybeOverride( - KafkaSourceOptions.PARTITION_DISCOVERY_INTERVAL_MS.key(), - "-1", - boundedness == Boundedness.BOUNDED); - - // If the client id prefix is not set, reuse the consumer group id as the client id prefix. - maybeOverride( - KafkaSourceOptions.CLIENT_ID_PREFIX.key(), - kafkaProperties.getProperty(ConsumerConfig.GROUP_ID_CONFIG), - false); - } - - private boolean maybeOverride(String key, String value, boolean override) { - boolean overridden = false; - String userValue = kafkaProperties.getProperty(key); - if (userValue != null) { - if (override) { - LOG.warn( - String.format( - "Property %s is provided but will be overridden from %s to %s", - key, userValue, value)); - kafkaProperties.setProperty(key, value); - overridden = true; - } - } else { - kafkaProperties.setProperty(key, value); - } - return overridden; - } - - private void sanityCheck() { - // Check required configs. - checkNotNull( - kafkaProperties.getProperty(BOOTSTRAP_SERVERS_CONFIG), - String.format("Property %s is required but not provided", LOG_STORE_ADDRESS)); - // Check required settings. - checkNotNull( - subscriber, - String.format("No topic is specified, '%s' should be set.", LOG_STORE_MESSAGE_TOPIC)); - } - - public static Map parseSpecificOffsets( - String specificOffsetsStr, String optionKey) { - final Map offsetMap = new HashMap<>(); - final String[] pairs = specificOffsetsStr.split(";"); - final String validationExceptionMessage = - String.format( - "Invalid properties '%s' should follow the format " - + "'partition:0,offset:42;partition:1,offset:300', but is '%s'.", - optionKey, specificOffsetsStr); - - if (pairs.length == 0) { - throw new ValidationException(validationExceptionMessage); - } - - for (String pair : pairs) { - if (null == pair || pair.length() == 0 || !pair.contains(",")) { - throw new ValidationException(validationExceptionMessage); - } - - final String[] kv = pair.split(","); - if (kv.length != 2 || !kv[0].startsWith(PARTITION + ':') || !kv[1].startsWith(OFFSET + ':')) { - throw new ValidationException(validationExceptionMessage); - } - - String partitionValue = kv[0].substring(kv[0].indexOf(":") + 1); - String offsetValue = kv[1].substring(kv[1].indexOf(":") + 1); - try { - final Integer partition = Integer.valueOf(partitionValue); - final Long offset = Long.valueOf(offsetValue); - offsetMap.put(partition, offset); - } catch (NumberFormatException e) { - throw new ValidationException(validationExceptionMessage, e); - } - } - return offsetMap; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java deleted file mode 100644 index f49250b8ff..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogKafkaSourceReader.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import org.apache.amoro.flink.read.internals.KafkaSourceFetcherManager; -import org.apache.amoro.flink.read.internals.KafkaSourceReader; -import org.apache.amoro.flink.read.source.log.LogSourceHelper; -import org.apache.flink.api.connector.source.SourceReaderContext; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordEmitter; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitState; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -/** The source reader for Kafka partitions. */ -public class LogKafkaSourceReader extends KafkaSourceReader { - - private static final Logger LOG = LoggerFactory.getLogger(LogKafkaSourceReader.class); - - @Nullable private final LogSourceHelper logReadHelper; - - public LogKafkaSourceReader( - FutureCompletingBlockingQueue>> - elementsQueue, - KafkaSourceFetcherManager kafkaSourceFetcherManager, - RecordEmitter, T, KafkaPartitionSplitState> recordEmitter, - Configuration config, - SourceReaderContext context, - KafkaSourceReaderMetrics kafkaSourceReaderMetrics, - @Nullable LogSourceHelper logReadHelper) { - super( - elementsQueue, - kafkaSourceFetcherManager, - recordEmitter, - config, - context, - kafkaSourceReaderMetrics); - - this.logReadHelper = logReadHelper; - } - - @Override - protected KafkaPartitionSplitState initializedState(KafkaPartitionSplit split) { - if (logReadHelper != null) { - logReadHelper.initializedState(split); - } - return new LogKafkaPartitionSplitState(split); - } - - @Override - protected KafkaPartitionSplit toSplitType(String splitId, KafkaPartitionSplitState splitState) { - return ((LogKafkaPartitionSplitState) splitState).toLogKafkaPartitionSplit(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java deleted file mode 100644 index 1f79bb9991..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/read/source/log/kafka/LogRecordWithRetractInfo.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.source.log.kafka; - -import org.apache.amoro.log.LogData; -import org.apache.kafka.clients.consumer.ConsumerRecord; - -public class LogRecordWithRetractInfo extends ConsumerRecord { - - /** - * Denote reader is in retracting read mode. In this mode, data would be read in reverse order and - * opposite RowKind. - */ - private final boolean retracting; - /** @see LogKafkaPartitionSplit#retractStopOffset */ - private final Long retractStoppingOffset; - /** @see LogKafkaPartitionSplit#revertStartOffset */ - private final Long revertStartingOffset; - /** @see LogKafkaPartitionSplit#retractingEpicNo */ - private final Long retractingEpicNo; - - private final LogData logData; - private final T actualValue; - - public LogRecordWithRetractInfo( - ConsumerRecord consumerRecord, - boolean retracting, - Long retractStoppingOffset, - Long revertStartingOffset, - Long retractingEpicNo, - LogData logData, - T actualValue) { - super( - consumerRecord.topic(), - consumerRecord.partition(), - consumerRecord.offset(), - consumerRecord.timestamp(), - consumerRecord.timestampType(), - consumerRecord.serializedKeySize(), - consumerRecord.serializedValueSize(), - consumerRecord.key(), - consumerRecord.value(), - consumerRecord.headers(), - consumerRecord.leaderEpoch()); - this.retracting = retracting; - this.retractStoppingOffset = retractStoppingOffset; - this.revertStartingOffset = revertStartingOffset; - this.retractingEpicNo = retractingEpicNo; - this.logData = logData; - this.actualValue = actualValue; - } - - public static LogRecordWithRetractInfo ofRetract( - ConsumerRecord consumerRecord, - Long retractStoppingOffset, - Long revertStartingOffset, - Long retractingEpicNo, - LogData logData, - T actualValue) { - return new LogRecordWithRetractInfo<>( - consumerRecord, - true, - retractStoppingOffset, - revertStartingOffset, - retractingEpicNo, - logData, - actualValue); - } - - public static LogRecordWithRetractInfo of( - ConsumerRecord consumerRecord, LogData logData) { - return new LogRecordWithRetractInfo<>( - consumerRecord, false, null, null, null, logData, logData.getActualValue()); - } - - public boolean isRetracting() { - return retracting; - } - - public Long getRetractStoppingOffset() { - return retractStoppingOffset; - } - - public Long getRevertStartingOffset() { - return revertStartingOffset; - } - - public LogData getLogData() { - return logData; - } - - public Long getRetractingEpicNo() { - return retractingEpicNo; - } - - public T getActualValue() { - return actualValue; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java deleted file mode 100644 index b0893e7df4..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ReadShuffleRulePolicy.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.DistributionHashMode; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Objects; -import java.util.Random; - -/** Shuffle RowData with same key to same subtask, to make sure cdc data with same key in order. */ -public class ReadShuffleRulePolicy implements ShuffleRulePolicy { - private static final Logger LOG = LoggerFactory.getLogger(ReadShuffleRulePolicy.class); - - private final ShuffleHelper helper; - - private final DistributionHashMode distributionHashMode; - - public ReadShuffleRulePolicy(ShuffleHelper helper) { - this( - helper, - DistributionHashMode.autoSelect(helper.isPrimaryKeyExist(), helper.isPartitionKeyExist())); - } - - public ReadShuffleRulePolicy(ShuffleHelper helper, DistributionHashMode distributionHashMode) { - this.helper = helper; - this.distributionHashMode = distributionHashMode; - Preconditions.checkArgument(distributionHashMode != DistributionHashMode.AUTO); - } - - @Override - public KeySelector generateKeySelector() { - return new PrimaryKeySelector(); - } - - @Override - public Partitioner generatePartitioner() { - return new RoundRobinPartitioner(distributionHashMode, helper); - } - - @Override - public DistributionHashMode getPolicyType() { - return distributionHashMode; - } - - /** return ShuffleKey */ - static class PrimaryKeySelector implements KeySelector { - @Override - public ShuffleKey getKey(RowData value) throws Exception { - return new ShuffleKey(value); - } - } - - /** Circular polling feed a streamRecord into a special factor node */ - static class RoundRobinPartitioner implements Partitioner { - private final ShuffleHelper helper; - private final DistributionHashMode distributionHashMode; - private Random random = null; - - RoundRobinPartitioner(DistributionHashMode distributionHashMode, ShuffleHelper helper) { - this.distributionHashMode = distributionHashMode; - this.helper = helper; - if (!distributionHashMode.isSupportPartition() - && !distributionHashMode.isSupportPrimaryKey()) { - random = new Random(); - } - } - - @Override - public int partition(ShuffleKey key, int numPartitions) { - if (helper != null) { - helper.open(); - } - checkNotNull(key); - RowData row = checkNotNull(key.getRow()); - - Integer pkHashCode = null; - if (distributionHashMode.isSupportPrimaryKey()) { - pkHashCode = helper.hashKeyValue(row); - } - // shuffle by mixed-format partition for partitioned table - Integer partitionHashCode = null; - if (distributionHashMode.isSupportPartition()) { - partitionHashCode = helper.hashPartitionValue(row); - } - if (pkHashCode != null && partitionHashCode != null) { - return Math.abs(Objects.hash(pkHashCode, partitionHashCode)) % numPartitions; - } else if (pkHashCode != null) { - return pkHashCode % numPartitions; - } else if (partitionHashCode != null) { - return partitionHashCode % numPartitions; - } else { - return random.nextInt(numPartitions); - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java deleted file mode 100644 index 46a5ca26ef..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/RoundRobinShuffleRulePolicy.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import static org.apache.flink.util.Preconditions.checkArgument; -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.amoro.table.DistributionHashMode; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * After the primary key value hash is modulated based on concurrency, the row is routed to - * different subtask write - * - *

- */ -public class RoundRobinShuffleRulePolicy implements ShuffleRulePolicy { - private static final Logger LOG = LoggerFactory.getLogger(RoundRobinShuffleRulePolicy.class); - - private final ShuffleHelper helper; - - private final int downStreamOperatorParallelism; - - private final int fileSplit; - - private int factor = -1; - - private Map> subtaskTreeNodes; - - private final DistributionHashMode distributionHashMode; - - public RoundRobinShuffleRulePolicy(int downStreamOperatorParallelism, int fileSplit) { - this(null, downStreamOperatorParallelism, fileSplit); - } - - public RoundRobinShuffleRulePolicy( - ShuffleHelper helper, int downStreamOperatorParallelism, int fileSplit) { - this( - helper, - downStreamOperatorParallelism, - fileSplit, - DistributionHashMode.autoSelect(helper.isPrimaryKeyExist(), helper.isPartitionKeyExist())); - } - - public RoundRobinShuffleRulePolicy( - ShuffleHelper helper, - int downStreamOperatorParallelism, - int fileSplit, - DistributionHashMode distributionHashMode) { - this.helper = helper; - this.downStreamOperatorParallelism = downStreamOperatorParallelism; - this.fileSplit = fileSplit; - this.distributionHashMode = distributionHashMode; - Preconditions.checkArgument(distributionHashMode != DistributionHashMode.NONE); - Preconditions.checkArgument(distributionHashMode != DistributionHashMode.AUTO); - } - - @Override - public KeySelector generateKeySelector() { - return new PrimaryKeySelector(); - } - - @Override - public Partitioner generatePartitioner() { - getSubtaskTreeNodes(); - return new RoundRobinPartitioner( - downStreamOperatorParallelism, factor, distributionHashMode, helper); - } - - @Override - public DistributionHashMode getPolicyType() { - return distributionHashMode; - } - - @Override - public Map> getSubtaskTreeNodes() { - if (this.subtaskTreeNodes == null) { - this.subtaskTreeNodes = initSubtaskFactorMap(this.downStreamOperatorParallelism); - return this.subtaskTreeNodes; - } - return this.subtaskTreeNodes; - } - - /** - * get factor sequence and writer subtask id mapping relationship Key:subtask id Value:treeNodes - * - * @return - */ - private Map> initSubtaskFactorMap(final int writerParallelism) { - Map> subtaskTreeNodes = new HashMap<>(writerParallelism); - if (distributionHashMode.isSupportPrimaryKey()) { - factor = fileSplit; - // every writer may accept all node data for partitioned table - if (distributionHashMode.isSupportPartition()) { - IntStream.range(0, writerParallelism) - .forEach( - subtaskId -> { - subtaskTreeNodes.put( - subtaskId, - IntStream.range(0, factor) - .mapToObj(index -> DataTreeNode.of(factor - 1, index)) - .collect(Collectors.toSet())); - }); - } else { - if (factor < writerParallelism) { - int actualDepth = getActualDepth(writerParallelism); - factor = (int) Math.pow(2, actualDepth - 1); - } - final int finalMask = factor - 1; - - IntStream.range(0, factor) - .forEach( - sequence -> { - int subtaskId = getSubtaskId(sequence, writerParallelism); - if (!subtaskTreeNodes.containsKey(subtaskId)) { - Set treeNodes = new HashSet<>(); - treeNodes.add(DataTreeNode.of(finalMask, sequence)); - subtaskTreeNodes.put(subtaskId, treeNodes); - } else { - subtaskTreeNodes.get(subtaskId).add(DataTreeNode.of(finalMask, sequence)); - } - }); - } - } else { - IntStream.range(0, writerParallelism) - .forEach( - subtaskId -> { - subtaskTreeNodes.put(subtaskId, Sets.newHashSet(DataTreeNode.of(0, 0))); - }); - } - subtaskTreeNodes.forEach( - (subtaskId, treeNodes) -> LOG.info("subtaskId={}, treeNodes={}.", subtaskId, treeNodes)); - return subtaskTreeNodes; - } - - private static int getActualDepth(int numPartitions) { - return (int) Math.ceil(Math.log(numPartitions) / Math.log(2)) + 1; - } - - private static int getSubtaskId(int sequence, int parallelism) { - return sequence % parallelism; - } - - /** return ShuffleKey */ - static class PrimaryKeySelector implements KeySelector { - @Override - public ShuffleKey getKey(RowData value) throws Exception { - return new ShuffleKey(value); - } - } - - /** Circular polling feed a streamRecord into a special factor node */ - static class RoundRobinPartitioner implements Partitioner { - private final int downStreamOperatorParallelism; - private final int factor; - private final ShuffleHelper helper; - private final DistributionHashMode distributionHashMode; - - RoundRobinPartitioner( - int downStreamOperatorParallelism, - int factor, - DistributionHashMode distributionHashMode, - ShuffleHelper helper) { - this.downStreamOperatorParallelism = downStreamOperatorParallelism; - this.factor = factor; - this.distributionHashMode = distributionHashMode; - this.helper = helper; - } - - @Override - public int partition(ShuffleKey key, int numPartitions) { - if (helper != null) { - helper.open(); - } - checkNotNull(key); - RowData row = checkNotNull(key.getRow()); - - checkArgument( - numPartitions == this.downStreamOperatorParallelism, - String.format( - "shuffle mixed-format record numPartition:%s is diff with writer parallelism:%s.", - numPartitions, this.downStreamOperatorParallelism)); - Integer factorIndex = null; - if (distributionHashMode.isSupportPrimaryKey()) { - long pkHashCode = helper.hashKeyValue(row); - factorIndex = (int) (pkHashCode % this.factor); - } - // shuffle by mixed-format tree node and partition for partitioned table - Integer partitionHashCode = null; - if (distributionHashMode.isSupportPartition()) { - partitionHashCode = helper.hashPartitionValue(row); - } - if (factorIndex != null && partitionHashCode != null) { - return Math.abs(Objects.hash(factorIndex, partitionHashCode)) % numPartitions; - } else if (factorIndex != null) { - return factorIndex % numPartitions; - } else if (partitionHashCode != null) { - return partitionHashCode % numPartitions; - } else { - return 0; - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java deleted file mode 100644 index 5edcabc131..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleHelper.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import static org.apache.iceberg.IcebergSchemaUtil.projectPartition; - -import org.apache.amoro.data.PrimaryKeyData; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.CollectionUtil; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.types.Types; - -import java.io.Serializable; - -/** This helper operates to one mixed-format table and the data of the table. */ -public class ShuffleHelper implements Serializable { - private static final long serialVersionUID = 1L; - - private boolean primaryKeyExist = false; - private PrimaryKeyData primaryKeyData; - private PartitionKey partitionKey; - private RowType rowType; - private Types.StructType struct; - private transient RowDataWrapper rowDataWrapper; - - public static ShuffleHelper EMPTY = new ShuffleHelper(); - - public static ShuffleHelper build(MixedTable table, Schema schema, RowType rowType) { - PartitionKey partitionKey = null; - - if (table.spec() != null && !CollectionUtil.isNullOrEmpty(table.spec().fields())) { - partitionKey = new PartitionKey(projectPartition(table.spec(), schema), schema); - } - schema = addFieldsNotInMixedFormat(schema, rowType); - - if (table.isUnkeyedTable()) { - return new ShuffleHelper(rowType, schema.asStruct(), partitionKey); - } - - KeyedTable keyedTable = table.asKeyedTable(); - PrimaryKeyData primaryKeyData = new PrimaryKeyData(keyedTable.primaryKeySpec(), schema); - return new ShuffleHelper( - keyedTable.primaryKeySpec().primaryKeyExisted(), - primaryKeyData, - partitionKey, - rowType, - schema.asStruct()); - } - - /** - * If using mixed-format table as build table, there will be an additional implicit field, valuing - * process time. - * - * @param schema The physical schema in mixed-format table. - * @param rowType Flink RowData type. - * @return the mixed-format Schema with additional implicit field. - */ - public static Schema addFieldsNotInMixedFormat(Schema schema, RowType rowType) { - Types.NestedField[] nestedFields = new Types.NestedField[rowType.getFieldCount()]; - - for (int i = 0; i < nestedFields.length; i++) { - RowType.RowField field = rowType.getFields().get(i); - Types.NestedField nestedField; - if ((nestedField = schema.findField(field.getName())) != null) { - nestedFields[i] = nestedField; - } else { - // for now, there is only one case that virtual watermark exist in RowData, but not in - // mixed-format table schema. - nestedFields[i] = - Types.NestedField.optional(-1, field.getName(), Types.TimestampType.withoutZone()); - } - } - return new Schema(nestedFields); - } - - /** Should open firstly to initial RowDataWrapper, because it cannot be serialized. */ - public void open() { - if (rowDataWrapper != null) { - return; - } - if (rowType != null && struct != null) { - rowDataWrapper = new RowDataWrapper(rowType, struct); - } - } - - public ShuffleHelper() {} - - public ShuffleHelper(RowType rowType, Types.StructType structType, PartitionKey partitionKey) { - this(false, null, partitionKey, rowType, structType); - } - - public ShuffleHelper( - boolean primaryKeyExist, - PrimaryKeyData primaryKeyData, - PartitionKey partitionKey, - RowType rowType, - Types.StructType structType) { - this(primaryKeyExist, primaryKeyData, null, partitionKey, rowType, structType); - } - - public ShuffleHelper( - boolean primaryKeyExist, - PrimaryKeyData primaryKeyData, - RowDataWrapper rowDataWrapper, - PartitionKey partitionKey, - RowType rowType, - Types.StructType structType) { - this.primaryKeyExist = primaryKeyExist; - this.primaryKeyData = primaryKeyData; - this.rowDataWrapper = rowDataWrapper; - this.partitionKey = partitionKey; - this.rowType = rowType; - this.struct = structType; - } - - public boolean isPrimaryKeyExist() { - return primaryKeyExist; - } - - public boolean isPartitionKeyExist() { - return partitionKey != null && partitionKey.size() > 0; - } - - public int hashPartitionValue(RowData rowData) { - partitionKey.partition(rowDataWrapper.wrap(rowData)); - int hashcode = Math.abs(partitionKey.hashCode()); - return hashcode == Integer.MIN_VALUE ? Integer.MAX_VALUE : hashcode; - } - - public int hashKeyValue(RowData rowData) { - primaryKeyData.primaryKey(rowDataWrapper.wrap(rowData)); - return primaryKeyData.hashCode(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java deleted file mode 100644 index dd34b40615..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleKey.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import org.apache.flink.table.data.RowData; - -public class ShuffleKey { - private final RowData row; - - public ShuffleKey(RowData row) { - this.row = row; - } - - public RowData getRow() { - return row; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java deleted file mode 100644 index 4264778ebe..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/shuffle/ShuffleRulePolicy.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.table.DistributionHashMode; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.api.java.functions.KeySelector; - -import java.io.Serializable; -import java.util.Map; -import java.util.Set; - -/** Policy for shuffle a streamRecord by primary keys */ -public interface ShuffleRulePolicy extends Serializable { - - /** - * Generate KeySelector - * - * @return - */ - KeySelector generateKeySelector(); - - /** - * Generate partitioner - * - * @return - */ - Partitioner generatePartitioner(); - - /** - * Get shuffle type. - * - * @return ShufflePolicyType - */ - DistributionHashMode getPolicyType(); - - /** - * Get factor sequence and writer subtask id mapping relationship Key:subtask id Value:treeNodes - * - * @return - */ - default Map> getSubtaskTreeNodes() { - return null; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java deleted file mode 100644 index 3e4080e8a8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/FlinkSource.java +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.interceptor.ProxyFactory; -import org.apache.amoro.flink.read.MixedFormatSource; -import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.amoro.flink.util.IcebergClassUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.util.ProxyUtil; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.io.InputFormat; -import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; -import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamSource; -import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; -import org.apache.flink.streaming.api.transformations.OneInputTransformation; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; - -/** An util class create mixed-format source data stream. */ -public class FlinkSource { - private FlinkSource() {} - - public static Builder forRowData() { - return new Builder(); - } - - public static final class Builder { - - private static final Logger LOG = LoggerFactory.getLogger(Builder.class); - private static final String MIXED_FORMAT_FILE_TRANSFORMATION = "mixed-format-file"; - private ProviderContext context; - private StreamExecutionEnvironment env; - private MixedTable mixedTable; - private MixedFormatTableLoader tableLoader; - private TableSchema projectedSchema; - private List filters; - private ReadableConfig flinkConf = new Configuration(); - private final Map properties = new HashMap<>(); - private long limit = -1L; - private WatermarkStrategy watermarkStrategy = WatermarkStrategy.noWatermarks(); - private final MixedFormatScanContext.Builder contextBuilder = - MixedFormatScanContext.contextBuilder(); - private boolean batchMode = false; - - private Builder() {} - - public Builder context(ProviderContext context) { - this.context = context; - return this; - } - - public Builder env(StreamExecutionEnvironment env) { - this.env = env; - return this; - } - - public Builder mixedFormatTable(MixedTable mixedTable) { - this.mixedTable = mixedTable; - properties.putAll(mixedTable.properties()); - return this; - } - - public Builder tableLoader(MixedFormatTableLoader tableLoader) { - this.tableLoader = tableLoader; - return this; - } - - public Builder project(TableSchema tableSchema) { - this.projectedSchema = tableSchema; - return this; - } - - public Builder limit(long limit) { - this.limit = limit; - contextBuilder.limit(limit); - return this; - } - - public Builder filters(List filters) { - this.filters = filters; - contextBuilder.filters(filters); - return this; - } - - public Builder flinkConf(ReadableConfig flinkConf) { - this.flinkConf = flinkConf; - return this; - } - - public Builder properties(Map properties) { - this.properties.putAll(properties); - return this; - } - - public Builder watermarkStrategy(WatermarkStrategy watermarkStrategy) { - if (watermarkStrategy != null) { - this.watermarkStrategy = watermarkStrategy; - } - return this; - } - - public Builder batchMode(boolean batchMode) { - this.batchMode = batchMode; - return this; - } - - public DataStream build() { - Preconditions.checkNotNull(env, "StreamExecutionEnvironment should not be null"); - loadTableIfNeeded(); - - if (mixedTable.isUnkeyedTable()) { - String scanStartupMode = properties.get(MixedFormatValidator.SCAN_STARTUP_MODE.key()); - return buildUnkeyedTableSource(scanStartupMode); - } - - boolean dimTable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - properties, - MixedFormatValidator.DIM_TABLE_ENABLE.key(), - MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); - RowType rowType; - - if (projectedSchema == null) { - contextBuilder.project(mixedTable.schema()); - rowType = FlinkSchemaUtil.convert(mixedTable.schema()); - } else { - contextBuilder.project( - FlinkSchemaUtil.convert( - mixedTable.schema(), - org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema))); - // If dim table is enabled, we reserve a RowTime field in Emitter. - if (dimTable) { - rowType = org.apache.amoro.flink.FlinkSchemaUtil.toRowType(projectedSchema); - } else { - rowType = - org.apache.amoro.flink.FlinkSchemaUtil.toRowType( - org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema)); - } - } - MixedFormatScanContext scanContext = - contextBuilder.fromProperties(properties).batchMode(batchMode).build(); - - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - flinkConf, - mixedTable.schema(), - scanContext.project(), - mixedTable.asKeyedTable().primaryKeySpec(), - scanContext.nameMapping(), - scanContext.caseSensitive(), - mixedTable.io()); - - int scanParallelism = - flinkConf.getOptional(MixedFormatValidator.SCAN_PARALLELISM).orElse(env.getParallelism()); - DataStreamSource sourceStream = - env.fromSource( - new MixedFormatSource<>( - tableLoader, - scanContext, - rowDataReaderFunction, - InternalTypeInfo.of(rowType), - mixedTable.name(), - dimTable), - watermarkStrategy, - MixedFormatSource.class.getName()) - .setParallelism(scanParallelism); - context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); - return sourceStream; - } - - private void loadTableIfNeeded() { - if (tableLoader == null || mixedTable != null) { - return; - } - mixedTable = MixedFormatUtils.loadMixedTable(tableLoader); - properties.putAll(mixedTable.properties()); - } - - public DataStream buildUnkeyedTableSource(String scanStartupMode) { - scanStartupMode = scanStartupMode == null ? null : scanStartupMode.toLowerCase(); - Preconditions.checkArgument( - Objects.isNull(scanStartupMode) - || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST) - || Objects.equals(scanStartupMode, MixedFormatValidator.SCAN_STARTUP_MODE_LATEST), - String.format( - "only support %s, %s when %s is %s", - MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST, - MixedFormatValidator.SCAN_STARTUP_MODE_LATEST, - MixedFormatValidator.MIXED_FORMAT_READ_MODE, - MixedFormatValidator.MIXED_FORMAT_READ_FILE)); - org.apache.iceberg.flink.source.FlinkSource.Builder builder = - org.apache.iceberg.flink.source.FlinkSource.forRowData() - .env(env) - .project(org.apache.amoro.flink.FlinkSchemaUtil.filterWatermark(projectedSchema)) - .tableLoader(tableLoader) - .filters(filters) - .properties(properties) - .flinkConf(flinkConf) - .limit(limit); - if (MixedFormatValidator.SCAN_STARTUP_MODE_LATEST.equalsIgnoreCase(scanStartupMode)) { - Optional startSnapshotOptional = - Optional.ofNullable(tableLoader.loadTable().currentSnapshot()); - if (startSnapshotOptional.isPresent()) { - Snapshot snapshot = startSnapshotOptional.get(); - LOG.info( - "Get starting snapshot id {} based on scan startup mode {}", - snapshot.snapshotId(), - scanStartupMode); - builder.startSnapshotId(snapshot.snapshotId()); - } - } - DataStream origin = builder.build(); - return wrapKrb(origin).assignTimestampsAndWatermarks(watermarkStrategy); - } - - /** extract op from dataStream, and wrap krb support */ - private DataStream wrapKrb(DataStream ds) { - IcebergClassUtil.clean(env); - Transformation origin = ds.getTransformation(); - int scanParallelism = - flinkConf - .getOptional(MixedFormatValidator.SCAN_PARALLELISM) - .orElse(origin.getParallelism()); - - if (origin instanceof OneInputTransformation) { - OneInputTransformation tf = - (OneInputTransformation) ds.getTransformation(); - OneInputStreamOperatorFactory op = (OneInputStreamOperatorFactory) tf.getOperatorFactory(); - ProxyFactory inputFormatProxyFactory = - IcebergClassUtil.getInputFormatProxyFactory(op, mixedTable.io(), mixedTable.schema()); - - if (tf.getInputs().isEmpty()) { - return env.addSource( - new UnkeyedInputFormatSourceFunction(inputFormatProxyFactory, tf.getOutputType())) - .setParallelism(scanParallelism); - } - - LegacySourceTransformation tfSource = (LegacySourceTransformation) tf.getInputs().get(0); - StreamSource source = tfSource.getOperator(); - SourceFunction function = IcebergClassUtil.getSourceFunction(source); - - SourceFunction functionProxy = - (SourceFunction) ProxyUtil.getProxy(function, mixedTable.io()); - DataStreamSource sourceStream = - env.addSource(functionProxy, tfSource.getName(), tfSource.getOutputType()); - context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); - if (sourceStream instanceof ParallelSourceFunction) { - sourceStream.setParallelism(scanParallelism); - } - return sourceStream.transform( - tf.getName(), - tf.getOutputType(), - new UnkeyedInputFormatOperatorFactory(inputFormatProxyFactory)); - } - - LegacySourceTransformation tfSource = (LegacySourceTransformation) origin; - StreamSource source = tfSource.getOperator(); - InputFormatSourceFunction function = - (InputFormatSourceFunction) IcebergClassUtil.getSourceFunction(source); - - InputFormat inputFormatProxy = - (InputFormat) ProxyUtil.getProxy(function.getFormat(), mixedTable.io()); - DataStreamSource sourceStream = - env.createInput(inputFormatProxy, tfSource.getOutputType()) - .setParallelism(scanParallelism); - context.generateUid(MIXED_FORMAT_FILE_TRANSFORMATION).ifPresent(sourceStream::uid); - return sourceStream; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java deleted file mode 100644 index 08316ef1c5..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/LogDynamicSource.java +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.flink.table.connector.ChangelogMode.insertOnly; - -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSourceBuilder; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types.NestedField; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import java.util.Properties; -import java.util.stream.Collectors; - -/** This is a log source table api, create log queue consumer e.g. {@link LogKafkaSource} */ -public class LogDynamicSource - implements ScanTableSource, SupportsWatermarkPushDown, SupportsProjectionPushDown { - - private static final Logger LOG = LoggerFactory.getLogger(LogDynamicSource.class); - - private final MixedTable mixedTable; - private final Schema schema; - private final ReadableConfig tableOptions; - private final Optional consumerChangelogMode; - private final boolean logRetractionEnable; - - /** Watermark strategy that is used to generate per-partition watermark. */ - protected @Nullable WatermarkStrategy watermarkStrategy; - - /** Data type to configure the formats. */ - - /** Indices that determine the value fields and the target position in the produced row. */ - protected int[] projectedFields; - - /** Properties for the logStore consumer. */ - protected final Properties properties; - - private static final ChangelogMode ALL_KINDS = - ChangelogMode.newBuilder() - .addContainedKind(RowKind.INSERT) - .addContainedKind(RowKind.UPDATE_BEFORE) - .addContainedKind(RowKind.UPDATE_AFTER) - .addContainedKind(RowKind.DELETE) - .build(); - - public LogDynamicSource( - Properties properties, Schema schema, ReadableConfig tableOptions, MixedTable mixedTable) { - this.schema = schema; - this.tableOptions = tableOptions; - this.consumerChangelogMode = - tableOptions.getOptional(MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE); - this.logRetractionEnable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - mixedTable.properties(), - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue()); - this.mixedTable = mixedTable; - this.properties = properties; - } - - public LogDynamicSource( - Properties properties, - Schema schema, - ReadableConfig tableOptions, - MixedTable mixedTable, - boolean logRetractionEnable, - Optional consumerChangelogMode) { - this.schema = schema; - this.tableOptions = tableOptions; - this.consumerChangelogMode = consumerChangelogMode; - this.logRetractionEnable = logRetractionEnable; - this.mixedTable = mixedTable; - this.properties = properties; - } - - protected LogKafkaSource createKafkaSource() { - Schema projectedSchema = getProjectSchema(schema); - LOG.info("Schema used for create KafkaSource is: {}", projectedSchema); - - LogKafkaSourceBuilder kafkaSourceBuilder = - LogKafkaSource.builder(projectedSchema, mixedTable.properties()); - kafkaSourceBuilder.setProperties(properties); - - LOG.info("build log kafka source"); - return kafkaSourceBuilder.build(); - } - - @Override - public ChangelogMode getChangelogMode() { - String changeLogMode = - consumerChangelogMode.orElse( - mixedTable.isKeyedTable() - ? MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS - : MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY); - switch (changeLogMode) { - case MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY: - if (logRetractionEnable) { - throw new IllegalArgumentException( - String.format( - "Only %s is false when %s is %s", - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), - MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), - MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY)); - } - return insertOnly(); - case MixedFormatValidator.LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS: - return ALL_KINDS; - default: - throw new UnsupportedOperationException( - String.format( - "As of now, %s can't support this option %s.", - MixedFormatValidator.MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE.key(), - consumerChangelogMode)); - } - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { - final LogKafkaSource kafkaSource = createKafkaSource(); - - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { - if (watermarkStrategy == null) { - watermarkStrategy = WatermarkStrategy.noWatermarks(); - } - int scanParallelism = - tableOptions - .getOptional(MixedFormatValidator.SCAN_PARALLELISM) - .orElse(execEnv.getParallelism()); - return execEnv - .fromSource(kafkaSource, watermarkStrategy, "LogStoreSource-" + mixedTable.name()) - .setParallelism(scanParallelism); - } - - @Override - public boolean isBounded() { - return kafkaSource.getBoundedness() == Boundedness.BOUNDED; - } - }; - } - - @Override - public DynamicTableSource copy() { - return new LogDynamicSource( - this.properties, - this.schema, - this.tableOptions, - this.mixedTable, - this.logRetractionEnable, - this.consumerChangelogMode); - } - - @Override - public String asSummaryString() { - return "Mixed-format Log: " + mixedTable.name(); - } - - @Override - public void applyWatermark(WatermarkStrategy watermarkStrategy) { - this.watermarkStrategy = watermarkStrategy; - } - - @Override - public boolean supportsNestedProjection() { - return false; - } - - @Override - public void applyProjection(int[][] projectFields) { - this.projectedFields = new int[projectFields.length]; - for (int i = 0; i < projectFields.length; i++) { - Preconditions.checkArgument( - projectFields[i].length == 1, "Don't support nested projection now."); - this.projectedFields[i] = projectFields[i][0]; - } - } - - private Schema getProjectSchema(Schema projectedSchema) { - if (projectedFields != null) { - List projectedSchemaColumns = projectedSchema.columns(); - projectedSchema = - new Schema( - Arrays.stream(projectedFields) - .mapToObj(projectedSchemaColumns::get) - .collect(Collectors.toList())); - } - return projectedSchema; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java deleted file mode 100644 index d9ca8ea5bd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedDynamicTableFactory.java +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE_DEFAULT; -import static org.apache.flink.api.common.RuntimeExecutionMode.BATCH; -import static org.apache.flink.configuration.ExecutionOptions.RUNTIME_MODE; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.PROPS_BOOTSTRAP_SERVERS; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.PROPS_GROUP_ID; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_STARTUP_MODE; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_STARTUP_TIMESTAMP_MILLIS; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SCAN_TOPIC_PARTITION_DISCOVERY; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.SINK_PARTITIONER; -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.TOPIC; - -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.catalog.MixedCatalog; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.amoro.utils.CompatiblePropertyUtil; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.connector.kafka.source.KafkaSourceOptions; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.ObjectIdentifier; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.factories.DynamicTableSinkFactory; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.factories.FactoryUtil; -import org.apache.flink.table.utils.TableSchemaUtils; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.Duration; -import java.util.Collections; -import java.util.HashSet; -import java.util.Map; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; - -/** A factory generates {@link MixedFormatDynamicSource} and {@link MixedFormatDynamicSink} */ -public class MixedDynamicTableFactory - implements DynamicTableSourceFactory, DynamicTableSinkFactory { - private static final Logger LOG = LoggerFactory.getLogger(MixedDynamicTableFactory.class); - public static final String IDENTIFIER = "mixed-format"; - private InternalCatalogBuilder internalCatalogBuilder; - private String internalCatalogName; - - public MixedDynamicTableFactory(MixedCatalog mixedCatalog) { - this.internalCatalogBuilder = mixedCatalog.catalogBuilder(); - this.internalCatalogName = mixedCatalog.amsCatalogName(); - } - - public MixedDynamicTableFactory() {} - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - CatalogTable catalogTable = context.getCatalogTable(); - ObjectIdentifier identifier = context.getObjectIdentifier(); - ObjectPath objectPath; - - FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); - Configuration options = (Configuration) helper.getOptions(); - - InternalCatalogBuilder actualBuilder = internalCatalogBuilder; - String actualCatalogName = internalCatalogName; - - // It denotes create table by ddl 'connector' option, not through catalog.db.tableName - if (actualBuilder == null || actualCatalogName == null) { - actualCatalogName = options.get(MixedFormatValidator.MIXED_FORMAT_CATALOG); - Preconditions.checkNotNull( - actualCatalogName, - String.format("%s should be set", MixedFormatValidator.MIXED_FORMAT_CATALOG.key())); - String amsUri = options.get(CatalogFactoryOptions.AMS_URI); - actualBuilder = - InternalCatalogBuilder.builder() - .amsUri(amsUri) - .catalogName(actualCatalogName) - .properties(options.toMap()); - } - - if (options.containsKey(MixedFormatValidator.MIXED_FORMAT_DATABASE.key()) - && options.containsKey(MixedFormatValidator.MIXED_FORMAT_TABLE.key())) { - objectPath = - new ObjectPath( - options.get(MixedFormatValidator.MIXED_FORMAT_DATABASE), - options.get(MixedFormatValidator.MIXED_FORMAT_TABLE)); - } else { - objectPath = new ObjectPath(identifier.getDatabaseName(), identifier.getObjectName()); - } - MixedFormatTableLoader tableLoader = - createTableLoader(objectPath, actualCatalogName, actualBuilder, options.toMap()); - MixedTable mixedTable = MixedFormatUtils.loadMixedTable(tableLoader); - - Configuration confWithAll = Configuration.fromMap(mixedTable.properties()); - - ScanTableSource mixedFormatDynamicSource; - - String readMode = - PropertyUtil.propertyAsString( - mixedTable.properties(), - MixedFormatValidator.MIXED_FORMAT_READ_MODE, - MixedFormatValidator.MIXED_READ_MODE_DEFAULT); - - boolean dimTable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - mixedTable.properties(), - MixedFormatValidator.DIM_TABLE_ENABLE.key(), - MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); - - TableSchema tableSchema; - if (!dimTable) { - tableSchema = - org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema(catalogTable.getSchema()); - } else { - tableSchema = - org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchemaForDimTable( - catalogTable.getSchema()); - } - - switch (readMode) { - case MixedFormatValidator.MIXED_FORMAT_READ_FILE: - boolean batchMode = context.getConfiguration().get(RUNTIME_MODE).equals(BATCH); - LOG.info("Building a file reader in {} runtime mode", batchMode ? "batch" : "streaming"); - mixedFormatDynamicSource = - new MixedFormatFileSource(tableLoader, tableSchema, mixedTable, confWithAll, batchMode); - break; - case MixedFormatValidator.MIXED_FORMAT_READ_LOG: - default: - Preconditions.checkArgument( - CompatiblePropertyUtil.propertyAsBoolean( - mixedTable.properties(), ENABLE_LOG_STORE, ENABLE_LOG_STORE_DEFAULT), - String.format("Read log should enable %s at first", ENABLE_LOG_STORE)); - mixedFormatDynamicSource = createLogSource(mixedTable, context, confWithAll); - } - - return generateDynamicTableSource( - identifier.getObjectName(), mixedFormatDynamicSource, mixedTable, tableLoader); - } - - protected DynamicTableSource generateDynamicTableSource( - String tableName, - ScanTableSource mixedFormatDynamicSource, - MixedTable mixedTable, - MixedFormatTableLoader tableLoader) { - return new MixedFormatDynamicSource( - tableName, mixedFormatDynamicSource, mixedTable, mixedTable.properties(), tableLoader); - } - - @Override - public MixedFormatDynamicSink createDynamicTableSink(Context context) { - CatalogTable catalogTable = context.getCatalogTable(); - - ObjectIdentifier identifier = context.getObjectIdentifier(); - Map options = catalogTable.getOptions(); - - MixedFormatTableLoader tableLoader = - createTableLoader( - new ObjectPath(identifier.getDatabaseName(), identifier.getObjectName()), - internalCatalogName, - internalCatalogBuilder, - options); - - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - return new MixedFormatDynamicSink(catalogTable, tableLoader, table.isKeyedTable()); - } - - private static MixedFormatTableLoader createTableLoader( - ObjectPath tablePath, - String internalCatalogName, - InternalCatalogBuilder catalogBuilder, - Map flinkTableProperties) { - TableIdentifier identifier = - TableIdentifier.of( - internalCatalogName, tablePath.getDatabaseName(), tablePath.getObjectName()); - - return MixedFormatTableLoader.of(identifier, catalogBuilder, flinkTableProperties); - } - - @Override - public String factoryIdentifier() { - return IDENTIFIER; - } - - @Override - public Set> requiredOptions() { - return Collections.emptySet(); - } - - @Override - public Set> optionalOptions() { - final Set> options = new HashSet<>(); - options.add(TOPIC); - options.add(PROPS_BOOTSTRAP_SERVERS); - options.add(PROPS_GROUP_ID); - options.add(SCAN_STARTUP_MODE); - options.add(SCAN_STARTUP_TIMESTAMP_MILLIS); - options.add(SINK_PARTITIONER); - options.add(MixedFormatValidator.MIXED_FORMAT_CATALOG); - options.add(MixedFormatValidator.MIXED_FORMAT_TABLE); - options.add(MixedFormatValidator.MIXED_FORMAT_DATABASE); - options.add(MixedFormatValidator.DIM_TABLE_ENABLE); - options.add(CatalogFactoryOptions.AMS_URI); - - // lookup - options.add(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS); - options.add(MixedFormatValidator.LOOKUP_RELOADING_INTERVAL); - options.add(MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE); - - options.add(MixedFormatValidator.ROCKSDB_AUTO_COMPACTIONS); - options.add(MixedFormatValidator.ROCKSDB_WRITING_THREADS); - options.add(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_CAPACITY); - options.add(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS); - return options; - } - - private ScanTableSource createLogSource( - MixedTable mixedTable, Context context, ReadableConfig tableOptions) { - CatalogTable catalogTable = context.getCatalogTable(); - TableSchema physicalSchema = TableSchemaUtils.getPhysicalSchema(catalogTable.getSchema()); - Schema schema = FlinkSchemaUtil.convert(physicalSchema); - - final Properties properties = OptionsUtil.getKafkaProperties(mixedTable.properties()); - - // add topic-partition discovery - final Optional partitionDiscoveryInterval = - tableOptions.getOptional(SCAN_TOPIC_PARTITION_DISCOVERY).map(Duration::toMillis); - properties.setProperty( - KafkaSourceOptions.PARTITION_DISCOVERY_INTERVAL_MS.key(), - partitionDiscoveryInterval.orElse(-1L).toString()); - - LOG.info("build log source"); - return new LogDynamicSource(properties, schema, tableOptions, mixedTable); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java deleted file mode 100644 index 5de853ec21..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSink.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.write.FlinkSink; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.sink.DataStreamSinkProvider; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite; -import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.hadoop.security.UserGroupInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Map; - -/** Flink table api that generates sink operators. */ -public class MixedFormatDynamicSink - implements DynamicTableSink, SupportsPartitioning, SupportsOverwrite { - - public static final Logger LOG = LoggerFactory.getLogger(MixedFormatDynamicSink.class); - - private final MixedFormatTableLoader tableLoader; - private final CatalogTable flinkTable; - private final boolean primaryKeyExisted; - private boolean overwrite = false; - - MixedFormatDynamicSink( - CatalogTable flinkTable, MixedFormatTableLoader tableLoader, boolean primaryKeyExisted) { - this.tableLoader = tableLoader; - this.flinkTable = flinkTable; - this.primaryKeyExisted = primaryKeyExisted; - } - - @Override - public ChangelogMode getChangelogMode(ChangelogMode changelogMode) { - ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); - if (primaryKeyExisted) { - builder - .addContainedKind(RowKind.UPDATE_BEFORE) - .addContainedKind(RowKind.UPDATE_AFTER) - .addContainedKind(RowKind.DELETE); - } - return builder.build(); - } - - @Override - public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - - return new DataStreamSinkProvider() { - @Override - public DataStreamSink consumeDataStream( - ProviderContext providerContext, DataStream dataStream) { - DataStreamSink ds = - FlinkSink.forRowData(dataStream) - .context(providerContext) - .table(table) - .flinkSchema(flinkTable.getSchema()) - .tableLoader(tableLoader) - .overwrite(overwrite) - .build(); - UserGroupInformation.reset(); - LOG.info("ugi reset"); - return ds; - } - }; - } - - @Override - public DynamicTableSink copy() { - return this; - } - - @Override - public String asSummaryString() { - return "mixed-format"; - } - - @Override - public void applyStaticPartition(Map map) { - // ignore - } - - @Override - public void applyOverwrite(boolean newOverwrite) { - this.overwrite = newOverwrite; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java deleted file mode 100644 index 2588a8b789..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatDynamicSource.java +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.lookup.KVTableFactory; -import org.apache.amoro.flink.lookup.MixedFormatRowDataLookupFunction; -import org.apache.amoro.flink.lookup.filter.RowDataPredicate; -import org.apache.amoro.flink.lookup.filter.RowDataPredicateExpressionVisitor; -import org.apache.amoro.flink.read.hybrid.reader.DataIteratorReaderFunction; -import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; -import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; -import org.apache.amoro.flink.util.FilterUtil; -import org.apache.amoro.flink.util.IcebergAndFlinkFilters; -import org.apache.amoro.hive.io.reader.AbstractAdaptHiveKeyedDataReader; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.utils.SchemaUtil; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.LookupTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; -import org.apache.flink.table.connector.source.lookup.LookupFunctionProvider; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.functions.BuiltInFunctionDefinitions; -import org.apache.flink.table.functions.FunctionIdentifier; -import org.apache.flink.table.functions.LookupFunction; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.stream.Collectors; - -/** Flink table api that generates source operators. */ -public class MixedFormatDynamicSource - implements ScanTableSource, - SupportsFilterPushDown, - SupportsProjectionPushDown, - SupportsLimitPushDown, - SupportsWatermarkPushDown, - LookupTableSource { - - private static final Logger LOG = LoggerFactory.getLogger(MixedFormatDynamicSource.class); - - protected final String tableName; - - protected final ScanTableSource mixedFormatDynamicSource; - protected final MixedTable mixedTable; - protected final Map properties; - - protected int[] projectFields; - protected List filters; - protected ResolvedExpression flinkExpression; - protected final MixedFormatTableLoader tableLoader; - - @Nullable protected WatermarkStrategy watermarkStrategy; - - /** - * @param tableName tableName - * @param mixedFormatDynamicSource underlying source - * @param mixedTable mixedTable - * @param properties With all mixed-format table properties and sql options - * @param tableLoader - */ - public MixedFormatDynamicSource( - String tableName, - ScanTableSource mixedFormatDynamicSource, - MixedTable mixedTable, - Map properties, - MixedFormatTableLoader tableLoader) { - this.tableName = tableName; - this.mixedFormatDynamicSource = mixedFormatDynamicSource; - this.mixedTable = mixedTable; - this.properties = properties; - this.tableLoader = tableLoader; - } - - public MixedFormatDynamicSource( - String tableName, - ScanTableSource mixedFormatDynamicSource, - MixedTable mixedTable, - Map properties, - MixedFormatTableLoader tableLoader, - int[] projectFields, - List filters, - ResolvedExpression flinkExpression) { - this.tableName = tableName; - this.mixedFormatDynamicSource = mixedFormatDynamicSource; - this.mixedTable = mixedTable; - this.properties = properties; - this.tableLoader = tableLoader; - this.projectFields = projectFields; - this.filters = filters; - this.flinkExpression = flinkExpression; - } - - @Override - public ChangelogMode getChangelogMode() { - return mixedFormatDynamicSource.getChangelogMode(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) { - ScanRuntimeProvider origin = mixedFormatDynamicSource.getScanRuntimeProvider(scanContext); - Preconditions.checkArgument( - origin instanceof DataStreamScanProvider, - "file or log ScanRuntimeProvider should be DataStreamScanProvider, but provided is " - + origin.getClass()); - return origin; - } - - @Override - public DynamicTableSource copy() { - return new MixedFormatDynamicSource( - tableName, - mixedFormatDynamicSource, - mixedTable, - properties, - tableLoader, - projectFields, - filters, - flinkExpression); - } - - @Override - public String asSummaryString() { - return "Mixed-format Dynamic Source"; - } - - @Override - public Result applyFilters(List filters) { - IcebergAndFlinkFilters icebergAndFlinkFilters = - FilterUtil.convertFlinkExpressToIceberg(filters); - this.filters = icebergAndFlinkFilters.expressions(); - - if (filters.size() == 1) { - flinkExpression = filters.get(0); - } else if (filters.size() >= 2) { - flinkExpression = and(filters.get(0), filters.get(1)); - for (int i = 2; i < filters.size(); i++) { - flinkExpression = and(flinkExpression, filters.subList(i, i + 1).get(0)); - } - } - - if (mixedFormatDynamicSource instanceof SupportsFilterPushDown) { - return ((SupportsFilterPushDown) mixedFormatDynamicSource).applyFilters(filters); - } else { - return Result.of(Collections.emptyList(), filters); - } - } - - @Override - public boolean supportsNestedProjection() { - if (mixedFormatDynamicSource instanceof SupportsProjectionPushDown) { - return ((SupportsProjectionPushDown) mixedFormatDynamicSource).supportsNestedProjection(); - } else { - return false; - } - } - - protected CallExpression and(ResolvedExpression left, ResolvedExpression right) { - return CallExpression.permanent( - FunctionIdentifier.of(BuiltInFunctionDefinitions.AND.getName()), - BuiltInFunctionDefinitions.AND, - Arrays.asList(left, right), - DataTypes.BOOLEAN()); - } - - @Override - public void applyProjection(int[][] projectedFields, DataType producedDataType) { - projectFields = new int[projectedFields.length]; - for (int i = 0; i < projectedFields.length; i++) { - Preconditions.checkArgument( - projectedFields[i].length == 1, "Don't support nested projection now."); - projectFields[i] = projectedFields[i][0]; - } - - if (mixedFormatDynamicSource instanceof SupportsProjectionPushDown) { - ((SupportsProjectionPushDown) mixedFormatDynamicSource) - .applyProjection(projectedFields, producedDataType); - } - } - - @Override - public void applyLimit(long newLimit) { - if (mixedFormatDynamicSource instanceof SupportsLimitPushDown) { - ((SupportsLimitPushDown) mixedFormatDynamicSource).applyLimit(newLimit); - } - } - - @Override - public void applyWatermark(WatermarkStrategy watermarkStrategy) { - if (mixedFormatDynamicSource instanceof SupportsWatermarkPushDown) { - ((SupportsWatermarkPushDown) mixedFormatDynamicSource).applyWatermark(watermarkStrategy); - } - } - - @Override - public LookupRuntimeProvider getLookupRuntimeProvider(LookupContext context) { - int[] joinKeys = new int[context.getKeys().length]; - for (int i = 0; i < context.getKeys().length; i++) { - Preconditions.checkArgument( - context.getKeys()[i].length == 1, - "Mixed-format lookup join doesn't support the row field as a joining key."); - joinKeys[i] = context.getKeys()[i][0]; - } - - return LookupFunctionProvider.of(getLookupFunction(joinKeys)); - } - - protected LookupFunction getLookupFunction(int[] joinKeys) { - Schema projectedSchema = getProjectedSchema(); - - List joinKeyNames = getJoinKeyNames(joinKeys, projectedSchema); - - Configuration config = new Configuration(); - properties.forEach(config::setString); - - Optional rowDataPredicate = - generatePredicate(projectedSchema, flinkExpression); - - AbstractAdaptHiveKeyedDataReader flinkMORDataReader = - generateMORReader(mixedTable, projectedSchema); - DataIteratorReaderFunction readerFunction = - generateReaderFunction(mixedTable, projectedSchema); - - return new MixedFormatRowDataLookupFunction( - KVTableFactory.INSTANCE, - mixedTable, - joinKeyNames, - projectedSchema, - filters, - tableLoader, - config, - rowDataPredicate.orElse(null), - flinkMORDataReader, - readerFunction); - } - - protected DataIteratorReaderFunction generateReaderFunction( - MixedTable mixedTable, Schema projectedSchema) { - return new RowDataReaderFunction( - new Configuration(), - mixedTable.schema(), - projectedSchema, - mixedTable.asKeyedTable().primaryKeySpec(), - null, - true, - mixedTable.io(), - true); - } - - protected AbstractAdaptHiveKeyedDataReader generateMORReader( - MixedTable mixedTable, Schema projectedSchema) { - BiFunction convertConstant = new ConvertTask(); - - return new FlinkKeyedMORDataReader( - mixedTable.io(), - mixedTable.schema(), - projectedSchema, - mixedTable.asKeyedTable().primaryKeySpec(), - null, - true, - convertConstant, - true); - } - - static class ConvertTask implements BiFunction, Serializable { - private static final long serialVersionUID = 4607513893568225789L; - - @Override - public Object apply(Type t, Object u) { - return RowDataUtil.convertConstant(t, u); - } - } - - protected List getJoinKeyNames(int[] joinKeys, Schema projectedSchema) { - return Arrays.stream(joinKeys) - .mapToObj(index -> projectedSchema.columns().get(index).name()) - .collect(Collectors.toList()); - } - - protected Schema getProjectedSchema() { - Schema mixedFormatTableSchema = mixedTable.schema(); - Schema projectedSchema; - if (projectFields == null) { - LOG.info("The projected fields is null."); - projectedSchema = mixedTable.schema(); - } else { - if (mixedTable.isUnkeyedTable()) { - throw new UnsupportedOperationException("Unkeyed table doesn't support lookup join."); - } - List primaryKeys = mixedTable.asKeyedTable().primaryKeySpec().fieldNames(); - List projectFieldList = - Arrays.stream(projectFields).boxed().collect(Collectors.toList()); - List columns = mixedFormatTableSchema.columns(); - for (int i = 0; i < mixedFormatTableSchema.columns().size(); i++) { - if (primaryKeys.contains(columns.get(i).name()) && !projectFieldList.contains(i)) { - projectFieldList.add(i); - LOG.info( - "Add identifier field {} to projected schema, due to this field is mismatched.", - columns.get(i).name()); - } - } - - List projectedFieldNames = - projectFieldList.stream() - .map(index -> columns.get(index).name()) - .collect(Collectors.toList()); - projectedSchema = SchemaUtil.selectInOrder(mixedFormatTableSchema, projectedFieldNames); - LOG.info("The projected schema {}.\n table schema {}.", projectedSchema, mixedTable.schema()); - } - return projectedSchema; - } - - protected Optional generatePredicate( - final Schema projectedSchema, final ResolvedExpression flinkExpression) { - if (flinkExpression == null) { - return Optional.empty(); - } - - final Map fieldIndexMap = new HashMap<>(); - final Map fieldDataTypeMap = new HashMap<>(); - List fields = projectedSchema.asStruct().fields(); - for (int i = 0; i < fields.size(); i++) { - Types.NestedField field = fields.get(i); - fieldIndexMap.put(field.name(), i); - fieldDataTypeMap.put( - field.name(), - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(field.type()))); - } - - RowDataPredicateExpressionVisitor visitor = - generateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); - return flinkExpression.accept(visitor); - } - - protected RowDataPredicateExpressionVisitor generateExpressionVisitor( - Map fieldIndexMap, Map fieldDataTypeMap) { - return new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java deleted file mode 100644 index a408e387bd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatFileSource.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.flink.api.common.RuntimeExecutionMode.BATCH; -import static org.apache.flink.configuration.ExecutionOptions.RUNTIME_MODE; - -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.amoro.flink.util.FilterUtil; -import org.apache.amoro.flink.util.IcebergAndFlinkFilters; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.connector.source.DataStreamScanProvider; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.connector.source.ScanTableSource; -import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; -import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.types.DataType; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.expressions.Expression; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.util.Arrays; -import java.util.List; - -/** Flink table api that generates mixed-format base/change file source operators. */ -public class MixedFormatFileSource - implements ScanTableSource, - SupportsFilterPushDown, - SupportsProjectionPushDown, - SupportsLimitPushDown, - SupportsWatermarkPushDown { - - private static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileSource.class); - - private int[] projectedFields; - private long limit; - private List filters; - private final MixedTable table; - @Nullable protected WatermarkStrategy watermarkStrategy; - - private final MixedFormatTableLoader loader; - private final TableSchema tableSchema; - private final ReadableConfig readableConfig; - private final boolean batchMode; - - private MixedFormatFileSource(MixedFormatFileSource toCopy) { - this.loader = toCopy.loader; - this.tableSchema = toCopy.tableSchema; - this.projectedFields = toCopy.projectedFields; - this.limit = toCopy.limit; - this.filters = toCopy.filters; - this.readableConfig = toCopy.readableConfig; - this.table = toCopy.table; - this.watermarkStrategy = toCopy.watermarkStrategy; - this.batchMode = toCopy.batchMode; - } - - public MixedFormatFileSource( - MixedFormatTableLoader loader, - TableSchema tableSchema, - int[] projectedFields, - MixedTable table, - long limit, - List filters, - ReadableConfig readableConfig, - boolean batchMode) { - this.loader = loader; - this.tableSchema = tableSchema; - this.projectedFields = projectedFields; - this.limit = limit; - this.table = table; - this.filters = filters; - this.readableConfig = readableConfig; - this.batchMode = batchMode; - } - - public MixedFormatFileSource( - MixedFormatTableLoader loader, - TableSchema tableSchema, - MixedTable table, - ReadableConfig readableConfig, - boolean batchMode) { - this(loader, tableSchema, null, table, -1, ImmutableList.of(), readableConfig, batchMode); - } - - @Override - public void applyProjection(int[][] projectFields) { - this.projectedFields = new int[projectFields.length]; - for (int i = 0; i < projectFields.length; i++) { - Preconditions.checkArgument( - projectFields[i].length == 1, "Don't support nested projection now."); - this.projectedFields[i] = projectFields[i][0]; - } - } - - private DataStream createDataStream( - ProviderContext providerContext, StreamExecutionEnvironment execEnv) { - return FlinkSource.forRowData() - .context(providerContext) - .env(execEnv) - .tableLoader(loader) - .mixedFormatTable(table) - .project(getProjectedSchema()) - .limit(limit) - .filters(filters) - .flinkConf(readableConfig) - .batchMode(execEnv.getConfiguration().get(RUNTIME_MODE).equals(BATCH)) - .watermarkStrategy(watermarkStrategy) - .build(); - } - - private TableSchema getProjectedSchema() { - if (projectedFields == null) { - return tableSchema; - } else { - String[] fullNames = tableSchema.getFieldNames(); - DataType[] fullTypes = tableSchema.getFieldDataTypes(); - - String[] projectedColumns = - Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new); - TableSchema.Builder builder = - TableSchema.builder() - .fields( - projectedColumns, - Arrays.stream(projectedFields) - .mapToObj(i -> fullTypes[i]) - .toArray(DataType[]::new)); - boolean dimTable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - table.properties(), - MixedFormatValidator.DIM_TABLE_ENABLE.key(), - MixedFormatValidator.DIM_TABLE_ENABLE.defaultValue()); - if (dimTable) { - builder.watermark(tableSchema.getWatermarkSpecs().get(0)); - } - - TableSchema ts = builder.build(); - LOG.info("TableSchema after projection:{}", ts); - return ts; - } - } - - @Override - public void applyLimit(long newLimit) { - this.limit = newLimit; - } - - @Override - public Result applyFilters(List flinkFilters) { - IcebergAndFlinkFilters icebergAndFlinkFilters = - FilterUtil.convertFlinkExpressToIceberg(flinkFilters); - this.filters = icebergAndFlinkFilters.expressions(); - return Result.of(icebergAndFlinkFilters.acceptedFilters(), flinkFilters); - } - - @Override - public boolean supportsNestedProjection() { - // TODO: support nested projection - return false; - } - - @Override - public ChangelogMode getChangelogMode() { - if (table.isUnkeyedTable() || batchMode) { - return ChangelogMode.insertOnly(); - } - return ChangelogMode.newBuilder() - .addContainedKind(RowKind.DELETE) - .addContainedKind(RowKind.INSERT) - .addContainedKind(RowKind.UPDATE_AFTER) - .addContainedKind(RowKind.UPDATE_BEFORE) - .build(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - return new DataStreamScanProvider() { - @Override - public DataStream produceDataStream( - ProviderContext providerContext, StreamExecutionEnvironment execEnv) { - return createDataStream(providerContext, execEnv); - } - - @Override - public boolean isBounded() { - return org.apache.iceberg.flink.source.FlinkSource.isBounded(table.properties()); - } - }; - } - - @Override - public DynamicTableSource copy() { - return new MixedFormatFileSource(this); - } - - @Override - public String asSummaryString() { - return "Mixed-Format File Source"; - } - - @Override - public void applyWatermark(WatermarkStrategy watermarkStrategy) { - Configuration conf = Configuration.fromMap(table.properties()); - boolean dimTable = - CompatibleFlinkPropertyUtil.propertyAsBoolean(conf, MixedFormatValidator.DIM_TABLE_ENABLE); - if (!dimTable) { - this.watermarkStrategy = watermarkStrategy; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java deleted file mode 100644 index d7282739fb..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/MixedFormatTableLoader.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.InternalCatalogBuilder; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.interceptor.FlinkTablePropertiesInvocationHandler; -import org.apache.amoro.mixed.MixedFormatCatalog; -import org.apache.amoro.shade.guava32.com.google.common.base.MoreObjects; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.TableLoader; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -/** load a proxy table contains both mixed-format table properties and flink table properties */ -public class MixedFormatTableLoader implements TableLoader { - - private static final long serialVersionUID = 1L; - - protected final InternalCatalogBuilder catalogBuilder; - protected final TableIdentifier tableIdentifier; - protected final Map flinkTableProperties; - /** - * The mark of loading internal table, base or change table. For compatible with iceberg - * committer. - */ - protected boolean loadBaseForKeyedTable; - - protected transient MixedFormatCatalog mixedFormatCatalog; - - public static MixedFormatTableLoader of( - TableIdentifier tableIdentifier, InternalCatalogBuilder catalogBuilder) { - return of(tableIdentifier, catalogBuilder, new HashMap<>()); - } - - public static MixedFormatTableLoader of( - TableIdentifier tableIdentifier, - InternalCatalogBuilder catalogBuilder, - Map flinkTableProperties) { - return new MixedFormatTableLoader(tableIdentifier, catalogBuilder, flinkTableProperties); - } - - public static MixedFormatTableLoader of( - TableIdentifier tableIdentifier, Map flinkTableProperties) { - String metastoreUri = flinkTableProperties.get(CatalogFactoryOptions.AMS_URI.key()); - return new MixedFormatTableLoader( - tableIdentifier, - InternalCatalogBuilder.builder().amsUri(metastoreUri), - flinkTableProperties); - } - - public static MixedFormatTableLoader of( - TableIdentifier tableIdentifier, - String metastoreUri, - Map flinkTableProperties) { - return new MixedFormatTableLoader( - tableIdentifier, - InternalCatalogBuilder.builder().amsUri(metastoreUri), - flinkTableProperties); - } - - protected MixedFormatTableLoader( - TableIdentifier tableIdentifier, - InternalCatalogBuilder catalogBuilder, - Map flinkTableProperties) { - this(tableIdentifier, catalogBuilder, flinkTableProperties, null); - } - - protected MixedFormatTableLoader( - TableIdentifier tableIdentifier, - InternalCatalogBuilder catalogBuilder, - Map flinkTableProperties, - Boolean loadBaseForKeyedTable) { - this.catalogBuilder = catalogBuilder; - this.tableIdentifier = tableIdentifier; - this.flinkTableProperties = flinkTableProperties; - this.loadBaseForKeyedTable = loadBaseForKeyedTable == null || loadBaseForKeyedTable; - } - - @Override - public void open() { - mixedFormatCatalog = catalogBuilder.build(); - } - - @Override - public boolean isOpen() { - return mixedFormatCatalog != null; - } - - public MixedTable loadMixedFormatTable() { - return ((MixedTable) - new FlinkTablePropertiesInvocationHandler( - flinkTableProperties, mixedFormatCatalog.loadTable(tableIdentifier)) - .getProxy()); - } - - public void switchLoadInternalTableForKeyedTable(boolean loadBaseForKeyedTable) { - this.loadBaseForKeyedTable = loadBaseForKeyedTable; - } - - @Override - public Table loadTable() { - MixedTable table = loadMixedFormatTable(); - - if (table.isKeyedTable()) { - if (loadBaseForKeyedTable) { - return table.asKeyedTable().baseTable(); - } else { - return table.asKeyedTable().changeTable(); - } - } - if (!(table instanceof Table)) { - throw new UnsupportedOperationException( - String.format("table type mismatched. It's %s", table.getClass())); - } - return (Table) table; - } - - @Override - public TableLoader clone() { - return new MixedFormatTableLoader( - tableIdentifier, catalogBuilder, flinkTableProperties, loadBaseForKeyedTable); - } - - @Override - public void close() throws IOException {} - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("tableIdentifier", tableIdentifier).toString(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java deleted file mode 100644 index 38ef34666f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/OptionsUtil.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; - -import java.util.Map; -import java.util.Properties; - -public class OptionsUtil { - - // Prefix for specific properties. - public static final String PROPERTIES_PREFIX = "properties."; - - public static Properties getKafkaProperties(Map tableOptions) { - final Properties kafkaProperties = new Properties(); - - if (hasProperties(tableOptions)) { - tableOptions.keySet().stream() - .filter(key -> key.startsWith(PROPERTIES_PREFIX)) - .forEach( - key -> { - final String value = tableOptions.get(key); - final String subKey = key.substring((PROPERTIES_PREFIX).length()); - kafkaProperties.put(subKey, value); - }); - } - return kafkaProperties; - } - - public static Map getCatalogProperties(Map options) { - Map catalogProperties = Maps.newHashMap(); - options.forEach( - (key, value) -> { - if (key.startsWith(PROPERTIES_PREFIX)) { - catalogProperties.put(key.substring((PROPERTIES_PREFIX).length()), value); - } else { - catalogProperties.put(key, value); - } - }); - return catalogProperties; - } - - /** Decides if the table options contains table properties that start with prefix 'properties'. */ - private static boolean hasProperties(Map tableOptions) { - return tableOptions.keySet().stream().anyMatch(k -> k.startsWith(PROPERTIES_PREFIX)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java deleted file mode 100644 index 71f30995f9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnifiedDynamicTableFactory.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.TableFormat; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.ObjectIdentifier; -import org.apache.flink.table.connector.sink.DynamicTableSink; -import org.apache.flink.table.connector.source.DynamicTableSource; -import org.apache.flink.table.factories.DynamicTableSinkFactory; -import org.apache.flink.table.factories.DynamicTableSourceFactory; -import org.apache.flink.table.factories.Factory; -import org.apache.flink.table.factories.FactoryUtil; -import org.apache.flink.util.Preconditions; - -import java.util.Map; -import java.util.Optional; -import java.util.Set; - -/** - * UnifiedDynamicTableFactory is a factory for creating dynamic table sources and sinks. It - * implements both DynamicTableSourceFactory and DynamicTableSinkFactory interfaces. - */ -public class UnifiedDynamicTableFactory - implements DynamicTableSourceFactory, DynamicTableSinkFactory { - - private final Map availableCatalogs; - - public UnifiedDynamicTableFactory(Map availableCatalogs) { - this.availableCatalogs = - Preconditions.checkNotNull(availableCatalogs, "availableCatalogs cannot be null"); - } - - @Override - public DynamicTableSink createDynamicTableSink(Context context) { - ObjectIdentifier identifier = context.getObjectIdentifier(); - FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); - Configuration options = (Configuration) helper.getOptions(); - TableFormat tableFormat = TableFormat.valueOf(options.get(MixedFormatValidator.TABLE_FORMAT)); - - return getOriginalCatalog(tableFormat) - .flatMap(AbstractCatalog::getFactory) - .filter(factory -> factory instanceof DynamicTableSinkFactory) - .map(factory -> ((DynamicTableSinkFactory) factory).createDynamicTableSink(context)) - .orElseThrow( - () -> - new UnsupportedOperationException( - String.format( - "Invalid catalog or factory for table format: %s, table: %s.", - tableFormat, identifier))); - } - - @Override - public DynamicTableSource createDynamicTableSource(Context context) { - ObjectIdentifier identifier = context.getObjectIdentifier(); - FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); - Configuration options = (Configuration) helper.getOptions(); - TableFormat tableFormat = TableFormat.valueOf(options.get(MixedFormatValidator.TABLE_FORMAT)); - - return getOriginalCatalog(tableFormat) - .flatMap(AbstractCatalog::getFactory) - .filter(factory -> factory instanceof DynamicTableSourceFactory) - .map(factory -> ((DynamicTableSourceFactory) factory).createDynamicTableSource(context)) - .orElseThrow( - () -> - new UnsupportedOperationException( - String.format( - "Invalid catalog or factory for table format: %s, table: %s.", - tableFormat, identifier))); - } - - private Optional getOriginalCatalog(TableFormat format) { - return Optional.of(availableCatalogs.get(format)); - } - - @Override - public String factoryIdentifier() { - return CatalogFactoryOptions.UNIFIED_IDENTIFIER; - } - - @Override - public Set> requiredOptions() { - Set> requiredOptions = Sets.newHashSet(); - availableCatalogs.forEach( - (format, catalog) -> { - Optional factory = catalog.getFactory(); - factory.ifPresent(value -> requiredOptions.addAll(value.requiredOptions())); - }); - requiredOptions.add(MixedFormatValidator.TABLE_FORMAT); - return requiredOptions; - } - - @Override - public Set> optionalOptions() { - Set> optionalOptions = Sets.newHashSet(); - availableCatalogs.forEach( - (format, catalog) -> { - Optional factory = catalog.getFactory(); - factory.ifPresent(value -> optionalOptions.addAll(value.optionalOptions())); - }); - return optionalOptions; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java deleted file mode 100644 index 644ef1f807..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatOperatorFactory.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.interceptor.ProxyFactory; -import org.apache.amoro.flink.util.IcebergClassUtil; -import org.apache.flink.api.common.operators.MailboxExecutor; -import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperator; -import org.apache.flink.streaming.api.operators.StreamOperatorParameters; -import org.apache.flink.streaming.api.operators.YieldingOperatorFactory; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkInputSplit; -import org.apache.iceberg.flink.source.StreamingReaderOperator; - -public class UnkeyedInputFormatOperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, - OneInputStreamOperatorFactory { - - private final ProxyFactory factory; - - private transient MailboxExecutor mailboxExecutor; - - public UnkeyedInputFormatOperatorFactory(ProxyFactory factory) { - this.factory = factory; - } - - @Override - public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { - this.mailboxExecutor = mailboxExecutor; - } - - @SuppressWarnings("unchecked") - @Override - public > O createStreamOperator( - StreamOperatorParameters parameters) { - StreamingReaderOperator operator = - IcebergClassUtil.newStreamingReaderOperator( - factory.getInstance(), processingTimeService, mailboxExecutor); - operator.setup( - parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); - return (O) operator; - } - - @Override - public Class getStreamOperatorClass(ClassLoader classLoader) { - return StreamingReaderOperator.class; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java deleted file mode 100644 index 9a85051e49..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/UnkeyedInputFormatSourceFunction.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.flink.interceptor.ProxyFactory; -import org.apache.flink.api.common.io.RichInputFormat; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.common.typeutils.TypeSerializer; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.io.InputSplit; -import org.apache.flink.metrics.Counter; -import org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider; -import org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException; -import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; -import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; -import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkInputSplit; - -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** Copy from {@link InputFormatSourceFunction} */ -public class UnkeyedInputFormatSourceFunction extends RichParallelSourceFunction { - private static final long serialVersionUID = 1L; - - private final TypeInformation typeInfo; - private transient TypeSerializer serializer; - - private FlinkInputFormat format; - private final ProxyFactory formatFactory; - - private transient InputSplitProvider provider; - private transient Iterator splitIterator; - - private volatile boolean isRunning = true; - - @SuppressWarnings("unchecked") - public UnkeyedInputFormatSourceFunction( - ProxyFactory formatFactory, TypeInformation typeInfo) { - this.formatFactory = formatFactory; - this.typeInfo = typeInfo; - } - - @Override - @SuppressWarnings("unchecked") - public void open(Configuration parameters) throws Exception { - StreamingRuntimeContext context = (StreamingRuntimeContext) getRuntimeContext(); - - format = formatFactory.getInstance(); - if (format instanceof RichInputFormat) { - format.setRuntimeContext(context); - } - format.configure(parameters); - - provider = context.getInputSplitProvider(); - serializer = typeInfo.createSerializer(getRuntimeContext().getExecutionConfig()); - splitIterator = getInputSplits(); - isRunning = splitIterator.hasNext(); - } - - @Override - public void run(SourceContext ctx) throws Exception { - try { - Counter completedSplitsCounter = - getRuntimeContext().getMetricGroup().counter("numSplitsProcessed"); - if (isRunning && format instanceof RichInputFormat) { - format.openInputFormat(); - } - - RowData nextElement = serializer.createInstance(); - while (isRunning) { - format.open((FlinkInputSplit) splitIterator.next()); - - // for each element we also check if cancel - // was called by checking the isRunning flag - - while (isRunning && !format.reachedEnd()) { - nextElement = format.nextRecord(nextElement); - if (nextElement != null) { - ctx.collect(nextElement); - } else { - break; - } - } - format.close(); - completedSplitsCounter.inc(); - - if (isRunning) { - isRunning = splitIterator.hasNext(); - } - } - } finally { - format.close(); - if (format instanceof RichInputFormat) { - format.closeInputFormat(); - } - isRunning = false; - } - } - - @Override - public void cancel() { - isRunning = false; - } - - @Override - public void close() throws Exception { - format.close(); - if (format instanceof RichInputFormat) { - format.closeInputFormat(); - } - } - - /** - * Returns the {@code InputFormat}. This is only needed because we need to set the input split - * assigner on the {@code StreamGraph}. - */ - public FlinkInputFormat getFormat() { - return format; - } - - private Iterator getInputSplits() { - - return new Iterator() { - - private InputSplit nextSplit; - - private boolean exhausted; - - @Override - public boolean hasNext() { - if (exhausted) { - return false; - } - - if (nextSplit != null) { - return true; - } - - final InputSplit split; - try { - split = provider.getNextInputSplit(getRuntimeContext().getUserCodeClassLoader()); - } catch (InputSplitProviderException e) { - throw new RuntimeException("Could not retrieve next input split.", e); - } - - if (split != null) { - this.nextSplit = split; - return true; - } else { - exhausted = true; - return false; - } - } - - @Override - public InputSplit next() { - if (this.nextSplit == null && !hasNext()) { - throw new NoSuchElementException(); - } - - final InputSplit tmp = this.nextSplit; - this.nextSplit = null; - return tmp; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java deleted file mode 100644 index 662da35be3..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/table/descriptors/MixedFormatValidator.java +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table.descriptors; - -import static org.apache.flink.configuration.description.TextElement.text; - -import org.apache.amoro.TableFormat; -import org.apache.commons.lang.StringUtils; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.description.Description; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.descriptors.ConnectorDescriptorValidator; -import org.apache.flink.table.descriptors.DescriptorProperties; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.Preconditions; - -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -/** Validate mixed-format table properties. */ -public class MixedFormatValidator extends ConnectorDescriptorValidator { - - public static final String MIXED_FORMAT_EMIT_LOG = "log"; - public static final String MIXED_FORMAT_EMIT_FILE = "file"; - - public static final String MIXED_FORMAT_EMIT_AUTO = "auto"; - - public static final String MIXED_FORMAT_READ_FILE = "file"; - public static final String MIXED_FORMAT_READ_LOG = "log"; - - public static final String MIXED_FORMAT_READ_MODE = "mixed-format.read.mode"; - public static final String MIXED_READ_MODE_DEFAULT = MIXED_FORMAT_READ_FILE; - - public static final String MIXED_FORMAT_LATENCY_METRIC_ENABLE = "metrics.event-latency.enabled"; - public static final boolean MIXED_FORMAT_LATENCY_METRIC_ENABLE_DEFAULT = false; - - @Deprecated - public static final String MIXED_FORMAT_LATENCY_METRIC_ENABLE_LEGACY = - "metrics.event-latency.enable"; - - public static final String MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE = "metrics.enabled"; - public static final boolean MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT = false; - - @Deprecated - public static final String MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_LEGACY = "metrics.enable"; - - public static final String BASE_WRITE_LOCATION = "base.write.location"; - public static final String BASE_WRITE_LOCATION_SUFFIX = "/init"; - - public static final String MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE = "write.open-files.size.max"; - public static final long MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT = - 671088640L; // 640M = 5 * 128M - - // log.consumer.changelog.mode - public static final String LOG_CONSUMER_CHANGELOG_MODE_APPEND_ONLY = "append-only"; - public static final String LOG_CONSUMER_CHANGELOG_MODE_ALL_KINDS = "all-kinds"; - - // file scan startup mode - public static final String SCAN_STARTUP_MODE_EARLIEST = "earliest"; - public static final String SCAN_STARTUP_MODE_LATEST = "latest"; - public static final String SCAN_STARTUP_MODE_TIMESTAMP = "timestamp"; - public static final String SCAN_STARTUP_MODE_GROUP_OFFSETS = "group-offsets"; - public static final String SCAN_STARTUP_MODE_SPECIFIC_OFFSETS = "specific-offsets"; - - public static final ConfigOption MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE = - ConfigOptions.key("log-store.consistency-guarantee.enabled") - .booleanType() - .defaultValue(false) - .withDescription("Flag hidden kafka read retraction enable or not."); - - @Deprecated - public static final ConfigOption MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY = - ConfigOptions.key("log-store.consistency-guarantee.enable") - .booleanType() - .defaultValue(false) - .withDescription("Flag hidden kafka read retraction enable or not."); - - public static final ConfigOption MIXED_FORMAT_LOG_CONSUMER_CHANGELOG_MODE = - ConfigOptions.key("log.consumer.changelog.modes") - .stringType() - .defaultValue("all-kinds") - .withDescription( - Description.builder() - .text("Describe what changelog modes does the log consumer support ") - .list( - text("'all-kinds' (log consumer support +I/-D/-U/+U)"), - text("'append-only' (log consumer only support +I)")) - .build()) - .withDescription("Describe what changelog modes does the log consumer support."); - - public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = - ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count") - .intType() - .defaultValue(2048) - .withDescription("The target number of records for Iceberg reader fetch batch."); - - public static final ConfigOption SCAN_STARTUP_MODE = - ConfigOptions.key("scan.startup.mode") - .stringType() - .defaultValue(SCAN_STARTUP_MODE_LATEST) - .withDescription( - String.format( - "Optional startup mode for mixed-format source, valid values are " - + "\"earliest\" or \"latest\", \"timestamp\". If %s values %s, \"earliest\":" - + " read earliest table data including base and change files from" - + " the current snapshot, \"latest\": read all incremental data in the change table starting from the" - + " current snapshot (the current snapshot will be excluded), \"timestamp\" has not supported yet." - + " If %s values %s, \"earliest\": start from the earliest offset possible." - + " \"latest\": start from the latest offset," - + " \"timestamp\": start from user-supplied timestamp for each partition.", - MIXED_FORMAT_READ_MODE, - MIXED_FORMAT_READ_FILE, - MIXED_FORMAT_READ_MODE, - MIXED_FORMAT_READ_LOG)); - - public static final ConfigOption SCAN_STARTUP_TIMESTAMP_MILLIS = - ConfigOptions.key("scan.startup.timestamp-millis") - .longType() - .noDefaultValue() - .withDescription("Optional timestamp used in case of \"timestamp\" startup mode"); - - public static final ConfigOption SCAN_STARTUP_SPECIFIC_OFFSETS = - ConfigOptions.key("scan.startup.specific-offsets") - .stringType() - .noDefaultValue() - .withDescription("Optional timestamp used in case of \"timestamp\" startup mode"); - - public static final ConfigOption SUBMIT_EMPTY_SNAPSHOTS = - ConfigOptions.key("submit.empty.snapshots") - .booleanType() - .defaultValue(false) - .withDescription( - "Optional submit empty snapshots to the mixed-format table, false means that writers will not emit" - + " empty WriteResults to the committer operator, and reduce the number of snapshots in File Cache; true" - + " means this job will submit empty snapshots to the table, it is suitable with some valid reasons, e.g." - + " advance watermark metadata stored in the table(https://github.com/apache/iceberg/pull/5561)."); - - public static final ConfigOption MIXED_FORMAT_CATALOG = - ConfigOptions.key("mixed-format.catalog") - .stringType() - .noDefaultValue() - .withDescription("underlying mixed-format catalog name."); - - public static final ConfigOption MIXED_FORMAT_DATABASE = - ConfigOptions.key("mixed-format.database") - .stringType() - .noDefaultValue() - .withDescription("underlying mixed-format database name."); - - public static final ConfigOption MIXED_FORMAT_TABLE = - ConfigOptions.key("mixed-format.table") - .stringType() - .noDefaultValue() - .withDescription("underlying mixed-format table name."); - - public static final ConfigOption DIM_TABLE_ENABLE = - ConfigOptions.key("dim-table.enabled") - .booleanType() - .defaultValue(false) - .withDescription( - "If it is true, mixed-format source will generate watermark after stock data being read"); - - @Deprecated - public static final ConfigOption DIM_TABLE_ENABLE_LEGACY = - ConfigOptions.key("dim-table.enable") - .booleanType() - .defaultValue(false) - .withDescription( - "If it is true, mixed-format source will generate watermark after stock data being read"); - - public static final ConfigOption MIXED_FORMAT_EMIT_MODE = - ConfigOptions.key("mixed-format.emit.mode") - .stringType() - .defaultValue(MIXED_FORMAT_EMIT_AUTO) - .withDescription( - "file, log, auto. e.g.\n" - + "'file' means only writing data into filestore.\n" - + "'log' means only writing data into logstore.\n" - + "'file,log' means writing data into both filestore and logstore.\n" - + "'auto' means writing data into filestore if the logstore of the mixed-format table is disabled;" - + " Also means writing data into both filestore and logstore if the logstore of the mixed-format table" - + " is enabled.\n" - + "'auto' is recommended."); - - public static final ConfigOption AUTO_EMIT_LOGSTORE_WATERMARK_GAP = - ConfigOptions.key("mixed-format.emit.auto-write-to-logstore.watermark-gap") - .durationType() - .noDefaultValue() - .withDescription( - "Only enabled when 'mixed-format.emit.mode'='auto', if the watermark of the mixed-format writers" - + " is greater than the current system timestamp subtracts the specific value, writers will also write" - + " data into the logstore.\n" - + "This value must be greater than 0."); - - public static final ConfigOption LOG_STORE_CATCH_UP = - ConfigOptions.key("log-store.catch-up") - .booleanType() - .defaultValue(false) - .withDescription( - "If it is true, mixed-format source will emit data to filestore and logstore. If it is false," - + " mixed-format source will only emit data to filestore."); - - public static final ConfigOption LOG_STORE_CATCH_UP_TIMESTAMP = - ConfigOptions.key("log-store.catch-up-timestamp") - .longType() - .defaultValue(0L) - .withDescription( - "Mark the time to start double writing (the logstore of mixed-format table catches up with the" - + " historical data)."); - - public static final ConfigOption LOOKUP_CACHE_MAX_ROWS = - ConfigOptions.key("lookup.cache.max-rows") - .longType() - .defaultValue(10000L) - .withDescription( - "The maximum number of rows in the lookup cache, beyond which the oldest row will expire." - + " By default, lookup cache is 10000."); - - public static final ConfigOption LOOKUP_CACHE_TTL_AFTER_WRITE = - ConfigOptions.key("lookup.cache.ttl-after-write") - .durationType() - .defaultValue(Duration.ZERO) - .withDescription("The TTL after which the row will expire in the lookup cache."); - - public static final ConfigOption LOOKUP_RELOADING_INTERVAL = - ConfigOptions.key("lookup.reloading.interval") - .durationType() - .defaultValue(Duration.ofSeconds(10)) - .withDescription( - "Configuration option for specifying the interval in seconds to reload lookup data in RocksDB." - + "\nThe default value is 10 seconds."); - - public static final ConfigOption ROCKSDB_AUTO_COMPACTIONS = - ConfigOptions.key("rocksdb.auto-compactions") - .booleanType() - .defaultValue(false) - .withDescription( - "Enable automatic compactions during the initialization process." - + "\nAfter the initialization completed, will enable the auto_compaction."); - - public static final ConfigOption ROCKSDB_WRITING_THREADS = - ConfigOptions.key("rocksdb.writing-threads") - .intType() - .defaultValue(5) - .withDescription("Writing data into rocksDB thread number."); - - public static final ConfigOption ROCKSDB_BLOCK_CACHE_CAPACITY = - ConfigOptions.key("rocksdb.block-cache.capacity") - .longType() - .defaultValue(32 * 1024 * 1024L) - .withDescription( - "Use the LRUCache strategy for blocks, the size of the BlockCache can be configured based on " - + "your memory requirements and available system resources. Default is 32MB."); - - public static final ConfigOption ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS = - ConfigOptions.key("rocksdb.block-cache.numShardBits") - .intType() - .defaultValue(-1) - .withDescription( - "Use the LRUCache strategy for blocks. The cache is sharded to 2^numShardBits shards, by hash " - + " of the key. Default is -1, means it is automatically determined: every shard will be at least 512KB and" - + " number of shard bits will not exceed 6."); - - public static final ConfigOption TABLE_FORMAT = - ConfigOptions.key("table.format") - .stringType() - .defaultValue(TableFormat.MIXED_ICEBERG.name()) - .withDescription( - String.format( - "The format of the table, valid values are %s, %s, %s or %s, and Flink choose '%s' as default format.", - TableFormat.ICEBERG.name(), - TableFormat.MIXED_ICEBERG.name(), - TableFormat.MIXED_HIVE.name(), - TableFormat.PAIMON.name(), - TableFormat.MIXED_ICEBERG.name())); - - public static final ConfigOption SCAN_PARALLELISM = - ConfigOptions.key("source.parallelism") - .intType() - .noDefaultValue() - .withDescription( - "Defines a custom parallelism for the source. " - + "By default, if this option is not defined, the planner will derive the parallelism " - + "for each statement individually by also considering the global configuration."); - - @Override - public void validate(DescriptorProperties properties) { - String emitMode = properties.getString(MIXED_FORMAT_EMIT_MODE.key()); - if (StringUtils.isBlank(emitMode)) { - throw new ValidationException("None value for property '" + MIXED_FORMAT_EMIT_MODE.key()); - } - - String[] actualEmitModes = emitMode.split(","); - List modeList = - Arrays.asList(MIXED_FORMAT_EMIT_FILE, MIXED_FORMAT_EMIT_LOG, MIXED_FORMAT_EMIT_AUTO); - for (String mode : actualEmitModes) { - if (!modeList.contains(mode)) { - throw new ValidationException( - "Unknown value for property '" - + MIXED_FORMAT_EMIT_MODE.key() - + "'.\n" - + "Supported values are " - + modeList.stream() - .collect(Collectors.toMap(v -> v, v -> DescriptorProperties.noValidation())) - .keySet() - + " but was: " - + mode); - } - - Preconditions.checkArgument( - !MIXED_FORMAT_EMIT_AUTO.equals(mode) || actualEmitModes.length == 1, - "The value of property '" - + MIXED_FORMAT_EMIT_MODE.key() - + "' must be only 'auto' when it is included."); - } - } - - public static Configuration asConfiguration(Map options) { - final Configuration configuration = new Configuration(); - options.forEach(configuration::setString); - return configuration; - } - - private static RowType getRowType(CatalogBaseTable flinkTable) { - return (RowType) flinkTable.getSchema().toRowDataType().getLogicalType(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java deleted file mode 100644 index fbe22e6c80..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/CompatibleFlinkPropertyUtil.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static org.apache.flink.streaming.connectors.kafka.table.KafkaConnectorOptions.TOPIC; - -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.table.TableProperties; -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.iceberg.util.PropertyUtil; - -import java.util.List; -import java.util.Map; -import java.util.Properties; - -/** PropertyUtil compatible with legacy flink properties */ -public class CompatibleFlinkPropertyUtil { - - private CompatibleFlinkPropertyUtil() {} - - public static boolean propertyAsBoolean( - Map properties, String property, boolean defaultValue) { - return PropertyUtil.propertyAsBoolean( - properties, getCompatibleProperty(properties, property), defaultValue); - } - - public static boolean propertyAsBoolean( - ReadableConfig config, ConfigOption configOption) { - ConfigOption legacyProperty = getLegacyProperty(configOption); - if (legacyProperty != null - && config.getOptional(legacyProperty).isPresent() - && !config.getOptional(configOption).isPresent()) { - return config.get(legacyProperty); - } else { - return config.get(configOption); - } - } - - public static double propertyAsDouble( - Map properties, String property, double defaultValue) { - return PropertyUtil.propertyAsDouble( - properties, getCompatibleProperty(properties, property), defaultValue); - } - - public static int propertyAsInt( - Map properties, String property, int defaultValue) { - return PropertyUtil.propertyAsInt( - properties, getCompatibleProperty(properties, property), defaultValue); - } - - public static long propertyAsLong( - Map properties, String property, long defaultValue) { - return PropertyUtil.propertyAsLong( - properties, getCompatibleProperty(properties, property), defaultValue); - } - - public static String propertyAsString( - Map properties, String property, String defaultValue) { - return PropertyUtil.propertyAsString( - properties, getCompatibleProperty(properties, property), defaultValue); - } - - private static String getCompatibleProperty(Map properties, String property) { - String legacyProperty = getLegacyProperty(property); - if (legacyProperty != null - && properties.containsKey(legacyProperty) - && !properties.containsKey(property)) { - return legacyProperty; - } else { - return property; - } - } - - private static String getLegacyProperty(String property) { - if (property == null) { - return null; - } - if (MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key().equals(property)) { - return MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY.key(); - } else if (MixedFormatValidator.DIM_TABLE_ENABLE.key().equals(property)) { - return MixedFormatValidator.DIM_TABLE_ENABLE_LEGACY.key(); - } - switch (property) { - case MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE: - return MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE_LEGACY; - case MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE: - return MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_LEGACY; - default: - return null; - } - } - - private static ConfigOption getLegacyProperty(ConfigOption configOption) { - if (configOption == null) { - return null; - } - if (MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE - .key() - .equals(configOption.key())) { - return MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY; - } else if (MixedFormatValidator.DIM_TABLE_ENABLE.key().equals(configOption.key())) { - return MixedFormatValidator.DIM_TABLE_ENABLE_LEGACY; - } - return null; - } - - /** - * Get log-store properties from table properties and flink options, whose prefix is {@link - * TableProperties#LOG_STORE_PROPERTIES_PREFIX}. - * - * @param tableOptions including table properties and flink options - * @return Properties. The keys in it have no {@link TableProperties#LOG_STORE_PROPERTIES_PREFIX}. - */ - public static Properties fetchLogstorePrefixProperties(Map tableOptions) { - final Properties properties = new Properties(); - - if (hasPrefix(tableOptions, TableProperties.LOG_STORE_PROPERTIES_PREFIX)) { - tableOptions.keySet().stream() - .filter(key -> key.startsWith(TableProperties.LOG_STORE_PROPERTIES_PREFIX)) - .forEach( - key -> { - final String value = tableOptions.get(key); - final String subKey = - key.substring((TableProperties.LOG_STORE_PROPERTIES_PREFIX).length()); - properties.put(subKey, value); - }); - } - return properties; - } - - public static boolean hasPrefix(Map tableOptions, String prefix) { - return tableOptions.keySet().stream().anyMatch(k -> k.startsWith(prefix)); - } - - public static List getLogTopic(Map tableProperties) { - Configuration conf = new Configuration(); - conf.setString(TOPIC.key(), tableProperties.get(TableProperties.LOG_STORE_MESSAGE_TOPIC)); - return conf.get(TOPIC); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java deleted file mode 100644 index 77e9f83a47..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/DateTimeUtils.java +++ /dev/null @@ -1,1797 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static java.time.temporal.ChronoField.DAY_OF_MONTH; -import static java.time.temporal.ChronoField.HOUR_OF_DAY; -import static java.time.temporal.ChronoField.MINUTE_OF_HOUR; -import static java.time.temporal.ChronoField.MONTH_OF_YEAR; -import static java.time.temporal.ChronoField.NANO_OF_SECOND; -import static java.time.temporal.ChronoField.SECOND_OF_MINUTE; -import static java.time.temporal.ChronoField.YEAR; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.api.TableException; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.TimestampType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.math.BigDecimal; -import java.math.RoundingMode; -import java.sql.Timestamp; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.time.DateTimeException; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.ZoneId; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.time.format.DateTimeParseException; -import java.time.temporal.TemporalAccessor; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.TimeZone; - -/** - * Utility functions for datetime types: date, time, timestamp. - * - *

These utils include: - * - *

    - *
  • {@code parse[type]}: methods for parsing strings to date/time/timestamp - *
  • {@code format[type]}: methods for formatting date/time/timestamp - *
  • {@code to[externalTypeName]} and {@code toInternal}: methods for converting values from - * internal date/time/timestamp types from/to java.sql or java.time types - *
  • Various operations on timestamp, including floor, ceil and extract - *
  • {@link TimeUnit} and {@link TimeUnitRange} enums - *
- * - *

Currently, this class is a bit messy because it includes a mix of functionalities both from - * common and planner. We should strive to reduce the number of functionalities here, eventually - * moving some methods closer to where they're needed. Connectors and formats should not use this - * class, but rather if a functionality is necessary, it should be part of the public APIs of our - * type system (e.g a new method in {@link TimestampData} or in {@link TimestampType}). Methods used - * only by the planner should live inside the planner whenever is possible. - * - *

Copied from flink-1.18 - */ -@Internal -public class DateTimeUtils { - - private static final Logger LOG = LoggerFactory.getLogger(DateTimeUtils.class); - - /** The julian date of the epoch, 1970-01-01. */ - public static final int EPOCH_JULIAN = 2440588; - - /** The number of milliseconds in a second. */ - private static final long MILLIS_PER_SECOND = 1000L; - - /** The number of milliseconds in a minute. */ - private static final long MILLIS_PER_MINUTE = 60000L; - - /** The number of milliseconds in an hour. */ - private static final long MILLIS_PER_HOUR = 3600000L; // = 60 * 60 * 1000 - - /** - * The number of milliseconds in a day. - * - *

This is the modulo 'mask' used when converting TIMESTAMP values to DATE and TIME values. - */ - public static final long MILLIS_PER_DAY = 86400000L; // = 24 * 60 * 60 * 1000 - - /** The SimpleDateFormat string for ISO dates, "yyyy-MM-dd". */ - private static final String DATE_FORMAT_STRING = "yyyy-MM-dd"; - - /** The SimpleDateFormat string for ISO times, "HH:mm:ss". */ - private static final String TIME_FORMAT_STRING = "HH:mm:ss"; - - /** The SimpleDateFormat string for ISO timestamps, "yyyy-MM-dd HH:mm:ss". */ - private static final String TIMESTAMP_FORMAT_STRING = - DATE_FORMAT_STRING + " " + TIME_FORMAT_STRING; - - /** The UTC time zone. */ - public static final TimeZone UTC_ZONE = TimeZone.getTimeZone("UTC"); - - /** The local time zone. */ - public static final TimeZone LOCAL_TZ = TimeZone.getDefault(); - - /** The valid minimum epoch milliseconds ('0000-01-01 00:00:00.000 UTC+0'). */ - private static final long MIN_EPOCH_MILLS = -62167219200000L; - - /** The valid minimum epoch seconds ('0000-01-01 00:00:00 UTC+0'). */ - private static final long MIN_EPOCH_SECONDS = -62167219200L; - - /** The valid maximum epoch milliseconds ('9999-12-31 23:59:59.999 UTC+0'). */ - private static final long MAX_EPOCH_MILLS = 253402300799999L; - - /** The valid maximum epoch seconds ('9999-12-31 23:59:59 UTC+0'). */ - private static final long MAX_EPOCH_SECONDS = 253402300799L; - - private static final DateTimeFormatter DEFAULT_TIMESTAMP_FORMATTER = - new DateTimeFormatterBuilder() - .appendPattern("yyyy-[MM][M]-[dd][d]") - .optionalStart() - .appendPattern(" [HH][H]:[mm][m]:[ss][s]") - .appendFraction(NANO_OF_SECOND, 0, 9, true) - .optionalEnd() - .toFormatter(); - - /** - * A ThreadLocal cache map for SimpleDateFormat, because SimpleDateFormat is not thread-safe. - * (string_format) => formatter - */ - private static final ThreadLocalCache FORMATTER_CACHE = - ThreadLocalCache.of(SimpleDateFormat::new); - - /** A ThreadLocal cache map for DateTimeFormatter. (string_format) => formatter */ - private static final ThreadLocalCache DATETIME_FORMATTER_CACHE = - ThreadLocalCache.of(DateTimeFormatter::ofPattern); - - /** A ThreadLocal cache map for TimeZone. (string_zone_id) => TimeZone */ - private static final ThreadLocalCache TIMEZONE_CACHE = - ThreadLocalCache.of(TimeZone::getTimeZone); - - // -------------------------------------------------------------------------------------------- - // java.sql Date/Time/Timestamp --> internal data types - // -------------------------------------------------------------------------------------------- - - /** - * Converts the internal representation of a SQL DATE (int) to the Java type used for UDF - * parameters ({@link java.sql.Date}). - */ - public static java.sql.Date toSQLDate(int v) { - // note that, in this case, can't handle Daylight Saving Time - final long t = v * MILLIS_PER_DAY; - return new java.sql.Date(t - LOCAL_TZ.getOffset(t)); - } - - /** - * Converts the internal representation of a SQL TIME (int) to the Java type used for UDF - * parameters ({@link java.sql.Time}). - */ - public static java.sql.Time toSQLTime(int v) { - // note that, in this case, can't handle Daylight Saving Time - return new java.sql.Time(v - LOCAL_TZ.getOffset(v)); - } - - /** - * Converts the internal representation of a SQL TIMESTAMP (long) to the Java type used for UDF - * parameters ({@link Timestamp}). - */ - public static Timestamp toSQLTimestamp(long v) { - return new Timestamp(v - LOCAL_TZ.getOffset(v)); - } - - /** - * Converts the Java type used for UDF parameters of SQL DATE type ({@link java.sql.Date}) to - * internal representation (int). - * - *

Converse of {@link #toSQLDate(int)}. - */ - public static int toInternal(java.sql.Date date) { - long ts = date.getTime() + LOCAL_TZ.getOffset(date.getTime()); - return (int) (ts / MILLIS_PER_DAY); - } - - /** - * Converts the Java type used for UDF parameters of SQL TIME type ({@link java.sql.Time}) to - * internal representation (int). - * - *

Converse of {@link #toSQLTime(int)}. - */ - public static int toInternal(java.sql.Time time) { - long ts = time.getTime() + LOCAL_TZ.getOffset(time.getTime()); - return (int) (ts % MILLIS_PER_DAY); - } - - /** - * Converts the Java type used for UDF parameters of SQL TIMESTAMP type ({@link Timestamp}) to - * internal representation (long). - * - *

Converse of {@link #toSQLTimestamp(long)}. - */ - public static long toInternal(Timestamp ts) { - long time = ts.getTime(); - return time + LOCAL_TZ.getOffset(time); - } - - public static int toInternal(LocalDate date) { - return ymdToUnixDate(date.getYear(), date.getMonthValue(), date.getDayOfMonth()); - } - - public static int toInternal(LocalTime time) { - return time.getHour() * (int) MILLIS_PER_HOUR - + time.getMinute() * (int) MILLIS_PER_MINUTE - + time.getSecond() * (int) MILLIS_PER_SECOND - + time.getNano() / 1000_000; - } - - // -------------------------------------------------------------------------------------------- - // Java 8 time conversion - // -------------------------------------------------------------------------------------------- - - public static LocalDate toLocalDate(int date) { - return julianToLocalDate(date + EPOCH_JULIAN); - } - - private static LocalDate julianToLocalDate(int julian) { - // this shifts the epoch back to astronomical year -4800 instead of the - // start of the Christian era in year AD 1 of the proleptic Gregorian - // calendar. - int j = julian + 32044; - int g = j / 146097; - int dg = j % 146097; - int c = (dg / 36524 + 1) * 3 / 4; - int dc = dg - c * 36524; - int b = dc / 1461; - int db = dc % 1461; - int a = (db / 365 + 1) * 3 / 4; - int da = db - a * 365; - - // integer number of full years elapsed since March 1, 4801 BC - int y = g * 400 + c * 100 + b * 4 + a; - // integer number of full months elapsed since the last March 1 - int m = (da * 5 + 308) / 153 - 2; - // number of days elapsed since day 1 of the month - int d = da - (m + 4) * 153 / 5 + 122; - int year = y - 4800 + (m + 2) / 12; - int month = (m + 2) % 12 + 1; - int day = d + 1; - return LocalDate.of(year, month, day); - } - - private static int ymdToUnixDate(int year, int month, int day) { - final int julian = ymdToJulian(year, month, day); - return julian - EPOCH_JULIAN; - } - - private static int ymdToJulian(int year, int month, int day) { - int a = (14 - month) / 12; - int y = year + 4800 - a; - int m = month + 12 * a - 3; - return day + (153 * m + 2) / 5 + 365 * y + y / 4 - y / 100 + y / 400 - 32045; - } - - public static LocalTime toLocalTime(int time) { - int h = time / 3600000; - int time2 = time % 3600000; - int m = time2 / 60000; - int time3 = time2 % 60000; - int s = time3 / 1000; - int ms = time3 % 1000; - return LocalTime.of(h, m, s, ms * 1000_000); - } - - public static LocalDateTime toLocalDateTime(long timestamp) { - int date = (int) (timestamp / MILLIS_PER_DAY); - int time = (int) (timestamp % MILLIS_PER_DAY); - if (time < 0) { - --date; - time += MILLIS_PER_DAY; - } - LocalDate localDate = toLocalDate(date); - LocalTime localTime = toLocalTime(time); - return LocalDateTime.of(localDate, localTime); - } - - // -------------------------------------------------------------------------------------------- - // Numeric -> Timestamp conversion - // -------------------------------------------------------------------------------------------- - - public static TimestampData toTimestampData(long v, int precision) { - switch (precision) { - case 0: - if (MIN_EPOCH_SECONDS <= v && v <= MAX_EPOCH_SECONDS) { - return timestampDataFromEpochMills(v * MILLIS_PER_SECOND); - } else { - return null; - } - case 3: - return timestampDataFromEpochMills(v); - default: - throw new TableException( - "The precision value '" - + precision - + "' for function " - + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," - + " the supported value is '0' for second or '3' for millisecond."); - } - } - - public static TimestampData toTimestampData(double v, int precision) { - switch (precision) { - case 0: - if (MIN_EPOCH_SECONDS <= v && v <= MAX_EPOCH_SECONDS) { - return timestampDataFromEpochMills((long) (v * MILLIS_PER_SECOND)); - } else { - return null; - } - case 3: - return timestampDataFromEpochMills((long) v); - default: - throw new TableException( - "The precision value '" - + precision - + "' for function " - + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," - + " the supported value is '0' for second or '3' for millisecond."); - } - } - - public static TimestampData toTimestampData(DecimalData v, int precision) { - long epochMills; - switch (precision) { - case 0: - epochMills = - v.toBigDecimal().setScale(0, RoundingMode.DOWN).longValue() * MILLIS_PER_SECOND; - return timestampDataFromEpochMills(epochMills); - case 3: - epochMills = toMillis(v); - return timestampDataFromEpochMills(epochMills); - default: - throw new TableException( - "The precision value '" - + precision - + "' for function " - + "TO_TIMESTAMP_LTZ(numeric, precision) is unsupported," - + " the supported value is '0' for second or '3' for millisecond."); - } - } - - private static TimestampData timestampDataFromEpochMills(long epochMills) { - if (MIN_EPOCH_MILLS <= epochMills && epochMills <= MAX_EPOCH_MILLS) { - return TimestampData.fromEpochMillis(epochMills); - } - return null; - } - - private static long toMillis(DecimalData v) { - return v.toBigDecimal().setScale(0, RoundingMode.DOWN).longValue(); - } - - // -------------------------------------------------------------------------------------------- - // Parsing functions - // -------------------------------------------------------------------------------------------- - - public static TimestampData parseTimestampData(String dateStr) throws DateTimeException { - // Precision is hardcoded to match signature of TO_TIMESTAMP - // https://issues.apache.org/jira/browse/FLINK-14925 - return parseTimestampData(dateStr, 3); - } - - public static TimestampData parseTimestampData(String dateStr, int precision) - throws DateTimeException { - return TimestampData.fromLocalDateTime( - fromTemporalAccessor(DEFAULT_TIMESTAMP_FORMATTER.parse(dateStr), precision)); - } - - public static TimestampData parseTimestampData(String dateStr, int precision, TimeZone timeZone) - throws DateTimeException { - return TimestampData.fromInstant( - fromTemporalAccessor(DEFAULT_TIMESTAMP_FORMATTER.parse(dateStr), precision) - .atZone(timeZone.toZoneId()) - .toInstant()); - } - - public static TimestampData parseTimestampData(String dateStr, String format) { - DateTimeFormatter formatter = DATETIME_FORMATTER_CACHE.get(format); - - try { - TemporalAccessor accessor = formatter.parse(dateStr); - // Precision is hardcoded to match signature of TO_TIMESTAMP - // https://issues.apache.org/jira/browse/FLINK-14925 - LocalDateTime ldt = fromTemporalAccessor(accessor, 3); - return TimestampData.fromLocalDateTime(ldt); - } catch (DateTimeParseException e) { - // fall back to support cases like '1999-9-10 05:20:10' or '1999-9-10' - try { - dateStr = dateStr.trim(); - int space = dateStr.indexOf(' '); - if (space >= 0) { - Timestamp ts = Timestamp.valueOf(dateStr); - return TimestampData.fromTimestamp(ts); - } else { - java.sql.Date dt = java.sql.Date.valueOf(dateStr); - return TimestampData.fromLocalDateTime( - LocalDateTime.of(dt.toLocalDate(), LocalTime.MIDNIGHT)); - } - } catch (IllegalArgumentException ie) { - return null; - } - } - } - - /** - * This is similar to {@link LocalDateTime#from(TemporalAccessor)}, but it's less strict and - * introduces default values. - */ - private static LocalDateTime fromTemporalAccessor(TemporalAccessor accessor, int precision) { - // complement year with 1970 - int year = accessor.isSupported(YEAR) ? accessor.get(YEAR) : 1970; - // complement month with 1 - int month = accessor.isSupported(MONTH_OF_YEAR) ? accessor.get(MONTH_OF_YEAR) : 1; - // complement day with 1 - int day = accessor.isSupported(DAY_OF_MONTH) ? accessor.get(DAY_OF_MONTH) : 1; - // complement hour with 0 - int hour = accessor.isSupported(HOUR_OF_DAY) ? accessor.get(HOUR_OF_DAY) : 0; - // complement minute with 0 - int minute = accessor.isSupported(MINUTE_OF_HOUR) ? accessor.get(MINUTE_OF_HOUR) : 0; - // complement second with 0 - int second = accessor.isSupported(SECOND_OF_MINUTE) ? accessor.get(SECOND_OF_MINUTE) : 0; - // complement nano_of_second with 0 - int nanoOfSecond = accessor.isSupported(NANO_OF_SECOND) ? accessor.get(NANO_OF_SECOND) : 0; - - if (precision == 0) { - nanoOfSecond = 0; - } else if (precision != 9) { - nanoOfSecond = (int) floor(nanoOfSecond, powerX(10, 9 - precision)); - } - - return LocalDateTime.of(year, month, day, hour, minute, second, nanoOfSecond); - } - - /** - * Parse date time string to timestamp based on the given time zone and format. Returns null if - * parsing failed. - * - * @param dateStr the date time string - * @param format date time string format - * @param tz the time zone - */ - private static long parseTimestampMillis(String dateStr, String format, TimeZone tz) - throws ParseException { - SimpleDateFormat formatter = FORMATTER_CACHE.get(format); - formatter.setTimeZone(tz); - return formatter.parse(dateStr).getTime(); - } - - /** - * Parse date time string to timestamp based on the given time zone string and format. Returns - * null if parsing failed. - * - * @param dateStr the date time string - * @param tzStr the time zone id string - */ - private static long parseTimestampTz(String dateStr, String tzStr) throws ParseException { - TimeZone tz = TIMEZONE_CACHE.get(tzStr); - return parseTimestampMillis(dateStr, DateTimeUtils.TIMESTAMP_FORMAT_STRING, tz); - } - - /** Returns the epoch days since 1970-01-01. */ - public static int parseDate(String dateStr, String fromFormat) { - // It is OK to use UTC, we just want get the epoch days - // TODO use offset, better performance - long ts = internalParseTimestampMillis(dateStr, fromFormat, TimeZone.getTimeZone("UTC")); - ZoneId zoneId = ZoneId.of("UTC"); - Instant instant = Instant.ofEpochMilli(ts); - ZonedDateTime zdt = ZonedDateTime.ofInstant(instant, zoneId); - return ymdToUnixDate(zdt.getYear(), zdt.getMonthValue(), zdt.getDayOfMonth()); - } - - public static Integer parseDate(String s) { - // allow timestamp str to date, e.g. 2017-12-12 09:30:00.0 - int ws1 = s.indexOf(" "); - if (ws1 > 0) { - s = s.substring(0, ws1); - } - int hyphen1 = s.indexOf('-'); - int y; - int m; - int d; - if (hyphen1 < 0) { - if (!isInteger(s.trim())) { - return null; - } - y = Integer.parseInt(s.trim()); - m = 1; - d = 1; - } else { - if (!isInteger(s.substring(0, hyphen1).trim())) { - return null; - } - y = Integer.parseInt(s.substring(0, hyphen1).trim()); - final int hyphen2 = s.indexOf('-', hyphen1 + 1); - if (hyphen2 < 0) { - if (!isInteger(s.substring(hyphen1 + 1).trim())) { - return null; - } - m = Integer.parseInt(s.substring(hyphen1 + 1).trim()); - d = 1; - } else { - if (!isInteger(s.substring(hyphen1 + 1, hyphen2).trim())) { - return null; - } - m = Integer.parseInt(s.substring(hyphen1 + 1, hyphen2).trim()); - if (!isInteger(s.substring(hyphen2 + 1).trim())) { - return null; - } - d = Integer.parseInt(s.substring(hyphen2 + 1).trim()); - } - } - if (!isIllegalDate(y, m, d)) { - return null; - } - return ymdToUnixDate(y, m, d); - } - - public static Integer parseTime(String v) { - final int start = 0; - final int colon1 = v.indexOf(':', start); - // timezone hh:mm:ss[.ssssss][[+|-]hh:mm:ss] - // refer https://www.w3.org/TR/NOTE-datetime - int timezoneHour; - int timezoneMinute; - int hour; - int minute; - int second; - int milli; - int operator = -1; - int end = v.length(); - int timezone = v.indexOf('-', start); - if (timezone < 0) { - timezone = v.indexOf('+', start); - operator = 1; - } - if (timezone < 0) { - timezoneHour = 0; - timezoneMinute = 0; - } else { - end = timezone; - final int colon3 = v.indexOf(':', timezone); - if (colon3 < 0) { - if (!isInteger(v.substring(timezone + 1).trim())) { - return null; - } - timezoneHour = Integer.parseInt(v.substring(timezone + 1).trim()); - timezoneMinute = 0; - } else { - if (!isInteger(v.substring(timezone + 1, colon3).trim())) { - return null; - } - timezoneHour = Integer.parseInt(v.substring(timezone + 1, colon3).trim()); - if (!isInteger(v.substring(colon3 + 1).trim())) { - return null; - } - timezoneMinute = Integer.parseInt(v.substring(colon3 + 1).trim()); - } - } - if (colon1 < 0) { - if (!isInteger(v.substring(start, end).trim())) { - return null; - } - hour = Integer.parseInt(v.substring(start, end).trim()); - minute = 0; - second = 0; - milli = 0; - } else { - if (!isInteger(v.substring(start, colon1).trim())) { - return null; - } - hour = Integer.parseInt(v.substring(start, colon1).trim()); - final int colon2 = v.indexOf(':', colon1 + 1); - if (colon2 < 0) { - if (!isInteger(v.substring(colon1 + 1, end).trim())) { - return null; - } - minute = Integer.parseInt(v.substring(colon1 + 1, end).trim()); - second = 0; - milli = 0; - } else { - if (!isInteger(v.substring(colon1 + 1, colon2).trim())) { - return null; - } - minute = Integer.parseInt(v.substring(colon1 + 1, colon2).trim()); - int dot = v.indexOf('.', colon2); - if (dot < 0) { - if (!isInteger(v.substring(colon2 + 1, end).trim())) { - return null; - } - second = Integer.parseInt(v.substring(colon2 + 1, end).trim()); - milli = 0; - } else { - if (!isInteger(v.substring(colon2 + 1, dot).trim())) { - return null; - } - second = Integer.parseInt(v.substring(colon2 + 1, dot).trim()); - milli = parseFraction(v.substring(dot + 1, end).trim()); - } - } - } - hour += operator * timezoneHour; - minute += operator * timezoneMinute; - return hour * (int) MILLIS_PER_HOUR - + minute * (int) MILLIS_PER_MINUTE - + second * (int) MILLIS_PER_SECOND - + milli; - } - - /** - * Parses a fraction, multiplying the first character by {@code multiplier}, the second character - * by {@code multiplier / 10}, the third character by {@code multiplier / 100}, and so forth. - * - *

For example, {@code parseFraction("1234", 100)} yields {@code 123}. - */ - private static int parseFraction(String v) { - int multiplier = 100; - int r = 0; - for (int i = 0; i < v.length(); i++) { - char c = v.charAt(i); - int x = c < '0' || c > '9' ? 0 : (c - '0'); - r += multiplier * x; - if (multiplier < 10) { - // We're at the last digit. Check for rounding. - if (i + 1 < v.length() && v.charAt(i + 1) >= '5') { - ++r; - } - break; - } - multiplier /= 10; - } - return r; - } - - // -------------------------------------------------------------------------------------------- - // Format - // -------------------------------------------------------------------------------------------- - - public static String formatTimestamp(TimestampData ts, String format) { - return formatTimestamp(ts, format, ZoneId.of("UTC")); - } - - public static String formatTimestamp(TimestampData ts, String format, TimeZone zone) { - return formatTimestamp(ts, format, zone.toZoneId()); - } - - private static String formatTimestamp(TimestampData ts, int precision) { - LocalDateTime ldt = ts.toLocalDateTime(); - - String fraction = pad(9, ldt.getNano()); - while (fraction.length() > precision && fraction.endsWith("0")) { - fraction = fraction.substring(0, fraction.length() - 1); - } - - StringBuilder ymdhms = - ymdhms( - new StringBuilder(), - ldt.getYear(), - ldt.getMonthValue(), - ldt.getDayOfMonth(), - ldt.getHour(), - ldt.getMinute(), - ldt.getSecond()); - - if (fraction.length() > 0) { - ymdhms.append(".").append(fraction); - } - - return ymdhms.toString(); - } - - public static String formatTimestamp(TimestampData ts, TimeZone tz, int precision) { - return formatTimestamp(timestampWithLocalZoneToTimestamp(ts, tz), precision); - } - - private static String formatTimestamp(TimestampData ts, String format, ZoneId zoneId) { - DateTimeFormatter formatter = DATETIME_FORMATTER_CACHE.get(format); - Instant instant = ts.toInstant(); - return LocalDateTime.ofInstant(instant, zoneId).format(formatter); - } - - public static String formatTimestampString( - String dateStr, String fromFormat, String toFormat, TimeZone tz) { - SimpleDateFormat fromFormatter = FORMATTER_CACHE.get(fromFormat); - fromFormatter.setTimeZone(tz); - SimpleDateFormat toFormatter = FORMATTER_CACHE.get(toFormat); - toFormatter.setTimeZone(tz); - try { - return toFormatter.format(fromFormatter.parse(dateStr)); - } catch (ParseException e) { - LOG.error( - "Exception when formatting: '" - + dateStr - + "' from: '" - + fromFormat - + "' to: '" - + toFormat - + "'", - e); - return null; - } - } - - public static String formatTimestampString(String dateStr, String toFormat, TimeZone tz) { - // use yyyy-MM-dd HH:mm:ss as default - return formatTimestampString(dateStr, TIMESTAMP_FORMAT_STRING, toFormat, tz); - } - - public static String formatTimestampString(String dateStr, String toFormat) { - return formatTimestampString(dateStr, toFormat, UTC_ZONE); - } - - public static String formatTimestampMillis(long ts, String format, TimeZone tz) { - SimpleDateFormat formatter = FORMATTER_CACHE.get(format); - formatter.setTimeZone(tz); - Date dateTime = new Date(ts); - return formatter.format(dateTime); - } - - public static String formatTimestampMillis(int time, int precision) { - final StringBuilder buf = new StringBuilder(8 + (precision > 0 ? precision + 1 : 0)); - formatTimestampMillis(buf, time, precision); - return buf.toString(); - } - - private static void formatTimestampMillis(StringBuilder buf, int time, int precision) { - // we copy this method from Calcite DateTimeUtils but add the following changes - // time may be negative which means time milli seconds before 00:00:00 - // this maybe a bug in calcite avatica - while (time < 0) { - time += MILLIS_PER_DAY; - } - int h = time / 3600000; - int time2 = time % 3600000; - int m = time2 / 60000; - int time3 = time2 % 60000; - int s = time3 / 1000; - int ms = time3 % 1000; - int2(buf, h); - buf.append(':'); - int2(buf, m); - buf.append(':'); - int2(buf, s); - if (precision > 0) { - buf.append('.'); - while (precision > 0) { - buf.append((char) ('0' + (ms / 100))); - ms = ms % 100; - ms = ms * 10; - - // keep consistent with Timestamp.toString() - if (ms == 0) { - break; - } - - --precision; - } - } - } - - private static void int2(StringBuilder buf, int i) { - buf.append((char) ('0' + (i / 10) % 10)); - buf.append((char) ('0' + i % 10)); - } - - /** Helper for CAST({date} AS VARCHAR(n)). */ - public static String formatDate(int date) { - final StringBuilder buf = new StringBuilder(10); - formatDate(buf, date); - return buf.toString(); - } - - private static void formatDate(StringBuilder buf, int date) { - julianToString(buf, date + EPOCH_JULIAN); - } - - private static void julianToString(StringBuilder buf, int julian) { - // this shifts the epoch back to astronomical year -4800 instead of the - // start of the Christian era in year AD 1 of the proleptic Gregorian - // calendar. - int j = julian + 32044; - int g = j / 146097; - int dg = j % 146097; - int c = (dg / 36524 + 1) * 3 / 4; - int dc = dg - c * 36524; - int b = dc / 1461; - int db = dc % 1461; - int a = (db / 365 + 1) * 3 / 4; - int da = db - a * 365; - - // integer number of full years elapsed since March 1, 4801 BC - int y = g * 400 + c * 100 + b * 4 + a; - // integer number of full months elapsed since the last March 1 - int m = (da * 5 + 308) / 153 - 2; - // number of days elapsed since day 1 of the month - int d = da - (m + 4) * 153 / 5 + 122; - int year = y - 4800 + (m + 2) / 12; - int month = (m + 2) % 12 + 1; - int day = d + 1; - int4(buf, year); - buf.append('-'); - int2(buf, month); - buf.append('-'); - int2(buf, day); - } - - public static String formatIntervalYearMonth(int v) { - final StringBuilder buf = new StringBuilder(); - if (v >= 0) { - buf.append('+'); - } else { - buf.append('-'); - v = -v; - } - final int y = v / 12; - final int m = v % 12; - buf.append(y); - buf.append('-'); - number(buf, m, 2); - return buf.toString(); - } - - public static StringBuilder number(StringBuilder buf, int v, int n) { - for (int k = digitCount(v); k < n; k++) { - buf.append('0'); - } - return buf.append(v); - } - - private static int digitCount(int v) { - for (int n = 1; ; n++) { - v /= 10; - if (v == 0) { - return n; - } - } - } - - private static long roundUp(long dividend, long divisor) { - long remainder = dividend % divisor; - dividend -= remainder; - if (remainder * 2 > divisor) { - dividend += divisor; - } - return dividend; - } - - private static void fraction(StringBuilder buf, int scale, long ms) { - if (scale > 0) { - buf.append('.'); - long v1 = scale == 3 ? ms : scale == 2 ? ms / 10 : scale == 1 ? ms / 100 : 0; - number(buf, (int) v1, scale); - } - } - - private static long powerX(long a, long b) { - long x = 1; - while (b > 0) { - x *= a; - --b; - } - return x; - } - - public static String formatIntervalDayTime(long v) { - final int scale = 3; - final StringBuilder buf = new StringBuilder(); - if (v >= 0) { - buf.append('+'); - } else { - buf.append('-'); - v = -v; - } - final long ms; - final long s; - final long m; - final long h; - final long d; - v = roundUp(v, powerX(10, 3 - scale)); - ms = v % 1000; - v /= 1000; - s = v % 60; - v /= 60; - m = v % 60; - v /= 60; - h = v % 24; - v /= 24; - d = v; - buf.append((int) d); - buf.append(' '); - number(buf, (int) h, 2); - buf.append(':'); - number(buf, (int) m, 2); - buf.append(':'); - number(buf, (int) s, 2); - fraction(buf, scale, ms); - return buf.toString(); - } - - private static long internalParseTimestampMillis(String dateStr, String format, TimeZone tz) { - SimpleDateFormat formatter = FORMATTER_CACHE.get(format); - formatter.setTimeZone(tz); - try { - Date date = formatter.parse(dateStr); - return date.getTime(); - } catch (ParseException e) { - LOG.error( - String.format( - "Exception when parsing datetime string '%s' in format '%s'", dateStr, format), - e); - return Long.MIN_VALUE; - } - } - - // -------------------------------------------------------------------------------------------- - // EXTRACT - // -------------------------------------------------------------------------------------------- - - private static final TimestampType REUSE_TIMESTAMP_TYPE = new TimestampType(9); - - public static long extractFromDate(TimeUnitRange range, long date) { - return extractFromDate(range, (int) date); - } - - public static long extractFromDate(TimeUnitRange range, int date) { - switch (range) { - case EPOCH: - return date * 86400L; - default: - return julianExtract(range, date + 2440588); - } - } - - private static int julianExtract(TimeUnitRange range, int julian) { - int j = julian + 32044; - int g = j / 146097; - int dg = j % 146097; - int c = (dg / 36524 + 1) * 3 / 4; - int dc = dg - c * 36524; - int b = dc / 1461; - int db = dc % 1461; - int a = (db / 365 + 1) * 3 / 4; - int da = db - a * 365; - int y = g * 400 + c * 100 + b * 4 + a; - int m = (da * 5 + 308) / 153 - 2; - int d = da - (m + 4) * 153 / 5 + 122; - int year = y - 4800 + (m + 2) / 12; - int month = (m + 2) % 12 + 1; - int day = d + 1; - switch (range) { - case YEAR: - return year; - case YEAR_TO_MONTH: - case DAY_TO_SECOND: - case DAY_TO_MINUTE: - case DAY_TO_HOUR: - case HOUR: - case HOUR_TO_MINUTE: - case HOUR_TO_SECOND: - case MINUTE_TO_SECOND: - case MINUTE: - case SECOND: - case EPOCH: - default: - throw new AssertionError(range); - case MONTH: - return month; - case DAY: - return day; - case ISOYEAR: - int weekNumber = getIso8601WeekNumber(julian, year, month, day); - if (weekNumber == 1 && month == 12) { - return year + 1; - } else { - if (month == 1 && weekNumber > 50) { - return year - 1; - } - - return year; - } - case QUARTER: - return (month + 2) / 3; - case DOW: - return (int) floorMod(julian + 1, 7L) + 1; - case ISODOW: - return (int) floorMod(julian, 7L) + 1; - case WEEK: - return getIso8601WeekNumber(julian, year, month, day); - case DOY: - long janFirst = ymdToJulian(year, 1, 1); - return (int) ((long) julian - janFirst) + 1; - case DECADE: - return year / 10; - case CENTURY: - return year > 0 ? (year + 99) / 100 : (year - 99) / 100; - case MILLENNIUM: - return year > 0 ? (year + 999) / 1000 : (year - 999) / 1000; - } - } - - private static long firstMondayOfFirstWeek(int year) { - long janFirst = ymdToJulian(year, 1, 1); - long janFirstDow = floorMod(janFirst + 1L, 7L); - return janFirst + (11L - janFirstDow) % 7L - 3L; - } - - private static int getIso8601WeekNumber(int julian, int year, int month, int day) { - long fmofw = firstMondayOfFirstWeek(year); - if (month == 12 && day > 28) { - return 31 - day + 4 > 7 - ((int) floorMod(julian, 7L) + 1) - && 31 - day + (int) (floorMod(julian, 7L) + 1L) >= 4 - ? (int) ((long) julian - fmofw) / 7 + 1 - : 1; - } else if (month == 1 && day < 5) { - return 4 - day <= 7 - ((int) floorMod(julian, 7L) + 1) - && day - (int) (floorMod(julian, 7L) + 1L) >= -3 - ? 1 - : (int) ((long) julian - firstMondayOfFirstWeek(year - 1)) / 7 + 1; - } else { - return (int) ((long) julian - fmofw) / 7 + 1; - } - } - - private static long floorDiv(long x, long y) { - long r = x / y; - if ((x ^ y) < 0L && r * y != x) { - --r; - } - - return r; - } - - private static long floorMod(long x, long y) { - return x - floorDiv(x, y) * y; - } - - private static long divide(long res, BigDecimal value) { - if (value.equals(BigDecimal.ONE)) { - return res; - } else if (value.compareTo(BigDecimal.ONE) < 0 && value.signum() == 1) { - BigDecimal reciprocal = BigDecimal.ONE.divide(value, RoundingMode.UNNECESSARY); - return reciprocal.multiply(BigDecimal.valueOf(res)).longValue(); - } else { - return res / value.longValue(); - } - } - - private static long mod(long res, BigDecimal value) { - if (value.equals(BigDecimal.ONE)) { - return res; - } else { - return res % value.longValue(); - } - } - - private static BigDecimal getFactor(TimeUnit unit) { - switch (unit) { - case DAY: - return BigDecimal.ONE; - case HOUR: - return TimeUnit.DAY.multiplier; - case MINUTE: - return TimeUnit.HOUR.multiplier; - case SECOND: - return TimeUnit.MINUTE.multiplier; - case MILLISECOND: - case MICROSECOND: - case NANOSECOND: - return TimeUnit.SECOND.multiplier; - case YEAR: - return BigDecimal.ONE; - case MONTH: - return TimeUnit.YEAR.multiplier; - case QUARTER: - return TimeUnit.YEAR.multiplier; - case DECADE: - case CENTURY: - case MILLENNIUM: - return BigDecimal.ONE; - default: - throw new IllegalArgumentException("Invalid start unit."); - } - } - - // -------------------------------------------------------------------------------------------- - // Floor/Ceil/Convert tz - // -------------------------------------------------------------------------------------------- - - public static long timestampFloor(TimeUnitRange range, long ts, TimeZone tz) { - // assume that we are at UTC timezone, just for algorithm performance - long offset = tz.getOffset(ts); - long utcTs = ts + offset; - - switch (range) { - case HOUR: - return floor(utcTs, MILLIS_PER_HOUR) - offset; - case DAY: - return floor(utcTs, MILLIS_PER_DAY) - offset; - case MILLENNIUM: - case CENTURY: - case DECADE: - case MONTH: - case YEAR: - case QUARTER: - case WEEK: - int days = (int) (utcTs / MILLIS_PER_DAY + EPOCH_JULIAN); - return julianDateFloor(range, days, true) * MILLIS_PER_DAY - offset; - default: - // for MINUTE and SECONDS etc..., - // it is more effective to use arithmetic Method - throw new AssertionError(range); - } - } - - /** - * Keep the algorithm consistent with Calcite DateTimeUtils.julianDateFloor, but here we take time - * zone into account. - */ - public static long timestampCeil(TimeUnitRange range, long ts, TimeZone tz) { - // assume that we are at UTC timezone, just for algorithm performance - long offset = tz.getOffset(ts); - long utcTs = ts + offset; - - switch (range) { - case HOUR: - return ceil(utcTs, MILLIS_PER_HOUR) - offset; - case DAY: - return ceil(utcTs, MILLIS_PER_DAY) - offset; - case MILLENNIUM: - case CENTURY: - case DECADE: - case MONTH: - case YEAR: - case QUARTER: - case WEEK: - int days = (int) (utcTs / MILLIS_PER_DAY + EPOCH_JULIAN); - return julianDateFloor(range, days, false) * MILLIS_PER_DAY - offset; - default: - // for MINUTE and SECONDS etc..., - // it is more effective to use arithmetic Method - throw new AssertionError(range); - } - } - - private static long floor(long a, long b) { - long r = a % b; - if (r < 0) { - return a - r - b; - } else { - return a - r; - } - } - - private static long ceil(long a, long b) { - long r = a % b; - if (r > 0) { - return a - r + b; - } else { - return a - r; - } - } - - private static long julianDateFloor(TimeUnitRange range, int julian, boolean floor) { - // Algorithm the book "Astronomical Algorithms" by Jean Meeus, 1998 - int b = 0; - int c = 0; - if (julian > 2299160) { - int a = julian + 32044; - b = (4 * a + 3) / 146097; - c = a - b * 146097 / 4; - } else { - b = 0; - c = julian + 32082; - } - int d = (4 * c + 3) / 1461; - int e = c - (1461 * d) / 4; - int m = (5 * e + 2) / 153; - int day = e - (153 * m + 2) / 5 + 1; - int month = m + 3 - 12 * (m / 10); - int quarter = (month + 2) / 3; - int year = b * 100 + d - 4800 + (m / 10); - switch (range) { - case MILLENNIUM: - return floor - ? ymdToUnixDate(1000 * ((year + 999) / 1000) - 999, 1, 1) - : ymdToUnixDate(1000 * ((year + 999) / 1000) + 1, 1, 1); - case CENTURY: - return floor - ? ymdToUnixDate(100 * ((year + 99) / 100) - 99, 1, 1) - : ymdToUnixDate(100 * ((year + 99) / 100) + 1, 1, 1); - case DECADE: - return floor - ? ymdToUnixDate(10 * (year / 10), 1, 1) - : ymdToUnixDate(10 * (1 + year / 10), 1, 1); - case YEAR: - if (!floor && (month > 1 || day > 1)) { - year += 1; - } - return ymdToUnixDate(year, 1, 1); - case MONTH: - if (!floor && day > 1) { - month += 1; - } - return ymdToUnixDate(year, month, 1); - case QUARTER: - if (!floor && (month > 1 || day > 1)) { - quarter += 1; - } - return ymdToUnixDate(year, quarter * 3 - 2, 1); - case WEEK: - int dow = (int) floorMod(julian + 1, 7); // sun=0, sat=6 - int offset = dow; - if (!floor && offset > 0) { - offset -= 7; - } - return ymdToUnixDate(year, month, day) - offset; - case DAY: - int res = ymdToUnixDate(year, month, day); - return floor ? res : res + 1; - default: - throw new AssertionError(range); - } - } - - /** - * Convert datetime string from a time zone to another time zone. - * - * @param dateStr the date time string - * @param tzFrom the original time zone - * @param tzTo the target time zone - */ - public static String convertTz(String dateStr, String tzFrom, String tzTo) { - try { - return formatTimestampTz(parseTimestampTz(dateStr, tzFrom), tzTo); - } catch (ParseException e) { - return null; - } - } - - private static String formatTimestampTz(long ts, String tzStr) { - TimeZone tz = TIMEZONE_CACHE.get(tzStr); - return formatTimestampMillis(ts, DateTimeUtils.TIMESTAMP_FORMAT_STRING, tz); - } - - // -------------------------------------------------------------------------------------------- - // TIMESTAMP to DATE/TIME utils - // -------------------------------------------------------------------------------------------- - - /** - * Get date from a timestamp. - * - * @param ts the timestamp in milliseconds. - * @return the date in days. - */ - public static int timestampMillisToDate(long ts) { - int days = (int) (ts / MILLIS_PER_DAY); - if (days < 0) { - days = days - 1; - } - return days; - } - - /** - * Get time from a timestamp. - * - * @param ts the timestamp in milliseconds. - * @return the time in milliseconds. - */ - public static int timestampMillisToTime(long ts) { - return (int) (ts % MILLIS_PER_DAY); - } - - // -------------------------------------------------------------------------------------------- - // UNIX TIME - // -------------------------------------------------------------------------------------------- - - public static long fromTimestamp(long ts) { - return ts; - } - - /** - * Convert unix timestamp (seconds since '1970-01-01 00:00:00' UTC) to datetime string in the - * "yyyy-MM-dd HH:mm:ss" format. - */ - public static String formatUnixTimestamp(long unixtime, TimeZone tz) { - return formatUnixTimestamp(unixtime, TIMESTAMP_FORMAT_STRING, tz); - } - - /** - * Convert unix timestamp (seconds since '1970-01-01 00:00:00' UTC) to datetime string in the - * given format. - */ - public static String formatUnixTimestamp(long unixtime, String format, TimeZone tz) { - SimpleDateFormat formatter = FORMATTER_CACHE.get(format); - formatter.setTimeZone(tz); - Date date = new Date(unixtime * 1000); - try { - return formatter.format(date); - } catch (Exception e) { - LOG.error("Exception when formatting.", e); - return null; - } - } - - public static long toTimestampMillis(LocalDateTime dateTime) { - return unixTimestamp( - dateTime.getYear(), - dateTime.getMonthValue(), - dateTime.getDayOfMonth(), - dateTime.getHour(), - dateTime.getMinute(), - dateTime.getSecond(), - dateTime.getNano() / 1000_000); - } - - private static long unixTimestamp( - int year, int month, int day, int hour, int minute, int second, int mills) { - final int date = ymdToUnixDate(year, month, day); - return (long) date * MILLIS_PER_DAY - + (long) hour * MILLIS_PER_HOUR - + (long) minute * MILLIS_PER_MINUTE - + (long) second * MILLIS_PER_SECOND - + mills; - } - - /** Returns a Unix timestamp in seconds since '1970-01-01 00:00:00' UTC as an unsigned integer. */ - public static long unixTimestamp() { - return System.currentTimeMillis() / 1000; - } - - /** Returns the value of the timestamp to seconds since '1970-01-01 00:00:00' UTC. */ - public static long unixTimestamp(long ts) { - return ts / 1000; - } - - /** - * Returns the value of the argument as an unsigned integer in seconds since '1970-01-01 00:00:00' - * UTC. - */ - public static long unixTimestamp(String dateStr, TimeZone tz) { - return unixTimestamp(dateStr, TIMESTAMP_FORMAT_STRING, tz); - } - - /** - * Returns the value of the argument as an unsigned integer in seconds since '1970-01-01 00:00:00' - * UTC. - */ - public static long unixTimestamp(String dateStr, String format, TimeZone tz) { - long ts = internalParseTimestampMillis(dateStr, format, tz); - if (ts == Long.MIN_VALUE) { - return Long.MIN_VALUE; - } else { - // return the seconds - return ts / 1000; - } - } - - // -------------------------------------------------------------------------------------------- - // TIMESTAMP to TIMESTAMP_LTZ conversions - // -------------------------------------------------------------------------------------------- - - public static TimestampData timestampToTimestampWithLocalZone(TimestampData ts, TimeZone tz) { - return TimestampData.fromInstant(ts.toLocalDateTime().atZone(tz.toZoneId()).toInstant()); - } - - public static TimestampData timestampWithLocalZoneToTimestamp(TimestampData ts, TimeZone tz) { - return TimestampData.fromLocalDateTime(LocalDateTime.ofInstant(ts.toInstant(), tz.toZoneId())); - } - - public static int timestampWithLocalZoneToDate(TimestampData ts, TimeZone tz) { - return toInternal( - LocalDateTime.ofInstant(Instant.ofEpochMilli(ts.getMillisecond()), tz.toZoneId()) - .toLocalDate()); - } - - public static int timestampWithLocalZoneToTime(TimestampData ts, TimeZone tz) { - return toInternal( - LocalDateTime.ofInstant(Instant.ofEpochMilli(ts.getMillisecond()), tz.toZoneId()) - .toLocalTime()); - } - - public static TimestampData dateToTimestampWithLocalZone(int date, TimeZone tz) { - return TimestampData.fromInstant( - LocalDateTime.of(toLocalDate(date), LocalTime.MIDNIGHT).atZone(tz.toZoneId()).toInstant()); - } - - public static TimestampData timeToTimestampWithLocalZone(int time, TimeZone tz) { - return TimestampData.fromInstant(toLocalDateTime(time).atZone(tz.toZoneId()).toInstant()); - } - - private static boolean isInteger(String s) { - boolean isInt = s.length() > 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) < '0' || s.charAt(i) > '9') { - isInt = false; - break; - } - } - return isInt; - } - - private static boolean isLeapYear(int s) { - return s % 400 == 0 || (s % 4 == 0 && s % 100 != 0); - } - - private static boolean isIllegalDate(int y, int m, int d) { - int[] monthOf31Days = new int[] {1, 3, 5, 7, 8, 10, 12}; - if (y < 0 || y > 9999 || m < 1 || m > 12 || d < 1 || d > 31) { - return false; - } - if (m == 2 && d > 28) { - if (!(isLeapYear(y) && d == 29)) { - return false; - } - } - if (d == 31) { - for (int i : monthOf31Days) { - if (i == m) { - return true; - } - } - return false; - } - return true; - } - - private static String pad(int length, long v) { - StringBuilder s = new StringBuilder(Long.toString(v)); - while (s.length() < length) { - s.insert(0, "0"); - } - return s.toString(); - } - - /** Appends hour:minute:second to a buffer; assumes they are valid. */ - private static StringBuilder hms(StringBuilder b, int h, int m, int s) { - int2(b, h); - b.append(':'); - int2(b, m); - b.append(':'); - int2(b, s); - return b; - } - - /** Appends year-month-day and hour:minute:second to a buffer; assumes they are valid. */ - private static StringBuilder ymdhms( - StringBuilder b, int year, int month, int day, int h, int m, int s) { - ymd(b, year, month, day); - b.append(' '); - hms(b, h, m, s); - return b; - } - - /** Appends year-month-day to a buffer; assumes they are valid. */ - private static StringBuilder ymd(StringBuilder b, int year, int month, int day) { - int4(b, year); - b.append('-'); - int2(b, month); - b.append('-'); - int2(b, day); - return b; - } - - private static void int4(StringBuilder buf, int i) { - buf.append((char) ('0' + (i / 1000) % 10)); - buf.append((char) ('0' + (i / 100) % 10)); - buf.append((char) ('0' + (i / 10) % 10)); - buf.append((char) ('0' + i % 10)); - } - - public static TimestampData truncate(TimestampData ts, int precision) { - String fraction = Integer.toString(ts.toLocalDateTime().getNano()); - if (fraction.length() <= precision) { - return ts; - } else { - // need to truncate - if (precision <= 3) { - return TimestampData.fromEpochMillis(zeroLastDigits(ts.getMillisecond(), 3 - precision)); - } else { - return TimestampData.fromEpochMillis( - ts.getMillisecond(), (int) zeroLastDigits(ts.getNanoOfMillisecond(), 9 - precision)); - } - } - } - - private static long zeroLastDigits(long l, int n) { - long tenToTheN = (long) Math.pow(10, n); - return (l / tenToTheN) * tenToTheN; - } - - public static long unixDateCeil(TimeUnitRange range, long date) { - return julianDateFloor(range, (int) date + 2440588, false); - } - - public static long unixDateFloor(TimeUnitRange range, long date) { - return julianDateFloor(range, (int) date + EPOCH_JULIAN, true); - } - - public static long unixTimestampFloor(TimeUnitRange range, long timestamp) { - int date = (int) (timestamp / MILLIS_PER_DAY); - final long f = julianDateFloor(range, date + EPOCH_JULIAN, true); - return f * MILLIS_PER_DAY; - } - - public static long unixTimestampCeil(TimeUnitRange range, long timestamp) { - int date = (int) (timestamp / MILLIS_PER_DAY); - final long f = julianDateFloor(range, date + EPOCH_JULIAN, false); - return f * MILLIS_PER_DAY; - } - - // -------------------------------------------------------------------------------------------- - // ADD/REMOVE months - // -------------------------------------------------------------------------------------------- - - /** - * Adds a given number of months to a timestamp, represented as the number of milliseconds since - * the epoch. - */ - public static long addMonths(long timestamp, int m) { - final long millis = DateTimeUtils.floorMod(timestamp, DateTimeUtils.MILLIS_PER_DAY); - timestamp -= millis; - final long x = addMonths((int) (timestamp / DateTimeUtils.MILLIS_PER_DAY), m); - return x * DateTimeUtils.MILLIS_PER_DAY + millis; - } - - /** Adds a given number of months to a date, represented as the number of days since the epoch. */ - public static int addMonths(int date, int m) { - int y0 = (int) extractFromDate(TimeUnitRange.YEAR, date); - int m0 = (int) extractFromDate(TimeUnitRange.MONTH, date); - int d0 = (int) extractFromDate(TimeUnitRange.DAY, date); - m0 += m; - int deltaYear = (int) DateTimeUtils.floorDiv(m0, 12); - y0 += deltaYear; - m0 = (int) DateTimeUtils.floorMod(m0, 12); - if (m0 == 0) { - y0 -= 1; - m0 += 12; - } - - int last = lastDay(y0, m0); - if (d0 > last) { - d0 = last; - } - return ymdToUnixDate(y0, m0, d0); - } - - private static int lastDay(int y, int m) { - switch (m) { - case 2: - return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0) ? 29 : 28; - case 4: - case 6: - case 9: - case 11: - return 30; - default: - return 31; - } - } - - /** - * Finds the number of months between two dates, each represented as the number of days since the - * epoch. - */ - public static int subtractMonths(int date0, int date1) { - if (date0 < date1) { - return -subtractMonths(date1, date0); - } - // Start with an estimate. - // Since no month has more than 31 days, the estimate is <= the true value. - int m = (date0 - date1) / 31; - while (true) { - int date2 = addMonths(date1, m); - if (date2 >= date0) { - return m; - } - int date3 = addMonths(date1, m + 1); - if (date3 > date0) { - return m; - } - ++m; - } - } - - public static int subtractMonths(long t0, long t1) { - final long millis0 = DateTimeUtils.floorMod(t0, DateTimeUtils.MILLIS_PER_DAY); - final int d0 = (int) DateTimeUtils.floorDiv(t0 - millis0, DateTimeUtils.MILLIS_PER_DAY); - final long millis1 = DateTimeUtils.floorMod(t1, DateTimeUtils.MILLIS_PER_DAY); - final int d1 = (int) DateTimeUtils.floorDiv(t1 - millis1, DateTimeUtils.MILLIS_PER_DAY); - int x = subtractMonths(d0, d1); - final long d2 = addMonths(d1, x); - if (d2 == d0 && millis0 < millis1) { - --x; - } - return x; - } - - // -------------------------------------------------------------------------------------------- - // TimeUnit and TimeUnitRange enums - // -------------------------------------------------------------------------------------------- - - /** - * Enumeration of time units used to construct an interval. - * - *

Only {@link #YEAR}, {@link #MONTH}, {@link #DAY}, {@link #HOUR}, {@link #MINUTE}, {@link - * #SECOND} can be the unit of a SQL interval. - * - *

The others ({@link #QUARTER}, {@link #WEEK}, {@link #MILLISECOND}, {@link #DOW}, {@link - * #DOY}, {@link #EPOCH}, {@link #DECADE}, {@link #CENTURY}, {@link #MILLENNIUM}, {@link - * #MICROSECOND}, {@link #NANOSECOND}, {@link #ISODOW} and {@link #ISOYEAR}) are convenient to use - * internally, when converting to and from UNIX timestamps. And also may be arguments to the - * {@code EXTRACT}, {@code TIMESTAMPADD} and {@code TIMESTAMPDIFF} functions. - */ - public enum TimeUnit { - YEAR(true, ' ', BigDecimal.valueOf(12) /* months */, null), - MONTH(true, '-', BigDecimal.ONE /* months */, BigDecimal.valueOf(12)), - DAY(false, '-', BigDecimal.valueOf(MILLIS_PER_DAY), null), - HOUR(false, ' ', BigDecimal.valueOf(MILLIS_PER_HOUR), BigDecimal.valueOf(24)), - MINUTE(false, ':', BigDecimal.valueOf(MILLIS_PER_MINUTE), BigDecimal.valueOf(60)), - SECOND(false, ':', BigDecimal.valueOf(MILLIS_PER_SECOND), BigDecimal.valueOf(60)), - - QUARTER(true, '*', BigDecimal.valueOf(3) /* months */, BigDecimal.valueOf(4)), - ISOYEAR(true, ' ', BigDecimal.valueOf(12) /* months */, null), - WEEK(false, '*', BigDecimal.valueOf(MILLIS_PER_DAY * 7), BigDecimal.valueOf(53)), - MILLISECOND(false, '.', BigDecimal.ONE, BigDecimal.valueOf(1000)), - MICROSECOND(false, '.', BigDecimal.ONE.scaleByPowerOfTen(-3), BigDecimal.valueOf(1000_000)), - NANOSECOND(false, '.', BigDecimal.ONE.scaleByPowerOfTen(-6), BigDecimal.valueOf(1000_000_000)), - DOW(false, '-', null, null), - ISODOW(false, '-', null, null), - DOY(false, '-', null, null), - EPOCH(false, '*', null, null), - DECADE(true, '*', BigDecimal.valueOf(120) /* months */, null), - CENTURY(true, '*', BigDecimal.valueOf(1200) /* months */, null), - MILLENNIUM(true, '*', BigDecimal.valueOf(12000) /* months */, null); - - public final boolean yearMonth; - public final char separator; - public final BigDecimal multiplier; - private final BigDecimal limit; - - private static final TimeUnit[] CACHED_VALUES = values(); - - TimeUnit(boolean yearMonth, char separator, BigDecimal multiplier, BigDecimal limit) { - this.yearMonth = yearMonth; - this.separator = separator; - this.multiplier = multiplier; - this.limit = limit; - } - - /** - * Returns the TimeUnit associated with an ordinal. The value returned is null if the ordinal is - * not a member of the TimeUnit enumeration. - */ - public static TimeUnit getValue(int ordinal) { - return ordinal < 0 || ordinal >= CACHED_VALUES.length ? null : CACHED_VALUES[ordinal]; - } - - /** - * Returns whether a given value is valid for a field of this time unit. - * - * @param field Field value - * @return Whether value - */ - public boolean isValidValue(BigDecimal field) { - return field.compareTo(BigDecimal.ZERO) >= 0 && (limit == null || field.compareTo(limit) < 0); - } - } - - /** - * A range of time units. The first is more significant than the other (e.g. year-to-day) or the - * same as the other (e.g. month). - */ - public enum TimeUnitRange { - YEAR(TimeUnit.YEAR, null), - YEAR_TO_MONTH(TimeUnit.YEAR, TimeUnit.MONTH), - MONTH(TimeUnit.MONTH, null), - DAY(TimeUnit.DAY, null), - DAY_TO_HOUR(TimeUnit.DAY, TimeUnit.HOUR), - DAY_TO_MINUTE(TimeUnit.DAY, TimeUnit.MINUTE), - DAY_TO_SECOND(TimeUnit.DAY, TimeUnit.SECOND), - HOUR(TimeUnit.HOUR, null), - HOUR_TO_MINUTE(TimeUnit.HOUR, TimeUnit.MINUTE), - HOUR_TO_SECOND(TimeUnit.HOUR, TimeUnit.SECOND), - MINUTE(TimeUnit.MINUTE, null), - MINUTE_TO_SECOND(TimeUnit.MINUTE, TimeUnit.SECOND), - SECOND(TimeUnit.SECOND, null), - - // non-standard time units cannot participate in ranges - ISOYEAR(TimeUnit.ISOYEAR, null), - QUARTER(TimeUnit.QUARTER, null), - WEEK(TimeUnit.WEEK, null), - MILLISECOND(TimeUnit.MILLISECOND, null), - MICROSECOND(TimeUnit.MICROSECOND, null), - NANOSECOND(TimeUnit.NANOSECOND, null), - DOW(TimeUnit.DOW, null), - ISODOW(TimeUnit.ISODOW, null), - DOY(TimeUnit.DOY, null), - EPOCH(TimeUnit.EPOCH, null), - DECADE(TimeUnit.DECADE, null), - CENTURY(TimeUnit.CENTURY, null), - MILLENNIUM(TimeUnit.MILLENNIUM, null); - - public final TimeUnit startUnit; - public final TimeUnit endUnit; - - private static final Map, TimeUnitRange> MAP = createMap(); - - /** - * Creates a TimeUnitRange. - * - * @param startUnit Start time unit - * @param endUnit End time unit - */ - TimeUnitRange(TimeUnit startUnit, TimeUnit endUnit) { - assert startUnit != null; - this.startUnit = startUnit; - this.endUnit = endUnit; - } - - /** - * Returns a {@code TimeUnitRange} with a given start and end unit. - * - * @param startUnit Start unit - * @param endUnit End unit - * @return Time unit range, or null if not valid - */ - public static TimeUnitRange of(TimeUnit startUnit, TimeUnit endUnit) { - return MAP.get(new Pair<>(startUnit, endUnit)); - } - - private static Map, TimeUnitRange> createMap() { - Map, TimeUnitRange> map = new HashMap<>(); - for (TimeUnitRange value : values()) { - map.put(new Pair<>(value.startUnit, value.endUnit), value); - } - return Collections.unmodifiableMap(map); - } - - /** Whether this is in the YEAR-TO-MONTH family of intervals. */ - public boolean monthly() { - return ordinal() <= MONTH.ordinal(); - } - - /** - * Immutable pair of values of the same type. - * - * @param the element type - */ - private static class Pair { - final E left; - final E right; - - private Pair(E left, E right) { - this.left = left; - this.right = right; - } - - @Override - public int hashCode() { - int k = (left == null) ? 0 : left.hashCode(); - int k1 = (right == null) ? 0 : right.hashCode(); - return ((k << 4) | k) ^ k1; - } - - @Override - public boolean equals(Object obj) { - return obj == this - || obj instanceof Pair - && Objects.equals(left, ((Pair) obj).left) - && Objects.equals(right, ((Pair) obj).right); - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java deleted file mode 100644 index 9fc777d51d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FilterUtil.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkFilters; - -import java.util.List; -import java.util.Optional; - -public class FilterUtil { - - public static IcebergAndFlinkFilters convertFlinkExpressToIceberg( - List flinkFilters) { - List acceptedFilters = Lists.newArrayList(); - List expressions = Lists.newArrayList(); - - for (ResolvedExpression resolvedExpression : flinkFilters) { - Optional icebergExpression = FlinkFilters.convert(resolvedExpression); - if (icebergExpression.isPresent()) { - expressions.add(icebergExpression.get()); - acceptedFilters.add(resolvedExpression); - } - } - return IcebergAndFlinkFilters.of(expressions, acceptedFilters); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java deleted file mode 100644 index 6b181b510d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/FlinkClassReflectionUtil.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.api.connector.source.ReaderOutput; -import org.apache.flink.streaming.api.operators.source.ProgressiveTimestampsAndWatermarks; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; - -/** A util class to handle the reflection operation of Flink class. */ -public class FlinkClassReflectionUtil { - - public static final Logger LOG = LoggerFactory.getLogger(FlinkClassReflectionUtil.class); - - public static Object getSplitLocalOutput(ReaderOutput readerOutput) { - if (readerOutput == null) { - return null; - } - try { - return ReflectionUtil.getField( - (Class) ProgressiveTimestampsAndWatermarks.class.getDeclaredClasses()[2], - readerOutput, - "splitLocalOutputs"); - } catch (Exception e) { - LOG.warn("extract internal watermark error", e); - } - return null; - } - - public static void emitPeriodWatermark(@Nullable Object splitLocalOutput) { - if (splitLocalOutput == null) { - return; - } - try { - Method method = - ProgressiveTimestampsAndWatermarks.class.getDeclaredClasses()[1].getDeclaredMethod( - "emitPeriodicWatermark"); - method.setAccessible(true); - method.invoke(splitLocalOutput); - } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { - LOG.warn("no method found", e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java deleted file mode 100644 index bfba02d779..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergAndFlinkFilters.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.iceberg.expressions.Expression; - -import java.util.List; - -public class IcebergAndFlinkFilters { - - List expressions; - List acceptedFilters; - - private IcebergAndFlinkFilters( - List expressions, List acceptedFilters) { - this.expressions = expressions; - this.acceptedFilters = acceptedFilters; - } - - public static IcebergAndFlinkFilters of( - List expressions, List acceptedFilters) { - return new IcebergAndFlinkFilters(expressions, acceptedFilters); - } - - public List expressions() { - return expressions; - } - - public List acceptedFilters() { - return acceptedFilters; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java deleted file mode 100644 index 8dcc3eb1bc..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/IcebergClassUtil.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.amoro.flink.interceptor.ProxyFactory; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.flink.api.common.operators.MailboxExecutor; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; -import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.ScanContext; -import org.apache.iceberg.flink.source.StreamingReaderOperator; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.util.ThreadPools; - -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** An util generates Apache Iceberg writer and committer operator w */ -public class IcebergClassUtil { - private static final String ICEBERG_SCAN_CONTEXT_CLASS = - "org.apache.iceberg.flink.source.ScanContext"; - private static final String ICEBERG_PARTITION_SELECTOR_CLASS = - "org.apache.iceberg.flink.sink.PartitionKeySelector"; - private static final String ICEBERG_FILE_COMMITTER_CLASS = - "org.apache.iceberg.flink.sink.IcebergFilesCommitter"; - private static final String ICEBERG_FILE_WRITER_CLASS = - "org.apache.iceberg.flink.sink.IcebergStreamWriter"; - - public static KeySelector newPartitionKeySelector( - PartitionSpec spec, Schema schema, RowType flinkSchema) { - try { - Class clazz = forName(ICEBERG_PARTITION_SELECTOR_CLASS); - Constructor c = clazz.getConstructor(PartitionSpec.class, Schema.class, RowType.class); - c.setAccessible(true); - return (KeySelector) c.newInstance(spec, schema, flinkSchema); - } catch (NoSuchMethodException - | IllegalAccessException - | InvocationTargetException - | InstantiationException e) { - throw new RuntimeException(e); - } - } - - public static OneInputStreamOperator newIcebergFilesCommitter( - TableLoader tableLoader, boolean replacePartitions, String branch, PartitionSpec spec) { - try { - Class clazz = forName(ICEBERG_FILE_COMMITTER_CLASS); - Constructor c = - clazz.getDeclaredConstructor( - TableLoader.class, - boolean.class, - Map.class, - Integer.class, - String.class, - PartitionSpec.class); - c.setAccessible(true); - return (OneInputStreamOperator) - c.newInstance( - tableLoader, - replacePartitions, - new HashMap<>(), - ThreadPools.WORKER_THREAD_POOL_SIZE, - branch, - spec); - } catch (NoSuchMethodException - | IllegalAccessException - | InvocationTargetException - | InstantiationException e) { - throw new RuntimeException(e); - } - } - - public static OneInputStreamOperator newIcebergFilesCommitter( - TableLoader tableLoader, - boolean replacePartitions, - String branch, - PartitionSpec spec, - AuthenticatedFileIO authenticatedFileIO) { - OneInputStreamOperator obj = - newIcebergFilesCommitter(tableLoader, replacePartitions, branch, spec); - return (OneInputStreamOperator) ProxyUtil.getProxy(obj, authenticatedFileIO); - } - - public static ProxyFactory getIcebergStreamWriterProxyFactory( - String fullTableName, - TaskWriterFactory taskWriterFactory, - AuthenticatedFileIO authenticatedFileIO) { - Class clazz = forName(ICEBERG_FILE_WRITER_CLASS); - return (ProxyFactory) - ProxyUtil.getProxyFactory( - clazz, - authenticatedFileIO, - new Class[] {String.class, TaskWriterFactory.class}, - new Object[] {fullTableName, taskWriterFactory}); - } - - public static StreamingReaderOperator newStreamingReaderOperator( - FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { - try { - Constructor c = - StreamingReaderOperator.class.getDeclaredConstructor( - FlinkInputFormat.class, ProcessingTimeService.class, MailboxExecutor.class); - c.setAccessible(true); - return c.newInstance(format, timeService, mailboxExecutor); - } catch (IllegalAccessException - | NoSuchMethodException - | InvocationTargetException - | InstantiationException e) { - throw new RuntimeException(e); - } - } - - public static FlinkInputFormat getInputFormat(OneInputStreamOperatorFactory operatorFactory) { - try { - Class[] classes = StreamingReaderOperator.class.getDeclaredClasses(); - Class clazz = null; - for (Class c : classes) { - if ("OperatorFactory".equals(c.getSimpleName())) { - clazz = c; - break; - } - } - Field field = clazz.getDeclaredField("format"); - field.setAccessible(true); - return (FlinkInputFormat) (field.get(operatorFactory)); - } catch (IllegalAccessException | NoSuchFieldException e) { - throw new RuntimeException(e); - } - } - - public static ProxyFactory getInputFormatProxyFactory( - OneInputStreamOperatorFactory operatorFactory, - AuthenticatedFileIO authenticatedFileIO, - Schema tableSchema) { - FlinkInputFormat inputFormat = getInputFormat(operatorFactory); - TableLoader tableLoader = - ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "tableLoader"); - FileIO io = ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "io"); - EncryptionManager encryption = - ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "encryption"); - Object context = ReflectionUtil.getField(FlinkInputFormat.class, inputFormat, "context"); - - return ProxyUtil.getProxyFactory( - FlinkInputFormat.class, - authenticatedFileIO, - new Class[] { - TableLoader.class, Schema.class, FileIO.class, EncryptionManager.class, ScanContext.class - }, - new Object[] {tableLoader, tableSchema, io, encryption, context}); - } - - private static Class forName(String className) { - try { - return Class.forName(className); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } - - public static SourceFunction getSourceFunction(AbstractUdfStreamOperator source) { - try { - Field field = AbstractUdfStreamOperator.class.getDeclaredField("userFunction"); - field.setAccessible(true); - return (SourceFunction) (field.get(source)); - } catch (IllegalAccessException | NoSuchFieldException e) { - throw new RuntimeException(e); - } - } - - public static void clean(StreamExecutionEnvironment env) { - try { - Field field = StreamExecutionEnvironment.class.getDeclaredField("transformations"); - field.setAccessible(true); - ((List) (field.get(env))).clear(); - } catch (IllegalAccessException | NoSuchFieldException e) { - throw new RuntimeException(e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java deleted file mode 100644 index 677bfc3a8f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/LookupUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.amoro.flink.lookup.LookupOptions; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.flink.configuration.Configuration; - -public class LookupUtil { - - public static LookupOptions convertLookupOptions(Configuration config) { - return new LookupOptions.Builder() - .lruMaximumSize(config.get(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS)) - .writeRecordThreadNum(config.get(MixedFormatValidator.ROCKSDB_WRITING_THREADS)) - .ttlAfterWrite(config.get(MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE)) - .blockCacheCapacity(config.get(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_CAPACITY)) - .blockCacheNumShardBits(config.get(MixedFormatValidator.ROCKSDB_BLOCK_CACHE_NUM_SHARD_BITS)) - .build(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java deleted file mode 100644 index cbd94f2413..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/MixedFormatUtils.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; -import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; -import static org.apache.amoro.table.TableProperties.LOG_STORE_DATA_VERSION; -import static org.apache.amoro.table.TableProperties.LOG_STORE_DATA_VERSION_DEFAULT; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; -import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_DEFAULT; -import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_KAFKA; -import static org.apache.amoro.table.TableProperties.LOG_STORE_TYPE; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - -import org.apache.amoro.flink.metric.MetricsGenerator; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.write.AutomaticLogWriter; -import org.apache.amoro.flink.write.MixedFormatLogWriter; -import org.apache.amoro.flink.write.hidden.HiddenLogWriter; -import org.apache.amoro.flink.write.hidden.kafka.HiddenKafkaFactory; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.amoro.table.TableProperties; -import org.apache.amoro.utils.CompatiblePropertyUtil; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.ValidationException; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.time.Duration; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.stream.Collectors; - -/** An util that loads mixed-format table, build mixed-format log writer and so on. */ -public class MixedFormatUtils { - - public static final Logger LOG = LoggerFactory.getLogger(MixedFormatUtils.class); - - public static MixedTable loadMixedTable(MixedFormatTableLoader tableLoader) { - tableLoader.open(); - MixedTable table = tableLoader.loadMixedFormatTable(); - try { - tableLoader.close(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - return table; - } - - public static List getPrimaryKeys(MixedTable table) { - if (table.isUnkeyedTable()) { - return Collections.emptyList(); - } - return table.asKeyedTable().primaryKeySpec().fields().stream() - .map(PrimaryKeySpec.PrimaryKeyField::fieldName) - .collect(Collectors.toList()); - } - - public static MetricsGenerator getMetricsGenerator( - boolean metricsEventLatency, - boolean metricsEnable, - MixedTable mixedTable, - RowType flinkSchemaRowType, - Schema writeSchema) { - MetricsGenerator metricsGenerator; - if (metricsEventLatency) { - String modifyTimeColumn = mixedTable.properties().get(TableProperties.TABLE_EVENT_TIME_FIELD); - metricsGenerator = - MetricsGenerator.newGenerator( - mixedTable.schema(), flinkSchemaRowType, modifyTimeColumn, metricsEnable); - } else { - metricsGenerator = MetricsGenerator.empty(metricsEnable); - } - return metricsGenerator; - } - - public static boolean mixedFormatWALWriterEnable( - Map properties, String emitMode) { - boolean streamEnable = - CompatiblePropertyUtil.propertyAsBoolean( - properties, ENABLE_LOG_STORE, TableProperties.ENABLE_LOG_STORE_DEFAULT); - - if (emitMode.contains(MixedFormatValidator.MIXED_FORMAT_EMIT_LOG)) { - if (!streamEnable) { - throw new ValidationException( - "emit to kafka was set, but no kafka config be found, please set kafka config first"); - } - return true; - } else if (emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO)) { - LOG.info( - "mixed-format emit mode is auto, and the mixed-format table {} is {}", - ENABLE_LOG_STORE, - streamEnable); - return streamEnable; - } - - return false; - } - - /** - * only when {@link MixedFormatValidator#MIXED_FORMAT_EMIT_MODE} contains {@link - * MixedFormatValidator#MIXED_FORMAT_EMIT_FILE} and enable {@link - * TableProperties#ENABLE_LOG_STORE} create logWriter according to {@link - * TableProperties#LOG_STORE_DATA_VERSION} - * - * @param properties mixed-format table properties - * @param producerConfig - * @param topic - * @param tableSchema - * @param tableLoader mixed-format table loader - * @param watermarkWriteGap watermark gap that triggers automatic writing to log storage - * @return mixed-formatLogWriter - */ - public static MixedFormatLogWriter buildLogWriter( - Map properties, - @Nullable Properties producerConfig, - @Nullable String topic, - TableSchema tableSchema, - String emitMode, - ShuffleHelper helper, - MixedFormatTableLoader tableLoader, - Duration watermarkWriteGap) { - if (!mixedFormatWALWriterEnable(properties, emitMode)) { - return null; - } - - if (topic == null) { - topic = - CompatibleFlinkPropertyUtil.propertyAsString(properties, LOG_STORE_MESSAGE_TOPIC, null); - } - Preconditions.checkNotNull( - topic, - String.format("Topic should be specified. It can be set by '%s'", LOG_STORE_MESSAGE_TOPIC)); - - producerConfig = combineTableAndUnderlyingLogstoreProperties(properties, producerConfig); - - String version = - properties.getOrDefault(LOG_STORE_DATA_VERSION, LOG_STORE_DATA_VERSION_DEFAULT); - if (LOG_STORE_DATA_VERSION_DEFAULT.equals(version)) { - if (emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO)) { - LOG.info( - "mixed-format emit mode is auto, and we will build automatic log writer: AutomaticLogWriter(v1)"); - return new AutomaticLogWriter( - FlinkSchemaUtil.convert(tableSchema), - producerConfig, - topic, - new HiddenKafkaFactory<>(), - LogRecordV1.FIELD_GETTER_FACTORY, - IdGenerator.generateUpstreamId(), - helper, - tableLoader, - watermarkWriteGap); - } - - LOG.info("build log writer: HiddenLogWriter(v1)"); - return new HiddenLogWriter( - FlinkSchemaUtil.convert(tableSchema), - producerConfig, - topic, - new HiddenKafkaFactory<>(), - LogRecordV1.FIELD_GETTER_FACTORY, - IdGenerator.generateUpstreamId(), - helper); - } - throw new UnsupportedOperationException( - "don't support log version '" + version + "'. only support 'v1' or empty"); - } - - /** - * Extract and combine the properties for underlying log store queue. - * - * @param tableProperties mixed-format table properties - * @param producerConfig can be set by java API - * @return properties with tableProperties and producerConfig which has higher priority. - */ - private static Properties combineTableAndUnderlyingLogstoreProperties( - Map tableProperties, Properties producerConfig) { - Properties finalProp; - Properties underlyingLogStoreProps = - CompatibleFlinkPropertyUtil.fetchLogstorePrefixProperties(tableProperties); - if (producerConfig == null) { - finalProp = underlyingLogStoreProps; - } else { - underlyingLogStoreProps - .stringPropertyNames() - .forEach(k -> producerConfig.putIfAbsent(k, underlyingLogStoreProps.get(k))); - finalProp = producerConfig; - } - - String logStoreAddress = - CompatibleFlinkPropertyUtil.propertyAsString(tableProperties, LOG_STORE_ADDRESS, null); - - String logType = - CompatibleFlinkPropertyUtil.propertyAsString( - tableProperties, LOG_STORE_TYPE, LOG_STORE_STORAGE_TYPE_DEFAULT); - if (logType.equals(LOG_STORE_STORAGE_TYPE_KAFKA)) { - finalProp.putIfAbsent( - "key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - finalProp.putIfAbsent( - "value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - finalProp.putIfAbsent( - "key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - finalProp.putIfAbsent( - "value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - - if (logStoreAddress != null) { - finalProp.putIfAbsent(BOOTSTRAP_SERVERS_CONFIG, logStoreAddress); - } - - Preconditions.checkArgument( - finalProp.containsKey(BOOTSTRAP_SERVERS_CONFIG), - String.format("%s should be set", LOG_STORE_ADDRESS)); - } - - return finalProp; - } - - public static boolean fileWriterEnable(String emitMode) { - return emitMode.contains(MixedFormatValidator.MIXED_FORMAT_EMIT_FILE) - || emitMode.equals(MixedFormatValidator.MIXED_FORMAT_EMIT_AUTO); - } - - public static boolean isToBase(boolean overwrite) { - boolean toBase = overwrite; - LOG.info("is write to base:{}", toBase); - return toBase; - } - - public static RowData removeMixedFormatMetaColumn(RowData rowData, int columnSize) { - GenericRowData newRowData = new GenericRowData(rowData.getRowKind(), columnSize); - if (rowData instanceof GenericRowData) { - GenericRowData before = (GenericRowData) rowData; - for (int i = 0; i < newRowData.getArity(); i++) { - newRowData.setField(i, before.getField(i)); - } - return newRowData; - } - throw new UnsupportedOperationException( - String.format( - "Can't remove mixed-format meta column from this RowData %s", - rowData.getClass().getSimpleName())); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java deleted file mode 100644 index 9ceffe2c32..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/Projection.java +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static org.apache.flink.table.types.logical.LogicalTypeRoot.ROW; - -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.FieldsDataType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.util.Preconditions; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.ListIterator; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * {@link Projection} represents a list of (possibly nested) indexes that can be used to project - * data types. A row projection includes both reducing the accessible fields and reordering them. - * - *

NOTE: Copied from Flink. - */ -public abstract class Projection { - - // sealed class - private Projection() {} - - /** - * Projects a (possibly nested) row data type by returning a new data type that only includes - * fields of the given index paths. - * - *

When extracting nested fields, the name of the resulting fields is the full path of the - * field separated by {@code _}. For example, the field {@code b} inside the row field {@code a} - * of the root {@link DataType} is named {@code a_b} in the result {@link DataType}. In case of - * naming conflicts the postfix notation '_$%d' is used, where {@code %d} is an arbitrary number, - * in order to generate a unique field name. For example if the root {@link DataType} includes - * both a field {@code a_b} and a nested row {@code a} with field {@code b}, the result {@link - * DataType} will contain one field named {@code a_b} and the other named {@code a_b_1}. - */ - public abstract DataType project(DataType dataType); - - /** Same as {@link #project(DataType)}, but accepting and returning {@link LogicalType}. */ - public LogicalType project(LogicalType logicalType) { - return this.project(TypeConversions.fromLogicalToDataType(logicalType)).getLogicalType(); - } - - /** @return {@code true} whether this projection is nested or not. */ - public abstract boolean isNested(); - - /** - * Perform a difference of this {@link Projection} with another {@link Projection}. The result of - * this operation is a new {@link Projection} retaining the same ordering of this instance but - * with the indexes from {@code other} removed. For example: - * - *

-   * 
-   * [4, 1, 0, 3, 2] - [4, 2] = [1, 0, 2]
-   * 
-   * 
- * - *

Note how the index {@code 3} in the minuend becomes {@code 2} because it's rescaled to - * project correctly a {@link RowData} or arity 3. - * - * @param other the subtrahend - * @throws IllegalArgumentException when {@code other} is nested. - */ - public abstract Projection difference(Projection other); - - /** - * Complement this projection. The returned projection is an ordered projection of fields from 0 - * to {@code fieldsNumber} except the indexes in this {@link Projection}. For example: - * - *

-   * 
-   * [4, 2].complement(5) = [0, 1, 3]
-   * 
-   * 
- * - * @param fieldsNumber the size of the universe - * @throws IllegalStateException if this projection is nested. - */ - public abstract Projection complement(int fieldsNumber); - - /** Like {@link #complement(int)}, using the {@code dataType} fields count. */ - public Projection complement(DataType dataType) { - return complement(dataType.getLogicalType().getChildren().size()); - } - - /** - * Convert this instance to a projection of top level indexes. The array represents the mapping of - * the fields of the original {@link DataType}. For example, {@code [0, 2, 1]} specifies to - * include in the following order the 1st field, the 3rd field and the 2nd field of the row. - * - * @throws IllegalStateException if this projection is nested. - */ - public abstract int[] toTopLevelIndexes(); - - /** - * Convert this instance to a nested projection index paths. The array represents the mapping of - * the fields of the original {@link DataType}, including nested rows. For example, {@code [[0, 2, - * 1], ...]} specifies to include the 2nd field of the 3rd field of the 1st field in the top-level - * row. - */ - public abstract int[][] toNestedIndexes(); - - /** - * Create an empty {@link Projection}, that is a projection that projects no fields, returning an - * empty {@link DataType}. - */ - public static Projection empty() { - return EmptyProjection.INSTANCE; - } - - /** - * Create a {@link Projection} of the provided {@code indexes}. - * - * @see #toTopLevelIndexes() - */ - public static Projection of(int[] indexes) { - if (indexes.length == 0) { - return empty(); - } - return new TopLevelProjection(indexes); - } - - /** - * Create a {@link Projection} of the provided {@code indexes}. - * - * @see #toNestedIndexes() - */ - public static Projection of(int[][] indexes) { - if (indexes.length == 0) { - return empty(); - } - return new NestedProjection(indexes); - } - - /** Create a {@link Projection} of a field range. */ - public static Projection range(int startInclusive, int endExclusive) { - return new TopLevelProjection(IntStream.range(startInclusive, endExclusive).toArray()); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Projection)) { - return false; - } - Projection other = (Projection) o; - if (!this.isNested() && !other.isNested()) { - return Arrays.equals(this.toTopLevelIndexes(), other.toTopLevelIndexes()); - } - return Arrays.deepEquals(this.toNestedIndexes(), other.toNestedIndexes()); - } - - @Override - public int hashCode() { - if (isNested()) { - return Arrays.deepHashCode(toNestedIndexes()); - } - return Arrays.hashCode(toTopLevelIndexes()); - } - - @Override - public String toString() { - if (isNested()) { - return "Nested projection = " + Arrays.deepToString(toNestedIndexes()); - } - return "Top level projection = " + Arrays.toString(toTopLevelIndexes()); - } - - private static class EmptyProjection extends Projection { - - static final EmptyProjection INSTANCE = new EmptyProjection(); - - private EmptyProjection() {} - - @Override - public DataType project(DataType dataType) { - return new NestedProjection(toNestedIndexes()).project(dataType); - } - - @Override - public boolean isNested() { - return false; - } - - @Override - public Projection difference(Projection projection) { - return this; - } - - @Override - public Projection complement(int fieldsNumber) { - return new TopLevelProjection(IntStream.range(0, fieldsNumber).toArray()); - } - - @Override - public int[] toTopLevelIndexes() { - return new int[0]; - } - - @Override - public int[][] toNestedIndexes() { - return new int[0][]; - } - } - - private static class NestedProjection extends Projection { - - final int[][] projection; - final boolean nested; - - NestedProjection(int[][] projection) { - this.projection = projection; - this.nested = Arrays.stream(projection).anyMatch(arr -> arr.length > 1); - } - - @Override - public DataType project(DataType dataType) { - final List updatedFields = new ArrayList<>(); - final List updatedChildren = new ArrayList<>(); - Set nameDomain = new HashSet<>(); - int duplicateCount = 0; - for (int[] indexPath : this.projection) { - DataType fieldType = dataType.getChildren().get(indexPath[0]); - LogicalType fieldLogicalType = fieldType.getLogicalType(); - StringBuilder builder = - new StringBuilder( - ((RowType) dataType.getLogicalType()).getFieldNames().get(indexPath[0])); - for (int index = 1; index < indexPath.length; index++) { - Preconditions.checkArgument( - fieldLogicalType.getTypeRoot() == ROW, "Row data type expected."); - RowType rowtype = ((RowType) fieldLogicalType); - builder.append("_").append(rowtype.getFieldNames().get(indexPath[index])); - fieldLogicalType = rowtype.getFields().get(indexPath[index]).getType(); - fieldType = fieldType.getChildren().get(indexPath[index]); - } - String path = builder.toString(); - while (nameDomain.contains(path)) { - path = builder.append("_$").append(duplicateCount++).toString(); - } - updatedFields.add(new RowType.RowField(path, fieldLogicalType)); - updatedChildren.add(fieldType); - nameDomain.add(path); - } - return new FieldsDataType( - new RowType(dataType.getLogicalType().isNullable(), updatedFields), - dataType.getConversionClass(), - updatedChildren); - } - - @Override - public boolean isNested() { - return nested; - } - - @Override - public Projection difference(Projection other) { - if (other.isNested()) { - throw new IllegalArgumentException( - "Cannot perform difference between nested projection and nested projection"); - } - if (other instanceof EmptyProjection) { - return this; - } - if (!this.isNested()) { - return new TopLevelProjection(toTopLevelIndexes()).difference(other); - } - - // Extract the indexes to exclude and sort them - int[] indexesToExclude = other.toTopLevelIndexes(); - indexesToExclude = Arrays.copyOf(indexesToExclude, indexesToExclude.length); - Arrays.sort(indexesToExclude); - - List resultProjection = - Arrays.stream(projection).collect(Collectors.toCollection(ArrayList::new)); - - ListIterator resultProjectionIterator = resultProjection.listIterator(); - while (resultProjectionIterator.hasNext()) { - int[] indexArr = resultProjectionIterator.next(); - - // Let's check if the index is inside the indexesToExclude array - int searchResult = Arrays.binarySearch(indexesToExclude, indexArr[0]); - if (searchResult >= 0) { - // Found, we need to remove it - resultProjectionIterator.remove(); - } else { - // Not found, let's compute the offset. - // Offset is the index where the projection index should be inserted in the - // indexesToExclude array - int offset = (-(searchResult) - 1); - if (offset != 0) { - indexArr[0] = indexArr[0] - offset; - } - } - } - - return new NestedProjection(resultProjection.toArray(new int[0][])); - } - - @Override - public Projection complement(int fieldsNumber) { - if (isNested()) { - throw new IllegalStateException("Cannot perform complement of a nested projection"); - } - return new TopLevelProjection(toTopLevelIndexes()).complement(fieldsNumber); - } - - @Override - public int[] toTopLevelIndexes() { - if (isNested()) { - throw new IllegalStateException( - "Cannot convert a nested projection to a top level projection"); - } - return Arrays.stream(projection).mapToInt(arr -> arr[0]).toArray(); - } - - @Override - public int[][] toNestedIndexes() { - return projection; - } - } - - private static class TopLevelProjection extends Projection { - - final int[] projection; - - TopLevelProjection(int[] projection) { - this.projection = projection; - } - - @Override - public DataType project(DataType dataType) { - return new NestedProjection(toNestedIndexes()).project(dataType); - } - - @Override - public boolean isNested() { - return false; - } - - @Override - public Projection difference(Projection other) { - if (other.isNested()) { - throw new IllegalArgumentException( - "Cannot perform difference between top level projection and nested projection"); - } - if (other instanceof EmptyProjection) { - return this; - } - - // Extract the indexes to exclude and sort them - int[] indexesToExclude = other.toTopLevelIndexes(); - indexesToExclude = Arrays.copyOf(indexesToExclude, indexesToExclude.length); - Arrays.sort(indexesToExclude); - - List resultProjection = - Arrays.stream(projection).boxed().collect(Collectors.toCollection(ArrayList::new)); - - ListIterator resultProjectionIterator = resultProjection.listIterator(); - while (resultProjectionIterator.hasNext()) { - int index = resultProjectionIterator.next(); - - // Let's check if the index is inside the indexesToExclude array - int searchResult = Arrays.binarySearch(indexesToExclude, index); - if (searchResult >= 0) { - // Found, we need to remove it - resultProjectionIterator.remove(); - } else { - // Not found, let's compute the offset. - // Offset is the index where the projection index should be inserted in the - // indexesToExclude array - int offset = (-(searchResult) - 1); - if (offset != 0) { - resultProjectionIterator.set(index - offset); - } - } - } - - return new TopLevelProjection(resultProjection.stream().mapToInt(i -> i).toArray()); - } - - @Override - public Projection complement(int fieldsNumber) { - int[] indexesToExclude = Arrays.copyOf(projection, projection.length); - Arrays.sort(indexesToExclude); - - return new TopLevelProjection( - IntStream.range(0, fieldsNumber) - .filter(i -> Arrays.binarySearch(indexesToExclude, i) < 0) - .toArray()); - } - - @Override - public int[] toTopLevelIndexes() { - return projection; - } - - @Override - public int[][] toNestedIndexes() { - return Arrays.stream(projection).mapToObj(i -> new int[] {i}).toArray(int[][]::new); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java deleted file mode 100644 index a08798e0e7..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ProxyUtil.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import net.sf.cglib.proxy.Enhancer; -import net.sf.cglib.proxy.MethodInterceptor; -import org.apache.amoro.flink.interceptor.KerberosInterceptor; -import org.apache.amoro.flink.interceptor.KerberosInvocationHandler; -import org.apache.amoro.flink.interceptor.ProxyFactory; -import org.apache.amoro.io.AuthenticatedFileIO; - -/** - * A proxy util wraps an object with the kerberos authenticate ability by {@link - * KerberosInvocationHandler}. - */ -public class ProxyUtil { - - public static Object getProxy(T obj, KerberosInvocationHandler handler) { - return handler.getProxy(obj); - } - - public static Object getProxy(T obj, AuthenticatedFileIO authenticatedFileIO) { - KerberosInvocationHandler handler = new KerberosInvocationHandler<>(authenticatedFileIO); - return getProxy(obj, handler); - } - - public static T getProxy( - Class clazz, MethodInterceptor interceptor, Class[] argumentTypes, Object[] arguments) { - Enhancer enhancer = new Enhancer(); - enhancer.setSuperclass(clazz); - enhancer.setCallback(interceptor); - return (T) enhancer.create(argumentTypes, arguments); - } - - public static T getProxy( - Class clazz, - AuthenticatedFileIO authenticatedFileIO, - Class[] argumentTypes, - Object[] arguments) { - return getProxy(clazz, new KerberosInterceptor(authenticatedFileIO), argumentTypes, arguments); - } - - public static ProxyFactory getProxyFactory( - Class clazz, - AuthenticatedFileIO authenticatedFileIO, - Class[] argumentTypes, - Object[] arguments) { - return new ProxyFactory( - clazz, new KerberosInterceptor(authenticatedFileIO), argumentTypes, arguments); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java deleted file mode 100644 index 010a17c74a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ReflectionUtil.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import java.lang.reflect.Field; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -/** An util for reflection. */ -public class ReflectionUtil { - - /** get interfaces of class and its parent */ - public static Class[] getAllInterface(Class clazz) { - if (clazz.equals(Object.class)) { - return new Class[] {}; - } - Class[] current = clazz.getInterfaces(); - Class superClass = clazz.getSuperclass(); - Class[] superInterfaces = getAllInterface(superClass); - - Set> all = new HashSet<>(); - all.addAll(Arrays.asList(current)); - all.addAll(Arrays.asList(superInterfaces)); - - Class[] deduplicated = new Class[all.size()]; - return all.toArray(deduplicated); - } - - public static V getField(Class clazz, O obj, String fieldName) { - try { - Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - Object v = field.get(obj); - return v == null ? null : (V) v; - } catch (NoSuchFieldException | IllegalAccessException e) { - throw new RuntimeException(e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java deleted file mode 100644 index cf44a0b18e..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/util/ThreadLocalCache.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.annotation.Internal; - -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.function.Function; - -/** - * Provides a thread local cache with a maximum cache size per thread. - * - *

Note: Values must not be null. - * - *

Copied from flink-1.18 - */ -@Internal -public abstract class ThreadLocalCache { - - private static final int DEFAULT_CACHE_SIZE = 64; - - private final ThreadLocal> cache = new ThreadLocal<>(); - private final int maxSizePerThread; - - protected ThreadLocalCache() { - this(DEFAULT_CACHE_SIZE); - } - - protected ThreadLocalCache(int maxSizePerThread) { - this.maxSizePerThread = maxSizePerThread; - } - - public V get(K key) { - BoundedMap map = cache.get(); - if (map == null) { - map = new BoundedMap<>(maxSizePerThread); - cache.set(map); - } - V value = map.get(key); - if (value == null) { - value = getNewInstance(key); - map.put(key, value); - } - return value; - } - - public abstract V getNewInstance(K key); - - private static class BoundedMap extends LinkedHashMap { - - private static final long serialVersionUID = -211630219014422361L; - - private final int maxSize; - - private BoundedMap(int maxSize) { - this.maxSize = maxSize; - } - - @Override - protected boolean removeEldestEntry(Map.Entry eldest) { - return this.size() > maxSize; - } - } - - public static ThreadLocalCache of(Function creator) { - return new ThreadLocalCache() { - @Override - public V getNewInstance(K key) { - return creator.apply(key); - } - }; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java deleted file mode 100644 index b801a5815b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AdaptHiveFlinkAppenderFactory.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetWriters; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.AdaptHiveParquet; - -import java.io.IOException; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Map; - -public class AdaptHiveFlinkAppenderFactory implements FileAppenderFactory, Serializable { - private final Schema schema; - private final RowType flinkSchema; - private final Map props; - private final PartitionSpec spec; - private final int[] equalityFieldIds; - private final Schema eqDeleteRowSchema; - private final Schema posDeleteRowSchema; - - private RowType eqDeleteFlinkSchema = null; - private RowType posDeleteFlinkSchema = null; - - public AdaptHiveFlinkAppenderFactory( - Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { - this(schema, flinkSchema, props, spec, null, null, null); - } - - public AdaptHiveFlinkAppenderFactory( - Schema schema, - RowType flinkSchema, - Map props, - PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { - this.schema = schema; - this.flinkSchema = flinkSchema; - this.props = props; - this.spec = spec; - this.equalityFieldIds = equalityFieldIds; - this.eqDeleteRowSchema = eqDeleteRowSchema; - this.posDeleteRowSchema = posDeleteRowSchema; - } - - private RowType lazyEqDeleteFlinkSchema() { - if (eqDeleteFlinkSchema == null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); - this.eqDeleteFlinkSchema = FlinkSchemaUtil.convert(eqDeleteRowSchema); - } - return eqDeleteFlinkSchema; - } - - private RowType lazyPosDeleteFlinkSchema() { - if (posDeleteFlinkSchema == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Pos-delete row schema shouldn't be null"); - this.posDeleteFlinkSchema = FlinkSchemaUtil.convert(posDeleteRowSchema); - } - return this.posDeleteFlinkSchema; - } - - @Override - public FileAppender newAppender(OutputFile outputFile, FileFormat format) { - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.write(outputFile) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .setAll(props) - .schema(schema) - .metricsConfig(metricsConfig) - .overwrite() - .build(); - - case ORC: - return ORC.write(outputFile) - .createWriterFunc( - (schema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, schema)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - case PARQUET: - return AdaptHiveParquet.write(outputFile) - .createWriterFunc( - msgType -> AdaptHiveFlinkParquetWriters.buildWriter(flinkSchema, msgType)) - .setAll(props) - .metricsConfig(metricsConfig) - .schema(schema) - .overwrite() - .build(); - - default: - throw new UnsupportedOperationException("Cannot write unknown file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public DataWriter newDataWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), - format, - file.encryptingOutputFile().location(), - spec, - partition, - file.keyMetadata()); - } - - @Override - public EqualityDeleteWriter newEqDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - Preconditions.checkState( - equalityFieldIds != null && equalityFieldIds.length > 0, - "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality delete row schema shouldn't be null when creating equality-delete writer"); - - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case PARQUET: - return AdaptHiveParquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> - AdaptHiveFlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - case ORC: - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (schema, typDesc) -> - FlinkOrcWriter.buildWriter(lazyEqDeleteFlinkSchema(), schema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .equalityFieldIds(equalityFieldIds) - .buildEqualityWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write equality-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Override - public PositionDeleteWriter newPosDeleteWriter( - EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { - MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); - try { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(ignore -> new FlinkAvroWriter(lazyPosDeleteFlinkSchema())) - .withPartition(partition) - .overwrite() - .setAll(props) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .buildPositionWriter(); - - case PARQUET: - RowType flinkPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return AdaptHiveParquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - msgType -> - AdaptHiveFlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .transformPaths(path -> StringData.fromString(path.toString())) - .buildPositionWriter(); - - case ORC: - RowType orcPosDeleteSchema = - FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc( - (schema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, schema)) - .withPartition(partition) - .overwrite() - .setAll(props) - .metricsConfig(metricsConfig) - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withKeyMetadata(outputFile.keyMetadata()) - .buildPositionWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write pos-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java deleted file mode 100644 index 7193dd646b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticDoubleWriteStatus.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.iceberg.UpdateProperties; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Serializable; -import java.time.Duration; -import java.util.Map; - -/** This is an automatic logstore writer util class. */ -public class AutomaticDoubleWriteStatus implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(AutomaticDoubleWriteStatus.class); - - private static final long serialVersionUID = 1L; - private final MixedFormatTableLoader tableLoader; - private final AutomaticWriteSpecification specification; - private MixedTable table; - private transient boolean shouldDoubleWrite = false; - private int subtaskId; - - public AutomaticDoubleWriteStatus( - MixedFormatTableLoader tableLoader, Duration writeLogstoreWatermarkGap) { - this.tableLoader = tableLoader; - this.specification = new AutomaticWriteSpecification(writeLogstoreWatermarkGap); - } - - public void setup(int indexOfThisSubtask) { - this.subtaskId = indexOfThisSubtask; - } - - public void open() { - table = MixedFormatUtils.loadMixedTable(tableLoader); - sync(); - } - - public boolean isDoubleWrite() { - return shouldDoubleWrite; - } - - public void processWatermark(Watermark mark) { - if (isDoubleWrite()) { - return; - } - if (specification.shouldDoubleWrite(mark.getTimestamp())) { - shouldDoubleWrite = true; - LOG.info( - "processWatermark {}, subTaskId is {}, should double write is true.", mark, subtaskId); - LOG.info( - "begin update mixed-format table, set {} to true", - MixedFormatValidator.LOG_STORE_CATCH_UP.key()); - UpdateProperties updateProperties = table.updateProperties(); - updateProperties.set(MixedFormatValidator.LOG_STORE_CATCH_UP.key(), String.valueOf(true)); - updateProperties.set( - MixedFormatValidator.LOG_STORE_CATCH_UP_TIMESTAMP.key(), - String.valueOf(System.currentTimeMillis())); - updateProperties.remove(MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); - updateProperties.commit(); - LOG.info("end update mixed-format table."); - } - } - - public void sync() { - table.refresh(); - Map properties = table.properties(); - shouldDoubleWrite = - !properties.containsKey(MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); - LOG.info( - "AutomaticDoubleWriteStatus sync, subTaskId: {}, should double write: {}", - subtaskId, - shouldDoubleWrite); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java deleted file mode 100644 index 03fc90e63a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticLogWriter.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.write.hidden.HiddenLogWriter; -import org.apache.amoro.flink.write.hidden.LogMsgFactory; -import org.apache.amoro.log.LogData; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.graph.StreamConfig; -import org.apache.flink.streaming.api.operators.Output; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.StreamTask; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; - -import java.time.Duration; -import java.util.Properties; - -/** - * This is an automatic logstore writer util class. It will write logstore when the system current - * timestamp is greater than the watermark of all subtasks plus the {@link - * MixedFormatValidator#AUTO_EMIT_LOGSTORE_WATERMARK_GAP} value. - */ -public class AutomaticLogWriter extends MixedFormatLogWriter { - private final AutomaticDoubleWriteStatus status; - private final MixedFormatLogWriter mixedFormatLogWriter; - - public AutomaticLogWriter( - Schema schema, - Properties producerConfig, - String topic, - LogMsgFactory factory, - LogData.FieldGetterFactory fieldGetterFactory, - byte[] jobId, - ShuffleHelper helper, - MixedFormatTableLoader tableLoader, - Duration writeLogstoreWatermarkGap) { - this.mixedFormatLogWriter = - new HiddenLogWriter( - schema, producerConfig, topic, factory, fieldGetterFactory, jobId, helper); - this.status = new AutomaticDoubleWriteStatus(tableLoader, writeLogstoreWatermarkGap); - } - - @Override - public void setup( - StreamTask containingTask, StreamConfig config, Output> output) { - super.setup(containingTask, config, output); - mixedFormatLogWriter.setup(containingTask, config, output); - status.setup(getRuntimeContext().getIndexOfThisSubtask()); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - mixedFormatLogWriter.initializeState(context); - } - - @Override - public void open() throws Exception { - super.open(); - mixedFormatLogWriter.open(); - status.open(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.processElement(element); - } - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - status.processWatermark(mark); - super.processWatermark(mark); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.prepareSnapshotPreBarrier(checkpointId); - } else { - status.sync(); - } - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.snapshotState(context); - } - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.notifyCheckpointComplete(checkpointId); - } - } - - @Override - public void notifyCheckpointAborted(long checkpointId) throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.notifyCheckpointAborted(checkpointId); - } - } - - @Override - public void close() throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.close(); - } - } - - @Override - public void endInput() throws Exception { - if (status.isDoubleWrite()) { - mixedFormatLogWriter.endInput(); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java deleted file mode 100644 index 6ad19f8868..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/AutomaticWriteSpecification.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.io.Serializable; -import java.time.Duration; - -/** Automatic write specification. */ -public class AutomaticWriteSpecification implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(AutomaticWriteSpecification.class); - - private static final long serialVersionUID = 1L; - public final Duration writeLogstoreWatermarkGap; - - public AutomaticWriteSpecification(@Nullable Duration writeLogstoreWatermarkGap) { - this.writeLogstoreWatermarkGap = writeLogstoreWatermarkGap; - } - - /** - * Returns whether the automatic writing is enabled. - * - * @param watermark the watermark of the operator - * @return true: double write, false: single write. - */ - public boolean shouldDoubleWrite(long watermark) { - // The writeLogstoreWatermarkGap is null, which means that the logstore writer is enabled - // immediately once the job - // is launched. - if (writeLogstoreWatermarkGap == null) { - LOG.info( - "The logstore writer is enabled and the {} is null," - + " so double write immediately once the job is launched.", - AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key()); - return true; - } - long now = System.currentTimeMillis(); - boolean result = watermark >= now - writeLogstoreWatermarkGap.toMillis(); - if (result) { - LOG.info( - "The logstore writer is enabled and the {} is {}, the watermark has caught up, {}.", - AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), - writeLogstoreWatermarkGap, - watermark); - } else { - LOG.debug( - "The logstore writer is enabled and the {} is {}, the watermark has not caught up, {}.", - AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), - writeLogstoreWatermarkGap, - watermark); - } - return result; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java deleted file mode 100644 index c63cdd5555..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkBaseTaskWriter.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.io.writer.BaseTaskWriter; -import org.apache.amoro.io.writer.OutputFileFactory; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.io.FileAppenderFactory; - -/** - * task writer for {@link KeyedTable#baseTable()}. Dev should make sure outputFileFactory write to - * base table's location - */ -public class FlinkBaseTaskWriter extends BaseTaskWriter { - - private final RowDataWrapper wrapper; - - public FlinkBaseTaskWriter( - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory outputFileFactory, - AuthenticatedFileIO io, - long targetFileSize, - long mask, - Schema schema, - RowType flinkSchema, - PartitionSpec spec, - PrimaryKeySpec primaryKeySpec) { - super( - format, - appenderFactory, - outputFileFactory, - io, - targetFileSize, - mask, - schema, - spec, - primaryKeySpec, - false); - this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - } - - @Override - protected StructLike asStructLike(RowData data) { - return wrapper.wrap(data); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java deleted file mode 100644 index 50aaacd086..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkChangeTaskWriter.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.data.PrimaryKeyData; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.io.writer.ChangeTaskWriter; -import org.apache.amoro.io.writer.OutputFileFactory; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.utils.JoinedRowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.io.FileAppenderFactory; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -/** - * task writer for {@link KeyedTable#changeTable()} ()}. Dev should make sure outputFileFactory - * write to change table's location - */ -public class FlinkChangeTaskWriter extends ChangeTaskWriter { - - private final RowDataWrapper wrapper; - private final boolean upsert; - private final Set hasUpdateBeforeKeys = new HashSet<>(); - - public FlinkChangeTaskWriter( - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory outputFileFactory, - AuthenticatedFileIO io, - long targetFileSize, - long mask, - Schema schema, - RowType flinkSchema, - PartitionSpec spec, - PrimaryKeySpec primaryKeySpec, - boolean upsert) { - super( - format, - appenderFactory, - outputFileFactory, - io, - targetFileSize, - mask, - schema, - spec, - primaryKeySpec, - false); - this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.upsert = upsert; - } - - @Override - protected StructLike asStructLike(RowData data) { - return wrapper.wrap(data); - } - - @Override - protected RowData appendMetaColumns(RowData data, Long fileOffset) { - return new JoinedRowData(data, GenericRowData.of(fileOffset)); - } - - @Override - public void write(RowData row) throws IOException { - processMultiUpdateAfter(row); - if (upsert && RowKind.INSERT.equals(row.getRowKind())) { - row.setRowKind(RowKind.DELETE); - super.write(row); - row.setRowKind(RowKind.INSERT); - } - super.write(row); - } - - @Override - protected ChangeAction action(RowData data) { - switch (data.getRowKind()) { - case DELETE: - return ChangeAction.DELETE; - case INSERT: - return ChangeAction.INSERT; - case UPDATE_BEFORE: - return ChangeAction.UPDATE_BEFORE; - case UPDATE_AFTER: - return ChangeAction.UPDATE_AFTER; - } - return ChangeAction.INSERT; - } - - /** Turn update_after to insert if there isn't update_after followed by update_before. */ - private void processMultiUpdateAfter(RowData row) { - RowKind rowKind = row.getRowKind(); - if (RowKind.UPDATE_BEFORE.equals(rowKind) || RowKind.UPDATE_AFTER.equals(rowKind)) { - PrimaryKeyData primaryKey = getPrimaryKey(); - primaryKey.primaryKey(asStructLike(row)); - - if (RowKind.UPDATE_AFTER.equals(rowKind)) { - if (!hasUpdateBeforeKeys.contains(primaryKey)) { - row.setRowKind(RowKind.INSERT); - } else { - hasUpdateBeforeKeys.remove(primaryKey); - } - } else { - PrimaryKeyData copyKey = primaryKey.copy(); - hasUpdateBeforeKeys.add(copyKey); - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java deleted file mode 100644 index 38f0ea532e..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkSink.java +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.flink.FlinkSchemaUtil.getPhysicalSchema; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_EMIT_FILE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_EMIT_MODE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SUBMIT_EMPTY_SNAPSHOTS; -import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_HASH_MODE; -import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_HASH_MODE_DEFAULT; -import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.amoro.table.TableProperties.WRITE_DISTRIBUTION_MODE_DEFAULT; -import static org.apache.flink.table.factories.FactoryUtil.SINK_PARALLELISM; - -import org.apache.amoro.flink.metric.MetricsGenerator; -import org.apache.amoro.flink.shuffle.RoundRobinShuffleRulePolicy; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.shuffle.ShuffleKey; -import org.apache.amoro.flink.shuffle.ShuffleRulePolicy; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.amoro.flink.util.CompatibleFlinkPropertyUtil; -import org.apache.amoro.flink.util.IcebergClassUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.util.ProxyUtil; -import org.apache.amoro.table.DistributionHashMode; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableProperties; -import org.apache.flink.api.common.typeinfo.Types; -import org.apache.flink.api.java.typeutils.TypeExtractor; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; -import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; -import org.apache.flink.streaming.api.functions.sink.DiscardingSink; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.connector.ProviderContext; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.Preconditions; -import org.apache.iceberg.DistributionMode; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.time.Duration; -import java.util.Properties; - -/** - * An util generates mixed-format sink operator including log writer, file writer and file committer - * operators. - */ -public class FlinkSink { - private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - - public static final String FILES_COMMITTER_NAME = "FilesCommitter"; - - public static Builder forRowData(DataStream input) { - return new Builder().forRowData(input); - } - - public static class Builder { - private DataStream rowDataInput = null; - private ProviderContext context; - private MixedTable table; - private MixedFormatTableLoader tableLoader; - private TableSchema flinkSchema; - private Properties producerConfig; - private String topic; - private boolean overwrite = false; - private final String branch = SnapshotRef.MAIN_BRANCH; - private DistributionHashMode distributionMode = null; - - private Builder() {} - - private Builder forRowData(DataStream newRowDataInput) { - this.rowDataInput = newRowDataInput; - return this; - } - - public Builder context(ProviderContext context) { - this.context = context; - return this; - } - - public Builder table(MixedTable table) { - this.table = table; - return this; - } - - public Builder flinkSchema(TableSchema flinkSchema) { - this.flinkSchema = flinkSchema; - return this; - } - - public Builder producerConfig(Properties producerConfig) { - this.producerConfig = producerConfig; - return this; - } - - public Builder topic(String topic) { - this.topic = topic; - return this; - } - - public Builder tableLoader(MixedFormatTableLoader tableLoader) { - this.tableLoader = tableLoader; - return this; - } - - public Builder overwrite(boolean overwrite) { - this.overwrite = overwrite; - return this; - } - - public Builder distribute(DistributionHashMode distributionMode) { - this.distributionMode = distributionMode; - return this; - } - - DataStreamSink withEmit( - DataStream input, - MixedFormatLogWriter logWriter, - MixedFormatFileWriter fileWriter, - OneInputStreamOperator committer, - int writeOperatorParallelism, - MetricsGenerator metricsGenerator, - String emitMode) { - SingleOutputStreamOperator writerStream = - input - .transform( - MixedFormatWriter.class.getName(), - TypeExtractor.createTypeInfo(WriteResult.class), - new MixedFormatWriter<>(logWriter, fileWriter, metricsGenerator)) - .name(String.format("MixedFormatWriter %s(%s)", table.name(), emitMode)) - .setParallelism(writeOperatorParallelism); - - if (committer != null) { - writerStream = - writerStream - .transform(FILES_COMMITTER_NAME, Types.VOID, committer) - .setParallelism(1) - .setMaxParallelism(1); - } - - return writerStream - .addSink(new DiscardingSink<>()) - .name(String.format("MixedFormatSink %s", table.name())) - .setParallelism(1); - } - - public DataStreamSink build() { - Preconditions.checkNotNull(tableLoader, "table loader can not be null"); - initTableIfNeeded(); - - Configuration config = new Configuration(); - table.properties().forEach(config::setString); - - RowType flinkSchemaRowType = - (RowType) getPhysicalSchema(flinkSchema).toRowDataType().getLogicalType(); - Schema writeSchema = - TypeUtil.reassignIds( - FlinkSchemaUtil.convert(getPhysicalSchema(flinkSchema)), table.schema()); - - int writeOperatorParallelism = - PropertyUtil.propertyAsInt( - table.properties(), - SINK_PARALLELISM.key(), - rowDataInput.getExecutionEnvironment().getParallelism()); - - DistributionHashMode distributionMode = getDistributionHashMode(); - LOG.info("take effect distribute mode: {}", distributionMode); - ShuffleHelper helper = ShuffleHelper.build(table, writeSchema, flinkSchemaRowType); - - ShuffleRulePolicy shufflePolicy = - buildShuffleRulePolicy( - helper, writeOperatorParallelism, distributionMode, overwrite, table); - LOG.info( - "shuffle policy config={}, actual={}", - distributionMode, - shufflePolicy == null ? DistributionMode.NONE : distributionMode.getDesc()); - - String emitMode = - table - .properties() - .getOrDefault(MIXED_FORMAT_EMIT_MODE.key(), MIXED_FORMAT_EMIT_MODE.defaultValue()); - final boolean metricsEventLatency = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - table.properties(), - MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE, - MixedFormatValidator.MIXED_FORMAT_LATENCY_METRIC_ENABLE_DEFAULT); - - final boolean metricsEnable = - CompatibleFlinkPropertyUtil.propertyAsBoolean( - table.properties(), - MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE, - MIXED_FORMAT_THROUGHPUT_METRIC_ENABLE_DEFAULT); - - final Duration watermarkWriteGap = config.get(AUTO_EMIT_LOGSTORE_WATERMARK_GAP); - - MixedFormatFileWriter fileWriter = - createFileWriter( - table, shufflePolicy, overwrite, flinkSchemaRowType, emitMode, tableLoader); - - MixedFormatLogWriter logWriter = - MixedFormatUtils.buildLogWriter( - table.properties(), - producerConfig, - topic, - flinkSchema, - emitMode, - helper, - tableLoader, - watermarkWriteGap); - - MetricsGenerator metricsGenerator = - MixedFormatUtils.getMetricsGenerator( - metricsEventLatency, metricsEnable, table, flinkSchemaRowType, writeSchema); - - if (shufflePolicy != null) { - rowDataInput = - rowDataInput.partitionCustom( - shufflePolicy.generatePartitioner(), shufflePolicy.generateKeySelector()); - } - - return withEmit( - rowDataInput, - logWriter, - fileWriter, - createFileCommitter(table, tableLoader, overwrite, branch, table.spec(), emitMode), - writeOperatorParallelism, - metricsGenerator, - emitMode); - } - - private void initTableIfNeeded() { - if (table == null) { - table = MixedFormatUtils.loadMixedTable(tableLoader); - } - } - - /** - * Transform {@link org.apache.iceberg.TableProperties#WRITE_DISTRIBUTION_MODE} to - * ShufflePolicyType - */ - private DistributionHashMode getDistributionHashMode() { - if (distributionMode != null) { - return distributionMode; - } - - String modeName = - PropertyUtil.propertyAsString( - table.properties(), WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_DEFAULT); - - DistributionMode mode = DistributionMode.fromName(modeName); - switch (mode) { - case NONE: - return DistributionHashMode.NONE; - case HASH: - String hashMode = - PropertyUtil.propertyAsString( - table.properties(), - WRITE_DISTRIBUTION_HASH_MODE, - WRITE_DISTRIBUTION_HASH_MODE_DEFAULT); - return DistributionHashMode.valueOfDesc(hashMode); - case RANGE: - LOG.warn( - "Fallback to use 'none' distribution mode, because {}={} is not supported in flink now", - WRITE_DISTRIBUTION_MODE, - DistributionMode.RANGE.modeName()); - return DistributionHashMode.NONE; - default: - return DistributionHashMode.AUTO; - } - } - - @Nullable - public static ShuffleRulePolicy buildShuffleRulePolicy( - ShuffleHelper helper, - int writeOperatorParallelism, - DistributionHashMode distributionHashMode, - boolean overwrite, - MixedTable table) { - if (distributionHashMode == DistributionHashMode.AUTO) { - distributionHashMode = - DistributionHashMode.autoSelect( - helper.isPrimaryKeyExist(), helper.isPartitionKeyExist()); - } - if (distributionHashMode == DistributionHashMode.NONE) { - return null; - } else { - if (distributionHashMode.mustByPrimaryKey() && !helper.isPrimaryKeyExist()) { - throw new IllegalArgumentException( - "illegal shuffle policy " - + distributionHashMode.getDesc() - + " for table without primary key"); - } - if (distributionHashMode.mustByPartition() && !helper.isPartitionKeyExist()) { - throw new IllegalArgumentException( - "illegal shuffle policy " - + distributionHashMode.getDesc() - + " for table without partition"); - } - int writeFileSplit; - if (MixedFormatUtils.isToBase(overwrite)) { - writeFileSplit = - PropertyUtil.propertyAsInt( - table.properties(), - TableProperties.BASE_FILE_INDEX_HASH_BUCKET, - TableProperties.BASE_FILE_INDEX_HASH_BUCKET_DEFAULT); - } else { - writeFileSplit = - PropertyUtil.propertyAsInt( - table.properties(), - TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET, - TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET_DEFAULT); - } - - return new RoundRobinShuffleRulePolicy( - helper, writeOperatorParallelism, writeFileSplit, distributionHashMode); - } - } - } - - public static MixedFormatFileWriter createFileWriter( - MixedTable mixedTable, - ShuffleRulePolicy shufflePolicy, - boolean overwrite, - RowType flinkSchema, - MixedFormatTableLoader tableLoader) { - return createFileWriter( - mixedTable, shufflePolicy, overwrite, flinkSchema, MIXED_FORMAT_EMIT_FILE, tableLoader); - } - - public static MixedFormatFileWriter createFileWriter( - MixedTable mixedTable, - ShuffleRulePolicy shufflePolicy, - boolean overwrite, - RowType flinkSchema, - String emitMode, - MixedFormatTableLoader tableLoader) { - if (!MixedFormatUtils.fileWriterEnable(emitMode)) { - return null; - } - long maxOpenFilesSizeBytes = - PropertyUtil.propertyAsLong( - mixedTable.properties(), - MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE, - MIXED_FORMAT_WRITE_MAX_OPEN_FILE_SIZE_DEFAULT); - LOG.info( - "with maxOpenFilesSizeBytes = {}MB, close biggest/earliest file to avoid OOM", - maxOpenFilesSizeBytes >> 20); - - int minFileSplitCount = - PropertyUtil.propertyAsInt( - mixedTable.properties(), - TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET, - TableProperties.CHANGE_FILE_INDEX_HASH_BUCKET_DEFAULT); - - boolean upsert = - mixedTable.isKeyedTable() - && PropertyUtil.propertyAsBoolean( - mixedTable.properties(), - TableProperties.UPSERT_ENABLED, - TableProperties.UPSERT_ENABLED_DEFAULT); - boolean submitEmptySnapshot = - PropertyUtil.propertyAsBoolean( - mixedTable.properties(), - SUBMIT_EMPTY_SNAPSHOTS.key(), - SUBMIT_EMPTY_SNAPSHOTS.defaultValue()); - - return new MixedFormatFileWriter( - shufflePolicy, - createTaskWriterFactory(mixedTable, overwrite, flinkSchema), - minFileSplitCount, - tableLoader, - upsert, - submitEmptySnapshot); - } - - private static TaskWriterFactory createTaskWriterFactory( - MixedTable mixedTable, boolean overwrite, RowType flinkSchema) { - return new MixedFormatRowDataTaskWriterFactory(mixedTable, flinkSchema, overwrite); - } - - public static OneInputStreamOperator createFileCommitter( - MixedTable mixedTable, - MixedFormatTableLoader tableLoader, - boolean overwrite, - String branch, - PartitionSpec spec) { - return createFileCommitter( - mixedTable, tableLoader, overwrite, branch, spec, MIXED_FORMAT_EMIT_FILE); - } - - public static OneInputStreamOperator createFileCommitter( - MixedTable mixedTable, - MixedFormatTableLoader tableLoader, - boolean overwrite, - String branch, - PartitionSpec spec, - String emitMode) { - if (!MixedFormatUtils.fileWriterEnable(emitMode)) { - return null; - } - tableLoader.switchLoadInternalTableForKeyedTable(MixedFormatUtils.isToBase(overwrite)); - return (OneInputStreamOperator) - ProxyUtil.getProxy( - IcebergClassUtil.newIcebergFilesCommitter( - tableLoader, overwrite, branch, spec, mixedTable.io()), - mixedTable.io()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java deleted file mode 100644 index 1fec38f803..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/FlinkTaskWriterBuilder.java +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.hive.io.writer.AdaptHiveOperateToTableRelation; -import org.apache.amoro.hive.io.writer.AdaptHiveOutputFileFactory; -import org.apache.amoro.hive.table.HiveLocationKind; -import org.apache.amoro.hive.table.SupportHive; -import org.apache.amoro.hive.utils.TableTypeUtil; -import org.apache.amoro.io.writer.CommonOutputFileFactory; -import org.apache.amoro.io.writer.OutputFileFactory; -import org.apache.amoro.io.writer.SortedPosDeleteWriter; -import org.apache.amoro.io.writer.TaskWriterBuilder; -import org.apache.amoro.properties.HiveTableProperties; -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.BaseLocationKind; -import org.apache.amoro.table.ChangeLocationKind; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.LocationKind; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.amoro.table.TableProperties; -import org.apache.amoro.table.UnkeyedTable; -import org.apache.amoro.table.WriteOperationKind; -import org.apache.amoro.utils.SchemaUtil; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptionManager; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.sink.FlinkAppenderFactory; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PropertyUtil; - -import java.util.Locale; - -public class FlinkTaskWriterBuilder implements TaskWriterBuilder { - - private final MixedTable table; - private Long transactionId; - private int partitionId = 0; - private long taskId = 0; - private RowType flinkSchema; - private long mask; - - private FlinkTaskWriterBuilder(MixedTable table) { - this.table = table; - } - - public FlinkTaskWriterBuilder withTransactionId(Long transactionId) { - this.transactionId = transactionId; - return this; - } - - public FlinkTaskWriterBuilder withPartitionId(int partitionId) { - this.partitionId = partitionId; - return this; - } - - public FlinkTaskWriterBuilder withTaskId(long taskId) { - this.taskId = taskId; - return this; - } - - public FlinkTaskWriterBuilder withFlinkSchema(RowType flinkSchema) { - this.flinkSchema = flinkSchema; - return this; - } - - public FlinkTaskWriterBuilder withMask(long mask) { - this.mask = mask; - return this; - } - - @Override - public TaskWriter buildWriter(WriteOperationKind writeOperationKind) { - LocationKind locationKind = - AdaptHiveOperateToTableRelation.INSTANT.getLocationKindsFromOperateKind( - table, writeOperationKind); - return buildWriter(locationKind); - } - - @Override - public TaskWriter buildWriter(LocationKind locationKind) { - if (locationKind == ChangeLocationKind.INSTANT) { - return buildChangeWriter(); - } else if (locationKind == BaseLocationKind.INSTANT - || locationKind == HiveLocationKind.INSTANT) { - return buildBaseWriter(locationKind); - } else { - throw new IllegalArgumentException("Not support Location Kind:" + locationKind); - } - } - - private FlinkBaseTaskWriter buildBaseWriter(LocationKind locationKind) { - Preconditions.checkArgument(transactionId == null); - FileFormat fileFormat = - FileFormat.valueOf( - (table - .properties() - .getOrDefault( - TableProperties.BASE_FILE_FORMAT, TableProperties.BASE_FILE_FORMAT_DEFAULT) - .toUpperCase(Locale.ENGLISH))); - long fileSizeBytes = - PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - - String baseLocation; - EncryptionManager encryptionManager; - Schema schema; - Table icebergTable; - PrimaryKeySpec primaryKeySpec = null; - if (table.isKeyedTable()) { - KeyedTable keyedTable = table.asKeyedTable(); - baseLocation = keyedTable.baseLocation(); - encryptionManager = keyedTable.baseTable().encryption(); - schema = keyedTable.baseTable().schema(); - primaryKeySpec = keyedTable.primaryKeySpec(); - icebergTable = keyedTable.baseTable(); - } else { - UnkeyedTable table = this.table.asUnkeyedTable(); - baseLocation = table.location(); - encryptionManager = table.encryption(); - schema = table.schema(); - icebergTable = table; - } - - Schema selectSchema = - TypeUtil.reassignIds( - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(flinkSchema)), schema); - boolean hiveConsistentWriteEnabled = - PropertyUtil.propertyAsBoolean( - table.properties(), - HiveTableProperties.HIVE_CONSISTENT_WRITE_ENABLED, - HiveTableProperties.HIVE_CONSISTENT_WRITE_ENABLED_DEFAULT); - - OutputFileFactory outputFileFactory = - locationKind == HiveLocationKind.INSTANT - ? new AdaptHiveOutputFileFactory( - ((SupportHive) table).hiveLocation(), - table.spec(), - fileFormat, - table.io(), - encryptionManager, - partitionId, - taskId, - transactionId, - hiveConsistentWriteEnabled) - : new CommonOutputFileFactory( - baseLocation, - table.spec(), - fileFormat, - table.io(), - encryptionManager, - partitionId, - taskId, - transactionId); - FileAppenderFactory appenderFactory = - TableTypeUtil.isHive(table) - ? new AdaptHiveFlinkAppenderFactory( - schema, flinkSchema, table.properties(), table.spec()) - : new FlinkAppenderFactory( - icebergTable, - schema, - flinkSchema, - table.properties(), - table.spec(), - null, - null, - null); - return new FlinkBaseTaskWriter( - fileFormat, - appenderFactory, - outputFileFactory, - table.io(), - fileSizeBytes, - mask, - selectSchema, - flinkSchema, - table.spec(), - primaryKeySpec); - } - - private TaskWriter buildChangeWriter() { - if (table.isUnkeyedTable()) { - throw new IllegalArgumentException("UnKeyed table UnSupport change writer"); - } - Preconditions.checkArgument(transactionId == null); - - FileFormat fileFormat = - FileFormat.valueOf( - (table - .properties() - .getOrDefault( - TableProperties.BASE_FILE_FORMAT, TableProperties.BASE_FILE_FORMAT_DEFAULT) - .toUpperCase(Locale.ENGLISH))); - long fileSizeBytes = - PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - - KeyedTable keyedTable = table.asKeyedTable(); - Schema selectSchema = - TypeUtil.reassignIds( - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(flinkSchema)), - keyedTable.baseTable().schema()); - Schema changeSchemaWithMeta = SchemaUtil.changeWriteSchema(keyedTable.baseTable().schema()); - RowType flinkSchemaWithMeta = FlinkSchemaUtil.convert(changeSchemaWithMeta); - - OutputFileFactory outputFileFactory = - new CommonOutputFileFactory( - keyedTable.changeLocation(), - keyedTable.spec(), - fileFormat, - keyedTable.io(), - keyedTable.baseTable().encryption(), - partitionId, - taskId, - transactionId); - FileAppenderFactory appenderFactory = - TableTypeUtil.isHive(table) - ? new AdaptHiveFlinkAppenderFactory( - changeSchemaWithMeta, - flinkSchemaWithMeta, - keyedTable.properties(), - keyedTable.spec()) - : new FlinkAppenderFactory( - keyedTable.changeTable(), - changeSchemaWithMeta, - flinkSchemaWithMeta, - keyedTable.properties(), - keyedTable.spec(), - null, - null, - null); - boolean upsert = - table.isKeyedTable() - && PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.UPSERT_ENABLED, - TableProperties.UPSERT_ENABLED_DEFAULT); - return new FlinkChangeTaskWriter( - fileFormat, - appenderFactory, - outputFileFactory, - keyedTable.io(), - fileSizeBytes, - mask, - selectSchema, - flinkSchema, - keyedTable.spec(), - keyedTable.primaryKeySpec(), - upsert); - } - - @Override - public SortedPosDeleteWriter buildBasePosDeleteWriter( - long mask, long index, StructLike partitionKey) { - throw new UnsupportedOperationException("flink not support position delete"); - } - - public static FlinkTaskWriterBuilder buildFor(MixedTable table) { - return new FlinkTaskWriterBuilder(table); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java deleted file mode 100644 index da7bf7285d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatFileWriter.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.flink.shuffle.ShuffleKey; -import org.apache.amoro.flink.shuffle.ShuffleRulePolicy; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.amoro.table.MixedTable; -import org.apache.commons.lang.ArrayUtils; -import org.apache.flink.annotation.VisibleForTesting; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** This is mixed-format table includes writing file data to un keyed table and keyed table. */ -public class MixedFormatFileWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileWriter.class); - - private final ShuffleRulePolicy shuffleRule; - - private final TaskWriterFactory taskWriterFactory; - private final int minFileSplitCount; - private final MixedFormatTableLoader tableLoader; - private final boolean submitEmptySnapshot; - - private transient TaskWriter writer; - private transient int subTaskId; - private transient int attemptId; - /** - * Load table in runtime, because that table's refresh method will be invoked in serialization. - * And it will set {@link org.apache.hadoop.security.UserGroupInformation#authenticationMethod} to - * KERBEROS if mixed-format's table is KERBEROS enabled. It will cause ugi relevant exception when - * deploy to yarn cluster. - */ - private transient MixedTable table; - - public MixedFormatFileWriter( - ShuffleRulePolicy shuffleRule, - TaskWriterFactory taskWriterFactory, - int minFileSplitCount, - MixedFormatTableLoader tableLoader, - boolean upsert, - boolean submitEmptySnapshot) { - this.shuffleRule = shuffleRule; - this.taskWriterFactory = taskWriterFactory; - this.minFileSplitCount = minFileSplitCount; - this.tableLoader = tableLoader; - this.submitEmptySnapshot = submitEmptySnapshot; - LOG.info( - "MixedFormatFileWriter is created with minFileSplitCount: {}, upsert: {}, submitEmptySnapshot: {}", - minFileSplitCount, - upsert, - submitEmptySnapshot); - } - - @Override - public void open() { - this.attemptId = getRuntimeContext().getAttemptNumber(); - table = MixedFormatUtils.loadMixedTable(tableLoader); - - long mask = getMask(subTaskId); - initTaskWriterFactory(mask); - - this.writer = table.io().doAs(taskWriterFactory::create); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - - this.subTaskId = getRuntimeContext().getIndexOfThisSubtask(); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - } - - private void initTaskWriterFactory(long mask) { - if (taskWriterFactory instanceof MixedFormatRowDataTaskWriterFactory) { - ((MixedFormatRowDataTaskWriterFactory) taskWriterFactory).setMask(mask); - } - taskWriterFactory.initialize(subTaskId, attemptId); - } - - private long getMask(int subTaskId) { - Set initRootNodes; - if (shuffleRule != null) { - initRootNodes = shuffleRule.getSubtaskTreeNodes().get(subTaskId); - } else { - if (table.isKeyedTable()) { - initRootNodes = - IntStream.range(0, minFileSplitCount) - .mapToObj(index -> DataTreeNode.of(minFileSplitCount - 1, index)) - .collect(Collectors.toSet()); - } else { - initRootNodes = Sets.newHashSet(); - initRootNodes.add(DataTreeNode.of(0, 0)); - } - } - - return initRootNodes.iterator().next().mask(); - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - table - .io() - .doAs( - () -> { - completeAndEmitFiles(); - - this.writer = null; - return null; - }); - } - - @Override - public void endInput() throws Exception { - table - .io() - .doAs( - () -> { - completeAndEmitFiles(); - return null; - }); - } - - private void completeAndEmitFiles() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the - // remaining - // completed files to downstream before closing the writer so that we won't miss any of them. - if (writer != null) { - emit(writer.complete()); - } - } - - @Override - public void processElement(StreamRecord element) throws Exception { - RowData row = element.getValue(); - table - .io() - .doAs( - () -> { - if (writer == null) { - this.writer = taskWriterFactory.create(); - } - writer.write(row); - return null; - }); - } - - @Override - public void close() throws Exception { - super.close(); - if (writer != null) { - table - .io() - .doAs( - () -> { - writer.close(); - return null; - }); - writer = null; - } - } - - private void emit(WriteResult writeResult) { - if (shouldEmit(writeResult)) { - // Only emit a non-empty WriteResult to committer operator, thus avoiding submitting too much - // empty snapshots. - output.collect(new StreamRecord<>(writeResult)); - } - } - - /** - * Whether to emit the WriteResult. - * - * @param writeResult the WriteResult to emit - * @return true if the WriteResult should be emitted, or the WriteResult isn't empty, false only - * if the WriteResult is empty and the submitEmptySnapshot is false. - */ - private boolean shouldEmit(WriteResult writeResult) { - return submitEmptySnapshot - || (writeResult != null - && (!ArrayUtils.isEmpty(writeResult.dataFiles()) - || !ArrayUtils.isEmpty(writeResult.deleteFiles()) - || !ArrayUtils.isEmpty(writeResult.referencedDataFiles()))); - } - - @VisibleForTesting - public TaskWriter getWriter() { - return writer; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java deleted file mode 100644 index ed4dd10e1f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatLogWriter.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.table.data.RowData; - -/** This is a common abstract mixed-format log writer. */ -public abstract class MixedFormatLogWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput {} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java deleted file mode 100644 index d6f128a1b8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatRowDataTaskWriterFactory.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.WriteOperationKind; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; - -/** This is an mixed-format table writer factory. */ -public class MixedFormatRowDataTaskWriterFactory implements TaskWriterFactory { - - private final MixedTable table; - private final RowType flinkSchema; - - private final boolean overwrite; - - private transient Long mask = null; - private final transient Long transactionId = null; - private transient Integer taskId = null; - private transient Integer attemptId = null; - - public MixedFormatRowDataTaskWriterFactory( - MixedTable table, RowType flinkSchema, boolean overwrite) { - this.table = table; - this.flinkSchema = flinkSchema; - this.overwrite = overwrite; - } - - public void setMask(long mask) { - this.mask = mask; - } - - @Override - public void initialize(int taskId, int attemptId) { - this.taskId = taskId; - this.attemptId = attemptId; - } - - @Override - public TaskWriter create() { - Preconditions.checkNotNull( - mask, "Mask should be set first. Invoke setMask() before this method"); - - FlinkTaskWriterBuilder builder = - FlinkTaskWriterBuilder.buildFor(table) - .withTaskId(taskId) - .withMask(mask) - .withTransactionId(transactionId) - .withFlinkSchema(flinkSchema) - .withPartitionId(attemptId); - if (overwrite) { - return builder.buildWriter(WriteOperationKind.OVERWRITE); - } else { - return builder.buildWriter(WriteOperationKind.APPEND); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java deleted file mode 100644 index e5dfb7a4a7..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/MixedFormatWriter.java +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.flink.metric.MetricsGenerator; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.metrics.Meter; -import org.apache.flink.metrics.MeterView; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.graph.StreamConfig; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.BoundedOneInput; -import org.apache.flink.streaming.api.operators.Input; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.Output; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.runtime.tasks.StreamTask; -import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; -import org.apache.flink.table.data.RowData; -import org.apache.flink.util.OutputTag; - -import java.util.Objects; - -/** - * This is the general entry of an mixed-format writer that wraps different operators insides. - * - * @param - */ -public class MixedFormatWriter extends AbstractStreamOperator - implements OneInputStreamOperator, BoundedOneInput { - - private transient Meter meterFlowRate; - - private transient Meter meterSpeed; - - private final AbstractStreamOperator fileWriter; - private final MixedFormatLogWriter logWriter; - private final MetricsGenerator metricsGenerator; - - private static final String INFLUXDB_TAG_NAME = "mixed_format_task_id"; - - public MixedFormatWriter( - MixedFormatLogWriter logWriter, - AbstractStreamOperator fileWriter, - MetricsGenerator metricsGenerator) { - this.logWriter = logWriter; - this.fileWriter = fileWriter; - this.metricsGenerator = metricsGenerator; - } - - @Override - public void setup( - StreamTask containingTask, StreamConfig config, Output> output) { - super.setup(containingTask, config, output); - if (logWriter != null) { - logWriter.setup(containingTask, config, EMPTY_OUTPUT); - } - if (fileWriter != null) { - fileWriter.setup(containingTask, config, output); - } - } - - @Override - public void open() throws Exception { - ExecutionConfig.GlobalJobParameters globalJobParameters = - getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); - String taskId = - Objects.nonNull(globalJobParameters.toMap().get(INFLUXDB_TAG_NAME)) - ? globalJobParameters.toMap().get(INFLUXDB_TAG_NAME) - : "null"; - // latency - if (metricsGenerator.enable()) { - getRuntimeContext() - .getMetricGroup() - .addGroup(INFLUXDB_TAG_NAME, taskId) - .gauge("record-latency", metricsGenerator::getCurrentLatency); - LOG.info("add metrics record-latency"); - } - if (metricsGenerator.isMetricEnable()) { - // speed - meterFlowRate = - getRuntimeContext() - .getMetricGroup() - .addGroup(INFLUXDB_TAG_NAME, taskId) - .meter("record-meter", new MeterView(60)); - LOG.info("add metrics record-meter"); - // rate of flow - meterSpeed = - getRuntimeContext() - .getMetricGroup() - .addGroup(INFLUXDB_TAG_NAME, taskId) - .meter("record-count", new MeterView(60)); - LOG.info("add metrics record-count"); - } - if (logWriter != null) { - logWriter.open(); - } - if (fileWriter != null) { - fileWriter.open(); - } - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - if (logWriter != null) { - logWriter.initializeState(context); - } - if (fileWriter != null) { - fileWriter.initializeState(context); - } - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - if (logWriter != null) { - logWriter.prepareSnapshotPreBarrier(checkpointId); - } - if (fileWriter != null) { - fileWriter.prepareSnapshotPreBarrier(checkpointId); - } - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - if (logWriter != null) { - logWriter.snapshotState(context); - } - if (fileWriter != null) { - fileWriter.snapshotState(context); - } - } - - @Override - public void endInput() throws Exception { - if (logWriter != null) { - logWriter.endInput(); - } - if (fileWriter instanceof BoundedOneInput) { - ((BoundedOneInput) fileWriter).endInput(); - } - } - - @Override - public void processElement(StreamRecord element) throws Exception { - if (metricsGenerator.isMetricEnable()) { - meterSpeed.markEvent(); - } - if (logWriter != null) { - logWriter.processElement(element); - } - if (fileWriter instanceof Input) { - ((Input) fileWriter).processElement(element); - } - metricsGenerator.recordLatency(element); - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - if (logWriter != null) { - logWriter.processWatermark(mark); - } - if (fileWriter instanceof Input) { - ((Input) fileWriter).processWatermark(mark); - } - super.processWatermark(mark); - } - - @Override - public void close() throws Exception { - super.close(); - if (logWriter != null) { - logWriter.close(); - } - if (fileWriter != null) { - fileWriter.close(); - } - } - - private static final Output> EMPTY_OUTPUT = - new Output>() { - @Override - public void emitWatermark(Watermark watermark) {} - - @Override - public void emitWatermarkStatus(WatermarkStatus watermarkStatus) {} - - @Override - public void collect(OutputTag outputTag, StreamRecord streamRecord) {} - - @Override - public void collect(StreamRecord rowDataStreamRecord) {} - - @Override - public void emitLatencyMarker(LatencyMarker latencyMarker) {} - - @Override - public void close() {} - }; -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java deleted file mode 100644 index dfd04264b0..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/AbstractHiddenLogWriter.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden; - -import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.write.MixedFormatLogWriter; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.typeutils.base.IntSerializer; -import org.apache.flink.api.common.typeutils.base.StringSerializer; -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Properties; - -/** - * This is an abstract log queue writer. Sending flip message to the kafka topic when the operator - * occurs restoring, through the {@link GlobalFlipCommitter} commit {@link - * GlobalFlipCommitter.CommitRequest} to the jobMaster. {@link this#processElement(StreamRecord)} - * will process records after all operators has sent flip message to the jobMaster and the jobMaster - * has finished handling these requests. - */ -public abstract class AbstractHiddenLogWriter extends MixedFormatLogWriter { - public static final Logger LOG = LoggerFactory.getLogger(AbstractHiddenLogWriter.class); - - private static final long serialVersionUID = 1L; - private int subtaskId; - private transient ListState hiddenLogJobIdentifyState; - private transient ListState parallelismState; - private transient Long ckpComplete; - private final Schema schema; - private final Properties producerConfig; - private final String topic; - private final ShuffleHelper helper; - protected final LogMsgFactory factory; - protected LogMsgFactory.Producer producer; - - private transient boolean shouldCheckFlipSent = false; - private transient boolean flipSentSucceed = false; - - private GlobalFlipCommitter flipCommitter; - private final LogData.FieldGetterFactory fieldGetterFactory; - protected transient LogDataJsonSerialization logDataJsonSerialization; - - protected FormatVersion logVersion = FormatVersion.FORMAT_VERSION_V1; - protected byte[] jobIdentify; - // start from 1L, epicNo is similar to checkpoint id. - protected long epicNo = 1L; - - protected transient LogData logFlip; - - public AbstractHiddenLogWriter( - Schema schema, - Properties producerConfig, - String topic, - LogMsgFactory factory, - LogData.FieldGetterFactory fieldGetterFactory, - byte[] jobId, - ShuffleHelper helper) { - this.schema = schema; - this.producerConfig = checkNotNull(producerConfig); - this.topic = checkNotNull(topic); - this.factory = factory; - this.fieldGetterFactory = fieldGetterFactory; - this.jobIdentify = jobId; - this.helper = helper; - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - super.initializeState(context); - subtaskId = getRuntimeContext().getIndexOfThisSubtask(); - - hiddenLogJobIdentifyState = - context - .getOperatorStateStore() - .getListState( - new ListStateDescriptor<>( - "hidden-wal-writer-job-identify", StringSerializer.INSTANCE)); - - parallelismState = - context - .getOperatorStateStore() - .getListState( - new ListStateDescriptor<>( - "job-" + Arrays.toString(jobIdentify) + "-parallelism", - IntSerializer.INSTANCE)); - // init flip committer function - flipCommitter = - new GlobalFlipCommitter( - getRuntimeContext().getGlobalAggregateManager(), - new GlobalFlipCommitter.FlipCommitFunction( - getRuntimeContext().getNumberOfParallelSubtasks(), - schema, - fieldGetterFactory, - factory, - producerConfig, - topic, - helper)); - int parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); - - if (context.isRestored() && parallelismSame(parallelism)) { - ckpComplete = context.getRestoredCheckpointId().getAsLong(); - - jobIdentify = - hiddenLogJobIdentifyState.get().iterator().next().getBytes(StandardCharsets.UTF_8); - - epicNo = ckpComplete; - - logFlip = - new LogRecordV1( - logVersion, jobIdentify, epicNo, true, ChangeAction.INSERT, new GenericRowData(0)); - // signal flip topic - shouldCheckFlipSent = true; - flipSentSucceed = flipCommitter.commit(subtaskId, logFlip); - // after send flip, epicNo + 1 The epicNo of the data sent by the subsequent processElement() - // method will be 1 larger than the flip.epicNo. - epicNo++; - } else { - hiddenLogJobIdentifyState.clear(); - hiddenLogJobIdentifyState.add(new String(jobIdentify, StandardCharsets.UTF_8)); - } - - logDataJsonSerialization = - new LogDataJsonSerialization<>(checkNotNull(schema), checkNotNull(fieldGetterFactory)); - - producer = factory.createProducer(producerConfig, topic, logDataJsonSerialization, helper); - - parallelismState.clear(); - parallelismState.add(parallelism); - - LOG.info( - "initializeState subtaskId={}, restore={}, lastCkpComplete={}.", - subtaskId, - context.isRestored(), - ckpComplete); - } - - private boolean parallelismSame(int parallelism) throws Exception { - if (parallelismState == null - || parallelismState.get() == null - || !parallelismState.get().iterator().hasNext()) { - LOG.info("Can't find out parallelism state, ignore sending flips."); - return false; - } - int beforeParallelism = parallelismState.get().iterator().next(); - if (beforeParallelism != parallelism) { - LOG.warn( - "This job restored from state, but has changed parallelism, before:{}, now:{}," - + " So ignore sending flips now.", - beforeParallelism, - parallelism); - return false; - } - return true; - } - - @Override - public void open() throws Exception { - producer.open(); - } - - public void processElement(StreamRecord element) throws Exception { - int waitCount = 0; - // this is a sync step that will check sending flip succeed or not - while (shouldCheckFlip() && !alreadySentFlip()) { - Thread.sleep(100); - if (waitCount++ % 100 == 0) { - LOG.info( - "Still waiting for sending flip," - + " while the other subtasks have committed to Global State. this subtask is {}.", - subtaskId); - } - } - } - - private boolean alreadySentFlip() throws IOException { - if (!flipSentSucceed) { - flipSentSucceed = flipCommitter.hasCommittedFlip(logFlip); - } - return flipSentSucceed; - } - - private boolean shouldCheckFlip() { - return shouldCheckFlipSent; - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - super.prepareSnapshotPreBarrier(checkpointId); - LOG.info("prepareSnapshotPreBarrier subtaskId={}, checkpointId={}.", subtaskId, checkpointId); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - super.snapshotState(context); - producer.flush(); - LOG.info("snapshotState subtaskId={}, checkpointId={}.", subtaskId, context.getCheckpointId()); - epicNo++; - } - - @Override - public void close() throws Exception { - if (producer != null) { - producer.close(); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java deleted file mode 100644 index 5f64438709..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/GlobalFlipCommitter.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden; - -import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; - -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.flink.api.common.functions.AggregateFunction; -import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.Map; -import java.util.NavigableMap; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.concurrent.CopyOnWriteArraySet; - -/** This is a global flip committer used by every log writer operator. */ -public class GlobalFlipCommitter { - private static final Logger LOG = LoggerFactory.getLogger(GlobalFlipCommitter.class); - - private static final String AGGREGATE_NAME = "flip-committer"; - private final GlobalAggregateManager aggregateManager; - private final FlipCommitFunction flipCommitFunction; - - public GlobalFlipCommitter( - GlobalAggregateManager aggregateManager, FlipCommitFunction flipCommitFunction) { - this.aggregateManager = aggregateManager; - this.flipCommitFunction = flipCommitFunction; - } - - public boolean commit(int subtaskId, LogData logData) throws IOException { - Long committedEpicNo = - aggregateManager.updateGlobalAggregate( - AGGREGATE_NAME, new CommitRequest(subtaskId, logData), flipCommitFunction); - return committedEpicNo != null && committedEpicNo == logData.getEpicNo(); - } - - public boolean hasCommittedFlip(LogData logData) throws IOException { - Long committedEpicNo = - aggregateManager.updateGlobalAggregate( - AGGREGATE_NAME, new CommitRequest(null, logData, true), flipCommitFunction); - return committedEpicNo != null && committedEpicNo == logData.getEpicNo(); - } - - static class FlipCommitFunction - implements AggregateFunction { - private static final long serialVersionUID = 6399278898504357412L; - private final int numberOfTasks; - private final LogDataJsonSerialization logDataJsonSerialization; - private final LogMsgFactory factory; - private final Properties producerConfig; - private final String topic; - private final ShuffleHelper helper; - private transient LogMsgFactory.Producer producer; - - public FlipCommitFunction( - int numberOfTasks, - Schema schema, - LogData.FieldGetterFactory fieldGetterFactory, - LogMsgFactory factory, - Properties producerConfig, - String topic, - ShuffleHelper helper) { - this.numberOfTasks = numberOfTasks; - this.factory = checkNotNull(factory); - this.logDataJsonSerialization = - new LogDataJsonSerialization<>(checkNotNull(schema), checkNotNull(fieldGetterFactory)); - this.producerConfig = producerConfig; - this.topic = topic; - this.helper = helper; - } - - @Override - public LogGlobalState createAccumulator() { - return new LogGlobalState(); - } - - @Override - public LogGlobalState add(CommitRequest value, LogGlobalState globalState) { - if (value.checkCommitted) { - return globalState; - } - LOG.info("receive CommitRequest={}.", value); - NavigableMap accumulator = globalState.accumulators; - Long epicNo = value.logRecord.getEpicNo(); - accumulator.compute( - epicNo, - (cpId, subAccumulator) -> { - subAccumulator = subAccumulator == null ? new SubAccumulator() : subAccumulator; - if (!subAccumulator.hasCommittedFlip) { - subAccumulator.add(value.subtaskId, value); - } - return subAccumulator; - }); - - SubAccumulator subAccumulator = globalState.accumulators.get(epicNo); - if (subAccumulator.taskIds.size() == numberOfTasks) { - // this sync step, wait for sent records to topic. - try { - LOG.info( - "already receive {} commit requests. The last subtask received is {}.", - numberOfTasks, - value.subtaskId); - sendFlip(subAccumulator, value); - LOG.info("sent flip messages success, cost {}ms.", subAccumulator.cost.time()); - } catch (Exception e) { - LOG.error("sending flip messages to topic failed, subAccumulator:{}.", subAccumulator, e); - throw new RuntimeException(e); - } - } else { - LOG.info( - "As of now, global state has received a total of {} commit requests which are {}.", - subAccumulator.taskIds.size(), - Arrays.toString(subAccumulator.taskIds.toArray(new Integer[0]))); - } - return globalState; - } - - private void sendFlip(SubAccumulator subAccumulator, CommitRequest value) throws Exception { - if (null == producer) { - producer = factory.createProducer(producerConfig, topic, logDataJsonSerialization, helper); - producer.open(); - } - - producer.sendToAllPartitions(value.logRecord); - subAccumulator.committed(); - } - - @Override - public Long getResult(LogGlobalState globalState) { - // find the maximum epic number and has already committed flip message to log queue. - Optional result = - globalState.accumulators.descendingMap().entrySet().stream() - .filter(entry -> entry.getValue().hasCommittedFlip) - .findFirst() - .map(Map.Entry::getKey); - return result.orElse(null); - } - - @Override - public LogGlobalState merge(LogGlobalState a, LogGlobalState b) { - b.accumulators.forEach( - (cpId, acc) -> - a.accumulators.compute( - cpId, - (key, subAccumulator) -> { - subAccumulator = subAccumulator == null ? new SubAccumulator() : subAccumulator; - if (!subAccumulator.hasCommittedFlip) { - subAccumulator.merge(acc); - } - return subAccumulator; - })); - return a; - } - } - - static class CommitRequest implements Serializable { - private static final long serialVersionUID = 5469815741394678192L; - private final Integer subtaskId; - private final LogData logRecord; - // TURE means check committerFunction has sent flip to topic whether. - private final boolean checkCommitted; - - private CommitRequest(Integer subtaskId, LogData logRecord) { - this.subtaskId = subtaskId; - this.logRecord = logRecord; - this.checkCommitted = false; - } - - private CommitRequest(Integer subtaskId, LogData logRecord, Boolean checkCommitted) { - this.subtaskId = subtaskId; - this.logRecord = logRecord; - this.checkCommitted = checkCommitted; - } - - @Override - public String toString() { - return "CommitRequest{subtaskId=" - + subtaskId - + ", flip message=" - + logRecord.toString() - + "}"; - } - } - - static class LogGlobalState implements Serializable { - private static final long serialVersionUID = 9132207718335661833L; - // this map keys mean epicNo, which is not exactly equal to checkpoint id - private final NavigableMap accumulators; - - public LogGlobalState() { - accumulators = new ConcurrentSkipListMap<>(); - } - } - - private static class SubAccumulator implements Serializable { - private static final long serialVersionUID = 1252547231163598559L; - private final Set taskIds = new CopyOnWriteArraySet<>(); - private CommitRequest commitRequest = null; - // TRUE means has already sent flip msg to topic successfully. - private volatile boolean hasCommittedFlip = false; - // Mark how long it took to collect all commit requests. - private final Cost cost = new Cost(); - - void add(int taskId, CommitRequest commitRequest) { - this.taskIds.add(taskId); - if (null == this.commitRequest && null != commitRequest) { - this.commitRequest = commitRequest; - } - cost.markStart(); - } - - void committed() { - this.hasCommittedFlip = true; - cost.markEnd(); - } - - void merge(SubAccumulator subAccumulator) { - this.taskIds.addAll(subAccumulator.taskIds); - this.commitRequest = subAccumulator.commitRequest; - } - - static class Cost implements Serializable { - private static final long serialVersionUID = 1L; - Long start; - Long end; - - long time() { - return end - start; - } - - void markStart() { - if (start == null) { - start = System.currentTimeMillis(); - } - } - - void markEnd() { - if (end == null) { - end = System.currentTimeMillis(); - } - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java deleted file mode 100644 index cfcb5d6029..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/HiddenLogWriter.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden; - -import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; - -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.log.LogData; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.Schema; - -import java.util.Properties; - -/** This is a hidden log writer. */ -public class HiddenLogWriter extends AbstractHiddenLogWriter { - private static final long serialVersionUID = 1L; - - public HiddenLogWriter( - Schema schema, - Properties producerConfig, - String topic, - LogMsgFactory factory, - LogData.FieldGetterFactory fieldGetterFactory, - byte[] jobId, - ShuffleHelper helper) { - super(schema, producerConfig, topic, factory, fieldGetterFactory, jobId, helper); - } - - @Override - public void endInput() throws Exception { - producer.flush(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - // check send flip successfully or not - super.processElement(element); - - // continue process element - RowData rowData = element.getValue(); - LogData logData = - new LogRecordV1( - logVersion, - jobIdentify, - epicNo, - false, - transformFromFlinkRowKind(rowData.getRowKind()), - rowData); - producer.send(logData); - output.collect(new StreamRecord<>(rowData)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java deleted file mode 100644 index 91eec0dbb6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/LogMsgFactory.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden; - -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.flink.configuration.Configuration; - -import java.io.Serializable; -import java.util.Properties; - -/** - * A factory creates log queue producers or consumers, e.g. kafka or pulsar distributed event - * streaming platform. - */ -public interface LogMsgFactory extends Serializable { - - Producer createProducer( - Properties producerConfig, - String topic, - LogDataJsonSerialization logDataJsonSerialization, - ShuffleHelper helper); - - Consumer createConsumer(); - - interface Producer { - void open() throws Exception; - - void send(LogData logData) throws Exception; - - void sendToAllPartitions(LogData logData) throws Exception; - - void flush(); - - void close() throws Exception; - } - - interface Consumer { - - default void open(Configuration parameters) throws Exception {} - - default void close() throws Exception {} - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java deleted file mode 100644 index 962e39e917..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/MixedFormatLogPartitioner.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden; - -import static org.apache.flink.util.Preconditions.checkArgument; -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.log.LogData; -import org.apache.commons.lang.ArrayUtils; -import org.apache.flink.table.data.RowData; -import org.apache.kafka.common.utils.Utils; - -import java.io.Serializable; -import java.util.concurrent.atomic.AtomicInteger; - -/** This is a log message partitioner that makes sure the record is without out-of-order. */ -public class MixedFormatLogPartitioner implements Serializable { - private static final long serialVersionUID = 9184708069203854226L; - private final AtomicInteger counter = new AtomicInteger(0); - private final ShuffleHelper helper; - - public MixedFormatLogPartitioner(ShuffleHelper shuffleHelper) { - this.helper = shuffleHelper; - } - - public int partition(LogData logData, int[] partitions) { - checkNotNull(logData, "record is null"); - checkArgument(ArrayUtils.isNotEmpty(partitions), "Partitions of the target topic is empty."); - - int partition; - if (helper == null || !helper.isPrimaryKeyExist()) { - int nextValue = nextValue(); - int part = Utils.toPositive(nextValue) % partitions.length; - partition = partitions[part]; - } else { - helper.open(); - long hash = helper.hashKeyValue((RowData) logData.getActualValue()); - partition = partitions[(int) (hash % partitions.length)]; - } - return partition; - } - - private int nextValue() { - return counter.getAndIncrement(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java deleted file mode 100644 index cd2ccffcdc..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaFactory.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden.kafka; - -import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; - -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.write.hidden.LogMsgFactory; -import org.apache.amoro.flink.write.hidden.MixedFormatLogPartitioner; -import org.apache.amoro.log.LogDataJsonSerialization; - -import java.util.Properties; - -/** A factory creates kafka log queue producers or consumers. */ -public class HiddenKafkaFactory implements LogMsgFactory { - private static final long serialVersionUID = -1L; - - @Override - public Producer createProducer( - Properties producerConfig, - String topic, - LogDataJsonSerialization logDataJsonSerialization, - ShuffleHelper helper) { - checkNotNull(topic); - return new HiddenKafkaProducer<>( - producerConfig, topic, logDataJsonSerialization, new MixedFormatLogPartitioner<>(helper)); - } - - @Override - public Consumer createConsumer() { - throw new UnsupportedOperationException("not supported right now"); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java deleted file mode 100644 index 2479c80f2a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/main/java/org/apache/amoro/flink/write/hidden/kafka/HiddenKafkaProducer.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden.kafka; - -import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; - -import org.apache.amoro.flink.write.hidden.LogMsgFactory; -import org.apache.amoro.flink.write.hidden.MixedFormatLogPartitioner; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.flink.streaming.connectors.kafka.FlinkKafkaErrorCode; -import org.apache.flink.streaming.connectors.kafka.FlinkKafkaException; -import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; -import org.apache.flink.util.ExceptionUtils; -import org.apache.flink.util.FlinkRuntimeException; -import org.apache.kafka.clients.producer.Callback; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.PartitionInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Properties; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * This is hidden log queue kafka producer that serializes {@link LogData} and emits to the kafka - * topic. - */ -public class HiddenKafkaProducer implements LogMsgFactory.Producer { - private static final Logger LOG = LoggerFactory.getLogger(HiddenKafkaProducer.class); - /** User defined properties for the Kafka Producer. */ - protected final Properties producerConfig; - - private final String topic; - - private final LogDataJsonSerialization logDataJsonSerialization; - - /** The callback than handles error propagation or logging callbacks. */ - @Nullable protected transient Callback callback; - /** Errors encountered in the async producer are stored here. */ - @Nullable protected transient volatile Exception asyncException; - - private transient FlinkKafkaInternalProducer producer; - private transient FlinkKafkaInternalProducer transactionalProducer; - - private final MixedFormatLogPartitioner mixedFormatLogPartitioner; - private int[] partitions; - - public HiddenKafkaProducer( - Properties producerConfig, - String topic, - LogDataJsonSerialization logDataJsonSerialization, - MixedFormatLogPartitioner mixedFormatLogPartitioner) { - this.producerConfig = producerConfig; - this.topic = topic; - this.logDataJsonSerialization = logDataJsonSerialization; - this.mixedFormatLogPartitioner = mixedFormatLogPartitioner; - } - - @Override - public void open() throws Exception { - callback = - (metadata, exception) -> { - if (exception != null && asyncException == null) { - asyncException = exception; - } - acknowledgeMessage(); - }; - producer = createProducer(); - transactionalProducer = createTransactionalProducer(); - transactionalProducer.initTransactions(); - partitions = getPartitionsByTopic(topic, producer); - LOG.info("HiddenKafkaPartition topic:{}, partitions:{}.", topic, partitions); - } - - @Override - public void send(LogData logData) throws Exception { - checkErroneous(); - byte[] message = logDataJsonSerialization.serialize(logData); - int partition = mixedFormatLogPartitioner.partition(logData, partitions); - ProducerRecord producerRecord = - new ProducerRecord<>(topic, partition, null, null, message); - producer.send(producerRecord, callback); - } - - @Override - public void sendToAllPartitions(LogData logData) throws Exception { - checkErroneous(); - byte[] message = logDataJsonSerialization.serialize(logData); - List> recordList = - IntStream.of(partitions) - .mapToObj(i -> new ProducerRecord(topic, i, null, null, message)) - .collect(Collectors.toList()); - LOG.info("sending {} partitions with flip message={}.", recordList.size(), logData); - long start = System.currentTimeMillis(); - try { - transactionalProducer.beginTransaction(); - for (ProducerRecord producerRecord : recordList) { - checkErroneous(); - transactionalProducer.send(producerRecord, callback); - } - transactionalProducer.commitTransaction(); - LOG.info("finished flips sending, cost {}ms.", System.currentTimeMillis() - start); - } catch (Throwable e) { - LOG.error("", e); - transactionalProducer.abortTransaction(); - throw new FlinkRuntimeException(e); - } - } - - @Override - public void flush() { - producer.flush(); - } - - @Override - public void close() throws Exception { - try { - if (producer != null) { - producer.close(Duration.ofSeconds(0)); - } - transactionalProducer.close(Duration.ofSeconds(0)); - } catch (Exception e) { - asyncException = ExceptionUtils.firstOrSuppressed(e, asyncException); - } finally { - checkErroneous(); - } - } - - protected FlinkKafkaInternalProducer createTransactionalProducer() { - Properties transactionalProperties = new Properties(); - transactionalProperties.putAll(producerConfig); - transactionalProperties.computeIfAbsent( - TRANSACTIONAL_ID_CONFIG, o -> UUID.randomUUID().toString()); - return new FlinkKafkaInternalProducer<>(transactionalProperties); - } - - protected FlinkKafkaInternalProducer createProducer() { - return new FlinkKafkaInternalProducer<>(producerConfig); - } - - public static int[] getPartitionsByTopic( - String topic, org.apache.kafka.clients.producer.Producer producer) { - // the fetched list is immutable, so we're creating a mutable copy in order to sort it - List partitionsList = new ArrayList<>(producer.partitionsFor(topic)); - - // sort the partitions by partition id to make sure the fetched partition list is the same - // across subtasks - partitionsList.sort(Comparator.comparingInt(PartitionInfo::partition)); - - return partitionsList.stream().mapToInt(PartitionInfo::partition).toArray(); - } - - protected void checkErroneous() throws FlinkKafkaException { - Exception e = asyncException; - if (e != null) { - // prevent double throwing - asyncException = null; - throw new FlinkKafkaException( - FlinkKafkaErrorCode.EXTERNAL_ERROR, "Failed to send data to Kafka: " + e.getMessage(), e); - } - } - - /** - * ATTENTION to subclass implementors: When overriding this method, please always call - * {@code super.acknowledgeMessage()} to keep the invariants of the internal bookkeeping of the - * producer. If not, be sure to know what you are doing. - */ - protected void acknowledgeMessage() {} -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java deleted file mode 100644 index 4d0ec20839..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/DynamicTableSourceTestBase.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.common.eventtime.WatermarkGenerator; -import org.apache.flink.api.common.eventtime.WatermarkOutput; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.connector.ChangelogMode; -import org.apache.flink.table.connector.source.SourceFunctionProvider; -import org.apache.flink.table.connector.source.abilities.SupportsWatermarkPushDown; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.planner.factories.TableFactoryHarness; - -import java.io.Serializable; - -public abstract class DynamicTableSourceTestBase extends TableFactoryHarness.ScanSourceBase - implements SupportsWatermarkPushDown, Serializable { - - public static final long serialVersionUID = 1L; - private WatermarkStrategy watermarkStrategy; - - @Override - public ChangelogMode getChangelogMode() { - return ChangelogMode.all(); - } - - @Override - public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { - init(); - return SourceFunctionProvider.of( - new SourceFunction() { - @Override - public void run(SourceContext ctx) { - WatermarkGenerator generator = - watermarkStrategy.createWatermarkGenerator(() -> null); - WatermarkOutput output = new TestWatermarkOutput(ctx); - doRun(generator, output, ctx); - } - - @Override - public void cancel() {} - }, - false); - } - - public void init() {} - - public abstract void doRun( - WatermarkGenerator generator, - WatermarkOutput output, - SourceFunction.SourceContext ctx); - - @Override - public void applyWatermark(WatermarkStrategy watermarkStrategy) { - this.watermarkStrategy = watermarkStrategy; - } - - public class TestWatermarkOutput implements WatermarkOutput, Serializable { - public static final long serialVersionUID = 1L; - public SourceFunction.SourceContext ctx; - - public TestWatermarkOutput(SourceFunction.SourceContext ctx) { - this.ctx = ctx; - } - - @Override - public void emitWatermark(Watermark watermark) { - ctx.emitWatermark( - new org.apache.flink.streaming.api.watermark.Watermark(watermark.getTimestamp())); - } - - @Override - public void markIdle() {} - - @Override - public void markActive() {} - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java deleted file mode 100644 index de8d1be1dd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTableTestBase.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.write.MixedFormatRowDataTaskWriterFactory; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.amoro.table.UnkeyedTable; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; - -import java.util.Arrays; - -/** - * This class contains flink table rowType schema and others, and will replace {@link FlinkTestBase} - * base class in the future. - */ -public interface FlinkTableTestBase { - default TaskWriter createTaskWriter(MixedTable mixedTable, RowType rowType) { - return mixedTable.isKeyedTable() - ? createKeyedTaskWriter((KeyedTable) mixedTable, rowType) - : createUnkeyedTaskWriter((UnkeyedTable) mixedTable, rowType); - } - - default TaskWriter createBaseTaskWriter(MixedTable mixedTable, RowType rowType) { - return mixedTable.isKeyedTable() - ? createKeyedTaskWriter((KeyedTable) mixedTable, rowType, true, 3) - : createUnkeyedTaskWriter((UnkeyedTable) mixedTable, rowType); - } - - default TaskWriter createKeyedTaskWriter(KeyedTable keyedTable, RowType rowType) { - return createKeyedTaskWriter(keyedTable, rowType, false, 3); - } - - default TaskWriter createKeyedTaskWriter( - KeyedTable keyedTable, RowType rowType, boolean overwrite, long mask) { - return createTaskWriter(keyedTable, rowType, overwrite, mask); - } - - default TaskWriter createUnkeyedTaskWriter(UnkeyedTable unkeyedTable, RowType rowType) { - return createTaskWriter(unkeyedTable, rowType, false, 3); - } - - default TaskWriter createTaskWriter( - MixedTable mixedTable, RowType rowType, boolean overwrite, long mask) { - MixedFormatRowDataTaskWriterFactory taskWriterFactory = - new MixedFormatRowDataTaskWriterFactory(mixedTable, rowType, overwrite); - taskWriterFactory.setMask(mask); - taskWriterFactory.initialize(0, 0); - return taskWriterFactory.create(); - } - - default void commit(MixedTable mixedTable, WriteResult result, boolean base) { - if (mixedTable.isKeyedTable()) { - KeyedTable keyedTable = mixedTable.asKeyedTable(); - if (base) { - AppendFiles baseAppend = keyedTable.baseTable().newAppend(); - Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); - baseAppend.commit(); - } else { - AppendFiles changeAppend = keyedTable.changeTable().newAppend(); - Arrays.stream(result.dataFiles()).forEach(changeAppend::appendFile); - changeAppend.commit(); - } - } else { - if (!base) { - throw new IllegalArgumentException( - String.format( - "mixed-format table %s is a unkeyed table, can't commit to change table", - mixedTable.name())); - } - UnkeyedTable unkeyedTable = mixedTable.asUnkeyedTable(); - AppendFiles baseAppend = unkeyedTable.newAppend(); - Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); - baseAppend.commit(); - } - } - - default MixedFormatTableLoader getTableLoader( - String catalogName, String amsUri, MixedTable mixedTable) { - TableIdentifier identifier = - TableIdentifier.of( - catalogName, mixedTable.id().getDatabase(), mixedTable.id().getTableName()); - InternalCatalogBuilder internalCatalogBuilder = InternalCatalogBuilder.builder().amsUri(amsUri); - return MixedFormatTableLoader.of(identifier, internalCatalogBuilder, mixedTable.properties()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java deleted file mode 100644 index cd3501a079..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/FlinkTestBase.java +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import static org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.catalog.TableTestBase; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.flink.write.MixedFormatRowDataTaskWriterFactory; -import org.apache.amoro.io.reader.GenericKeyedDataReader; -import org.apache.amoro.scan.CombinedScanTask; -import org.apache.amoro.scan.KeyedTableScanTask; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableSet; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.StateBackend; -import org.apache.flink.runtime.state.filesystem.FsStateBackend; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.IdentityPartitionConverters; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.MiniClusterResource; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.rules.TestName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ExecutionException; - -public class FlinkTestBase extends TableTestBase { - private static final Logger LOG = LoggerFactory.getLogger(FlinkTestBase.class); - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @Rule public TestName name = new TestName(); - - public static String metastoreUri; - - protected static final int KAFKA_PARTITION_NUMS = 1; - - private volatile StreamTableEnvironment tEnv = null; - protected Map props; - private volatile StreamExecutionEnvironment env = null; - public static final Schema TABLE_SCHEMA = BasicTableTestHelper.TABLE_SCHEMA; - public static final TableSchema FLINK_SCHEMA = - TableSchema.builder() - .field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("ts", DataTypes.BIGINT()) - .field("op_time", DataTypes.TIMESTAMP()) - .build(); - public static final RowType FLINK_ROW_TYPE = - (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); - - public static InternalCatalogBuilder catalogBuilder; - - public FlinkTestBase(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { - super(catalogTestHelper, tableTestHelper); - } - - @Before - public void before() throws Exception { - metastoreUri = getCatalogUri(); - catalogBuilder = InternalCatalogBuilder.builder().amsUri(metastoreUri); - } - - public void config() { - props = Maps.newHashMap(); - props.put("type", MIXED_ICEBERG_IDENTIFIER); - props.put(CatalogFactoryOptions.AMS_URI.key(), metastoreUri); - } - - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - protected StreamTableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - StreamTableEnvironment.create( - getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); - Configuration configuration = tEnv.getConfig().getConfiguration(); - // set low-level key-value options - configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); - } - } - } - return tEnv; - } - - protected StreamExecutionEnvironment getEnv() { - if (env == null) { - synchronized (this) { - if (env == null) { - StateBackend backend = - new FsStateBackend( - "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setParallelism(1); - env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig().setCheckpointInterval(300); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - env.setStateBackend(backend); - env.setRestartStrategy(RestartStrategies.noRestart()); - } - } - } - return env; - } - - protected List sql(String query, Object... args) { - TableResult tableResult = getTableEnv().executeSql(String.format(query, args)); - tableResult - .getJobClient() - .ifPresent( - c -> { - try { - c.getJobExecutionResult().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); - try (CloseableIterator iter = tableResult.collect()) { - List results = Lists.newArrayList(iter); - return results; - } catch (Exception e) { - LOG.warn("Failed to collect table result", e); - return null; - } - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected Set sqlSet(String query, Object... args) { - return new HashSet<>(sql(query, args)); - } - - public static List read(KeyedTable table) { - CloseableIterable combinedScanTasks = table.newScan().planTasks(); - Schema schema = table.schema(); - GenericKeyedDataReader genericKeyedDataReader = - new GenericKeyedDataReader( - table.io(), - schema, - schema, - table.primaryKeySpec(), - null, - true, - IdentityPartitionConverters::convertConstant); - ImmutableList.Builder builder = ImmutableList.builder(); - for (CombinedScanTask combinedScanTask : combinedScanTasks) { - for (KeyedTableScanTask keyedTableScanTask : combinedScanTask.tasks()) { - builder.addAll(genericKeyedDataReader.readData(keyedTableScanTask)); - } - } - return builder.build(); - } - - public static Set toRecords(Collection rows) { - GenericRecord record = GenericRecord.create(TABLE_SCHEMA); - ImmutableSet.Builder b = ImmutableSet.builder(); - rows.forEach( - r -> - b.add( - record.copy( - ImmutableMap.of( - "id", - r.getField(0), - "name", - r.getField(1), - "ts", - r.getField(2), - "op_time", - r.getField(3))))); - return b.build(); - } - - public static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } - - protected static RowData createRowData( - Integer id, String name, String dateTime, RowKind rowKind) { - return GenericRowData.ofKind( - rowKind, - id, - StringData.fromString(name), - LocalDateTime.parse(dateTime).toInstant(ZoneOffset.UTC).toEpochMilli(), - TimestampData.fromLocalDateTime(LocalDateTime.parse(dateTime))); - } - - protected static RowData createRowData(RowKind rowKind, Object... objects) { - return GenericRowData.ofKind( - rowKind, - objects[0], - StringData.fromString((String) objects[1]), - objects[2], - TimestampData.fromLocalDateTime((LocalDateTime) objects[3])); - } - - protected static RowData createRowData(Integer id, String name, String dateTime) { - return createRowData(id, name, dateTime, RowKind.INSERT); - } - - protected static void commit(KeyedTable keyedTable, WriteResult result, boolean base) { - if (base) { - AppendFiles baseAppend = keyedTable.baseTable().newAppend(); - Arrays.stream(result.dataFiles()).forEach(baseAppend::appendFile); - baseAppend.commit(); - } else { - AppendFiles changeAppend = keyedTable.changeTable().newAppend(); - Arrays.stream(result.dataFiles()).forEach(changeAppend::appendFile); - changeAppend.commit(); - } - } - - protected static TaskWriter createKeyedTaskWriter( - KeyedTable keyedTable, RowType rowType, boolean base) { - return createKeyedTaskWriter(keyedTable, rowType, base, 3); - } - - protected static TaskWriter createKeyedTaskWriter( - KeyedTable keyedTable, RowType rowType, boolean base, long mask) { - MixedFormatRowDataTaskWriterFactory taskWriterFactory = - new MixedFormatRowDataTaskWriterFactory(keyedTable, rowType, base); - taskWriterFactory.setMask(mask); - taskWriterFactory.initialize(0, 0); - return taskWriterFactory.create(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java deleted file mode 100644 index 0eeee1bc9d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/TestFlinkSchemaUtil.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink; - -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.iceberg.Schema; -import org.junit.Assert; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Map; - -public class TestFlinkSchemaUtil { - @Test - public void testFlinkSchemaToIcebergSchema() { - // flinkSchema with physical column,compute column, watermark - TableSchema flinkSchema = - TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) - .field("ts", DataTypes.TIMESTAMP(6)) - .field("compute_id", DataTypes.INT(), "`id` + 5") - .field("proc", DataTypes.TIMESTAMP_LTZ(), "PROCTIME()") - // org.apache.iceberg.flink.TypeToFlinkType will convert Timestamp to Timestamp(6), so - // we cast datatype manually - .field("ts3", DataTypes.TIMESTAMP(3), "cast(`ts` as timestamp(3))") - .watermark("ts3", "`ts3` - INTERVAL '5' SECOND", DataTypes.TIMESTAMP(3)) - .build(); - - // get physicalSchema from tableSchema and convert into iceberg Schema - Schema icebergSchema = - org.apache.iceberg.flink.FlinkSchemaUtil.convert( - FlinkSchemaUtil.getPhysicalSchema(flinkSchema)); - - Map extraOptions = FlinkSchemaUtil.generateExtraOptionsFrom(flinkSchema); - - // Convert iceberg Schema with extraOptions into flink TableSchema - TableSchema fromIcebergSchema = - FlinkSchemaUtil.toSchema(icebergSchema, new ArrayList<>(), extraOptions); - - Assert.assertEquals(flinkSchema, fromIcebergSchema); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java deleted file mode 100644 index 92b2a286a1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkAmoroCatalogITCase.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.amoro.flink.table.AmoroCatalogITCaseBase; -import org.apache.amoro.formats.AmoroCatalogTestHelper; -import org.apache.amoro.formats.paimon.PaimonHadoopCatalogTestHelper; -import org.apache.amoro.formats.paimon.PaimonHiveCatalogTestHelper; -import org.apache.amoro.formats.paimon.PaimonTable; -import org.apache.amoro.hive.TestHMS; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.types.Row; -import org.apache.paimon.table.FileStoreTable; -import org.junit.After; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.Optional; -import java.util.concurrent.TimeUnit; - -/** ITCase for Flink UnifiedCatalog based on AmoroCatalogTestBase */ -@RunWith(value = Parameterized.class) -public class FlinkAmoroCatalogITCase extends AmoroCatalogITCaseBase { - static final TestHMS TEST_HMS = new TestHMS(); - AbstractCatalog flinkCatalog; - - public FlinkAmoroCatalogITCase(AmoroCatalogTestHelper catalogTestHelper) { - super(catalogTestHelper); - } - - @Parameterized.Parameters(name = "{0}") - public static Object[] parameters() { - return new Object[] { - PaimonHiveCatalogTestHelper.defaultHelper(), PaimonHadoopCatalogTestHelper.defaultHelper() - }; - } - - @BeforeClass - public static void beforeAll() throws Exception { - TEST_HMS.before(); - } - - @Before - public void setup() throws Exception { - createDatabase(); - createTable(); - String catalog = "unified_catalog"; - exec( - "CREATE CATALOG %s WITH ('type'='unified', 'metastore.url'='%s')", - catalog, getCatalogUrl()); - exec("USE CATALOG %s", catalog); - exec("USE %s", TEST_DB_NAME); - Optional catalogOptional = getTableEnv().getCatalog(catalog); - assertTrue(catalogOptional.isPresent()); - flinkCatalog = (AbstractCatalog) catalogOptional.get(); - assertEquals(catalog, flinkCatalog.getName()); - } - - @After - public void teardown() { - TEST_HMS.after(); - if (flinkCatalog != null) { - flinkCatalog.close(); - } - } - - public void createDatabase() { - try { - catalogTestHelper.createDatabase(TEST_DB_NAME); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - public void createTable() { - try { - catalogTestHelper.createTable(TEST_DB_NAME, TEST_TABLE_NAME); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @Test - public void testTableExists() throws Exception { - CatalogBaseTable catalogBaseTable = - flinkCatalog.getTable(new ObjectPath(TEST_DB_NAME, TEST_TABLE_NAME)); - assertNotNull(catalogBaseTable); - PaimonTable paimonTable = - (PaimonTable) catalogTestHelper.amoroCatalog().loadTable(TEST_DB_NAME, TEST_TABLE_NAME); - FileStoreTable originalPaimonTable = (FileStoreTable) paimonTable.originalTable(); - assertEquals( - originalPaimonTable.schema().fields().size(), - catalogBaseTable.getUnresolvedSchema().getColumns().size()); - } - - @Test - public void testInsertAndQuery() throws Exception { - exec("INSERT INTO %s SELECT 1, 'Lily', 1234567890", TEST_TABLE_NAME); - TableResult tableResult = - exec("select * from %s /*+OPTIONS('monitor-interval'='1s')*/ ", TEST_TABLE_NAME); - - tableResult.await(30, TimeUnit.SECONDS); - - Row actualRow = tableResult.collect().next(); - assertEquals(Row.of(1, "Lily", 1234567890).toString(), actualRow.toString()); - } - - @Test - public void testSwitchCurrentCatalog() { - String memCatalog = "mem_catalog"; - exec("create catalog %s with('type'='generic_in_memory')", memCatalog); - exec( - "create table %s.`default`.datagen_table(\n" - + " a int,\n" - + " b varchar" - + ") with(\n" - + " 'connector'='datagen',\n" - + " 'number-of-rows'='1'\n" - + ")", - memCatalog); - TableResult tableResult = exec("select * from mem_catalog.`default`.datagen_table"); - assertNotNull(tableResult.collect().next()); - exec("use catalog %s", memCatalog); - tableResult = exec("select * from datagen_table"); - assertNotNull(tableResult.collect().next()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java deleted file mode 100644 index 86d134322d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkCatalogContext.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions.AMS_URI; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; - -import org.apache.amoro.TableFormat; -import org.apache.amoro.TestAms; -import org.apache.amoro.api.CatalogMeta; -import org.apache.amoro.flink.catalog.factories.FlinkUnifiedCatalogFactory; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.factories.FactoryUtil; -import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; -import org.junit.jupiter.params.provider.Arguments; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class FlinkCatalogContext { - - static final TestHMS TEST_HMS = new TestHMS(); - static final TestAms TEST_AMS = new TestAms(); - static final FlinkUnifiedCatalogFactory FLINK_UNIFIED_CATALOG_FACTORY = - new FlinkUnifiedCatalogFactory(); - - static ResolvedSchema resolvedSchema = - ResolvedSchema.of( - Column.physical("name", DataTypes.STRING()), Column.physical("age", DataTypes.INT())); - static Schema schema = Schema.newBuilder().fromResolvedSchema(resolvedSchema).build(); - - ObjectPath objectPath = new ObjectPath("default", "test_hive_from_flink"); - - static Stream getFlinkCatalogAndTable() { - return Stream.of( - Arguments.of( - initFlinkCatalog(TableFormat.MIXED_HIVE), - generateFlinkTable(TableFormat.MIXED_HIVE.toString()), - TableFormat.MIXED_HIVE), - Arguments.of( - initFlinkCatalog(TableFormat.MIXED_ICEBERG), - generateFlinkTable(TableFormat.MIXED_ICEBERG.toString()), - TableFormat.MIXED_ICEBERG), - Arguments.of( - initFlinkCatalog(TableFormat.ICEBERG), - generateFlinkTable(TableFormat.ICEBERG.toString()), - TableFormat.ICEBERG), - Arguments.of( - initFlinkCatalog(TableFormat.PAIMON), - generateFlinkTable(TableFormat.PAIMON.toString()), - TableFormat.PAIMON)); - } - - static ResolvedCatalogTable generateFlinkTable(String tableFormat) { - return new ResolvedCatalogTable( - CatalogTable.of( - schema, - "Flink managed table", - new ArrayList<>(), - new HashMap() { - { - put(TABLE_FORMAT.key(), tableFormat); - } - }), - resolvedSchema); - } - - void initial() throws Exception { - TEST_HMS.before(); - TEST_AMS.before(); - } - - void close() { - TEST_AMS.after(); - TEST_HMS.after(); - } - - static FlinkUnifiedCatalog initFlinkCatalog(TableFormat tableFormat) { - FlinkUnifiedCatalog flinkUnifiedCatalog; - Map factoryOptions = Maps.newHashMap(); - CatalogMeta meta = - HiveCatalogTestHelper.build(TEST_HMS.getHiveConf(), tableFormat) - .buildCatalogMeta(TEST_HMS.getWareHouseLocation()); - meta.setCatalogName(tableFormat.name().toLowerCase()); - - TEST_AMS.getAmsHandler().dropCatalog(meta.getCatalogName()); - TEST_AMS.getAmsHandler().createCatalog(meta); - - factoryOptions.put(AMS_URI.key(), TEST_AMS.getServerUrl() + "/" + meta.getCatalogName()); - final FactoryUtil.DefaultCatalogContext context = - new FactoryUtil.DefaultCatalogContext( - "FLINK_" + tableFormat, - factoryOptions, - new Configuration(), - FlinkCatalogContext.class.getClassLoader()); - flinkUnifiedCatalog = - (FlinkUnifiedCatalog) FLINK_UNIFIED_CATALOG_FACTORY.createCatalog(context); - flinkUnifiedCatalog.open(); - return flinkUnifiedCatalog; - } - - HiveMetaStoreClient getHMSClient() { - return TEST_HMS.getHiveClient(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java deleted file mode 100644 index 6e9a654bf5..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/FlinkUnifiedCatalogITCase.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.table.CatalogITCaseBase; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.catalog.AbstractCatalog; -import org.apache.flink.table.catalog.Catalog; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.flink.types.Row; -import org.junit.After; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.Optional; -import java.util.concurrent.TimeUnit; - -@RunWith(value = Parameterized.class) -public class FlinkUnifiedCatalogITCase extends CatalogITCaseBase { - static final TestHMS TEST_HMS = new TestHMS(); - AbstractCatalog flinkCatalog; - TableIdentifier identifier; - - public FlinkUnifiedCatalogITCase(CatalogTestHelper catalogTestHelper) { - super(catalogTestHelper, new BasicTableTestHelper(true, false)); - } - - @Parameterized.Parameters(name = "catalogTestHelper = {0}") - public static Object[][] parameters() { - return new Object[][] { - {new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf())}, - {new HiveCatalogTestHelper(TableFormat.MIXED_ICEBERG, TEST_HMS.getHiveConf())}, - {new HiveCatalogTestHelper(TableFormat.ICEBERG, TEST_HMS.getHiveConf())} - }; - } - - @BeforeClass - public static void beforeAll() throws Exception { - TEST_HMS.before(); - } - - @Before - public void setup() throws Exception { - String catalog = "unified_catalog"; - exec("CREATE CATALOG %s WITH ('type'='unified', 'ams.uri'='%s')", catalog, getCatalogUri()); - exec("USE CATALOG %s", catalog); - exec("USE %s", tableTestHelper().id().getDatabase()); - Optional catalogOptional = getTableEnv().getCatalog(catalog); - assertTrue(catalogOptional.isPresent()); - flinkCatalog = (AbstractCatalog) catalogOptional.get(); - assertEquals(catalog, flinkCatalog.getName()); - identifier = tableTestHelper().id(); - } - - @After - public void teardown() { - TEST_HMS.after(); - if (flinkCatalog != null) { - flinkCatalog.close(); - } - } - - @Test - public void testTableExists() throws TableNotExistException { - CatalogBaseTable catalogBaseTable = - flinkCatalog.getTable(new ObjectPath(identifier.getDatabase(), identifier.getTableName())); - assertNotNull(catalogBaseTable); - assertEquals( - tableTestHelper().tableSchema().columns().size(), - catalogBaseTable.getUnresolvedSchema().getColumns().size()); - } - - @Test - public void testInsertAndQuery() throws Exception { - exec( - "INSERT INTO %s SELECT 1, 'Lily', 1234567890, TO_TIMESTAMP('2020-01-01 01:02:03')", - identifier.getTableName()); - TableResult tableResult = - exec("select * from %s /*+OPTIONS('monitor-interval'='1s')*/ ", identifier.getTableName()); - - tableResult.await(30, TimeUnit.SECONDS); - - Row actualRow = tableResult.collect().next(); - assertEquals( - Row.of(1, "Lily", 1234567890L, "2020-01-01T01:02:03").toString(), actualRow.toString()); - } - - @Test - public void testSwitchCurrentCatalog() { - String memCatalog = "mem_catalog"; - exec("create catalog %s with('type'='generic_in_memory')", memCatalog); - exec( - "create table %s.`default`.datagen_table(\n" - + " a int,\n" - + " b varchar" - + ") with(\n" - + " 'connector'='datagen',\n" - + " 'number-of-rows'='1'\n" - + ")", - memCatalog); - TableResult tableResult = exec("select * from mem_catalog.`default`.datagen_table"); - assertNotNull(tableResult.collect().next()); - exec("use catalog %s", memCatalog); - tableResult = exec("select * from datagen_table"); - assertNotNull(tableResult.collect().next()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java deleted file mode 100644 index 4536db0c86..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestFlinkUnifiedCatalogs.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.TABLE_FORMAT; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.amoro.TableFormat; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Schema; -import org.apache.flink.table.catalog.CatalogBaseTable; -import org.apache.flink.table.catalog.CatalogDatabaseImpl; -import org.apache.flink.table.catalog.CatalogTable; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.ResolvedCatalogTable; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; -import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; -import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; -import org.apache.flink.table.catalog.exceptions.TableNotExistException; -import org.apache.thrift.TException; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.MethodSource; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -class TestFlinkUnifiedCatalogs { - static FlinkCatalogContext flinkCatalogContext = new FlinkCatalogContext(); - - @BeforeAll - public static void setupCatalogMeta() throws Exception { - flinkCatalogContext.initial(); - } - - @AfterAll - public static void tearDown() { - flinkCatalogContext.close(); - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testListDatabases(FlinkUnifiedCatalog flinkUnifiedCatalog) throws TException { - List expects = flinkCatalogContext.getHMSClient().getAllDatabases(); - assertEquals(expects, flinkUnifiedCatalog.listDatabases()); - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testDatabaseExists(FlinkUnifiedCatalog flinkUnifiedCatalog) { - assertTrue(flinkUnifiedCatalog.databaseExists("default")); - assertFalse(flinkUnifiedCatalog.databaseExists("not_exists_db")); - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testCreateAndDropDatabase(FlinkUnifiedCatalog flinkUnifiedCatalog) - throws DatabaseAlreadyExistException, DatabaseNotEmptyException, DatabaseNotExistException { - flinkUnifiedCatalog.createDatabase( - "test", new CatalogDatabaseImpl(Collections.emptyMap(), "test"), false); - assertTrue(flinkUnifiedCatalog.databaseExists("test")); - - flinkUnifiedCatalog.dropDatabase("test", false); - assertFalse(flinkUnifiedCatalog.databaseExists("test")); - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testAlterDatabase( - FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) - throws DatabaseNotExistException { - try { - flinkUnifiedCatalog.alterDatabase( - "default", new CatalogDatabaseImpl(Collections.emptyMap(), "default"), false); - } catch (UnsupportedOperationException e) { - // Mixed-format,Iceberg and paimon catalog does not support altering database. - if (!tableFormat.in( - TableFormat.MIXED_HIVE, - TableFormat.MIXED_ICEBERG, - TableFormat.ICEBERG, - TableFormat.PAIMON)) { - throw e; - } - } - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testCreateGetAndDropTable( - FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) - throws TableAlreadyExistException, DatabaseNotExistException, TableNotExistException { - ObjectPath objectPath = flinkCatalogContext.objectPath; - - flinkUnifiedCatalog.createTable(flinkCatalogContext.objectPath, table, false); - assertTrue(flinkUnifiedCatalog.tableExists(objectPath)); - - CatalogBaseTable actualTable = flinkUnifiedCatalog.getTable(objectPath); - assertEquals(table.getUnresolvedSchema(), actualTable.getUnresolvedSchema()); - assertEquals(tableFormat.toString(), actualTable.getOptions().get(TABLE_FORMAT.key())); - - flinkUnifiedCatalog.dropTable(objectPath, false); - assertFalse(flinkUnifiedCatalog.tableExists(objectPath)); - } - - @ParameterizedTest - @MethodSource("org.apache.amoro.flink.catalog.FlinkCatalogContext#getFlinkCatalogAndTable") - void testAlterTable( - FlinkUnifiedCatalog flinkUnifiedCatalog, CatalogTable table, TableFormat tableFormat) - throws TableNotExistException, TableAlreadyExistException, DatabaseNotExistException { - try { - flinkUnifiedCatalog.createTable(flinkCatalogContext.objectPath, table, true); - - ResolvedSchema newResolvedSchema = - ResolvedSchema.of( - Column.physical("name", DataTypes.STRING()), - Column.physical("age", DataTypes.INT()), - Column.physical("address", DataTypes.STRING())); - String comment = "Flink new Table"; - Map newProperties = Maps.newHashMap(); - newProperties.put("new_key", "new_value"); - - CatalogBaseTable newTable = - new ResolvedCatalogTable( - CatalogTable.of( - Schema.newBuilder().fromResolvedSchema(newResolvedSchema).build(), - comment, - new ArrayList<>(), - newProperties), - newResolvedSchema); - try { - flinkUnifiedCatalog.alterTable(flinkCatalogContext.objectPath, newTable, false); - } catch (UnsupportedOperationException e) { - // https://github.com/apache/amoro/issues/2 altering Mixed format table is not supported. - // Altering Iceberg schema is also not supported yet. - if (!tableFormat.in( - TableFormat.MIXED_ICEBERG, TableFormat.MIXED_HIVE, TableFormat.ICEBERG)) { - throw e; - } - } - } finally { - flinkUnifiedCatalog.dropTable(flinkCatalogContext.objectPath, true); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java deleted file mode 100644 index 5e4bb582ba..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalog.java +++ /dev/null @@ -1,589 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static org.apache.amoro.flink.FlinkSchemaUtil.COMPUTED_COLUMNS; -import static org.apache.amoro.flink.FlinkSchemaUtil.FLINK_PREFIX; -import static org.apache.amoro.flink.FlinkSchemaUtil.WATERMARK; -import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; -import static org.apache.flink.table.descriptors.DescriptorProperties.DATA_TYPE; -import static org.apache.flink.table.descriptors.DescriptorProperties.EXPR; -import static org.apache.flink.table.descriptors.DescriptorProperties.NAME; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_ROWTIME; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_DATA_TYPE; -import static org.apache.flink.table.descriptors.DescriptorProperties.WATERMARK_STRATEGY_EXPR; - -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestBase; -import org.apache.amoro.flink.catalog.factories.CatalogFactoryOptions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.StateBackend; -import org.apache.flink.runtime.state.filesystem.FsStateBackend; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.flink.util.CollectionUtil; -import org.apache.iceberg.flink.MiniClusterResource; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** - * Test cases for mixed catalog factories, including: - * CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER, CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER, - * CatalogFactoryOptions.LEGACY_MIXED_IDENTIFIER - */ -@RunWith(value = Parameterized.class) -public class TestMixedCatalog extends CatalogTestBase { - private String catalogName; - private String catalogFactoryType; - private static final Logger LOG = LoggerFactory.getLogger(TestMixedCatalog.class); - - public TestMixedCatalog(String catalogFactoryType) { - super(new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG)); - this.catalogFactoryType = catalogFactoryType; - this.catalogName = catalogFactoryType + "_catalog"; - } - - @Parameterized.Parameters(name = "catalogFactoryType = {0}") - public static Object[] parameters() { - return new Object[] { - CatalogFactoryOptions.MIXED_ICEBERG_IDENTIFIER, CatalogFactoryOptions.MIXED_HIVE_IDENTIFIER - }; - } - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - protected Map props; - - private static final String DB = TableTestHelper.TEST_DB_NAME; - private static final String TABLE = TableTestHelper.TEST_TABLE_NAME; - private volatile StreamExecutionEnvironment env = null; - private volatile StreamTableEnvironment tEnv = null; - - @Before - public void before() throws Exception { - props = Maps.newHashMap(); - props.put("type", catalogFactoryType); - props.put(CatalogFactoryOptions.AMS_URI.key(), getCatalogUri()); - sql("CREATE CATALOG " + catalogName + " WITH %s", toWithClause(props)); - sql("USE CATALOG " + catalogName); - sql("CREATE DATABASE " + catalogName + "." + DB); - } - - @After - public void after() { - sql("DROP TABLE IF EXISTS " + catalogName + "." + DB + "." + TABLE); - sql("DROP DATABASE IF EXISTS " + catalogName + "." + DB); - Assert.assertTrue(CollectionUtil.isNullOrEmpty(getMixedFormatCatalog().listDatabases())); - sql("USE CATALOG default_catalog"); - sql("DROP CATALOG " + catalogName); - } - - @Test - public void testMixedCatalog() { - String[] catalogs = getTableEnv().listCatalogs(); - Assert.assertArrayEquals( - Arrays.stream(catalogs).sorted().toArray(), - Stream.of("default_catalog", catalogName).sorted().toArray()); - } - - @Test - public void testDDL() { - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - sql("USE " + catalogName + "." + DB); - sql("SHOW tables"); - - Assert.assertTrue( - getMixedFormatCatalog() - .loadTable(TableIdentifier.of(catalogName, DB, TABLE)) - .isKeyedTable()); - } - - @Test - public void testComputeIndex() { - // if compute column before any physical column, will throw exception. - Assert.assertThrows( - org.apache.flink.table.api.TableException.class, - () -> - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " compute_id as id+5 ," - + " proc as PROCTIME() ," - + " name STRING" - + ") ")); - - // compute column must come after all the physical columns - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " proc as PROCTIME() " - + ") "); - } - - @Test - public void testDDLWithVirtualColumn() throws IOException { - // create mixed-format table with compute columns and watermark under mixed-format catalog - // org.apache.iceberg.flink.TypeToFlinkType will convert Timestamp to Timestamp(6), so we cast - // datatype manually - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP," - + " t3 as cast(t as TIMESTAMP(3))," - + " compute_id as id+5 ," - + " proc as PROCTIME() ," - + " watermark FOR t3 AS t3 - INTERVAL '5' SECOND, " - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - - Map properties = - getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); - - // index for compute columns - int[] computedIndex = {1, 2, 3}; - Arrays.stream(computedIndex) - .forEach( - x -> { - Assert.assertTrue( - properties.containsKey(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, NAME))); - Assert.assertTrue( - properties.containsKey(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, EXPR))); - Assert.assertTrue( - properties.containsKey( - compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, x, DATA_TYPE))); - }); - - Assert.assertTrue( - properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_ROWTIME))); - Assert.assertTrue( - properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_EXPR))); - Assert.assertTrue( - properties.containsKey(compoundKey(FLINK_PREFIX, WATERMARK, WATERMARK_STRATEGY_DATA_TYPE))); - - List result = sql("DESC " + catalogName + "." + DB + "." + TABLE + ""); - Assert.assertEquals(6, result.size()); - } - - @Test - public void testDMLWithVirtualColumn() throws IOException { - // create mixed-format table with compute columns under mixed-format catalog - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " t TIMESTAMP(6)," - + " compute_id as id+5 ," - + " proc as PROCTIME(), " - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - - // insert values into mixed-format table - insertValue(); - - // select from mixed-format table with compute columns under mixed-format catalog - List rows = - sql( - "SELECT * FROM " - + catalogName - + "." - + DB - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"); - checkRows(rows); - } - - @Test - public void testReadNotMatchColumn() throws IOException { - // create mixed-format table with compute columns under mixed-format catalog - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " t TIMESTAMP(6)," - + " proc as PROCTIME(), " - + " compute_id as id+5 ," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - - MixedTable amoroTable = - getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)); - String beforeExpr = - amoroTable.properties().get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR)); - // change property "flink.computed-column.2.expr" from "`id` +5" to "`newId` +5" - String afterExpr = "`newId` +5"; - amoroTable - .updateProperties() - .set(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR), afterExpr) - .commit(); - - Assert.assertNotEquals( - beforeExpr, - amoroTable.properties().get(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR))); - - // property for expr do not match any columns in amoro, will throw exception. - Assert.assertThrows( - IllegalStateException.class, - () -> sql("DESC " + catalogName + "." + DB + "." + TABLE + "")); - amoroTable - .updateProperties() - .set(compoundKey(FLINK_PREFIX, COMPUTED_COLUMNS, 2, EXPR), beforeExpr) - .commit(); - - // can get table normally - sql("DESC " + catalogName + "." + DB + "." + TABLE + ""); - } - - @Test - public void testDML() throws IOException { - sql( - "CREATE TABLE default_catalog.default_database." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) " - + " WITH (" - + " 'connector' = 'datagen'," - + " 'fields.id.kind'='sequence'," - + " 'fields.id.start'='1'," - + " 'fields.id.end'='1'" - + ")"); - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - - sql( - "INSERT INTO " - + catalogName - + "." - + DB - + "." - + TABLE - + " SELECT * FROM default_catalog.default_database." - + TABLE); - List rows = - sql( - "SELECT * FROM " - + catalogName - + "." - + DB - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"); - Assert.assertEquals(1, rows.size()); - - sql("DROP TABLE default_catalog.default_database." + TABLE); - } - - private void checkRows(List rows) { - Assert.assertEquals(1, rows.size()); - int id = (int) rows.get(0).getField("id"); - int computeId = (int) rows.get(0).getField("compute_id"); - Assert.assertEquals(1, id); - // computeId should be id+5 - Assert.assertEquals(id + 5, computeId); - Assert.assertEquals(4, rows.get(0).getFieldNames(true).size()); - } - - protected List sql(String query, Object... args) { - TableResult tableResult = getTableEnv().executeSql(String.format(query, args)); - tableResult - .getJobClient() - .ifPresent( - c -> { - try { - c.getJobExecutionResult().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); - try (CloseableIterator iter = tableResult.collect()) { - List results = Lists.newArrayList(iter); - return results; - } catch (Exception e) { - LOG.warn("Failed to collect table result", e); - return null; - } - } - - protected StreamTableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - StreamTableEnvironment.create( - getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); - Configuration configuration = tEnv.getConfig().getConfiguration(); - // set low-level key-value options - configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); - } - } - } - return tEnv; - } - - protected StreamExecutionEnvironment getEnv() { - if (env == null) { - synchronized (this) { - if (env == null) { - StateBackend backend = - new FsStateBackend( - "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setParallelism(1); - env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig().setCheckpointInterval(300); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - env.setStateBackend(backend); - env.setRestartStrategy(RestartStrategies.noRestart()); - } - } - } - return env; - } - - public static String toWithClause(Map props) { - StringBuilder builder = new StringBuilder(); - builder.append("("); - int propCount = 0; - for (Map.Entry entry : props.entrySet()) { - if (propCount > 0) { - builder.append(","); - } - builder - .append("'") - .append(entry.getKey()) - .append("'") - .append("=") - .append("'") - .append(entry.getValue()) - .append("'"); - propCount++; - } - builder.append(")"); - return builder.toString(); - } - - private String compoundKey(Object... components) { - return Stream.of(components).map(Object::toString).collect(Collectors.joining(".")); - } - - private void insertValue() { - sql( - "CREATE TABLE default_catalog.default_database." - + TABLE - + " (" - + " id INT," - + " t TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) " - + " WITH (" - + " 'connector' = 'datagen'," - + " 'fields.id.kind'='sequence'," - + " 'fields.id.start'='1'," - + " 'fields.id.end'='1'" - + ")"); - - sql( - "INSERT INTO " - + catalogName - + "." - + DB - + "." - + TABLE - + " SELECT * FROM default_catalog.default_database." - + TABLE); - - sql("DROP TABLE default_catalog.default_database." + TABLE); - } - - @Test - public void testAlterUnKeyTable() throws Exception { - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP" - + ") PARTITIONED BY(t) " - + " WITH (" - + " 'self-optimizing.enabled' = 'false'" - + ")"); - - sql( - "ALTER TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " " - + "SET ( 'write.metadata.delete-after-commit.enabled' = 'false')"); - Map unKeyTableProperties = - getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); - Assert.assertEquals( - unKeyTableProperties.get("write.metadata.delete-after-commit.enabled"), "false"); - } - - @Test - public void testAlterKeyTable() throws Exception { - sql( - "CREATE TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " t TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(t) "); - sql( - "ALTER TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " " - + "SET ( 'self-optimizing.group' = 'flink')"); - sql( - "ALTER TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " " - + "SET ( 'self-optimizing.enabled' = 'true')"); - - sql( - "ALTER TABLE " - + catalogName - + "." - + DB - + "." - + TABLE - + " " - + "SET ( 'write.upsert.enabled' = 'true')"); - - Map keyTableProperties = - getMixedFormatCatalog().loadTable(TableIdentifier.of(catalogName, DB, TABLE)).properties(); - Assert.assertEquals(keyTableProperties.get("self-optimizing.enabled"), "true"); - Assert.assertEquals(keyTableProperties.get("self-optimizing.group"), "flink"); - Assert.assertEquals(keyTableProperties.get("write.upsert.enabled"), "true"); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java deleted file mode 100644 index e951acce03..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/catalog/TestMixedCatalogTablePartitions.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.catalog; - -import static java.util.Collections.singletonList; -import static org.apache.flink.table.api.Expressions.$; -import static org.apache.flink.table.expressions.ApiExpressionUtils.valueLiteral; -import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.EQUALS; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.table.api.ApiExpression; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.catalog.CatalogPartitionSpec; -import org.apache.flink.table.catalog.ObjectPath; -import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; -import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.expressions.CallExpression; -import org.apache.flink.table.expressions.FieldReferenceExpression; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.types.RowKind; -import org.junit.Assert; -import org.junit.Test; - -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public class TestMixedCatalogTablePartitions extends FlinkTestBase { - private final String tableName = "test_partition_table"; - private final String db = "test_partition_db"; - - public TestMixedCatalogTablePartitions() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - public void before() throws Exception { - super.before(); - super.config(); - } - - @Test - public void testListPartitionsUnKeyedTable() throws TableNotPartitionedException { - List data = new LinkedList<>(); - data.add(new Object[] {1, "mark", "2023-10-01"}); - data.add(new Object[] {2, "Gerry", "2023-10-02"}); - - List rows = DataUtil.toRows(data); - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixedCatalog." - + db - + "." - + tableName - + "(" - + " id INT, name STRING, dt STRING) PARTITIONED BY (dt)"); - - sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); - ObjectPath objectPath = new ObjectPath(db, tableName); - MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); - List list = mixedCatalog.listPartitions(objectPath); - - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01")); - CatalogPartitionSpec partitionSpec2 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-02")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - Assert.assertEquals("Should produce the expected catalog partition specs.", list, expected); - } - - @Test - public void testListPartitionsKeyedTable() throws TableNotPartitionedException { - List data = new LinkedList<>(); - data.add(new Object[] {1, "mark", "2023-10-01"}); - data.add(new Object[] {2, "Gerry", "2023-10-02"}); - data.add(new Object[] {RowKind.DELETE, 2, "Gerry", "2023-10-02"}); - - DataStreamSource rowData = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType())); - Table input = getTableEnv().fromDataStream(rowData, $("id"), $("name"), $("dt")); - - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixedCatalog." - + db - + "." - + tableName - + "(" - + " id INT, name STRING, dt STRING, PRIMARY KEY (id) NOT ENFORCED) PARTITIONED BY (dt)"); - - sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); - ObjectPath objectPath = new ObjectPath(db, tableName); - MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); - List partitionList = mixedCatalog.listPartitions(objectPath); - - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01")); - CatalogPartitionSpec partitionSpec2 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-02")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - Assert.assertEquals( - "Should produce the expected catalog partition specs.", partitionList, expected); - } - - @Test - public void testListPartitionsByFilter() - throws TableNotPartitionedException, PartitionSpecInvalidException { - List data = new LinkedList<>(); - data.add(new Object[] {1, "mark", "2023-10-01"}); - data.add(new Object[] {2, "Gerry", "2023-10-02"}); - data.add(new Object[] {2, "mark", "2023-10-02"}); - data.add(new Object[] {2, "Gerry", "2023-10-01"}); - data.add(new Object[] {RowKind.DELETE, 2, "Gerry", "2023-10-02"}); - - DataStreamSource rowData = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType())); - Table input = getTableEnv().fromDataStream(rowData, $("id"), $("name"), $("dt")); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixedCatalog WITH %s", toWithClause(props)); - sql( - "CREATE TABLE IF NOT EXISTS mixedCatalog." - + db - + "." - + tableName - + "(" - + " id INT, name STRING, dt STRING) PARTITIONED BY (dt,name)"); - sql("INSERT INTO %s select * from input", "mixedCatalog." + db + "." + tableName); - - ResolvedExpression dtRef = new FieldReferenceExpression("dt", DataTypes.STRING(), 0, 3); - CallExpression callExpression = - CallExpression.permanent( - EQUALS, - Arrays.asList(dtRef, valueLiteral("2023-10-01", DataTypes.STRING().notNull())), - DataTypes.BOOLEAN()); - - ObjectPath objectPath = new ObjectPath(db, tableName); - MixedCatalog mixedCatalog = (MixedCatalog) getTableEnv().getCatalog("mixedCatalog").get(); - List list = - mixedCatalog.listPartitionsByFilter(objectPath, singletonList(callExpression)); - - List expected = Lists.newArrayList(); - CatalogPartitionSpec partitionSpec1 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "Gerry")); - CatalogPartitionSpec partitionSpec2 = - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "mark")); - expected.add(partitionSpec1); - expected.add(partitionSpec2); - Assert.assertEquals("Should produce the expected catalog partition specs.", list, expected); - - List listCatalogPartitionSpec = - mixedCatalog.listPartitions( - objectPath, - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name", "Gerry"))); - Assert.assertEquals( - "Should produce the expected catalog partition specs.", listCatalogPartitionSpec.size(), 1); - - try { - mixedCatalog.listPartitions( - objectPath, - new CatalogPartitionSpec(ImmutableMap.of("dt", "2023-10-01", "name1", "Gerry"))); - } catch (Exception e) { - Assert.assertTrue(e instanceof PartitionSpecInvalidException); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java deleted file mode 100644 index 9b24642735..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaConfigGenerate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.kafka.testutils; - -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.producer.ProducerConfig; -import org.apache.kafka.common.serialization.ByteArrayDeserializer; -import org.apache.kafka.common.serialization.ByteArraySerializer; -import org.apache.kafka.common.serialization.StringDeserializer; -import org.apache.kafka.common.serialization.StringSerializer; - -import java.util.Properties; - -public interface KafkaConfigGenerate { - - static Properties getProperties() { - Properties properties = new Properties(); - properties.put( - BOOTSTRAP_SERVERS_CONFIG, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - properties.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); - properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); - properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); - properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - return properties; - } - - static Properties getProperties(Properties properties) { - properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); - properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); - properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - return properties; - } - - static Properties getPropertiesWithByteArray() { - Properties properties = new Properties(); - properties.put( - BOOTSTRAP_SERVERS_CONFIG, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - properties.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); - properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); - properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); - properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); - properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); - return properties; - } - - static Properties getPropertiesWithByteArray(Properties properties) { - properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); - properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); - properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); - properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class); - return properties; - } - - static Properties getStandardProperties(Properties properties) { - properties.put(ConsumerConfig.GROUP_ID_CONFIG, "mixed-format-tests"); - properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); - properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); // read from the beginning. - properties.put("max.partition.fetch.bytes", "256"); - return properties; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java deleted file mode 100644 index 609158255a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaContainerTest.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.kafka.testutils; - -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - -import org.apache.kafka.clients.admin.AdminClient; -import org.apache.kafka.clients.admin.NewTopic; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerConfig; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.containers.Network; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.time.Duration; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.stream.Collectors; - -@Testcontainers -public class KafkaContainerTest { - private static final Logger LOG = LoggerFactory.getLogger(KafkaContainerTest.class); - public static String INTER_CONTAINER_KAFKA_ALIAS = "kafka"; - public static Network NETWORK = Network.newNetwork(); - public static String KAFKA = "confluentinc/cp-kafka:7.2.6"; - - @Container - public static KafkaContainer KAFKA_CONTAINER = - KafkaUtil.createKafkaContainer(KAFKA, LOG) - .withStartupTimeout(Duration.ofSeconds(120L)) - .withSharedMemorySize(134217728L) - .withEmbeddedZookeeper() - .withNetwork(NETWORK) - .withNetworkAliases(INTER_CONTAINER_KAFKA_ALIAS); - - public static ConsumerRecords readRecords(String topic) { - Properties properties = getProperties(); - properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); - KafkaConsumer consumer = new KafkaConsumer<>(properties); - consumer.assign( - consumer.partitionsFor(topic).stream() - .map(partitionInfo -> new TopicPartition(topic, partitionInfo.partition())) - .collect(Collectors.toSet())); - consumer.seekToBeginning(consumer.assignment()); - return consumer.poll(Duration.ofMillis(1000)); - } - - public static ConsumerRecords readRecordsBytes(String topic) { - return (ConsumerRecords) readRecords(topic, getPropertiesWithByteArray()); - } - - public static ConsumerRecords readRecords(String topic, Properties properties) { - properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); - KafkaConsumer consumer = new KafkaConsumer<>(properties); - consumer.assign( - consumer.partitionsFor(topic).stream() - .map(partitionInfo -> new TopicPartition(topic, partitionInfo.partition())) - .collect(Collectors.toSet())); - consumer.seekToBeginning(consumer.assignment()); - return consumer.poll(Duration.ofMillis(1000)); - } - - public static Integer countAllRecords(String topic, Properties properties) { - properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); - return KafkaUtil.drainAllRecordsFromTopic(topic, properties).size(); - } - - public static void createTopics(int numPartitions, int replicationFactor, String... topics) { - List newTopics = - Arrays.stream(topics) - .map(topic -> new NewTopic(topic, numPartitions, (short) replicationFactor)) - .collect(Collectors.toList()); - Map params = new HashMap<>(); - params.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - try (AdminClient admin = AdminClient.create(params)) { - admin.createTopics(newTopics); - } - } - - public static void deleteTopics(String... topics) { - Map params = new HashMap<>(); - params.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - try (AdminClient admin = AdminClient.create(params)) { - admin.deleteTopics(Arrays.asList(topics)); - } - } - - public static Properties getPropertiesByTopic(String topic) { - Properties properties = getPropertiesWithByteArray(getProperties()); - properties.put(LOG_STORE_MESSAGE_TOPIC, topic); - properties.put(ProducerConfig.ACKS_CONFIG, "all"); - return properties; - } - - public static List getPartitionsForTopic(String topic) { - Properties properties = getProperties(); - KafkaConsumer consumer = new KafkaConsumer<>(properties); - return consumer.partitionsFor(topic).stream() - .map(pi -> new TopicPartition(pi.topic(), pi.partition())) - .collect(Collectors.toList()); - } - - public static KafkaProducer getProducer() { - Properties properties = getPropertiesWithByteArray(); - KafkaProducer producer = new KafkaProducer<>(properties); - return producer; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java deleted file mode 100644 index a97f7d0835..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/KafkaUtil.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.kafka.testutils; - -import org.apache.flink.util.StringUtils; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.KafkaException; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.serialization.ByteArrayDeserializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.containers.output.Slf4jLogConsumer; -import org.testcontainers.utility.DockerImageName; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.stream.Collectors; - -/** Collection of methods to interact with a Kafka cluster. */ -public class KafkaUtil { - - private static final Logger LOG = LoggerFactory.getLogger(KafkaUtil.class); - private static final Duration CONSUMER_POLL_DURATION = Duration.ofSeconds(1); - - private KafkaUtil() {} - - /** - * This method helps to set commonly used Kafka configurations and aligns the internal Kafka log - * levels with the ones used by the capturing logger. - * - * @param dockerImageVersion describing the Kafka image - * @param logger to derive the log level from - * @return configured Kafka container - */ - public static KafkaContainer createKafkaContainer(String dockerImageVersion, Logger logger) { - return createKafkaContainer(dockerImageVersion, logger, null); - } - - /** - * This method helps to set commonly used Kafka configurations and aligns the internal Kafka log - * levels with the ones used by the capturing logger, and set the prefix of logger. - */ - public static KafkaContainer createKafkaContainer( - String dockerImageVersion, Logger logger, String loggerPrefix) { - String logLevel; - if (logger.isTraceEnabled()) { - logLevel = "TRACE"; - } else if (logger.isDebugEnabled()) { - logLevel = "DEBUG"; - } else if (logger.isInfoEnabled()) { - logLevel = "INFO"; - } else if (logger.isWarnEnabled()) { - logLevel = "WARN"; - } else if (logger.isErrorEnabled()) { - logLevel = "ERROR"; - } else { - logLevel = "OFF"; - } - - Slf4jLogConsumer logConsumer = new Slf4jLogConsumer(logger); - if (!StringUtils.isNullOrWhitespaceOnly(loggerPrefix)) { - logConsumer.withPrefix(loggerPrefix); - } - return new KafkaContainer(DockerImageName.parse(dockerImageVersion)) - .withEnv("KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR", "1") - .withEnv("KAFKA_TRANSACTION_STATE_LOG_MIN_ISR", "1") - .withEnv("KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE", "false") - .withEnv("KAFKA_LOG4J_ROOT_LOGLEVEL", logLevel) - .withEnv("KAFKA_LOG4J_LOGGERS", "state.change.logger=" + logLevel) - .withEnv("KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR", "1") - .withEnv("KAFKA_TRANSACTION_STATE_LOG_MIN_ISR", "1") - .withEnv("KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE", "false") - .withEnv("KAFKA_TRANSACTION_MAX_TIMEOUT_MS", String.valueOf(Duration.ofHours(2).toMillis())) - .withEnv("KAFKA_LOG4J_TOOLS_ROOT_LOGLEVEL", logLevel) - .withLogConsumer(logConsumer); - } - - /** - * Drain all records available from the given topic from the beginning until the current highest - * offset. - * - *

This method will fetch the latest offsets for the partitions once and only return records - * until that point. - * - * @param topic to fetch from - * @param properties used to configure the created {@link KafkaConsumer} - * @param committed determines the mode {@link ConsumerConfig#ISOLATION_LEVEL_CONFIG} with which - * the consumer reads the records. - * @return all {@link ConsumerRecord} in the topic - * @throws KafkaException - */ - public static List> drainAllRecordsFromTopic( - String topic, Properties properties, boolean committed) throws KafkaException { - final Properties consumerConfig = new Properties(); - consumerConfig.putAll(properties); - consumerConfig.put( - ConsumerConfig.ISOLATION_LEVEL_CONFIG, committed ? "read_committed" : "read_uncommitted"); - return drainAllRecordsFromTopic(topic, consumerConfig); - } - - /** - * Drain all records available from the given topic from the beginning until the current highest - * offset. - * - *

This method will fetch the latest offsets for the partitions once and only return records - * until that point. - * - * @param topic to fetch from - * @param properties used to configure the created {@link KafkaConsumer} - * @return all {@link ConsumerRecord} in the topic - * @throws KafkaException - */ - public static List> drainAllRecordsFromTopic( - String topic, Properties properties) throws KafkaException { - final Properties consumerConfig = new Properties(); - consumerConfig.putAll(properties); - consumerConfig.put("key.deserializer", ByteArrayDeserializer.class.getName()); - consumerConfig.put("value.deserializer", ByteArrayDeserializer.class.getName()); - try (KafkaConsumer consumer = new KafkaConsumer<>(consumerConfig)) { - Set topicPartitions = getAllPartitions(consumer, topic); - Map endOffsets = consumer.endOffsets(topicPartitions); - consumer.assign(topicPartitions); - consumer.seekToBeginning(topicPartitions); - - final List> consumerRecords = new ArrayList<>(); - while (!topicPartitions.isEmpty()) { - ConsumerRecords records = consumer.poll(CONSUMER_POLL_DURATION); - LOG.debug("Fetched {} records from topic {}.", records.count(), topic); - - // Remove partitions from polling which have reached its end. - final List finishedPartitions = new ArrayList<>(); - for (final TopicPartition topicPartition : topicPartitions) { - final long position = consumer.position(topicPartition); - final long endOffset = endOffsets.get(topicPartition); - LOG.debug( - "Endoffset {} and current position {} for partition {}", - endOffset, - position, - topicPartition.partition()); - if (endOffset - position > 0) { - continue; - } - finishedPartitions.add(topicPartition); - } - if (topicPartitions.removeAll(finishedPartitions)) { - consumer.assign(topicPartitions); - } - for (ConsumerRecord r : records) { - consumerRecords.add(r); - } - } - return consumerRecords; - } - } - - private static Set getAllPartitions( - KafkaConsumer consumer, String topic) { - return consumer.partitionsFor(topic).stream() - .map(info -> new TopicPartition(info.topic(), info.partition())) - .collect(Collectors.toSet()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java deleted file mode 100644 index b114e6b8c8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/kafka/testutils/SuccessException.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.kafka.testutils; - -/** Exception that is thrown to terminate a program and indicate success. */ -public class SuccessException extends RuntimeException { - private static final long serialVersionUID = -7011865671593955887L; -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java deleted file mode 100644 index 8f7aae0606..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/ByteArraySetSerializerTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashSet; -import java.util.Set; - -public class ByteArraySetSerializerTest { - private static final Logger LOG = LoggerFactory.getLogger(ByteArraySetSerializerTest.class); - - @Test - public void testByteArraySetSerializer() { - Set byteSet = new HashSet<>(); - byte[] data = "Hello".getBytes(); - byteSet.add(new ByteArrayWrapper(data, data.length)); - data = "World".getBytes(); - byteSet.add(new ByteArrayWrapper(data, data.length)); - byte[] serialized = ByteArraySetSerializer.serialize(byteSet); - Set actualSet = ByteArraySetSerializer.deserialize(serialized); - Assert.assertEquals(byteSet.size(), actualSet.size()); - Assert.assertEquals(byteSet, actualSet); - } - - @Test - public void testPerformance() { - Set byteArraySet = new HashSet<>(); - StringBuilder sb = new StringBuilder(); - int num = 10000; - long start = System.currentTimeMillis(); - int totalSize = 4; - for (int i = 0; i < num; i++) { - sb.append(i); - byte[] tmp = sb.toString().getBytes(); - byteArraySet.add(new ByteArrayWrapper(tmp, tmp.length)); - totalSize += 4 + tmp.length; - } - LOG.info("added {} items process time: {}", num, System.currentTimeMillis() - start); - Assert.assertEquals(num, byteArraySet.size()); - - start = System.currentTimeMillis(); - byte[] serialized = ByteArraySetSerializer.serialize(byteArraySet); - long cost = System.currentTimeMillis() - start; - assert serialized != null; - Assert.assertEquals(totalSize, serialized.length); - LOG.info( - "serialized cost: {}, num= {}, result byte array size={}.", cost, num, serialized.length); - - start = System.currentTimeMillis(); - Set actualSet = ByteArraySetSerializer.deserialize(serialized); - cost = System.currentTimeMillis() - start; - LOG.info("deserialized cost: {}, num= {}, set size={}.", cost, num, actualSet.size()); - Assert.assertEquals(byteArraySet, actualSet); - - // exists - sb = new StringBuilder(); - start = System.currentTimeMillis(); - for (int i = 0; i < num; i++) { - sb.append(i); - Assert.assertTrue( - actualSet.contains( - new ByteArrayWrapper(sb.toString().getBytes(), sb.toString().getBytes().length))); - } - long end = System.currentTimeMillis(); - LOG.info("contains process time:{}", end - start); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java deleted file mode 100644 index 1f830cb6d9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/TestKVTable.java +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOOKUP_CACHE_TTL_AFTER_WRITE; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.ROCKSDB_WRITING_THREADS; -import static org.junit.Assert.assertEquals; - -import org.apache.amoro.flink.lookup.filter.RowDataPredicate; -import org.apache.amoro.flink.lookup.filter.RowDataPredicateExpressionVisitor; -import org.apache.amoro.flink.lookup.filter.TestRowDataPredicateBase; -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.memory.DataInputDeserializer; -import org.apache.flink.core.memory.DataOutputSerializer; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.flink.shaded.guava30.com.google.common.cache.Cache; -import org.apache.flink.shaded.guava30.com.google.common.cache.CacheBuilder; -import org.apache.flink.shaded.guava30.com.google.common.collect.Lists; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.catalog.UniqueConstraint; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.binary.BinaryRowData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer; -import org.apache.flink.table.runtime.typeutils.RowDataSerializer; -import org.apache.flink.table.types.DataType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.utils.TypeConversions; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.rules.TestName; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; - -@SuppressWarnings("OptionalUsedAsFieldOrParameterType") -@RunWith(value = Parameterized.class) -public class TestKVTable extends TestRowDataPredicateBase { - private static final Logger LOG = LoggerFactory.getLogger(TestKVTable.class); - @Rule public TemporaryFolder temp = new TemporaryFolder(); - @Rule public TestName name = new TestName(); - private final Configuration config = new Configuration(); - private final List primaryKeys = Lists.newArrayList("id", "grade"); - private final List primaryKeysDisorder = Lists.newArrayList("grade", "num", "id"); - - private final boolean guavaCacheEnabled; - - private final Schema mixedTableSchema = - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "grade", Types.StringType.get()), - Types.NestedField.required(3, "num", Types.IntegerType.get())); - - private String dbPath; - - @Parameterized.Parameters(name = "guavaCacheEnabled = {0}") - public static Object[][] parameters() { - return new Object[][] {{true}, {false}}; - } - - public TestKVTable(boolean guavaCacheEnabled) { - this.guavaCacheEnabled = guavaCacheEnabled; - } - - @Before - public void before() throws IOException { - dbPath = temp.newFolder().getPath(); - if (!guavaCacheEnabled) { - config.set(MixedFormatValidator.LOOKUP_CACHE_MAX_ROWS, 0L); - } - } - - @Test - public void testRowDataSerializer() throws IOException { - BinaryRowDataSerializer binaryRowDataSerializer = new BinaryRowDataSerializer(3); - - GenericRowData genericRowData = (GenericRowData) row(1, "2", 3); - RowType rowType = FlinkSchemaUtil.convert(mixedTableSchema); - RowDataSerializer rowDataSerializer = new RowDataSerializer(rowType); - BinaryRowData record = rowDataSerializer.toBinaryRow(genericRowData); - - DataOutputSerializer view = new DataOutputSerializer(32); - binaryRowDataSerializer.serialize(record, view); - System.out.println(Arrays.toString(view.getCopyOfBuffer())); - - BinaryRowData desRowData = - binaryRowDataSerializer.deserialize(new DataInputDeserializer(view.getCopyOfBuffer())); - Assert.assertNotNull(desRowData); - Assert.assertEquals(record.getInt(0), desRowData.getInt(0)); - Assert.assertEquals(record.getInt(1), desRowData.getInt(1)); - Assert.assertEquals(record.getInt(2), desRowData.getInt(2)); - - // test join key rowData - binaryRowDataSerializer = new BinaryRowDataSerializer(2); - List keys = Lists.newArrayList("id", "grade"); - Schema keySchema = mixedTableSchema.select(keys); - rowType = FlinkSchemaUtil.convert(keySchema); - rowDataSerializer = new RowDataSerializer(rowType); - KeyRowData keyRowData = new KeyRowData(new int[] {0, 1}, row(2, "3", 4)); - KeyRowData keyRowData1 = new KeyRowData(new int[] {0, 1}, row(2, "3", 4)); - - BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(keyRowData); - view.clear(); - binaryRowDataSerializer.serialize(binaryRowData, view); - byte[] rowBytes = view.getCopyOfBuffer(); - - BinaryRowData binaryRowData1 = rowDataSerializer.toBinaryRow(keyRowData1); - view.clear(); - binaryRowDataSerializer.serialize(binaryRowData1, view); - byte[] rowBytes1 = view.getCopyOfBuffer(); - Assert.assertArrayEquals(rowBytes1, rowBytes); - } - - @Test - public void testInitialUniqueKeyTable() throws IOException { - config.setInteger(ROCKSDB_WRITING_THREADS, 5); - List joinKeys = Lists.newArrayList("id", "grade"); - try (UniqueIndexTable uniqueIndexTable = (UniqueIndexTable) createTable(joinKeys)) { - uniqueIndexTable.open(); - - // During the initialization phase, the Merge-on-Read approach is used to retrieve data, - // which will only return INSERT data. - // When there are multiple entries with the same primary key, only one entry will be returned. - initTable( - uniqueIndexTable, - upsertStream( - row(RowKind.INSERT, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.INSERT, 2, "3", 3), - row(RowKind.INSERT, 2, "4", 4), - row(RowKind.INSERT, 2, "5", 5))); - - if (!uniqueIndexTable.initialized()) { - uniqueIndexTable.waitInitializationCompleted(); - } - - assertTable( - uniqueIndexTable, - row(1, "1"), - row(1, "1", 1), - row(2, "2"), - row(2, "2", 2), - row(2, "3"), - row(2, "3", 3), - row(2, "4"), - row(2, "4", 4), - row(2, "5"), - row(2, "5", 5)); - - // upsert table - upsertTable( - uniqueIndexTable, - upsertStream( - row(RowKind.DELETE, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.DELETE, 2, "2", 2), - row(RowKind.UPDATE_BEFORE, 3, "3", 4), - row(RowKind.UPDATE_AFTER, 3, "3", 5), - row(RowKind.INSERT, 4, "4", 4))); - - assertTable( - uniqueIndexTable, - row(1, "1"), - null, - row(2, "2"), - null, - row(3, "3"), - row(3, "3", 5), - row(4, "4"), - row(4, "4", 4)); - } - } - - @Test - public void testSecondaryKeysMapping() throws IOException { - // primary keys are id and grade. - List joinKeys = Lists.newArrayList("grade", "id"); - try (SecondaryIndexTable secondaryIndexTable = - (SecondaryIndexTable) createTableWithDisorderPK(joinKeys)) { - secondaryIndexTable.open(); - - initTable( - secondaryIndexTable, - upsertStream( - row(RowKind.INSERT, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.INSERT, 2, "3", 3), - row(RowKind.INSERT, 2, "3", 4), - row(RowKind.INSERT, 2, "5", 5))); - - if (!secondaryIndexTable.initialized()) { - secondaryIndexTable.waitInitializationCompleted(); - } - - assertTableSet(secondaryIndexTable, row("1", 1), row(1, "1", 1)); - assertTableSet(secondaryIndexTable, row("2", 2), row(2, "2", 2)); - - upsertTable( - secondaryIndexTable, - upsertStream( - row(RowKind.DELETE, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.DELETE, 2, "2", 2), - row(RowKind.UPDATE_BEFORE, 3, "3", 4), - row(RowKind.UPDATE_AFTER, 3, "3", 5), - row(RowKind.INSERT, 3, "4", 4))); - - assertTableSet(secondaryIndexTable, row("1", 1), null); - assertTableSet(secondaryIndexTable, row("3", 2), row(2, "3", 3), row(2, "3", 4)); - assertTableSet(secondaryIndexTable, row("4", 3), row(3, "4", 4)); - } - } - - @Test - public void testInitialSecondaryKeyTable() throws IOException { - config.setInteger(ROCKSDB_WRITING_THREADS, 10); - config.set(LOOKUP_CACHE_TTL_AFTER_WRITE, Duration.ofMinutes(1000)); - // primary keys are id and grade. - List joinKeys = Lists.newArrayList("id"); - try (SecondaryIndexTable secondaryIndexTable = (SecondaryIndexTable) createTable(joinKeys)) { - writeAndAssert(secondaryIndexTable); - } - } - - private void writeAndAssert(SecondaryIndexTable secondaryIndexTable) throws IOException { - secondaryIndexTable.open(); - - initTable( - secondaryIndexTable, - upsertStream( - row(RowKind.INSERT, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.INSERT, 2, "3", 3), - row(RowKind.INSERT, 2, "4", 4), - row(RowKind.INSERT, 2, "5", 5))); - - if (!secondaryIndexTable.initialized()) { - secondaryIndexTable.waitInitializationCompleted(); - } - - assertTableSet(secondaryIndexTable, row(1), row(1, "1", 1)); - assertTableSet( - secondaryIndexTable, - row(2), - row(2, "2", 2), - row(2, "3", 3), - row(2, "4", 4), - row(2, "5", 5)); - - upsertTable( - secondaryIndexTable, - upsertStream( - row(RowKind.DELETE, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.DELETE, 2, "2", 2), - row(RowKind.UPDATE_BEFORE, 3, "3", 4), - row(RowKind.UPDATE_AFTER, 3, "3", 5), - row(RowKind.INSERT, 3, "4", 4))); - - assertTableSet(secondaryIndexTable, row(1), null); - assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4), row(2, "5", 5)); - assertTableSet(secondaryIndexTable, row(3), row(3, "3", 5), row(3, "4", 4)); - } - - @Test - public void testCacheExpired() throws InterruptedException { - Cache cache = - CacheBuilder.newBuilder().expireAfterWrite(Duration.ofSeconds(1)).build(); - cache.put(1, 1); - cache - .asMap() - .compute( - 2, - (k, v) -> { - if (v == null) { - return k; - } - return v; - }); - Assert.assertEquals(Integer.valueOf(1), cache.getIfPresent(1)); - Assert.assertEquals(Integer.valueOf(2), cache.getIfPresent(2)); - Thread.sleep(1001); - Assert.assertEquals(2, cache.size()); - Assert.assertNull(cache.getIfPresent(1)); - Assert.assertNull(cache.getIfPresent(2)); - cache.cleanUp(); - cache.put(3, 3); - Assert.assertEquals(1, cache.size()); - Assert.assertNull(cache.getIfPresent(1)); - Assert.assertEquals(Integer.valueOf(3), cache.getIfPresent(3)); - } - - @Test - public void testPredicate() throws IOException { - String filter = "id >= 2 and num < 5 and num > 2"; - Optional rowDataPredicate = generatePredicate(filter); - - KVTable uniqueIndexTable = - createTable(Lists.newArrayList("id", "grade"), rowDataPredicate); - uniqueIndexTable.open(); - initTable( - uniqueIndexTable, - upsertStream( - row(RowKind.INSERT, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.INSERT, 2, "3", 3), - row(RowKind.INSERT, 3, "4", 4), - row(RowKind.INSERT, 3, "5", 5))); - - if (!uniqueIndexTable.initialized()) { - uniqueIndexTable.waitInitializationCompleted(); - } - - assertTable( - uniqueIndexTable, - row(1, "1"), - null, - row(2, "2"), - null, - row(2, "3"), - row(2, "3", 3), - row(3, "4"), - row(3, "4", 4), - row(3, "5"), - null); - - // upsert table - upsertTable( - uniqueIndexTable, - upsertStream( - row(RowKind.DELETE, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.DELETE, 2, "2", 2), - row(RowKind.UPDATE_BEFORE, 2, "3", 3), - row(RowKind.UPDATE_AFTER, 2, "3", 5), - row(RowKind.INSERT, 4, "4", 4))); - - assertTable( - uniqueIndexTable, - row(1, "1"), - null, - row(2, "2"), - null, - row(2, "3"), - null, - row(4, "4"), - row(4, "4", 4)); - } - - @Test - public void testSecondaryIndexPredicate() throws IOException { - String filter = "id >= 2 and num < 5 and num > 2"; - Optional rowDataPredicate = generatePredicate(filter); - - // primary keys are id and grade. - List joinKeys = Lists.newArrayList("id"); - try (SecondaryIndexTable secondaryIndexTable = - (SecondaryIndexTable) createTable(joinKeys, rowDataPredicate)) { - secondaryIndexTable.open(); - - initTable( - secondaryIndexTable, - upsertStream( - row(RowKind.INSERT, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.INSERT, 2, "3", 3), - row(RowKind.INSERT, 2, "4", 4), - row(RowKind.INSERT, 2, "5", 5))); - - if (!secondaryIndexTable.initialized()) { - secondaryIndexTable.waitInitializationCompleted(); - } - - assertTableSet(secondaryIndexTable, row(1), null); - assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4)); - - upsertTable( - secondaryIndexTable, - upsertStream( - row(RowKind.DELETE, 1, "1", 1), - row(RowKind.INSERT, 2, "2", 2), - row(RowKind.DELETE, 2, "2", 2), - row(RowKind.UPDATE_BEFORE, 3, "3", 4), - row(RowKind.UPDATE_AFTER, 3, "3", 5), - row(RowKind.INSERT, 3, "4", 4))); - - assertTableSet(secondaryIndexTable, row(1), null); - assertTableSet(secondaryIndexTable, row(2), row(2, "3", 3), row(2, "4", 4)); - assertTableSet(secondaryIndexTable, row(3), row(3, "4", 4)); - } - } - - private Optional generatePredicate(String filterSql) { - Map fieldIndexMap = new HashMap<>(); - Map fieldTypeMap = new HashMap<>(); - List fields = mixedTableSchema.asStruct().fields(); - List columns = new ArrayList<>(fields.size()); - for (int i = 0; i < fields.size(); i++) { - String name = fields.get(i).name(); - DataType dataType = - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(fields.get(i).type())); - fieldIndexMap.put(name, i); - fieldTypeMap.put(name, dataType); - columns.add(i, Column.physical(name, dataType)); - } - ResolvedSchema schema = - new ResolvedSchema( - columns, Collections.emptyList(), UniqueConstraint.primaryKey("", primaryKeys)); - - RowDataPredicateExpressionVisitor rowDataPredicateExpressionVisitor = - new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldTypeMap); - - List expressions = resolveSQLFilterToExpression(filterSql, schema); - assertEquals(1, expressions.size()); - return expressions.get(0).accept(rowDataPredicateExpressionVisitor); - } - - private KVTable createTableWithDisorderPK(List joinKeys) { - return createTable(joinKeys, Optional.empty(), true); - } - - private KVTable createTable( - List joinKeys, Optional rowDataPredicate) { - return createTable(joinKeys, rowDataPredicate, false); - } - - private KVTable createTable( - List joinKeys, Optional rowDataPredicate, boolean isDisorderPK) { - return KVTableFactory.INSTANCE.create( - new RowDataStateFactory(dbPath, new UnregisteredMetricsGroup()), - isDisorderPK ? primaryKeysDisorder : primaryKeys, - joinKeys, - mixedTableSchema, - config, - rowDataPredicate.orElse(null)); - } - - private KVTable createTable(List joinKeys) { - return createTable(joinKeys, Optional.empty()); - } - - private void initTable(KVTable table, Iterator initStream) throws IOException { - if (initStream != null) { - table.initialize(initStream); - } - } - - private void upsertTable(KVTable table, Iterator upsertStream, RowData... rows) - throws IOException { - if (upsertStream != null) { - table.upsert(upsertStream); - } - } - - private void assertTable(KVTable table, RowData... rows) throws IOException { - // Loop through the rows array in steps of 2 - for (int i = 0; i < rows.length; i = i + 2) { - // Get the key and expected value at the current index and the next index - RowData key = rows[i], expected = rows[i + 1]; - - List values = table.get(key); - Assert.assertNotNull(values); - if (expected == null) { - Assert.assertEquals(0, values.size()); - continue; - } - Assert.assertEquals(expected.toString(), 1, values.size()); - RowData actual = values.get(0); - assertRecord(expected, actual); - } - } - - private void assertTableSet(KVTable table, RowData key, RowData... expects) - throws IOException { - List values = table.get(key); - if (expects == null) { - Assert.assertEquals(0, values.size()); - return; - } - Assert.assertEquals(expects.length, values.size()); - values = values.stream().sorted(compare()).collect(Collectors.toList()); - List expectsAfterSort = - Arrays.stream(expects).sorted(compare()).collect(Collectors.toList()); - for (int i = 0; i < expects.length; i = i + 1) { - // Get the key and expected value at the current index and the next index - RowData expected = expectsAfterSort.get(i); - - RowData actual = values.get(i); - assertRecord(expected, actual); - } - } - - private Comparator compare() { - return Comparator.comparingInt((RowData o) -> o.getInt(0)) - .thenComparing(o -> o.getString(1)) - .thenComparingInt(o -> o.getInt(2)); - } - - private void assertRecord(RowData expected, RowData actual) { - if (!(actual instanceof BinaryRowData)) { - throw new IllegalArgumentException("Only support BinaryRowData"); - } - BinaryRowData binaryRowData = (BinaryRowData) actual; - for (int j = 0; j < binaryRowData.getArity(); j++) { - switch (j) { - case 0: - case 2: - Assert.assertEquals( - String.format("expected:%s, actual:%s.", expected.toString(), actual), - expected.getInt(j), - binaryRowData.getInt(j)); - break; - case 1: - Assert.assertEquals( - String.format("expected:%s, actual:%s.", expected, actual), - expected.getString(j), - binaryRowData.getString(j)); - break; - } - } - } - - RowData row(RowKind rowKind, Object... objects) { - return GenericRowData.ofKind(rowKind, wrapStringData(objects)); - } - - RowData row(Object... objects) { - return GenericRowData.of(wrapStringData(objects)); - } - - Object[] wrapStringData(Object... objects) { - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof String) { - objects[i] = StringData.fromString(objects[i].toString()); - } - } - return objects; - } - - Iterator upsertStream(RowData... rows) { - return Lists.newArrayList(rows).iterator(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java deleted file mode 100644 index b48aa9bfeb..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateAllFieldTypes.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup.filter; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.apache.amoro.flink.util.DateTimeUtils; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.types.DataType; -import org.junit.Before; -import org.junit.Test; - -import java.math.BigDecimal; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; - -/** Test for {@link RowDataPredicate}. */ -public class TestRowDataPredicateAllFieldTypes extends TestRowDataPredicateBase { - protected RowDataPredicateExpressionVisitor visitor; - protected final Map fieldIndexMap = new HashMap<>(); - protected final Map fieldDataTypeMap = new HashMap<>(); - List columns = new ArrayList<>(); - protected ResolvedSchema schema; - - @Before - public void setUp() { - columns.add(0, Column.physical("f0", DataTypes.INT())); - columns.add(1, Column.physical("f1", DataTypes.STRING())); - columns.add(2, Column.physical("f2", DataTypes.CHAR(1))); - columns.add(3, Column.physical("f3", DataTypes.BOOLEAN())); - columns.add(4, Column.physical("f4", DataTypes.BINARY(1))); - columns.add(5, Column.physical("f5", DataTypes.VARBINARY(10))); - columns.add(6, Column.physical("f6", DataTypes.DECIMAL(38, 10))); - columns.add(7, Column.physical("f7", DataTypes.TINYINT())); - columns.add(8, Column.physical("f8", DataTypes.SMALLINT())); - columns.add(9, Column.physical("f9", DataTypes.BIGINT())); - columns.add(10, Column.physical("f10", DataTypes.FLOAT())); - columns.add(11, Column.physical("f11", DataTypes.DOUBLE())); - columns.add(12, Column.physical("f12", DataTypes.DATE())); - columns.add(13, Column.physical("f13", DataTypes.TIME())); - columns.add(14, Column.physical("f14", DataTypes.TIMESTAMP(3))); - columns.add(15, Column.physical("f15", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))); - schema = new ResolvedSchema(columns, Collections.emptyList(), null); - for (int i = 0; i < columns.size(); i++) { - Column column = columns.get(i); - fieldDataTypeMap.put(column.getName(), column.getDataType()); - fieldIndexMap.put(column.getName(), i); - } - visitor = new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); - } - - @Test - public void testInt() { - String equalExpr = "f0 = 2"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f0", 2))); - assertFalse(predicate.test(generateRowData("f0", 1))); - } - - @Test - public void testString() { - String equalExpr = "f1 = 'a'"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f1", StringData.fromString("a")))); - assertFalse(predicate.test(GenericRowData.of("f1", StringData.fromString("b")))); - } - - @Test - public void testChar() { - String equalExpr = "f2 = 'a'"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f2", StringData.fromString("a")))); - assertFalse(predicate.test(generateRowData("f2", StringData.fromString("b")))); - } - - @Test - public void testBoolean() { - String equalExpr = "f3 = true"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f3", Boolean.TRUE))); - assertFalse(predicate.test(generateRowData("f3", Boolean.FALSE))); - } - - // @Test - public void testBinary() { - String equalExpr = "f4 = '1'"; // byte[] - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f4", (byte) 1))); - assertFalse(predicate.test(generateRowData("f4", (byte) 2))); - } - - @Test - public void testDecimal() { - String equalExpr = "f6 = 1.1"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue( - predicate.test( - generateRowData("f6", DecimalData.fromBigDecimal(BigDecimal.valueOf(1.1d), 38, 1)))); - assertFalse( - predicate.test( - generateRowData("f6", DecimalData.fromBigDecimal(BigDecimal.valueOf(1.2d), 38, 1)))); - } - - // @Test - public void testTinyint() { - String equalExpr = "f7 = cast('1' as tinyint)"; // byte - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f7", 1))); - assertFalse(predicate.test(generateRowData("f7", 0))); - } - - // @Test - public void testSmallint() { - String equalExpr = "f8 = 1"; // short - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f8", (short) 1))); - assertFalse(predicate.test(generateRowData("f8", (short) 0))); - } - - @Test - public void testBigint() { - String equalExpr = "f9 = 1"; // long - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f9", 1L))); - assertFalse(predicate.test(generateRowData("f9", 0L))); - } - - // @Test - public void testFloat() { - String equalExpr = "f10 = 1.1"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f10", 1.1f))); - assertFalse(predicate.test(generateRowData("f10", 1.2f))); - } - - @Test - public void testDouble() { - String equalExpr = "f11 = 1.1"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f11", 1.1d))); - assertFalse(predicate.test(generateRowData("f11", 1.2d))); - } - - // @Test - public void testTimestamp() { - String equalExpr = "f14 = TO_TIMESTAMP('2020-01-01 00:00:00', 'yyyy-MM-dd HH:mm:ss')"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue( - predicate.test( - generateRowData( - "f14", TimestampData.fromTimestamp(Timestamp.valueOf("2020-01-01 00:00:00"))))); - assertFalse( - predicate.test( - generateRowData( - "f14", TimestampData.fromTimestamp(Timestamp.valueOf("2020-01-01 00:00:01"))))); - } - - // @Test - public void testUnixTimestamp() { - String equalExpr = "f1 = cast(from_unixtime(unix_timestamp(),'yyyy-MM-dd') as String)"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - String format = "yyyy-MM-dd"; - String current = - DateTimeUtils.formatUnixTimestamp( - System.currentTimeMillis() / 1000, format, TimeZone.getDefault()); - assertTrue(predicate.test(generateRowData("f1", StringData.fromString(current)))); - assertFalse(predicate.test(generateRowData("f1", StringData.fromString("2020-01-01-01")))); - } - - // @Test - public void testFromUnixTimestampMinus() { - String equalExpr = "f1 = from_unixtime(unix_timestamp()- 3 * 3600,'yyyy-MM-dd')"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - String format = "yyyy-MM-dd"; - String current = - DateTimeUtils.formatUnixTimestamp( - System.currentTimeMillis() / 1000 - 3 * 3600, format, TimeZone.getDefault()); - assertTrue(predicate.test(generateRowData("f1", StringData.fromString(current)))); - assertFalse(predicate.test(generateRowData("f1", StringData.fromString("2020-01-01-01")))); - } - - @Test - public void testArithmetic() { - // bigint type - String arithmeticExpr = "f9 = (1514356320000 + 1) * 10 / 2"; - List resolved = resolveSQLFilterToExpression(arithmeticExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate predicate = resolved.get(0).accept(visitor).get(); - assertTrue(predicate.test(generateRowData("f9", 7571781600005L))); - assertFalse(predicate.test(generateRowData("f9", 7571781600004L))); - } - - protected RowData generateRowData(String fieldName, Object val) { - int index = Integer.parseInt(fieldName.substring(1)); - Object[] objects = new Object[columns.size()]; - objects[index] = val; - return GenericRowData.of(objects); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java deleted file mode 100644 index c50d2d38a1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateBase.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup.filter; - -import org.apache.amoro.flink.planner.calcite.FlinkTypeSystem; -import org.apache.calcite.rex.RexBuilder; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableException; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.table.api.bridge.java.internal.StreamTableEnvironmentImpl; -import org.apache.flink.table.catalog.CatalogManager; -import org.apache.flink.table.catalog.FunctionCatalog; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.expressions.resolver.ExpressionResolver; -import org.apache.flink.table.planner.calcite.FlinkContext; -import org.apache.flink.table.planner.calcite.FlinkTypeFactory; -import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.table.planner.expressions.RexNodeExpression; -import org.apache.flink.table.planner.plan.utils.RexNodeToExpressionConverter; -import org.apache.flink.table.types.logical.RowType; -import org.junit.Before; - -import java.util.Collections; -import java.util.List; -import java.util.Optional; -import java.util.TimeZone; - -public abstract class TestRowDataPredicateBase { - public static StreamExecutionEnvironment env; - public static TableEnvironment tEnv; - - @Before - public void init() { - env = StreamExecutionEnvironment.getExecutionEnvironment(); - tEnv = StreamTableEnvironment.create(env); - } - - /** - * This method takes in an SQL filter expression and a ResolvedSchema object, and returns a List - * of ResolvedExpression objects. - */ - protected List resolveSQLFilterToExpression( - String sqlExp, ResolvedSchema schema) { - StreamTableEnvironmentImpl tbImpl = (StreamTableEnvironmentImpl) tEnv; - - FlinkContext ctx = ((PlannerBase) tbImpl.getPlanner()).getFlinkContext(); - CatalogManager catMan = tbImpl.getCatalogManager(); - FunctionCatalog funCat = ctx.getFunctionCatalog(); - RowType sourceType = (RowType) schema.toSourceRowDataType().getLogicalType(); - ClassLoader classLoader = tEnv.getClass().getClassLoader(); - FlinkTypeFactory typeFactory = new FlinkTypeFactory(classLoader, FlinkTypeSystem.INSTANCE); - RexNodeToExpressionConverter converter = - new RexNodeToExpressionConverter( - new RexBuilder(typeFactory), - sourceType.getFieldNames().toArray(new String[0]), - funCat, - catMan, - TimeZone.getTimeZone(tEnv.getConfig().getLocalTimeZone())); - - RexNodeExpression rexExp = - (RexNodeExpression) tbImpl.getParser().parseSqlExpression(sqlExp, sourceType, null); - ResolvedExpression resolvedExp = - rexExp - .getRexNode() - .accept(converter) - .getOrElse( - () -> { - throw new IllegalArgumentException( - "Cannot convert " - + rexExp.getRexNode() - + " to Expression, this likely " - + "means you used some function(s) not " - + "supported with this setup."); - }); - ExpressionResolver resolver = - ExpressionResolver.resolverFor( - tEnv.getConfig(), - classLoader, - name -> Optional.empty(), - funCat.asLookup( - str -> { - throw new TableException( - "We should not need to lookup any expressions at this point"); - }), - catMan.getDataTypeFactory(), - (sqlExpression, inputRowType, outputType) -> { - throw new TableException( - "SQL expression parsing is not supported at this location."); - }) - .build(); - return resolver.resolve(Collections.singletonList(resolvedExp)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java deleted file mode 100644 index e69689dbbd..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/lookup/filter/TestRowDataPredicateExpressionVisitor.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.lookup.filter; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.catalog.Column; -import org.apache.flink.table.catalog.ResolvedSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.expressions.ResolvedExpression; -import org.apache.flink.table.types.DataType; -import org.junit.Before; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** This class contains unit tests for the {@link RowDataPredicateExpressionVisitor} class. */ -public class TestRowDataPredicateExpressionVisitor extends TestRowDataPredicateBase { - - RowDataPredicateExpressionVisitor visitor; - final Map fieldIndexMap = new HashMap<>(); - final Map fieldDataTypeMap = new HashMap<>(); - List columns = new ArrayList<>(); - ResolvedSchema schema; - - @Before - public void setUp() { - columns.add(0, Column.physical("id", DataTypes.INT())); - columns.add(1, Column.physical("name", DataTypes.STRING())); - columns.add(2, Column.physical("age", DataTypes.INT())); - schema = new ResolvedSchema(columns, Collections.emptyList(), null); - for (int i = 0; i < columns.size(); i++) { - Column column = columns.get(i); - fieldDataTypeMap.put(column.getName(), column.getDataType()); - fieldIndexMap.put(column.getName(), i); - } - - visitor = new RowDataPredicateExpressionVisitor(fieldIndexMap, fieldDataTypeMap); - } - - @Test - public void testVisitCallExpressionEquals() { - String equalExpr = "id = NULL"; - List resolved = resolveSQLFilterToExpression(equalExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - } - - @Test - public void testVisitCallExpressionNotEquals() { - String notEqualExpr = "id <> 1"; - List resolved = resolveSQLFilterToExpression(notEqualExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(2, StringData.fromString("2"), 6))); - assertFalse(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 6))); - } - - @Test - public void testVisitCallExpressionGreaterThanOrEqual() { - String greaterThanOrEqualExpr = "age >= 5"; - List resolved = - resolveSQLFilterToExpression(greaterThanOrEqualExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); - assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 4))); - } - - @Test - public void testVisitCallExpressionGreaterThan() { - String greaterThanExpr = "age > 5"; - List resolved = resolveSQLFilterToExpression(greaterThanExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - } - - @Test - public void testVisitCallExpressionLessThanOrEqual() { - String lessThanOrEqualExpr = "age <= 5"; - List resolved = resolveSQLFilterToExpression(lessThanOrEqualExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); - assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - } - - @Test - public void testVisitCallExpressionLessThan() { - String lessThanExpr = "age < 5"; - List resolved = resolveSQLFilterToExpression(lessThanExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 5))); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 4))); - } - - @Test - public void testVisitCallExpressionIsNotNull() { - String isNotNullExpr = "id is not NULL"; - List resolved = resolveSQLFilterToExpression(isNotNullExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("1"), 6))); - assertFalse(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("2"), 6))); - } - - @Test - public void testVisitCallExpressionIsNull() { - String isNullExpr = "id is NULL"; - List resolved = resolveSQLFilterToExpression(isNullExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - } - - @Test - public void testVisitCallExpressionEqualsAndGreaterThan() { - String andExpr = "id = NULL AND age > 5"; - List resolved = resolveSQLFilterToExpression(andExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - } - - @Test - public void testVisitCallExpressionEqualsOrLessThan() { - String orExpr = "id = NULL OR age < 5"; - List resolved = resolveSQLFilterToExpression(orExpr, schema); - assertEquals(1, resolved.size()); - RowDataPredicate rowDataPredicate = resolved.get(0).accept(visitor).get(); - assertTrue(rowDataPredicate.test(GenericRowData.of(null, StringData.fromString("1"), 6))); - assertFalse(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 5))); - assertTrue(rowDataPredicate.test(GenericRowData.of(1, StringData.fromString("2"), 4))); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java deleted file mode 100644 index ef5d08a68b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSource.java +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import static org.apache.amoro.flink.write.TestMixedFormatFileWriter.TARGET_FILE_SIZE; -import static org.apache.amoro.flink.write.TestMixedFormatFileWriter.createUnkeyedTaskWriter; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.table.FlinkSource; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamUtils; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.junit.Assert; -import org.junit.Test; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Optional; -import java.util.Set; - -public class TestFlinkSource extends FlinkTestBase { - - protected static final FileFormat FILE_FORMAT = - FileFormat.valueOf("parquet".toUpperCase(Locale.ENGLISH)); - - public TestFlinkSource() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(false, true)); - } - - protected static void commit(WriteResult result, Table table) { - AppendFiles append = table.newAppend(); - Arrays.stream(result.dataFiles()).forEach(append::appendFile); - append.commit(); - } - - protected static void write(Collection data, Table table, RowType rowType) - throws IOException { - try (TaskWriter taskWriter = - createUnkeyedTaskWriter(table, TARGET_FILE_SIZE, FILE_FORMAT, rowType)) { - data.forEach( - d -> { - try { - taskWriter.write(DataUtil.toRowData(d)); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - taskWriter.close(); - - commit(taskWriter.complete(), table); - } - } - - @Test - public void testUnkeyedTableDataStream() throws Exception { - Configuration conf = new Configuration(); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.setParallelism(1); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List data = new LinkedList<>(); - LocalDateTime localDateTime = LocalDateTime.parse("2022-06-18T10:10:11.0"); - long timestamp = localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(); - data.add(new Object[] {1000004, "a", timestamp, localDateTime}); - data.add(new Object[] {1000015, "b", timestamp, localDateTime}); - data.add(new Object[] {1000011, "c", timestamp, localDateTime}); - data.add(new Object[] {1000014, "d", timestamp, localDateTime}); - data.add(new Object[] {1000021, "d", timestamp, localDateTime}); - data.add(new Object[] {1000015, "e", timestamp, localDateTime}); - - Collection expectedRecords = DataUtil.toRowData(data); - write(data, getMixedTable().asUnkeyedTable(), FLINK_ROW_TYPE); - - final CloseableIterator resultIterator = - FlinkSource.forRowData() - .env(env) - .context(Optional::of) - .project(FLINK_SCHEMA) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkConf(conf) - .properties( - new HashMap() { - { - put("streaming", "false"); - } - }) - .build() - .executeAndCollect(); - - Set rowData = new HashSet<>(); - resultIterator.forEachRemaining( - o -> - rowData.add( - GenericRowData.of( - o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6)))); - - Assert.assertEquals(new HashSet<>(expectedRecords), rowData); - } - - @Test - public void testUnkeyedStreamingRead() throws Exception { - Configuration conf = new Configuration(); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.setParallelism(1); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List data = new LinkedList<>(); - LocalDateTime localDateTime = LocalDateTime.parse("2022-06-18T10:10:11.0"); - long timestamp = localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli(); - data.add(new Object[] {1000004, "a", timestamp, localDateTime}); - data.add(new Object[] {1000015, "b", timestamp, localDateTime}); - data.add(new Object[] {1000011, "c", timestamp, localDateTime}); - data.add(new Object[] {1000014, "d", timestamp, localDateTime}); - data.add(new Object[] {1000021, "d", timestamp, localDateTime}); - data.add(new Object[] {1000015, "e", timestamp, localDateTime}); - - Collection expectedRecords = DataUtil.toRowData(data); - write(data, getMixedTable().asUnkeyedTable(), FLINK_ROW_TYPE); - - DataStream ds = - FlinkSource.forRowData() - .env(env) - .context(Optional::of) - .project(FLINK_SCHEMA) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkConf(conf) - .build(); - - ClientAndIterator clientAndIterator = - DataStreamUtils.collectWithClient(ds, this.getClass().getName()); - - JobClient jobClient = clientAndIterator.client; - CloseableIterator iterator = clientAndIterator.iterator; - - Set rowData = new HashSet<>(); - while (iterator.hasNext()) { - RowData o = iterator.next(); - rowData.add( - GenericRowData.of(o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6))); - if (rowData.size() == expectedRecords.size()) { - break; - } - } - jobClient.cancel(); - - Assert.assertEquals(new HashSet<>(expectedRecords), rowData); - } - - @Test - public void testUnkeyedSnapshotRead() throws Exception { - Configuration conf = new Configuration(); - final Table testTable = getMixedTable().asUnkeyedTable(); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(conf); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.setParallelism(1); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List s1 = new LinkedList<>(); - LocalDateTime localDateTime1 = LocalDateTime.parse("2022-06-18T10:10:11.0"); - long timestamp1 = localDateTime1.toInstant(ZoneOffset.UTC).toEpochMilli(); - s1.add(new Object[] {1000004, "a", timestamp1, localDateTime1}); - s1.add(new Object[] {1000015, "b", timestamp1, localDateTime1}); - s1.add(new Object[] {1000011, "c", timestamp1, localDateTime1}); - s1.add(new Object[] {1000014, "d", timestamp1, localDateTime1}); - s1.add(new Object[] {1000021, "d", timestamp1, localDateTime1}); - s1.add(new Object[] {1000015, "e", timestamp1, localDateTime1}); - - write(s1, testTable, FLINK_ROW_TYPE); - - List s2 = new LinkedList<>(); - LocalDateTime localDateTime2 = LocalDateTime.parse("2022-06-19T10:10:11.0"); - long timestamp2 = localDateTime2.toInstant(ZoneOffset.UTC).toEpochMilli(); - s2.add(new Object[] {12, "ac", timestamp2, localDateTime2}); - s2.add(new Object[] {52, "ad", timestamp2, localDateTime2}); - s2.add(new Object[] {15, "ad", timestamp2, localDateTime2}); - s2.add(new Object[] {26, "ae", timestamp2, localDateTime2}); - - Collection expectedRecords = DataUtil.toRowData(s2); - write(s2, testTable, FLINK_ROW_TYPE); - - testTable.refresh(); - Snapshot s = testTable.snapshots().iterator().next(); - - DataStream ds = - FlinkSource.forRowData() - .env(env) - .context(Optional::of) - .project(FLINK_SCHEMA) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkConf(conf) - .properties( - new HashMap() { - { - put("streaming", "true"); - put("start-snapshot-id", String.valueOf(s.snapshotId())); - } - }) - .build(); - - ClientAndIterator clientAndIterator = - DataStreamUtils.collectWithClient(ds, this.getClass().getName()); - - JobClient jobClient = clientAndIterator.client; - CloseableIterator iterator = clientAndIterator.iterator; - - Set rowData = new HashSet<>(); - while (iterator.hasNext()) { - RowData o = iterator.next(); - rowData.add( - GenericRowData.of(o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6))); - if (rowData.size() == expectedRecords.size()) { - break; - } - } - jobClient.cancel(); - - Assert.assertEquals(new HashSet<>(expectedRecords), rowData); - - CloseableIterator resultIterator = - FlinkSource.forRowData() - .env(env) - .context(Optional::of) - .project(FLINK_SCHEMA) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkConf(conf) - .properties( - new HashMap() { - { - put("streaming", "false"); - put("snapshot-id", String.valueOf(s.snapshotId())); - } - }) - .build() - .executeAndCollect(); - - rowData.clear(); - resultIterator.forEachRemaining( - o -> - rowData.add( - GenericRowData.of( - o.getInt(0), o.getString(1), o.getLong(2), o.getTimestamp(3, 6)))); - - expectedRecords = DataUtil.toRowData(s1); - Assert.assertEquals(new HashSet<>(expectedRecords), rowData); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java deleted file mode 100644 index 859776b94c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestFlinkSplitPlanner.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.scan.ChangeTableIncrementalScan; -import org.apache.iceberg.Snapshot; -import org.junit.Assert; -import org.junit.Test; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -public class TestFlinkSplitPlanner extends TestRowDataReaderFunction { - - @Test - public void testPlanSplitFromKeyedTable() { - testKeyedTable.baseTable().refresh(); - testKeyedTable.changeTable().refresh(); - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - Assert.assertEquals(7, splitList.size()); - } - - @Test - public void testIncrementalChangelog() throws IOException { - testKeyedTable.baseTable().refresh(); - testKeyedTable.changeTable().refresh(); - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - - Assert.assertEquals(7, splitList.size()); - - long startSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); - writeUpdate(); - testKeyedTable.changeTable().refresh(); - - Snapshot snapshot = testKeyedTable.changeTable().snapshot(startSnapshotId); - long fromSequence = snapshot.sequenceNumber(); - - long nowSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); - ChangeTableIncrementalScan changeTableScan = - testKeyedTable - .changeTable() - .newScan() - .useSnapshot(nowSnapshotId) - .fromSequence(fromSequence); - - List changeSplits = - FlinkSplitPlanner.planChangeTable(changeTableScan, new AtomicInteger()); - - Assert.assertEquals(1, changeSplits.size()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java deleted file mode 100644 index ecdf4c47e9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/TestMixedFormatSource.java +++ /dev/null @@ -1,1128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read; - -import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; -import static org.apache.amoro.MockAmoroManagementServer.TEST_DB_NAME; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_EARLIEST; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SCAN_STARTUP_MODE_LATEST; -import static org.apache.flink.util.Preconditions.checkArgument; -import static org.apache.flink.util.Preconditions.checkNotNull; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.flink.read.hybrid.reader.ReaderFunction; -import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.write.FlinkSink; -import org.apache.amoro.mixed.MixedFormatCatalog; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.amoro.table.UnkeyedTable; -import org.apache.amoro.utils.TableFileUtil; -import org.apache.flink.api.common.JobID; -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.RpcServiceSharing; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.streaming.api.datastream.DataStreamUtils; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.ChainingStrategy; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFiles; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.Serializable; -import java.time.Duration; -import java.time.ZoneOffset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.atomic.AtomicInteger; - -public class TestMixedFormatSource extends TestRowDataReaderFunction implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(TestMixedFormatSource.class); - private static final long serialVersionUID = 7418812854449034756L; - private static final int PARALLELISM = 1; - - @Rule - public final MiniClusterWithClientResource miniClusterResource = - new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .setRpcServiceSharing(RpcServiceSharing.DEDICATED) - .withHaLeadershipControl() - .build()); - - protected KeyedTable testFailoverTable; - protected static final String SINK_TABLE_NAME = "test_sink_exactly_once"; - protected static final TableIdentifier FAIL_TABLE_ID = - TableIdentifier.of( - TableTestHelper.TEST_CATALOG_NAME, TableTestHelper.TEST_DB_NAME, SINK_TABLE_NAME); - - @Before - public void testSetup() throws IOException { - MixedFormatCatalog testCatalog = getMixedFormatCatalog(); - - String db = FAIL_TABLE_ID.getDatabase(); - if (!testCatalog.listDatabases().contains(db)) { - testCatalog.createDatabase(db); - } - - if (!testCatalog.tableExists(FAIL_TABLE_ID)) { - testFailoverTable = - testCatalog - .newTableBuilder(FAIL_TABLE_ID, TABLE_SCHEMA) - .withPartitionSpec(BasicTableTestHelper.SPEC) - .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) - .create() - .asKeyedTable(); - } - } - - @After - public void dropTable() { - miniClusterResource.cancelAllJobs(); - getMixedFormatCatalog().dropTable(FAIL_TABLE_ID, true); - getMixedFormatCatalog().dropTable(TableTestHelper.TEST_TABLE_ID, true); - getMixedFormatCatalog().dropDatabase(TableTestHelper.TEST_DB_NAME); - } - - @Test - public void testMixedFormatSourceStatic() throws Exception { - MixedFormatSource mixedFormatSource = initMixedFormatSource(false); - - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(3000); - // set the source parallelism to 4 - final CloseableIterator resultIterator = - env.fromSource( - mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") - .setParallelism(PARALLELISM) - .executeAndCollect(); - - List actualResult = new ArrayList<>(); - - resultIterator.forEachRemaining( - row -> { - GenericRowData rowData = convert(row); - actualResult.add(rowData); - }); - RowData[] expected = expectedAfterMOR(); - assertArrayEquals(expected, actualResult); - } - - @Test - public void testMixedFormatSourceStaticJobManagerFailover() throws Exception { - testMixedFormatSource(FailoverType.JM); - } - - @Test - public void testMixedFormatSourceStaticTaskManagerFailover() throws Exception { - testMixedFormatSource(FailoverType.TM); - } - - public void testMixedFormatSource(FailoverType failoverType) throws Exception { - List expected = new ArrayList<>(expectedCollection()); - List updated = updateRecords(); - writeUpdate(updated); - List records = generateRecords(2, 1); - writeUpdate(records); - expected.addAll(updated); - expected.addAll(records); - - MixedFormatSource mixedFormatSource = initMixedFormatSource(false); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - - DataStream input = - env.fromSource( - mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") - .setParallelism(PARALLELISM); - - List expectedAfterMoR = new ArrayList<>(mor(expected)); - DataStream streamFailingInTheMiddleOfReading = - RecordCounterToFail.wrapWithFailureAfter(input, expectedAfterMoR.size() / 2); - - FlinkSink.forRowData(streamFailingInTheMiddleOfReading) - .context(Optional::of) - .table(testFailoverTable) - .tableLoader(MixedFormatTableLoader.of(FAIL_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .build(); - - JobClient jobClient = env.executeAsync("Bounded Mixed-Format Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - RecordCounterToFail.waitToFail(); - triggerFailover( - failoverType, - jobId, - RecordCounterToFail::continueProcessing, - miniClusterResource.getMiniCluster()); - - assertRecords(testFailoverTable, expectedAfterMoR, Duration.ofMillis(10), 12000); - } - - @Test - public void testDimTaskManagerFailover() throws Exception { - List updated = updateRecords(); - writeUpdate(updated); - List records = generateRecords(2, 1); - writeUpdate(records); - - MixedFormatSource mixedFormatSource = initMixedFormatDimSource(true); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 0)); - - DataStream input = - env.fromSource( - mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") - .setParallelism(PARALLELISM); - - WatermarkAwareFailWrapper.wrapWithFailureAfter(input); - - JobClient jobClient = env.executeAsync("Dim Mixed-Format Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - WatermarkAwareFailWrapper.waitToFail(); - triggerFailover( - FailoverType.TM, - jobId, - WatermarkAwareFailWrapper::continueProcessing, - miniClusterResource.getMiniCluster()); - - while (WatermarkAwareFailWrapper.watermarkCounter.get() != PARALLELISM) { - Thread.sleep(1000); - LOG.info("wait for watermark after failover"); - } - Assert.assertEquals(Long.MAX_VALUE, WatermarkAwareFailWrapper.getWatermarkAfterFailover()); - } - - @Test - public void testMixedFormatContinuousSource() throws Exception { - MixedFormatSource mixedFormatSource = initMixedFormatSource(true); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - ClientAndIterator clientAndIterator = - executeAndCollectWithClient(env, mixedFormatSource); - - JobClient jobClient = clientAndIterator.client; - - List actualResult = - collectRecordsFromUnboundedStream(clientAndIterator, excepts().length); - - assertArrayEquals(excepts(), actualResult); - - LOG.info( - "begin write update_before update_after data and commit new snapshot to change table."); - writeUpdate(); - - actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length); - - assertArrayEquals(excepts2(), actualResult); - jobClient.cancel(); - } - - @Test - public void testMixedFormatContinuousSourceWithEmptyChangeInInit() throws Exception { - TableIdentifier tableId = - TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_empty_change"); - KeyedTable table = - getMixedFormatCatalog() - .newTableBuilder(tableId, TABLE_SCHEMA) - .withPartitionSpec(BasicTableTestHelper.SPEC) - .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) - .create() - .asKeyedTable(); - - TaskWriter taskWriter = createTaskWriter(true); - List baseData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(table, taskWriter.complete(), true); - - MixedFormatSource mixedFormatSource = - initMixedFormatSource(true, SCAN_STARTUP_MODE_EARLIEST, tableId); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - ClientAndIterator clientAndIterator = - executeAndCollectWithClient(env, mixedFormatSource); - - JobClient jobClient = clientAndIterator.client; - - List actualResult = - collectRecordsFromUnboundedStream(clientAndIterator, baseData.size()); - - Assert.assertEquals(new HashSet<>(baseData), new HashSet<>(actualResult)); - - LOG.info( - "begin write update_before update_after data and commit new snapshot to change table."); - writeUpdate(updateRecords(), table); - writeUpdate(updateRecords(), table); - - actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); - jobClient.cancel(); - - Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); - getMixedFormatCatalog().dropTable(tableId, true); - } - - @Test - public void testMixedFormatSourceEnumeratorWithChangeExpired() throws Exception { - final String maxContinuousEmptyCommits = "flink.max-continuous-empty-commits"; - TableIdentifier tableId = TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_keyed_tb"); - KeyedTable table = - getMixedFormatCatalog() - .newTableBuilder(tableId, TABLE_SCHEMA) - .withProperty(maxContinuousEmptyCommits, "1") - .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) - .create() - .asKeyedTable(); - - TaskWriter taskWriter = createTaskWriter(table, false); - List changeData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - } - }; - for (RowData record : changeData) { - taskWriter.write(record); - } - - List changeDataFiles = new ArrayList<>(); - WriteResult result = taskWriter.complete(); - changeDataFiles.addAll(Arrays.asList(result.dataFiles())); - commit(table, result, false); - - for (DataFile dataFile : changeDataFiles) { - Assert.assertTrue(table.io().exists(dataFile.path().toString())); - } - - final Duration monitorInterval = Duration.ofSeconds(1); - MixedFormatSource mixedFormatSource = - initMixedFormatSourceWithMonitorInterval( - true, SCAN_STARTUP_MODE_EARLIEST, tableId, monitorInterval); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - ClientAndIterator clientAndIterator = - executeAndCollectWithClient(env, mixedFormatSource); - - JobClient jobClient = clientAndIterator.client; - - List actualResult = - collectRecordsFromUnboundedStream(clientAndIterator, changeData.size()); - Assert.assertEquals(new HashSet<>(changeData), new HashSet<>(actualResult)); - - // expire changeTable snapshots - DeleteFiles deleteFiles = table.changeTable().newDelete(); - for (DataFile dataFile : changeDataFiles) { - Assert.assertTrue(table.io().exists(dataFile.path().toString())); - deleteFiles.deleteFile(dataFile); - } - deleteFiles.commit(); - - LOG.info("commit empty snapshot"); - AppendFiles changeAppend = table.changeTable().newAppend(); - changeAppend.commit(); - - final long timeWait = (monitorInterval.toMillis() * 2); - LOG.info("try sleep {}, wait snapshot expired and scan the empty snapshot.", timeWait); - Thread.sleep(timeWait); - - expireSnapshots(table.changeTable(), System.currentTimeMillis(), new HashSet<>()); - - writeUpdate(updateRecords(), table); - writeUpdate(updateRecords(), table); - - actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); - jobClient.cancel(); - - Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); - getMixedFormatCatalog().dropTable(tableId, true); - } - - @Test - public void testMixedFormatSourceEnumeratorWithBaseExpired() throws Exception { - final String maxContinuousEmptyCommits = "flink.max-continuous-empty-commits"; - TableIdentifier tableId = TableIdentifier.of(TEST_CATALOG_NAME, TEST_DB_NAME, "test_keyed_tb"); - KeyedTable table = - getMixedFormatCatalog() - .newTableBuilder(tableId, TABLE_SCHEMA) - .withProperty(maxContinuousEmptyCommits, "1") - .withPrimaryKeySpec(BasicTableTestHelper.PRIMARY_KEY_SPEC) - .create() - .asKeyedTable(); - - TaskWriter taskWriter = createTaskWriter(table, true); - List baseData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - - List baseDataFiles = new ArrayList<>(); - WriteResult result = taskWriter.complete(); - baseDataFiles.addAll(Arrays.asList(result.dataFiles())); - commit(table, result, true); - - for (DataFile dataFile : baseDataFiles) { - Assert.assertTrue(table.io().exists(dataFile.path().toString())); - } - - final Duration monitorInterval = Duration.ofSeconds(1); - MixedFormatSource mixedFormatSource = - initMixedFormatSourceWithMonitorInterval( - true, SCAN_STARTUP_MODE_EARLIEST, tableId, monitorInterval); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - ClientAndIterator clientAndIterator = - executeAndCollectWithClient(env, mixedFormatSource); - - JobClient jobClient = clientAndIterator.client; - - List actualResult = - collectRecordsFromUnboundedStream(clientAndIterator, baseData.size()); - Assert.assertEquals(new HashSet<>(baseData), new HashSet<>(actualResult)); - - // expire baseTable snapshots - DeleteFiles deleteFiles = table.baseTable().newDelete(); - for (DataFile dataFile : baseDataFiles) { - Assert.assertTrue(table.io().exists(dataFile.path().toString())); - deleteFiles.deleteFile(dataFile); - } - deleteFiles.commit(); - - LOG.info("commit empty snapshot"); - AppendFiles changeAppend = table.changeTable().newAppend(); - changeAppend.commit(); - - final long timeWait = (monitorInterval.toMillis() * 2); - LOG.info("try sleep {}, wait snapshot expired and scan the empty snapshot.", timeWait); - Thread.sleep(timeWait); - - expireSnapshots(table.baseTable(), System.currentTimeMillis(), new HashSet<>()); - - writeUpdate(updateRecords(), table); - writeUpdate(updateRecords(), table); - - actualResult = collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length * 2); - jobClient.cancel(); - - Assert.assertEquals(new HashSet<>(updateRecords()), new HashSet<>(actualResult)); - getMixedFormatCatalog().dropTable(tableId, true); - } - - @Test - public void testLatestStartupMode() throws Exception { - MixedFormatSource mixedFormatSource = initMixedFormatSourceWithLatest(); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - - ClientAndIterator clientAndIterator = - executeAndCollectWithClient(env, mixedFormatSource); - - JobClient jobClient = clientAndIterator.client; - - while (true) { - if (JobStatus.RUNNING == jobClient.getJobStatus().get()) { - Thread.sleep(500); - LOG.info( - "begin write update_before update_after data and commit new snapshot to change table."); - writeUpdate(); - break; - } - Thread.sleep(100); - } - - List actualResult = - collectRecordsFromUnboundedStream(clientAndIterator, excepts2().length); - - assertArrayEquals(excepts2(), actualResult); - jobClient.cancel(); - } - - @Test - public void testMixedFormatContinuousSourceJobManagerFailover() throws Exception { - testMixedFormatContinuousSource(FailoverType.JM); - } - - @Test - public void testMixedFormatContinuousSourceTaskManagerFailover() throws Exception { - testMixedFormatContinuousSource(FailoverType.TM); - } - - public void testMixedFormatContinuousSource(final FailoverType failoverType) throws Exception { - List expected = new ArrayList<>(Arrays.asList(excepts())); - writeUpdate(); - expected.addAll(Arrays.asList(excepts2())); - - MixedFormatSource mixedFormatSource = initMixedFormatSource(true); - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - // enable checkpoint - env.enableCheckpointing(1000); - // env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - - DataStream input = - env.fromSource( - mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") - .setParallelism(PARALLELISM); - - FlinkSink.forRowData(input) - .context(Optional::of) - .table(testFailoverTable) - .tableLoader(MixedFormatTableLoader.of(FAIL_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .build(); - - JobClient jobClient = env.executeAsync("Unbounded Mixed-Format Source Failover Test"); - JobID jobId = jobClient.getJobID(); - - for (int i = 1; i < 5; i++) { - Thread.sleep(10); - List records = generateRecords(2, i); - expected.addAll(records); - writeUpdate(records); - if (i == 2) { - triggerFailover(failoverType, jobId, () -> {}, miniClusterResource.getMiniCluster()); - } - } - - // wait longer for continuous source to reduce flakiness - // because CI servers tend to be overloaded. - assertRecords(testFailoverTable, expected, Duration.ofMillis(10), 12000); - jobClient.cancel(); - } - - private void assertRecords( - KeyedTable testFailoverTable, - List expected, - Duration checkInterval, - int maxCheckCount) - throws InterruptedException { - for (int i = 0; i < maxCheckCount; ++i) { - if (equalsRecords(expected, tableRecords(testFailoverTable), testFailoverTable.schema())) { - break; - } else { - Thread.sleep(checkInterval.toMillis()); - } - } - // success or failure, assert on the latest table state - equalsRecords(expected, tableRecords(testFailoverTable), testFailoverTable.schema()); - } - - private boolean equalsRecords(List expected, List tableRecords, Schema schema) { - try { - RowData[] expectedArray = sortRowDataCollection(expected); - RowData[] actualArray = sortRowDataCollection(tableRecords); - Assert.assertArrayEquals(expectedArray, actualArray); - return true; - } catch (Throwable e) { - return false; - } - } - - public static List tableRecords(final KeyedTable keyedTable) { - keyedTable.refresh(); - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(keyedTable, new AtomicInteger(0)); - - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - new Configuration(), - keyedTable.schema(), - keyedTable.schema(), - keyedTable.primaryKeySpec(), - null, - true, - keyedTable.io()); - - List actual = new ArrayList<>(); - mixedFormatSplits.forEach( - split -> { - LOG.info("Mixed format split: {}.", split); - DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - LOG.info("{}", rowData); - actual.add(rowData); - } - }); - return actual; - } - - private List generateRecords(int numRecords, int index) { - int pk = 100; - List records = new ArrayList<>(numRecords); - for (int i = index; i < numRecords + index; i++) { - records.add( - GenericRowData.ofKind( - RowKind.INSERT, - pk + index, - StringData.fromString("jo" + index + i), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - records.add( - GenericRowData.ofKind( - RowKind.DELETE, - pk + index, - StringData.fromString("jo" + index + i), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - } - return records; - } - - // ------------------------------------------------------------------------ - // test utilities - // ------------------------------------------------------------------------ - - private enum FailoverType { - NONE, - TM, - JM - } - - private static void triggerFailover( - FailoverType type, JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - switch (type) { - case NONE: - afterFailAction.run(); - break; - case TM: - restartTaskManager(afterFailAction, miniCluster); - break; - case JM: - triggerJobManagerFailover(jobId, afterFailAction, miniCluster); - break; - } - } - - private static void triggerJobManagerFailover( - JobID jobId, Runnable afterFailAction, MiniCluster miniCluster) throws Exception { - final HaLeadershipControl haLeadershipControl = miniCluster.getHaLeadershipControl().get(); - haLeadershipControl.revokeJobMasterLeadership(jobId).get(); - afterFailAction.run(); - haLeadershipControl.grantJobMasterLeadership(jobId).get(); - } - - private static void restartTaskManager(Runnable afterFailAction, MiniCluster miniCluster) - throws Exception { - miniCluster.terminateTaskManager(0).get(); - afterFailAction.run(); - miniCluster.startTaskManager(); - } - - private List collectRecordsFromUnboundedStream( - final ClientAndIterator client, final int numElements) { - - checkNotNull(client, "client"); - checkArgument(numElements > 0, "numElement must be > 0"); - - final ArrayList result = new ArrayList<>(numElements); - final Iterator iterator = client.iterator; - - CollectTask collectTask = new CollectTask(result, iterator, numElements); - new Thread(collectTask).start(); - - long start = System.currentTimeMillis(); - final long timeout = 60 * 1000; - long intervalOneSecond = 1; - while (collectTask.running) { - // TODO a more proper timeout strategy? - long timeFlies = System.currentTimeMillis() - start; - if (timeFlies / 1000 >= intervalOneSecond) { - LOG.info("Time flies: {} ms.", timeFlies); - intervalOneSecond++; - } - if (System.currentTimeMillis() - start > timeout) { - LOG.error( - "This task [{}] try to collect records from unbounded stream but timeout {}. As of now, collect result:{}.", - client.client.getJobID().toString(), - timeout, - result.toArray()); - break; - } - } - - Assert.assertEquals( - String.format( - "The stream ended before reaching the requested %d records. Only %d records were received, received list:%s.", - numElements, result.size(), Arrays.toString(result.toArray())), - numElements, - result.size()); - - return result; - } - - private static class CollectTask implements Runnable { - final ArrayList result; - final Iterator iterator; - final int limit; - - boolean running = true; - - public CollectTask(ArrayList result, Iterator iterator, int limit) { - this.result = result; - this.iterator = iterator; - this.limit = limit; - } - - @Override - public void run() { - while (iterator.hasNext()) { - result.add(convert(iterator.next())); - if (result.size() == limit) { - running = false; - return; - } - } - } - } - - private ClientAndIterator executeAndCollectWithClient( - StreamExecutionEnvironment env, MixedFormatSource mixedFormatSource) - throws Exception { - final DataStreamSource source = - env.fromSource( - mixedFormatSource, WatermarkStrategy.noWatermarks(), "MixedFormatParallelSource") - .setParallelism(PARALLELISM); - return DataStreamUtils.collectWithClient(source, "job_" + name.getMethodName()); - } - - private static GenericRowData convert(RowData row) { - GenericRowData rowData = new GenericRowData(row.getRowKind(), row.getArity()); - rowData.setField(0, row.getInt(0)); - rowData.setField(1, row.getString(1)); - rowData.setField(2, row.getLong(2)); - rowData.setField(3, row.getTimestamp(3, 6)); - return rowData; - } - - private static void expireSnapshots( - UnkeyedTable tableStore, long olderThan, Set exclude) { - LOG.debug("start expire snapshots, the exclude is {}", exclude); - final AtomicInteger toDeleteFiles = new AtomicInteger(0); - final AtomicInteger deleteFiles = new AtomicInteger(0); - Set parentDirectory = new HashSet<>(); - tableStore - .expireSnapshots() - .retainLast(1) - .expireOlderThan(olderThan) - .deleteWith( - file -> { - try { - if (!exclude.contains(file) - && !exclude.contains(new Path(file).getParent().toString())) { - tableStore.io().deleteFile(file); - } - parentDirectory.add(new Path(file).getParent().toString()); - deleteFiles.incrementAndGet(); - } catch (Throwable t) { - LOG.warn("failed to delete file {}", file, t); - } finally { - toDeleteFiles.incrementAndGet(); - } - }) - .cleanExpiredFiles(true) - .commit(); - parentDirectory.forEach( - parent -> TableFileUtil.deleteEmptyDirectory(tableStore.io(), parent, exclude)); - LOG.info("to delete {} files, success delete {} files", toDeleteFiles.get(), deleteFiles.get()); - } - - private MixedFormatSource initMixedFormatSource(boolean isStreaming) { - return initMixedFormatSource(isStreaming, SCAN_STARTUP_MODE_EARLIEST); - } - - private MixedFormatSource initMixedFormatSourceWithLatest() { - return initMixedFormatSource(true, SCAN_STARTUP_MODE_LATEST); - } - - private MixedFormatSource initMixedFormatSource( - boolean isStreaming, String scanStartupMode) { - MixedFormatTableLoader tableLoader = initLoader(); - MixedFormatScanContext mixedFormatScanContext = - initMixedFormatScanContext(isStreaming, scanStartupMode); - ReaderFunction rowDataReaderFunction = initRowDataReadFunction(); - TypeInformation typeInformation = - InternalTypeInfo.of(FlinkSchemaUtil.convert(testKeyedTable.schema())); - - return new MixedFormatSource<>( - tableLoader, - mixedFormatScanContext, - rowDataReaderFunction, - typeInformation, - testKeyedTable.name(), - false); - } - - private MixedFormatSource initMixedFormatSourceWithMonitorInterval( - boolean isStreaming, - String scanStartupMode, - TableIdentifier tableIdentifier, - Duration monitorInterval) { - MixedFormatTableLoader tableLoader = MixedFormatTableLoader.of(tableIdentifier, catalogBuilder); - MixedFormatScanContext mixedFormatScanContext = - initMixedFormatScanContext(isStreaming, scanStartupMode, monitorInterval); - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - ReaderFunction rowDataReaderFunction = initRowDataReadFunction(table.asKeyedTable()); - TypeInformation typeInformation = - InternalTypeInfo.of(FlinkSchemaUtil.convert(table.schema())); - - return new MixedFormatSource<>( - tableLoader, - mixedFormatScanContext, - rowDataReaderFunction, - typeInformation, - table.name(), - false); - } - - private MixedFormatSource initMixedFormatSource( - boolean isStreaming, String scanStartupMode, TableIdentifier tableIdentifier) { - return initMixedFormatSourceWithMonitorInterval( - isStreaming, scanStartupMode, tableIdentifier, Duration.ofMillis(500)); - } - - private MixedFormatSource initMixedFormatDimSource(boolean isStreaming) { - MixedFormatTableLoader tableLoader = initLoader(); - MixedFormatScanContext mixedFormatScanContext = - initMixedFormatScanContext(isStreaming, SCAN_STARTUP_MODE_EARLIEST); - ReaderFunction rowDataReaderFunction = initRowDataReadFunction(); - Schema schema = testKeyedTable.schema(); - Schema schemaWithWm = - TypeUtil.join( - schema, - new Schema(Types.NestedField.of(-1, true, "opt", Types.TimestampType.withoutZone()))); - TypeInformation typeInformation = - InternalTypeInfo.of(FlinkSchemaUtil.convert(schemaWithWm)); - - return new MixedFormatSource<>( - tableLoader, - mixedFormatScanContext, - rowDataReaderFunction, - typeInformation, - testKeyedTable.name(), - true); - } - - private RowDataReaderFunction initRowDataReadFunction() { - return initRowDataReadFunction(testKeyedTable); - } - - private RowDataReaderFunction initRowDataReadFunction(KeyedTable keyedTable) { - return new RowDataReaderFunction( - new Configuration(), - keyedTable.schema(), - keyedTable.schema(), - keyedTable.primaryKeySpec(), - null, - true, - keyedTable.io()); - } - - private MixedFormatScanContext initMixedFormatScanContext( - boolean isStreaming, String scanStartupMode, Duration monitorInterval) { - return MixedFormatScanContext.contextBuilder() - .streaming(isStreaming) - .scanStartupMode(scanStartupMode) - .monitorInterval(monitorInterval) - .build(); - } - - private MixedFormatScanContext initMixedFormatScanContext( - boolean isStreaming, String scanStartupMode) { - return MixedFormatScanContext.contextBuilder() - .streaming(isStreaming) - .scanStartupMode(scanStartupMode) - .monitorInterval(Duration.ofMillis(500)) - .build(); - } - - private MixedFormatTableLoader initLoader() { - return MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - } - - // ------------------------------------------------------------------------ - // mini cluster failover utilities - // ------------------------------------------------------------------------ - - private static class RecordCounterToFail { - - private static AtomicInteger records; - private static CompletableFuture fail; - private static CompletableFuture continueProcessing; - - private static DataStream wrapWithFailureAfter(DataStream stream, int failAfter) { - - records = new AtomicInteger(); - fail = new CompletableFuture<>(); - continueProcessing = new CompletableFuture<>(); - return stream.map( - record -> { - final boolean halfOfInputIsRead = records.incrementAndGet() > failAfter; - final boolean notFailedYet = !fail.isDone(); - if (notFailedYet && halfOfInputIsRead) { - fail.complete(null); - continueProcessing.get(); - } - return record; - }); - } - - private static void waitToFail() throws ExecutionException, InterruptedException { - fail.get(); - } - - private static void continueProcessing() { - continueProcessing.complete(null); - } - } - - private static class WatermarkAwareFailWrapper { - - private static WatermarkFailoverTestOperator op; - private static long watermarkAfterFailover = -1; - private static final AtomicInteger watermarkCounter = new AtomicInteger(0); - - public static long getWatermarkAfterFailover() { - return watermarkAfterFailover; - } - - private static DataStream wrapWithFailureAfter(DataStream stream) { - op = new WatermarkFailoverTestOperator(); - return stream.transform("watermark failover", TypeInformation.of(RowData.class), op); - } - - private static void waitToFail() throws InterruptedException { - op.waitToFail(); - } - - private static void continueProcessing() { - op.continueProcessing(); - } - - static class WatermarkFailoverTestOperator extends AbstractStreamOperator - implements OneInputStreamOperator { - - private static final long serialVersionUID = 1L; - private static boolean fail = false; - private static boolean failoverHappened = false; - - public WatermarkFailoverTestOperator() { - super(); - chainingStrategy = ChainingStrategy.ALWAYS; - } - - private void waitToFail() throws InterruptedException { - while (!fail) { - LOG.info("Waiting to fail"); - Thread.sleep(1000); - } - } - - private void continueProcessing() { - failoverHappened = true; - LOG.info("failover happened"); - } - - @Override - public void open() throws Exception { - super.open(); - } - - @Override - public void processElement(StreamRecord element) throws Exception { - output.collect(element); - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - LOG.info("processWatermark: {}", mark); - if (!failoverHappened && mark.getTimestamp() > 0) { - fail = true; - } - if (failoverHappened) { - LOG.info("failover happened, watermark: {}", mark); - Assert.assertEquals(Long.MAX_VALUE, mark.getTimestamp()); - if (watermarkAfterFailover == -1) { - watermarkAfterFailover = mark.getTimestamp(); - } else { - watermarkAfterFailover = Math.min(watermarkAfterFailover, mark.getTimestamp()); - } - watermarkCounter.incrementAndGet(); - } - super.processWatermark(mark); - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java deleted file mode 100644 index 029ebf3e75..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaConsumer.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hidden.kafka; - -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; -import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; -import static org.assertj.core.api.Assertions.assertThat; - -import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; -import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; -import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.TopicPartition; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.jupiter.api.Assertions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Properties; -import java.util.Set; -import java.util.UUID; -import java.util.stream.Collectors; - -public class TestKafkaConsumer extends TestBaseLog { - private static final Logger LOG = LoggerFactory.getLogger(TestKafkaConsumer.class); - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Test - public void testTransactionalConsume() { - final String topic = "test-offset-flip"; - FlinkKafkaInternalProducer reuse = null; - final String transactionalIdPrefix = UUID.randomUUID().toString(); - try { - int numCount = 20; - Properties properties = new Properties(); - properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - properties = getProperties(KafkaConfigGenerate.getStandardProperties(properties)); - properties.put(TRANSACTIONAL_ID_CONFIG, transactionalIdPrefix + "flip"); - reuse = new FlinkKafkaInternalProducer<>(properties); - reuse.initTransactions(); - reuse.beginTransaction(); - for (int i = 1; i <= numCount; i++) { - reuse.send(new ProducerRecord<>(topic, "test-value-" + i)); - } - reuse.commitTransaction(); - int count = KafkaContainerTest.countAllRecords(topic, properties); - LOG.info("consumption = {}", count); - assertThat(count).isEqualTo(numCount); - } catch (Throwable e) { - LOG.error("error:", e); - if (reuse != null) { - reuse.abortTransaction(); - } - } finally { - assert reuse != null; - reuse.close(Duration.ofMillis(1000)); - } - } - - @Test - public void testResetOffset() { - final int countNum = 20; - String topicIntern = TestHiddenLogOperators.TOPIC; - Properties properties = new Properties(); - properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); - // send - properties.put(TRANSACTIONAL_ID_CONFIG, "transactionalId1"); - FlinkKafkaInternalProducer reuse = new FlinkKafkaInternalProducer<>(properties); - reuse.initTransactions(); - reuse.beginTransaction(); - String[] expects = new String[countNum]; - for (int i = 0; i < countNum; i++) { - expects[i] = "test-value-" + i; - reuse.send(new ProducerRecord<>(TestHiddenLogOperators.TOPIC, expects[i].getBytes())); - } - reuse.commitTransaction(); - reuse.close(Duration.ofMillis(1000)); - - // read all - properties.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); - KafkaConsumer consumer = new KafkaConsumer<>(properties); - Set topicPartitionList = - consumer.partitionsFor(topicIntern).stream() - .map(partitionInfo -> new TopicPartition(topicIntern, partitionInfo.partition())) - .collect(Collectors.toSet()); - TopicPartition partition0 = topicPartitionList.stream().iterator().next(); - consumer.assign(topicPartitionList); - consumer.seekToBeginning(consumer.assignment()); - ConsumerRecords consumerRecords = consumer.poll(Duration.ofMillis(1000)); - - int count = consumerRecords.count(); - assertThat(count).isEqualTo(countNum); - List actual = new ArrayList<>(); - consumerRecords.forEach(consumerRecord -> actual.add(new String(consumerRecord.value()))); - Assertions.assertArrayEquals(expects, actual.toArray(new String[0])); - - // seek - long seekOffset = 1L; - consumer.seek(partition0, seekOffset); - - consumerRecords = consumer.poll(Duration.ofMillis(1000)); - - count = consumerRecords.count(); - assertThat(count).isEqualTo(countNum - seekOffset); - List actualSeek = new ArrayList<>(); - consumerRecords.forEach(consumerRecord -> actualSeek.add(new String(consumerRecord.value()))); - String[] expect = Arrays.copyOfRange(expects, (int) seekOffset, countNum); - Assertions.assertArrayEquals(expect, actualSeek.toArray(new String[0])); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java deleted file mode 100644 index 2747d9ec59..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestKafkaSourceReader.java +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hidden.kafka; - -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.getPropertiesByTopic; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; -import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; -import static org.junit.Assert.assertEquals; - -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplit; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplitState; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSourceReader; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.util.TestUtil; -import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; -import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.api.common.eventtime.Watermark; -import org.apache.flink.api.connector.source.ReaderOutput; -import org.apache.flink.api.connector.source.SourceOutput; -import org.apache.flink.api.connector.source.SourceReader; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; -import org.apache.flink.table.data.RowData; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.TopicPartition; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TestName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; - -public class TestKafkaSourceReader { - private static final Logger LOG = LoggerFactory.getLogger(TestKafkaSourceReader.class); - private static String topic; - private static final int KAFKA_PARTITION_NUMS = 1; - private static final int NUM_SPLITS = 1; - private static final int NUM_RECORDS_PER_SPLIT = 10; - private static final int TOTAL_NUM_RECORDS = NUM_RECORDS_PER_SPLIT * NUM_SPLITS; - - @Rule public TestName testName = new TestName(); - - private static final byte[] JOB_ID = IdGenerator.generateUpstreamId(); - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Before - public void initData() throws Exception { - topic = TestUtil.getUtMethodName(testName); - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - write(topic, TOTAL_NUM_RECORDS); - } - - @Test - public void testSourceReaderFailover() throws Exception { - final String groupId = "testSourceReaderFailover"; - LogKafkaSourceReader reader = (LogKafkaSourceReader) createReader(groupId); - reader.addSplits(getSplits(NUM_SPLITS)); - ValidatingSourceOutput output = new ValidatingSourceOutput(); - List splitList; - long checkpointId = 0; - do { - checkpointId++; - reader.pollNext(output); - // Create a checkpoint for each message consumption, but not complete them. - splitList = reader.snapshotState(checkpointId); - } while (output.count() < TOTAL_NUM_RECORDS); - - // The completion of the last checkpoint should subsume all the previous checkpoints. - assertEquals(checkpointId, reader.getOffsetsToCommit().size()); - reader.notifyCheckpointComplete(checkpointId); - - // re-create and restore - reader = (LogKafkaSourceReader) createReader(groupId); - reader.addSplits(splitList); - List currentSplitList = reader.snapshotState(checkpointId); - currentSplitList.forEach(s -> assertEquals(TOTAL_NUM_RECORDS, s.getStartingOffset())); - } - - private ProducerRecord createLogData( - String topic, - int i, - int epicNo, - boolean flip, - LogDataJsonSerialization serialization) { - RowData rowData = TestHiddenLogOperators.createRowData(i); - LogData logData = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - JOB_ID, - epicNo, - flip, - transformFromFlinkRowKind(rowData.getRowKind()), - rowData); - byte[] message = serialization.serialize(logData); - int partition = 0; - ProducerRecord producerRecord = - new ProducerRecord<>(topic, partition, null, null, message); - return producerRecord; - } - - private void write(String topic, int numRecords) throws Exception { - KafkaProducer producer = KafkaContainerTest.getProducer(); - LogDataJsonSerialization serialization = - new LogDataJsonSerialization<>(TestBaseLog.USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); - for (int i = 0; i < numRecords; i++) { - producer.send(createLogData(topic, 0, 1, false, serialization)); - } - printDataInTopic(topic); - } - - public static void printDataInTopic(String topic) { - ConsumerRecords consumerRecords = readRecordsBytes(topic); - LogDataJsonDeserialization deserialization = - TestBaseLog.createLogDataDeserialization(); - consumerRecords.forEach( - consumerRecord -> { - try { - LOG.info("data in kafka: {}", deserialization.deserialize(consumerRecord.value())); - } catch (IOException e) { - e.printStackTrace(); - } - }); - } - - private SourceReader createReader(String groupId) { - List topics = new ArrayList<>(); - topics.add(topic); - LogKafkaSource kafkaSource = createKafkaSource(groupId, false, topics); - return kafkaSource.createReader(new TestingReaderContext()); - } - - private LogKafkaSource createKafkaSource(String groupId, boolean retract, List topics) { - Properties properties = getPropertiesByTopic(topic); - properties.put("group.id", groupId); - properties.put("auto.offset.reset", "earliest"); - - Map configuration = new HashMap<>(); - configuration.put(MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), String.valueOf(retract)); - - return LogKafkaSource.builder(TestBaseLog.USER_SCHEMA, configuration) - .setTopics(topics) - .setStartingOffsets(OffsetsInitializer.earliest()) - .setProperties(properties) - .build(); - } - - protected List getSplits(int numRecordsPerSplit) { - List splits = new ArrayList<>(); - for (int i = 0; i < numRecordsPerSplit; i++) { - splits.add(getSplit(i, numRecordsPerSplit)); - } - return splits; - } - - protected LogKafkaPartitionSplit getSplit(int splitId, int numRecords) { - long stoppingOffset = KafkaPartitionSplit.NO_STOPPING_OFFSET; - KafkaPartitionSplit kafkaPartitionSplit = - new KafkaPartitionSplit(new TopicPartition(topic, splitId), 0L, stoppingOffset); - return new LogKafkaPartitionSplit(new LogKafkaPartitionSplitState(kafkaPartitionSplit)); - } - - // ---------------- helper classes ----------------- - - /** A source output that validates the output. */ - public static class ValidatingSourceOutput implements ReaderOutput { - private final Set consumedValues = new HashSet<>(); - private final int max = Integer.MIN_VALUE; - private final int min = Integer.MAX_VALUE; - - private int count = 0; - - @Override - public void collect(RowData rowData) { - count++; - consumedValues.add(rowData); - } - - @Override - public void collect(RowData rowData, long timestamp) { - collect(rowData); - } - - @Override - public void emitWatermark(Watermark watermark) {} - - public void validate() { - assertEquals( - String.format("Should be %d distinct elements in total", TOTAL_NUM_RECORDS), - TOTAL_NUM_RECORDS, - consumedValues.size()); - assertEquals( - String.format("Should be %d elements in total", TOTAL_NUM_RECORDS), - TOTAL_NUM_RECORDS, - count); - assertEquals("The min value should be 0", 0, min); - assertEquals( - "The max value should be " + (TOTAL_NUM_RECORDS - 1), TOTAL_NUM_RECORDS - 1, max); - } - - public int count() { - return count; - } - - @Override - public void markIdle() {} - - @Override - public void markActive() {} - - @Override - public SourceOutput createOutputForSplit(String splitId) { - return this; - } - - @Override - public void releaseOutputForSplit(String splitId) {} - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java deleted file mode 100644 index 1c45d4abb9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hidden/kafka/TestLogKafkaPartitionSplitReader.java +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hidden.kafka; - -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; -import static org.apache.amoro.flink.shuffle.RowKindUtil.transformFromFlinkRowKind; -import static org.junit.Assert.assertEquals; - -import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.read.source.log.LogSourceHelper; -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaPartitionSplitReader; -import org.apache.amoro.flink.read.source.log.kafka.LogRecordWithRetractInfo; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.write.hidden.kafka.TestBaseLog; -import org.apache.amoro.flink.write.hidden.kafka.TestHiddenLogOperators; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.flink.connector.kafka.source.metrics.KafkaSourceReaderMetrics; -import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplit; -import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; -import org.apache.flink.metrics.groups.SourceReaderMetricGroup; -import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; -import org.apache.flink.table.data.RowData; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.TopicPartition; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; - -public class TestLogKafkaPartitionSplitReader { - - private static final Logger LOG = LoggerFactory.getLogger(TestLogKafkaPartitionSplitReader.class); - - public static final int TOPIC1_STOP_OFFSET = 16; - public static final int TOPIC2_STOP_OFFSET = 21; - public static final String TOPIC1 = "topic1"; - public static final String TOPIC2 = "topic2"; - private static Map> splitsByOwners; - private static final byte[] JOB_ID = IdGenerator.generateUpstreamId(); - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - - Map earliestOffsets = new HashMap<>(); - earliestOffsets.put(new TopicPartition(TOPIC1, 0), 0L); - earliestOffsets.put(new TopicPartition(TOPIC2, 0), 5L); - splitsByOwners = getSplitsByOwners(earliestOffsets); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Before - public void initData() throws Exception { - // |0 1 2 3 4 5 6 7 8 9 Flip 10 11 12 13 14| 15 16 17 18 19 - write(TOPIC1, 0); - // 0 0 0 0 0 |5 6 7 8 9 10 11 12 13 14 Flip 15 16 17 18 19| 20 21 22 23 24 - write(TOPIC2, 5); - } - - @Test - public void testHandleSplitChangesAndFetch() throws IOException { - LogKafkaPartitionSplitReader reader = createReader(new Properties()); - assignSplitsAndFetchUntilFinish(reader, 0, 20); - assignSplitsAndFetchUntilFinish(reader, 1, 20); - } - - private ProducerRecord createLogData( - String topic, - int i, - int epicNo, - boolean flip, - LogDataJsonSerialization serialization) { - RowData rowData = TestHiddenLogOperators.createRowData(i); - LogData logData = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - JOB_ID, - epicNo, - flip, - transformFromFlinkRowKind(rowData.getRowKind()), - rowData); - byte[] message = serialization.serialize(logData); - int partition = 0; - ProducerRecord producerRecord = - new ProducerRecord<>(topic, partition, null, null, message); - return producerRecord; - } - - private void write(String topic, int offset) throws Exception { - KafkaProducer producer = KafkaContainerTest.getProducer(); - LogDataJsonSerialization serialization = - new LogDataJsonSerialization<>(TestBaseLog.USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); - for (int j = 0; j < offset; j++) { - producer.send(createLogData(topic, 0, 1, false, serialization)); - } - - int i = offset; - // 0-4 + offset success - for (; i < offset + 5; i++) { - producer.send(createLogData(topic, i, 1, false, serialization)); - } - - // 5-9 + offset fail - for (; i < offset + 10; i++) { - producer.send(createLogData(topic, i, 2, false, serialization)); - } - - producer.send(createLogData(topic, i, 1, true, serialization)); - - // 10-14 + offset success - for (; i < offset + 15; i++) { - producer.send(createLogData(topic, i, 2, false, serialization)); - } - - for (; i < offset + 20; i++) { - producer.send(createLogData(topic, i, 3, false, serialization)); - } - printDataInTopic(topic); - } - - public static void printDataInTopic(String topic) { - ConsumerRecords consumerRecords = readRecordsBytes(topic); - LogDataJsonDeserialization deserialization = - TestBaseLog.createLogDataDeserialization(); - consumerRecords.forEach( - consumerRecord -> { - try { - LOG.info("data in kafka: {}", deserialization.deserialize(consumerRecord.value())); - } catch (IOException e) { - e.printStackTrace(); - } - }); - } - - private void assignSplitsAndFetchUntilFinish( - LogKafkaPartitionSplitReader reader, int readerId, int expectedRecordCount) - throws IOException { - Map splits = assignSplits(reader, splitsByOwners.get(readerId)); - - Map numConsumedRecords = new HashMap<>(); - Set finishedSplits = new HashSet<>(); - int flipCount = 0; - while (finishedSplits.size() < splits.size()) { - RecordsWithSplitIds> recordsBySplitIds = reader.fetch(); - String splitId = recordsBySplitIds.nextSplit(); - while (splitId != null) { - // Collect the records in this split. - List> splitFetch = new ArrayList<>(); - ConsumerRecord record; - boolean hasFlip = false; - while ((record = recordsBySplitIds.nextRecordFromSplit()) != null) { - LOG.info( - "read: {}, offset: {}", - ((LogRecordWithRetractInfo) record).getLogData().getActualValue(), - record.offset()); - if (((LogRecordWithRetractInfo) record).isRetracting()) { - hasFlip = true; - } - splitFetch.add((LogRecordWithRetractInfo) record); - } - if (hasFlip) { - flipCount++; - } - // verify the consumed records. - if (verifyConsumed(splits.get(splitId), splitFetch, flipCount)) { - finishedSplits.add(splitId); - } - numConsumedRecords.compute( - splitId, - (ignored, recordCount) -> - recordCount == null ? splitFetch.size() : recordCount + splitFetch.size()); - splitId = recordsBySplitIds.nextSplit(); - } - } - - // Verify the number of records consumed from each split. - numConsumedRecords.forEach( - (splitId, recordCount) -> { - assertEquals( - String.format("%s should have %d records.", splits.get(splitId), expectedRecordCount), - expectedRecordCount, - (long) recordCount); - }); - } - - public static Map> getSplitsByOwners( - Map earliestOffsets) { - final Map> splitsByOwners = new HashMap<>(); - splitsByOwners.put( - 0, - new HashMap() { - { - TopicPartition tp = new TopicPartition(TOPIC1, 0); - put( - KafkaPartitionSplit.toSplitId(tp), - new KafkaPartitionSplit(tp, earliestOffsets.get(tp), TOPIC1_STOP_OFFSET)); - } - }); - splitsByOwners.put( - 1, - new HashMap() { - { - TopicPartition tp = new TopicPartition(TOPIC2, 0); - put( - KafkaPartitionSplit.toSplitId(tp), - new KafkaPartitionSplit(tp, earliestOffsets.get(tp), TOPIC2_STOP_OFFSET)); - } - }); - return splitsByOwners; - } - - private Map assignSplits( - LogKafkaPartitionSplitReader reader, Map splits) { - SplitsChange splitsChange = - new SplitsAddition<>(new ArrayList<>(splits.values())); - reader.handleSplitsChanges(splitsChange); - return splits; - } - - private LogKafkaPartitionSplitReader createReader(Properties additionalProperties) { - Properties props = KafkaConfigGenerate.getPropertiesWithByteArray(); - props.put("group.id", "test"); - props.put("auto.offset.reset", "earliest"); - if (!additionalProperties.isEmpty()) { - props.putAll(additionalProperties); - } - SourceReaderMetricGroup sourceReaderMetricGroup = - UnregisteredMetricsGroup.createSourceReaderMetricGroup(); - return new LogKafkaPartitionSplitReader( - props, - new TestingReaderContext(new Configuration(), sourceReaderMetricGroup), - new KafkaSourceReaderMetrics(sourceReaderMetricGroup), - TestBaseLog.USER_SCHEMA, - true, - new LogSourceHelper(), - "all-kinds"); - } - - private boolean verifyConsumed( - final KafkaPartitionSplit split, - final Collection> consumed, - final int valueOffsetDiffInOrderedRead) { - long currentOffset = -1; - - for (LogRecordWithRetractInfo record : consumed) { - if (record.isRetracting()) { - assertEquals(record.offset(), record.getActualValue().getInt(1)); - } else { - assertEquals( - record.offset(), record.getActualValue().getInt(1) + valueOffsetDiffInOrderedRead); - } - - currentOffset = Math.max(currentOffset, record.offset()); - } - if (split.getStoppingOffset().isPresent()) { - return currentOffset == split.getStoppingOffset().get() - 1; - } else { - return false; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java deleted file mode 100644 index eef62b8dbe..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestShuffleSplitAssigner.java +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.flink.api.connector.source.ReaderInfo; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.connector.source.SplitsAssignment; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup; -import org.apache.flink.table.data.RowData; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.BiConsumer; -import java.util.stream.Collectors; - -public class TestShuffleSplitAssigner extends TestRowDataReaderFunction { - private static final Logger LOG = LoggerFactory.getLogger(TestShuffleSplitAssigner.class); - - @Test - public void testSingleParallelism() { - ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(1); - - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - shuffleSplitAssigner.onDiscoveredSplits(splitList); - List actual = new ArrayList<>(); - - while (true) { - Split splitOpt = shuffleSplitAssigner.getNext(0); - if (splitOpt.isAvailable()) { - actual.add(splitOpt.split()); - } else { - break; - } - } - - Assert.assertEquals(splitList.size(), actual.size()); - } - - @Test - public void testMultiParallelism() { - ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); - - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - shuffleSplitAssigner.onDiscoveredSplits(splitList); - List actual = new ArrayList<>(); - - int subtaskId = 2; - while (subtaskId >= 0) { - Split splitOpt = shuffleSplitAssigner.getNext(subtaskId); - if (splitOpt.isAvailable()) { - actual.add(splitOpt.split()); - } else { - LOG.info("Subtask id {}, splits {}.\n {}", subtaskId, actual.size(), actual); - --subtaskId; - } - } - - Assert.assertEquals(splitList.size(), actual.size()); - } - - @Test - public void testTreeNodeMaskUpdate() { - ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); - long[][] treeNodes = - new long[][] { - {3, 0}, {3, 1}, {3, 2}, {3, 3}, {7, 0}, {7, 1}, {7, 2}, {7, 3}, {7, 4}, {1, 0}, {1, 1}, - {0, 0}, {7, 7}, {15, 15} - }; - long[][] expectNodes = - new long[][] { - {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 0}, {3, 0}, {3, 2}, - {3, 1}, {3, 3}, {3, 0}, {3, 2}, {3, 1}, {3, 3}, {3, 3}, {3, 3} - }; - - List actualNodes = new ArrayList<>(); - - for (long[] node : treeNodes) { - MixedFormatSplit mixedFormatSplit = - new MixedFormatSplit() { - DataTreeNode dataTreeNode = DataTreeNode.of(node[0], node[1]); - - @Override - public Integer taskIndex() { - return null; - } - - @Override - public void updateOffset(Object[] recordOffsets) {} - - @Override - public MixedFormatSplit copy() { - return null; - } - - @Override - public DataTreeNode dataTreeNode() { - return this.dataTreeNode; - } - - @Override - public void modifyTreeNode(DataTreeNode expected) { - this.dataTreeNode = expected; - } - - @Override - public String splitId() { - return null; - } - - @Override - public String toString() { - return dataTreeNode.toString(); - } - }; - List exactTreeNodes = - shuffleSplitAssigner.getExactlyTreeNodes(mixedFormatSplit); - actualNodes.addAll(exactTreeNodes); - } - long[][] result = - actualNodes.stream() - .map(treeNode -> new long[] {treeNode.mask(), treeNode.index()}) - .toArray(value -> new long[actualNodes.size()][]); - - Assert.assertArrayEquals(expectNodes, result); - } - - @Test - public void testNodeUpMoved() throws IOException { - writeUpdateWithSpecifiedMaskOne(); - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); - int totalParallelism = 3; - ShuffleSplitAssigner assigner = instanceSplitAssigner(totalParallelism); - assigner.onDiscoveredSplits(mixedFormatSplits); - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - new Configuration(), - testKeyedTable.schema(), - testKeyedTable.schema(), - testKeyedTable.primaryKeySpec(), - null, - true, - testKeyedTable.io()); - int subtaskId = 0; - Split split; - List actual = new ArrayList<>(); - LOG.info("subtaskId={}...", subtaskId); - do { - split = assigner.getNext(subtaskId); - if (split.isAvailable()) { - DataIterator dataIterator = - rowDataReaderFunction.createDataIterator(split.split()); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - LOG.info("{}", rowData); - actual.add(rowData); - } - } else { - subtaskId = subtaskId + 1; - LOG.info("subtaskId={}...", subtaskId); - } - } while (subtaskId < totalParallelism); - - List excepts = expectedCollection(); - excepts.addAll(generateRecords()); - RowData[] array = - excepts.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()) - .toArray(new RowData[excepts.size()]); - assertArrayEquals(array, actual); - } - - protected ShuffleSplitAssigner instanceSplitAssigner(int parallelism) { - SplitEnumeratorContext splitEnumeratorContext = - new InternalSplitEnumeratorContext(parallelism); - return new ShuffleSplitAssigner(splitEnumeratorContext); - } - - protected static class InternalSplitEnumeratorContext - implements SplitEnumeratorContext { - private final int parallelism; - - public InternalSplitEnumeratorContext(int parallelism) { - this.parallelism = parallelism; - } - - @Override - public SplitEnumeratorMetricGroup metricGroup() { - return null; - } - - @Override - public void sendEventToSourceReader(int subtaskId, SourceEvent event) {} - - @Override - public int currentParallelism() { - return parallelism; - } - - @Override - public Map registeredReaders() { - return null; - } - - @Override - public void assignSplits(SplitsAssignment newSplitAssignments) {} - - @Override - public void signalNoMoreSplits(int subtask) {} - - @Override - public void callAsync(Callable callable, BiConsumer handler) {} - - @Override - public void callAsync( - Callable callable, BiConsumer handler, long initialDelay, long period) {} - - @Override - public void runInCoordinatorThread(Runnable runnable) {} - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java deleted file mode 100644 index 71c85609ae..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestSplitAssignerAwaiting.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.junit.Assert; -import org.junit.Test; - -import java.util.Collection; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -public class TestSplitAssignerAwaiting extends TestShuffleSplitAssigner { - - @Test - public void testEmpty() { - ShuffleSplitAssigner splitAssigner = instanceSplitAssigner(1); - Split split = splitAssigner.getNext(0); - Assert.assertNotNull(split); - Assert.assertEquals(Split.Status.UNAVAILABLE, split.status()); - } - - @Test - public void testStaticAssign() { - ShuffleSplitAssigner splitAssigner = instanceSplitAssigner(1); - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - - splitAssigner.onDiscoveredSplits(splitList); - assertSnapshot(splitAssigner, 7); - assertAllSplits(splitAssigner, 7); - - splitAssigner.onUnassignedSplits(splitList.subList(0, 6)); - assertSnapshot(splitAssigner, 6); - assertAllSplits(splitAssigner, 6); - } - - @Test - public void testContinueAssign() { - ShuffleSplitAssigner assigner = instanceSplitAssigner(1); - assertGetNext(assigner, Split.Status.UNAVAILABLE); - - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - List splits1 = splitList.subList(0, 1); - assertAvailableFuture(assigner, () -> assigner.onDiscoveredSplits(splits1)); - List splits2 = splitList.subList(1, 2); - assertAvailableFuture(assigner, () -> assigner.onUnassignedSplits(splits2)); - - assigner.onDiscoveredSplits(splitList.subList(2, 4)); - assertSnapshot(assigner, 2); - assertAllSplits(assigner, 2); - assertSnapshot(assigner, 0); - } - - private void assertAllSplits(ShuffleSplitAssigner splitAssigner, int splitCount) { - for (int i = 0; i < splitCount + 2; i++) { - if (i < splitCount) { - assertGetNext(splitAssigner, Split.Status.AVAILABLE); - } else { - assertGetNext(splitAssigner, Split.Status.UNAVAILABLE); - } - } - } - - private void assertAvailableFuture(ShuffleSplitAssigner assigner, Runnable addSplitsRunnable) { - // register callback - AtomicBoolean futureCompleted = new AtomicBoolean(); - CompletableFuture future = assigner.isAvailable(); - future.thenAccept(ignored -> futureCompleted.set(true)); - // calling isAvailable again should return the same object reference - // note that thenAccept will return a new future. - // we want to assert the same instance on the assigner returned future - Assert.assertSame(future, assigner.isAvailable()); - - // now add some splits - addSplitsRunnable.run(); - Assert.assertTrue(futureCompleted.get()); - - for (int i = 0; i < 1; ++i) { - assertGetNext(assigner, Split.Status.AVAILABLE); - } - assertGetNext(assigner, Split.Status.UNAVAILABLE); - assertSnapshot(assigner, 0); - } - - private void assertGetNext(ShuffleSplitAssigner assigner, Split.Status expectedStatus) { - Split result = assigner.getNext(0); - Assert.assertEquals(expectedStatus, result.status()); - switch (expectedStatus) { - case AVAILABLE: - Assert.assertNotNull(result.split()); - break; - case UNAVAILABLE: - Assert.assertNull(result.split()); - break; - default: - Assert.fail("Unknown status: " + expectedStatus); - } - } - - private void assertSnapshot(ShuffleSplitAssigner assigner, int splitCount) { - Collection stateBeforeGet = assigner.state(); - Assert.assertEquals(splitCount, stateBeforeGet.size()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java deleted file mode 100644 index 3c614e921d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/assigner/TestStaticSplitAssigner.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.assigner; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -public class TestStaticSplitAssigner extends TestRowDataReaderFunction { - private static final Logger LOG = LoggerFactory.getLogger(TestStaticSplitAssigner.class); - - @Test - public void testSingleParallelism() throws IOException { - try (StaticSplitAssigner staticSplitAssigner = instanceStaticSplitAssigner()) { - List splitList = - FlinkSplitPlanner.mergeOnReadPlan( - testKeyedTable, Collections.emptyList(), new AtomicInteger()); - staticSplitAssigner.onDiscoveredSplits(splitList); - List actual = new ArrayList<>(); - - while (true) { - Split splitOpt = staticSplitAssigner.getNext(0); - if (splitOpt.isAvailable()) { - actual.add(splitOpt.split()); - } else { - break; - } - } - - Assert.assertEquals(splitList.size(), actual.size()); - } - } - - @Test - public void testMultiParallelism() throws IOException { - try (StaticSplitAssigner staticSplitAssigner = instanceStaticSplitAssigner()) { - List splitList = - FlinkSplitPlanner.mergeOnReadPlan( - testKeyedTable, Collections.emptyList(), new AtomicInteger()); - staticSplitAssigner.onDiscoveredSplits(splitList); - List actual = new ArrayList<>(); - - int subtaskId = 2; - while (subtaskId >= 0) { - Split splitOpt = staticSplitAssigner.getNext(subtaskId); - if (splitOpt.isAvailable()) { - actual.add(splitOpt.split()); - } else { - LOG.info("Subtask id {}, splits {}.\n {}", subtaskId, actual.size(), actual); - --subtaskId; - } - } - - Assert.assertEquals(splitList.size(), actual.size()); - } - } - - protected StaticSplitAssigner instanceStaticSplitAssigner() { - return new StaticSplitAssigner(null); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java deleted file mode 100644 index ff41fb5319..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestContinuousSplitPlannerImpl.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.TaskWriter; -import org.junit.Before; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.ZoneOffset; -import java.util.ArrayList; -import java.util.List; - -public class TestContinuousSplitPlannerImpl extends FlinkTestBase { - private static final Logger LOG = LoggerFactory.getLogger(TestContinuousSplitPlannerImpl.class); - protected static final RowType ROW_TYPE = FlinkSchemaUtil.convert(TABLE_SCHEMA); - protected KeyedTable testKeyedTable; - - protected static final LocalDateTime LDT = - LocalDateTime.of(LocalDate.of(2022, 1, 1), LocalTime.of(0, 0, 0, 0)); - - public TestContinuousSplitPlannerImpl( - CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - @Before - public void init() throws IOException { - testKeyedTable = getMixedTable().asKeyedTable(); - // write base - { - TaskWriter taskWriter = createTaskWriter(true); - List baseData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(testKeyedTable, taskWriter.complete(), true); - } - - // write change insert - { - TaskWriter taskWriter = createTaskWriter(false); - List insert = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 5, - StringData.fromString("mary"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 6, - StringData.fromString("mack"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - } - }; - for (RowData record : insert) { - taskWriter.write(record); - } - commit(testKeyedTable, taskWriter.complete(), true); - } - - // write change delete - { - TaskWriter taskWriter = createTaskWriter(false); - List update = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.DELETE, - 5, - StringData.fromString("mary"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 5, - StringData.fromString("lind"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - } - }; - - for (RowData record : update) { - taskWriter.write(record); - } - commit(testKeyedTable, taskWriter.complete(), false); - } - } - - protected TaskWriter createTaskWriter(boolean base) { - return createKeyedTaskWriter(testKeyedTable, ROW_TYPE, base); - } - - protected TaskWriter createTaskWriter(KeyedTable keyedTable, boolean base) { - return createKeyedTaskWriter(keyedTable, ROW_TYPE, base); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java deleted file mode 100644 index 1c1f52616f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumStateSerializer.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; -import org.apache.amoro.flink.read.hybrid.assigner.Split; -import org.apache.amoro.flink.read.hybrid.assigner.TestShuffleSplitAssigner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.concurrent.atomic.AtomicInteger; - -public class TestMixedFormatSourceEnumStateSerializer extends TestShuffleSplitAssigner { - private static final Logger LOG = - LoggerFactory.getLogger(TestMixedFormatSourceEnumStateSerializer.class); - - @Test - public void testMixedFormatEnumState() throws IOException { - ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(3); - - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - shuffleSplitAssigner.onDiscoveredSplits(splitList); - TemporalJoinSplits splits = new TemporalJoinSplits(splitList, null); - - MixedFormatSourceEnumState expect = - new MixedFormatSourceEnumState( - shuffleSplitAssigner.state(), - null, - shuffleSplitAssigner.serializePartitionIndex(), - splits); - - MixedFormatSourceEnumStateSerializer mixedFormatSourceEnumStateSerializer = - new MixedFormatSourceEnumStateSerializer(); - byte[] ser = mixedFormatSourceEnumStateSerializer.serialize(expect); - - Assert.assertNotNull(ser); - - MixedFormatSourceEnumState actual = mixedFormatSourceEnumStateSerializer.deserialize(1, ser); - - Assert.assertEquals(expect.pendingSplits().size(), actual.pendingSplits().size()); - Assert.assertEquals( - Objects.requireNonNull(expect.shuffleSplitRelation()).length, - Objects.requireNonNull(actual.shuffleSplitRelation()).length); - - SplitEnumeratorContext splitEnumeratorContext = - new InternalSplitEnumeratorContext(3); - try (ShuffleSplitAssigner actualAssigner = - new ShuffleSplitAssigner(splitEnumeratorContext, getMixedTable().name(), actual)) { - List actualSplits = new ArrayList<>(); - - int subtaskId = 2; - while (subtaskId >= 0) { - Split splitOpt = actualAssigner.getNext(subtaskId); - if (splitOpt.isAvailable()) { - actualSplits.add(splitOpt.split()); - } else { - LOG.info("subtask id {}, splits {}.\n {}", subtaskId, actualSplits.size(), actualSplits); - --subtaskId; - } - } - - Assert.assertEquals(splitList.size(), actualSplits.size()); - - TemporalJoinSplits temporalJoinSplits = actual.temporalJoinSplits(); - Assert.assertEquals(expect.temporalJoinSplits(), temporalJoinSplits); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java deleted file mode 100644 index 2849bf3ca3..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestMixedFormatSourceEnumerator.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import static org.apache.flink.util.Preconditions.checkState; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.assigner.ShuffleSplitAssigner; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplitState; -import org.apache.amoro.flink.read.hybrid.split.SplitRequestEvent; -import org.apache.amoro.flink.read.source.MixedFormatScanContext; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.api.connector.source.ReaderInfo; -import org.apache.flink.api.connector.source.SourceEvent; -import org.apache.flink.api.connector.source.SplitEnumeratorContext; -import org.apache.flink.api.connector.source.SplitsAssignment; -import org.apache.flink.metrics.groups.SplitEnumeratorMetricGroup; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.io.TaskWriter; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.time.ZoneOffset; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.BiConsumer; - -public class TestMixedFormatSourceEnumerator extends FlinkTestBase { - - public TestMixedFormatSourceEnumerator() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - private final int splitCount = 4; - private final int parallelism = 5; - private KeyedTable testKeyedTable; - - public static final String SCAN_STARTUP_MODE_EARLIEST = "earliest"; - - protected static final LocalDateTime LDT = - LocalDateTime.of(LocalDate.of(2022, 1, 1), LocalTime.of(0, 0, 0, 0)); - - @Before - public void init() throws IOException { - testKeyedTable = getMixedTable().asKeyedTable(); - // write change insert - { - TaskWriter taskWriter = createKeyedTaskWriter(testKeyedTable, FLINK_ROW_TYPE, false); - List insert = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - } - }; - for (RowData record : insert) { - taskWriter.write(record); - } - commit(testKeyedTable, taskWriter.complete(), false); - } - } - - @Test - public void testReadersNumGreaterThanSplits() throws Exception { - TestingSplitEnumeratorContext splitEnumeratorContext = - instanceSplitEnumeratorContext(parallelism); - ShuffleSplitAssigner shuffleSplitAssigner = instanceSplitAssigner(splitEnumeratorContext); - MixedFormatScanContext scanContext = - MixedFormatScanContext.contextBuilder() - .streaming(true) - .scanStartupMode(SCAN_STARTUP_MODE_EARLIEST) - .build(); - - List splitList = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger()); - shuffleSplitAssigner.onDiscoveredSplits(splitList); - assertSnapshot(shuffleSplitAssigner, splitCount); - - MixedFormatSourceEnumerator enumerator = - new MixedFormatSourceEnumerator( - splitEnumeratorContext, - shuffleSplitAssigner, - MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder), - scanContext, - null, - false); - - Collection pendingSplitsEmpty = - enumerator.snapshotState(1).pendingSplits(); - Assert.assertEquals(splitCount, pendingSplitsEmpty.size()); - - // register readers, and let them request a split - // 4 split, 5 subtask, one or more subtask will fetch empty split - // subtask 0 - splitEnumeratorContext.registerReader(0, "host0"); - enumerator.addReader(0); - enumerator.handleSourceEvent(0, new SplitRequestEvent()); - // subtask 1 - splitEnumeratorContext.registerReader(1, "host1"); - enumerator.addReader(1); - enumerator.handleSourceEvent(1, new SplitRequestEvent()); - // subtask 2 - splitEnumeratorContext.registerReader(2, "host2"); - enumerator.addReader(2); - enumerator.handleSourceEvent(2, new SplitRequestEvent()); - // subtask 3 - splitEnumeratorContext.registerReader(3, "host3"); - enumerator.addReader(3); - enumerator.handleSourceEvent(3, new SplitRequestEvent()); - // subtask 4 - splitEnumeratorContext.registerReader(4, "host4"); - enumerator.addReader(4); - enumerator.handleSourceEvent(4, new SplitRequestEvent()); - - Assert.assertEquals(parallelism - splitCount, enumerator.getReadersAwaitingSplit().size()); - Assert.assertTrue(enumerator.snapshotState(2).pendingSplits().isEmpty()); - } - - private void assertSnapshot(ShuffleSplitAssigner assigner, int splitCount) { - Collection stateBeforeGet = assigner.state(); - Assert.assertEquals(splitCount, stateBeforeGet.size()); - } - - private ShuffleSplitAssigner instanceSplitAssigner( - TestingSplitEnumeratorContext splitEnumeratorContext) { - return new ShuffleSplitAssigner(splitEnumeratorContext); - } - - private TestingSplitEnumeratorContext instanceSplitEnumeratorContext(int parallelism) { - return new TestingSplitEnumeratorContext(parallelism); - } - - protected static class TestingSplitEnumeratorContext - implements SplitEnumeratorContext { - private final int parallelism; - - private final HashMap> splitAssignments = - new HashMap<>(); - - private final HashMap> events = new HashMap<>(); - - private final HashMap registeredReaders = new HashMap<>(); - - public Map> getSplitAssignments() { - return splitAssignments; - } - - public Map> getSentEvents() { - return events; - } - - public void registerReader(int subtask, String hostname) { - checkState(!registeredReaders.containsKey(subtask), "Reader already registered"); - registeredReaders.put(subtask, new ReaderInfo(subtask, hostname)); - } - - public TestingSplitEnumeratorContext(int parallelism) { - this.parallelism = parallelism; - } - - @Override - public SplitEnumeratorMetricGroup metricGroup() { - return null; - } - - @Override - public void sendEventToSourceReader(int subtaskId, SourceEvent event) { - final List eventsForSubTask = - events.computeIfAbsent(subtaskId, (key) -> new ArrayList<>()); - eventsForSubTask.add(event); - } - - @Override - public int currentParallelism() { - return parallelism; - } - - @Override - public Map registeredReaders() { - return registeredReaders; - } - - @Override - public void assignSplits(SplitsAssignment newSplitAssignments) { - for (final Map.Entry> entry : - newSplitAssignments.assignment().entrySet()) { - final SplitAssignmentState assignment = - splitAssignments.computeIfAbsent(entry.getKey(), (key) -> new SplitAssignmentState<>()); - - assignment.getAssignedSplits().addAll(entry.getValue()); - } - } - - @Override - public void assignSplit(MixedFormatSplit split, int subtask) { - SplitEnumeratorContext.super.assignSplit(split, subtask); - } - - @Override - public void signalNoMoreSplits(int subtask) { - final SplitAssignmentState assignment = - splitAssignments.computeIfAbsent(subtask, (key) -> new SplitAssignmentState<>()); - assignment.noMoreSplits = true; - } - - @Override - public void callAsync( - Callable callable, BiConsumer handler, long initialDelay, long period) {} - - @Override - public void callAsync(Callable callable, BiConsumer handler) {} - - @Override - public void runInCoordinatorThread(Runnable runnable) {} - } - - public static final class SplitAssignmentState { - - final List splits = new ArrayList<>(); - boolean noMoreSplits; - - public List getAssignedSplits() { - return splits; - } - - public boolean hasReceivedNoMoreSplitsSignal() { - return noMoreSplits; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java deleted file mode 100644 index ba378491d1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/enumerator/TestTemporalJoinSplitsThreadSafe.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.enumerator; - -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.hybrid.split.TemporalJoinSplits; -import org.junit.Assert; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.CompletableFuture; -import java.util.stream.Collectors; - -public class TestTemporalJoinSplitsThreadSafe { - - @Test - public void testTemporalJoinSplits() { - List allSplit = new LinkedList<>(); - for (int i = 0; i < 100; i++) { - allSplit.add(UUID.randomUUID().toString()); - } - - Collection mixedFormatSplits = - allSplit.stream().map(TestMixedFormatSplit::of).collect(Collectors.toList()); - - for (int i = 0; i < 2; i++) { - round(allSplit, mixedFormatSplits); - } - } - - public void round(List allSplit, Collection mixedFormatSplits) { - TemporalJoinSplits temporalJoinSplits = new TemporalJoinSplits(mixedFormatSplits, null); - int n = allSplit.size(); - - List s1 = new ArrayList<>(allSplit.subList(0, (int) (2.0 / 3 * n))), - s2 = new ArrayList<>(allSplit.subList((int) (1.0 / 3 * n), n)); - Collections.shuffle(s1); - Collections.shuffle(s2); - - List as = new ArrayList<>(mixedFormatSplits); - Collections.shuffle(as); - int an = as.size(); - List as1 = new ArrayList<>(as.subList(0, (int) (2.0 / 3 * an))); - List as2 = new ArrayList<>(as.subList((int) (1.0 / 3 * an), an)); - CompletableFuture f1 = - CompletableFuture.runAsync(() -> temporalJoinSplits.removeAndReturnIfAllFinished(s1)); - CompletableFuture f2 = - CompletableFuture.runAsync(() -> temporalJoinSplits.addSplitsBack(as1)); - CompletableFuture f3 = - CompletableFuture.runAsync(() -> temporalJoinSplits.removeAndReturnIfAllFinished(s2)); - CompletableFuture f4 = - CompletableFuture.runAsync(() -> temporalJoinSplits.addSplitsBack(as2)); - CompletableFuture.allOf(f1, f2, f3, f4).join(); - Assert.assertTrue(temporalJoinSplits.removeAndReturnIfAllFinished(allSplit)); - } - - static class TestMixedFormatSplit extends MixedFormatSplit { - private final String splitId; - - public TestMixedFormatSplit(String splitId) { - this.splitId = splitId; - } - - public static TestMixedFormatSplit of(String splitId) { - return new TestMixedFormatSplit(splitId); - } - - @Override - public Integer taskIndex() { - return null; - } - - @Override - public void updateOffset(Object[] recordOffsets) {} - - @Override - public MixedFormatSplit copy() { - return new TestMixedFormatSplit(splitId); - } - - @Override - public String splitId() { - return splitId; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java deleted file mode 100644 index 61da6d3e71..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/MixedIncrementalLoaderTest.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.TableTestBase; -import org.apache.amoro.flink.read.MixedIncrementalLoader; -import org.apache.amoro.flink.read.hybrid.enumerator.ContinuousSplitPlanner; -import org.apache.amoro.flink.read.hybrid.enumerator.MergeOnReadIncrementalPlanner; -import org.apache.amoro.flink.read.source.FlinkKeyedMORDataReader; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.TaskWriter; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.List; - -@RunWith(value = Parameterized.class) -public class MixedIncrementalLoaderTest extends TableTestBase implements FlinkTaskWriterBaseTest { - - public MixedIncrementalLoaderTest(boolean partitionedTable) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, partitionedTable)); - } - - @Parameterized.Parameters(name = "partitionedTable = {0}") - public static Object[][] parameters() { - // todo mix hive test - return new Object[][] {{true}, {false}}; - } - - @Before - public void before() throws IOException { - MixedTable mixedTable = getMixedTable(); - TableSchema flinkPartialSchema = - TableSchema.builder() - .field("id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("ts", DataTypes.BIGINT()) - .field("op_time", DataTypes.TIMESTAMP()) - .build(); - RowType rowType = (RowType) flinkPartialSchema.toRowDataType().getLogicalType(); - - List expected = - Lists.newArrayList( - DataUtil.toRowData(1000011, "a", 1010L, LocalDateTime.parse("2022-06-18T10:10:11.0")), - DataUtil.toRowData(1000012, "b", 1011L, LocalDateTime.parse("2022-06-18T10:10:11.0")), - DataUtil.toRowData(1000013, "c", 1012L, LocalDateTime.parse("2022-06-18T10:10:11.0")), - DataUtil.toRowData(1000014, "d", 1013L, LocalDateTime.parse("2022-06-21T10:10:11.0")), - DataUtil.toRowData(1000015, "e", 1014L, LocalDateTime.parse("2022-06-21T10:10:11.0"))); - for (RowData rowData : expected) { - try (TaskWriter taskWriter = createBaseTaskWriter(mixedTable, rowType)) { - writeAndCommit(rowData, taskWriter, mixedTable); - } - } - - expected = - Lists.newArrayList( - DataUtil.toRowDataWithKind( - RowKind.DELETE, 1000015, "e", 1014L, LocalDateTime.parse("2022-06-21T10:10:11.0")), - DataUtil.toRowData(1000021, "a", 1020L, LocalDateTime.parse("2022-06-28T10:10:11.0")), - DataUtil.toRowData(1000022, "b", 1021L, LocalDateTime.parse("2022-06-28T10:10:11.0")), - DataUtil.toRowData(1000023, "c", 1022L, LocalDateTime.parse("2022-06-28T10:10:11.0")), - DataUtil.toRowData(1000024, "d", 1023L, LocalDateTime.parse("2022-06-28T10:10:11.0")), - DataUtil.toRowData(1000025, "e", 1024L, LocalDateTime.parse("2022-06-28T10:10:11.0"))); - for (RowData rowData : expected) { - try (TaskWriter taskWriter = createTaskWriter(mixedTable, rowType)) { - writeAndCommit(rowData, taskWriter, mixedTable); - } - } - } - - @Test - public void testMOR() { - KeyedTable keyedTable = getMixedTable().asKeyedTable(); - List expressions = - Lists.newArrayList(Expressions.greaterThan("op_time", "2022-06-20T10:10:11.0")); - ContinuousSplitPlanner morPlanner = - new MergeOnReadIncrementalPlanner( - getTableLoader(getCatalogName(), getMetastoreUri(), keyedTable)); - - FlinkKeyedMORDataReader flinkKeyedMORDataReader = - new FlinkKeyedMORDataReader( - keyedTable.io(), - keyedTable.schema(), - keyedTable.schema(), - keyedTable.primaryKeySpec(), - null, - true, - RowDataUtil::convertConstant, - true); - - MixedIncrementalLoader incrementalLoader = - new MixedIncrementalLoader<>( - morPlanner, - flinkKeyedMORDataReader, - new RowDataReaderFunction( - new Configuration(), - keyedTable.schema(), - keyedTable.schema(), - keyedTable.asKeyedTable().primaryKeySpec(), - null, - true, - keyedTable.io(), - true), - expressions); - - List actuals = new ArrayList<>(); - while (incrementalLoader.hasNext()) { - CloseableIterator iterator = incrementalLoader.next(); - while (iterator.hasNext()) { - RowData rowData = iterator.next(); - System.out.println(rowData); - actuals.add(rowData); - } - } - if (isPartitionedTable()) { - Assert.assertEquals(6, actuals.size()); - } else { - Assert.assertEquals(9, actuals.size()); - } - } - - @Override - public String getMetastoreUri() { - return getCatalogUri(); - } - - @Override - public String getCatalogName() { - return getMixedFormatCatalog().name(); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java deleted file mode 100644 index 4bad7e47fe..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/reader/TestRowDataReaderFunction.java +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.reader; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.data.DataFileType; -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.enumerator.TestContinuousSplitPlannerImpl; -import org.apache.amoro.flink.read.hybrid.split.ChangelogSplit; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.scan.ChangeTableIncrementalScan; -import org.apache.amoro.scan.MixedFileScanTask; -import org.apache.amoro.table.KeyedTable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.shaded.guava30.com.google.common.collect.Maps; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.TaskWriter; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.ZoneOffset; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -public class TestRowDataReaderFunction extends TestContinuousSplitPlannerImpl { - private static final Logger LOG = LoggerFactory.getLogger(TestRowDataReaderFunction.class); - private static final AtomicInteger splitCount = new AtomicInteger(); - - public TestRowDataReaderFunction() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - @Test - public void testReadChangelog() throws IOException { - - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); - - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - new Configuration(), - testKeyedTable.schema(), - testKeyedTable.schema(), - testKeyedTable.primaryKeySpec(), - null, - true, - testKeyedTable.io()); - - List actual = new ArrayList<>(); - mixedFormatSplits.forEach( - split -> { - LOG.info("Mixed format split: {}.", split); - DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - LOG.info("{}", rowData); - actual.add(rowData); - } - }); - - assertArrayEquals(excepts(), actual); - - long snapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); - writeUpdate(); - - testKeyedTable.changeTable().refresh(); - long nowSnapshotId = testKeyedTable.changeTable().currentSnapshot().snapshotId(); - ChangeTableIncrementalScan changeTableScan = - testKeyedTable.changeTable().newScan().useSnapshot(nowSnapshotId); - - Snapshot snapshot = testKeyedTable.changeTable().snapshot(snapshotId); - long fromSequence = snapshot.sequenceNumber(); - - Set appendLogTasks = new HashSet<>(); - Set deleteLogTasks = new HashSet<>(); - try (CloseableIterable tasks = changeTableScan.planFiles()) { - for (FileScanTask fileScanTask : tasks) { - if (fileScanTask.file().dataSequenceNumber() <= fromSequence) { - continue; - } - MixedFileScanTask mixedFileScanTask = (MixedFileScanTask) fileScanTask; - if (mixedFileScanTask.fileType().equals(DataFileType.INSERT_FILE)) { - appendLogTasks.add(mixedFileScanTask); - } else if (mixedFileScanTask.fileType().equals(DataFileType.EQ_DELETE_FILE)) { - deleteLogTasks.add(mixedFileScanTask); - } else { - throw new IllegalArgumentException( - String.format( - "DataFileType %s is not supported during change log reading period.", - mixedFileScanTask.fileType())); - } - } - } - ChangelogSplit changelogSplit = - new ChangelogSplit(appendLogTasks, deleteLogTasks, splitCount.incrementAndGet()); - actual.clear(); - DataIterator dataIterator = rowDataReaderFunction.createDataIterator(changelogSplit); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - actual.add(rowData); - } - assertArrayEquals(excepts2(), actual); - } - - @Test - public void testReadNodesUpMoved() throws IOException { - writeUpdateWithSpecifiedMaskOne(); - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); - - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - new Configuration(), - testKeyedTable.schema(), - testKeyedTable.schema(), - testKeyedTable.primaryKeySpec(), - null, - true, - testKeyedTable.io()); - - List actual = new ArrayList<>(); - mixedFormatSplits.forEach( - split -> { - LOG.info("Mixed format split: {}.", split); - DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - LOG.info("{}", rowData); - actual.add(rowData); - } - }); - - List excepts = expectedCollection(); - excepts.addAll(generateRecords()); - RowData[] array = - excepts.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()) - .toArray(new RowData[excepts.size()]); - assertArrayEquals(array, actual); - } - - protected void assertArrayEquals(RowData[] excepts, List actual) { - Assert.assertArrayEquals(excepts, sortRowDataCollection(actual)); - } - - protected RowData[] sortRowDataCollection(Collection records) { - return records.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()) - .toArray(new RowData[records.size()]); - } - - protected void writeUpdate() throws IOException { - // write change update - writeUpdate(updateRecords()); - } - - protected void writeUpdate(List input) throws IOException { - writeUpdate(input, testKeyedTable); - } - - protected void writeUpdateWithSpecifiedMaskOne() throws IOException { - List excepts = generateRecords(); - - writeUpdateWithSpecifiedMask(excepts, testKeyedTable, 1); - } - - protected void writeUpdateWithSpecifiedMask(List input, KeyedTable table, long mask) - throws IOException { - // write change update - TaskWriter taskWriter = createKeyedTaskWriter(table, ROW_TYPE, false, mask); - - for (RowData record : input) { - taskWriter.write(record); - } - commit(table, taskWriter.complete(), false); - } - - protected void writeUpdate(List input, KeyedTable table) throws IOException { - // write change update - TaskWriter taskWriter = createKeyedTaskWriter(table, ROW_TYPE, false); - - for (RowData record : input) { - taskWriter.write(record); - } - commit(table, taskWriter.complete(), false); - } - - protected List generateRecords() { - List excepts = new ArrayList<>(); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 7, - StringData.fromString("syan"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_BEFORE, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_AFTER, - 2, - StringData.fromString("daniel"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_BEFORE, - 7, - StringData.fromString("syan"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_AFTER, - 7, - StringData.fromString("syan2"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - return excepts; - } - - protected List updateRecords() { - List excepts = new ArrayList<>(); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_BEFORE, - 5, - StringData.fromString("lind"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.UPDATE_AFTER, - 5, - StringData.fromString("lina"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - return excepts; - } - - protected RowData[] excepts2() { - List excepts = updateRecords(); - - return updateRecords().stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()) - .toArray(new RowData[excepts.size()]); - } - - protected RowData[] excepts() { - List excepts = expectedCollection(); - - return excepts.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()) - .toArray(new RowData[excepts.size()]); - } - - protected RowData[] expectedAfterMOR() { - List expected = expectedCollection(); - return mor(expected).stream() - .sorted(Comparator.comparing(RowData::toString)) - .toArray(RowData[]::new); - } - - protected Collection mor(final Collection changelog) { - Map map = Maps.newHashMap(); - - changelog.forEach( - rowData -> { - int key = rowData.getInt(0); - RowKind kind = rowData.getRowKind(); - - if ((kind == RowKind.INSERT || kind == RowKind.UPDATE_AFTER) && !map.containsKey(key)) { - rowData.setRowKind(RowKind.INSERT); - map.put(key, rowData); - } else if ((kind == RowKind.DELETE || kind == RowKind.UPDATE_BEFORE)) { - map.remove(key); - } - }); - - return map.values(); - } - - protected List expectedCollection() { - List excepts = new ArrayList<>(); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 1, - StringData.fromString("john"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 2, - StringData.fromString("lily"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 3, - StringData.fromString("jake"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 4, - StringData.fromString("sam"), - LDT.plusDays(1).toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT.plusDays(1)))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 5, - StringData.fromString("mary"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 6, - StringData.fromString("mack"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.DELETE, - 5, - StringData.fromString("mary"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - excepts.add( - GenericRowData.ofKind( - RowKind.INSERT, - 5, - StringData.fromString("lind"), - LDT.toEpochSecond(ZoneOffset.UTC), - TimestampData.fromLocalDateTime(LDT))); - return excepts; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java deleted file mode 100644 index f1a6d115c3..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/read/hybrid/split/TestMixedFormatSplitSerializer.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.read.hybrid.split; - -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.reader.TestRowDataReaderFunction; -import org.apache.flink.util.FlinkRuntimeException; -import org.junit.Assert; -import org.junit.Test; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -public class TestMixedFormatSplitSerializer extends TestRowDataReaderFunction { - - @Test - public void testSerAndDes() { - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(testKeyedTable, new AtomicInteger(0)); - assertSerializedSplitEquals(mixedFormatSplits); - } - - @Test - public void testSerAndDesMoRSplit() { - List mixedFormatSplits = - FlinkSplitPlanner.mergeOnReadPlan(testKeyedTable, null, new AtomicInteger(0)); - assertSerializedSplitEquals(mixedFormatSplits); - } - - private void assertSerializedSplitEquals(List expected) { - MixedFormatSplitSerializer serializer = new MixedFormatSplitSerializer(); - List contents = - expected.stream() - .map( - split -> { - try { - return serializer.serialize(split); - } catch (IOException e) { - e.printStackTrace(); - return new byte[0]; - } - }) - .collect(Collectors.toList()); - - Assert.assertArrayEquals( - expected.toArray(new MixedFormatSplit[0]), - contents.stream() - .map( - data -> { - if (data.length == 0) { - throw new FlinkRuntimeException("failed cause data length is 0."); - } - try { - return serializer.deserialize(1, data); - } catch (IOException e) { - throw new FlinkRuntimeException(e); - } - }) - .toArray(MixedFormatSplit[]::new)); - } - - @Test - public void testNullableSplit() throws IOException { - MixedFormatSplitSerializer serializer = new MixedFormatSplitSerializer(); - byte[] ser = serializer.serialize(null); - - MixedFormatSplit actual = serializer.deserialize(1, ser); - - Assert.assertNull(actual); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java deleted file mode 100644 index bafb9c28cc..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestLogRecordV1.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.log.Bytes; -import org.apache.amoro.log.FormatTestBase; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - -import java.io.IOException; -import java.util.ArrayList; - -/** This is a {@link LogRecordV1} log data test, include all data types. */ -public class TestLogRecordV1 extends FormatTestBase { - - public final Schema userSchema = - new Schema( - new ArrayList() { - { - add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); - add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); - add(Types.NestedField.optional(2, "f_long", Types.LongType.get())); - add( - Types.NestedField.optional( - 3, "f_list_string", Types.ListType.ofOptional(4, Types.StringType.get()))); - } - }); - - @Test - public void testLogDataSerialize() throws IOException { - - LogDataJsonSerialization logDataJsonSerialization = - new LogDataJsonSerialization<>(userSchema, LogRecordV1.FIELD_GETTER_FACTORY); - GenericRowData rowData = new GenericRowData(4); - rowData.setField(0, true); - rowData.setField(1, 1); - rowData.setField(2, 123456789L); - rowData.setField( - 3, - new GenericArrayData( - new StringData[] { - null, StringData.fromString("b"), null, StringData.fromString("c"), null - })); - LogData logData = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - IdGenerator.generateUpstreamId(), - 123455L, - false, - ChangeAction.INSERT, - rowData); - - byte[] bytes = logDataJsonSerialization.serialize(logData); - - Assert.assertNotNull(bytes); - String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); - String expected = - "{\"f_boolean\":true,\"f_int\":1,\"f_long\":123456789,\"f_list_string\":[null,\"b\",null,\"c\",null]}"; - assertEquals(expected, actualJson); - - LogDataJsonDeserialization logDataJsonDeserialization = - new LogDataJsonDeserialization<>( - userSchema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); - LogData result = logDataJsonDeserialization.deserialize(bytes); - Assert.assertNotNull(result); - check(logData, result); - } - - @Test - public void testLogDataSerializeNullList() throws IOException { - - LogDataJsonSerialization logDataJsonSerialization = - new LogDataJsonSerialization<>(userSchema, LogRecordV1.FIELD_GETTER_FACTORY); - GenericRowData rowData = new GenericRowData(4); - rowData.setField(0, true); - rowData.setField(1, 1); - rowData.setField(2, 123456789L); - rowData.setField(3, new GenericArrayData(new StringData[] {null, null, null})); - LogData logData = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - IdGenerator.generateUpstreamId(), - 123455L, - false, - ChangeAction.INSERT, - rowData); - - byte[] bytes = logDataJsonSerialization.serialize(logData); - - Assert.assertNotNull(bytes); - String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); - String expected = - "{\"f_boolean\":true,\"f_int\":1,\"f_long\":123456789,\"f_list_string\":[null,null,null]}"; - assertEquals(expected, actualJson); - - LogDataJsonDeserialization logDataJsonDeserialization = - new LogDataJsonDeserialization<>( - userSchema, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); - LogData result = logDataJsonDeserialization.deserialize(bytes); - Assert.assertNotNull(result); - check(logData, result); - } - - private void check(LogData expected, LogData actual) { - assertArrayEquals(expected.getVersionBytes(), actual.getVersionBytes()); - assertArrayEquals(expected.getUpstreamIdBytes(), actual.getUpstreamIdBytes()); - assertEquals(expected.getEpicNo(), actual.getEpicNo()); - assertEquals(expected.getFlip(), actual.getFlip()); - assertEquals(expected.getChangeActionByte(), actual.getChangeActionByte()); - assertEquals(expected.getActualValue().toString(), actual.getActualValue().toString()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java deleted file mode 100644 index c870eb2287..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/shuffle/TestRoundRobinShuffleRulePolicy.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.shuffle; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.data.DataTreeNode; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.flink.api.common.functions.Partitioner; -import org.apache.flink.api.java.functions.KeySelector; -import org.apache.flink.table.data.RowData; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.Map; -import java.util.Set; - -@RunWith(Parameterized.class) -public class TestRoundRobinShuffleRulePolicy extends FlinkTestBase { - - public TestRoundRobinShuffleRulePolicy(boolean keyedTable, boolean partitionedTable) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(keyedTable, partitionedTable)); - } - - @Parameterized.Parameters(name = "keyedTable = {0}, partitionedTable = {1}") - public static Object[][] parameters() { - return new Object[][] { - {true, true}, - {true, false}, - {false, true}, - {false, false} - }; - } - - @Test - public void testPrimaryKeyPartitionedTable() throws Exception { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - ShuffleHelper helper = - ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); - RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); - Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); - Assert.assertEquals(subTaskTreeNodes.size(), 5); - subTaskTreeNodes - .values() - .forEach( - nodes -> { - Assert.assertEquals(nodes.size(), 2); - Assert.assertTrue(nodes.contains(DataTreeNode.of(1, 0))); - Assert.assertTrue(nodes.contains(DataTreeNode.of(1, 1))); - }); - - KeySelector keySelector = policy.generateKeySelector(); - Partitioner partitioner = policy.generatePartitioner(); - Assert.assertEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); - - Assert.assertNotEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); - - Assert.assertNotEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); - } - - @Test - public void testPrimaryKeyTableWithoutPartition() throws Exception { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - ShuffleHelper helper = - ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); - RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); - Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); - Assert.assertEquals(subTaskTreeNodes.size(), 5); - Assert.assertEquals( - subTaskTreeNodes.get(0), Sets.newHashSet(DataTreeNode.of(7, 0), DataTreeNode.of(7, 5))); - Assert.assertEquals( - subTaskTreeNodes.get(1), Sets.newHashSet(DataTreeNode.of(7, 1), DataTreeNode.of(7, 6))); - Assert.assertEquals( - subTaskTreeNodes.get(2), Sets.newHashSet(DataTreeNode.of(7, 2), DataTreeNode.of(7, 7))); - Assert.assertEquals(subTaskTreeNodes.get(3), Sets.newHashSet(DataTreeNode.of(7, 3))); - Assert.assertEquals(subTaskTreeNodes.get(4), Sets.newHashSet(DataTreeNode.of(7, 4))); - - KeySelector keySelector = policy.generateKeySelector(); - Partitioner partitioner = policy.generatePartitioner(); - Assert.assertEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); - - Assert.assertEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); - - Assert.assertNotEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); - } - - @Test - public void testPartitionedTableWithoutPrimaryKey() throws Exception { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - ShuffleHelper helper = - ShuffleHelper.build(getMixedTable(), getMixedTable().schema(), FLINK_ROW_TYPE); - RoundRobinShuffleRulePolicy policy = new RoundRobinShuffleRulePolicy(helper, 5, 2); - Map> subTaskTreeNodes = policy.getSubtaskTreeNodes(); - Assert.assertEquals(subTaskTreeNodes.size(), 5); - subTaskTreeNodes - .values() - .forEach( - nodes -> { - Assert.assertEquals(nodes.size(), 1); - Assert.assertTrue(nodes.contains(DataTreeNode.of(0, 0))); - }); - - KeySelector keySelector = policy.generateKeySelector(); - Partitioner partitioner = policy.generatePartitioner(); - Assert.assertEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-11T10:10:11.0")), 5)); - - Assert.assertEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(2, "hello2", "2022-10-11T10:10:11.0")), 5)); - - Assert.assertNotEquals( - partitioner.partition( - keySelector.getKey(createRowData(1, "hello", "2022-10-11T10:10:11.0")), 5), - partitioner.partition( - keySelector.getKey(createRowData(1, "hello2", "2022-10-12T10:10:11.0")), 5)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java deleted file mode 100644 index 7b5d723636..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/AmoroCatalogITCaseBase.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; - -import org.apache.amoro.TestAms; -import org.apache.amoro.formats.AmoroCatalogTestBase; -import org.apache.amoro.formats.AmoroCatalogTestHelper; -import org.apache.amoro.hive.TestHMS; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.StateBackend; -import org.apache.flink.runtime.state.filesystem.FsStateBackend; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.iceberg.flink.MiniClusterResource; -import org.junit.ClassRule; - -import java.io.IOException; - -public class AmoroCatalogITCaseBase extends AmoroCatalogTestBase { - static final TestHMS TEST_HMS = new TestHMS(); - public static final String TEST_DB_NAME = "test_db"; - public static final String TEST_TABLE_NAME = "test_table"; - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - @ClassRule public static TestAms TEST_AMS = new TestAms(); - - private volatile StreamTableEnvironment tEnv = null; - private volatile StreamExecutionEnvironment env = null; - - public AmoroCatalogITCaseBase(AmoroCatalogTestHelper catalogTestHelper) { - super(catalogTestHelper); - } - - @Override - public void setupCatalog() throws IOException { - super.setupCatalog(); - catalogTestHelper.initHiveConf(TEST_HMS.getHiveConf()); - TEST_AMS.getAmsHandler().createCatalog(catalogTestHelper.getCatalogMeta()); - } - - protected String getCatalogUrl() { - return TEST_AMS.getServerUrl() + "/" + catalogTestHelper.getCatalogMeta().getCatalogName(); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected StreamTableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - StreamTableEnvironment.create( - getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); - Configuration configuration = tEnv.getConfig().getConfiguration(); - // set low-level key-value options - configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); - } - } - } - return tEnv; - } - - protected StreamExecutionEnvironment getEnv() { - if (env == null) { - synchronized (this) { - if (env == null) { - StateBackend backend = - new FsStateBackend( - "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setParallelism(defaultParallelism()); - env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig().setCheckpointInterval(300); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - env.setStateBackend(backend); - env.setRestartStrategy(RestartStrategies.noRestart()); - } - } - } - return env; - } - - protected int defaultParallelism() { - return 1; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java deleted file mode 100644 index 2e10a280cb..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/CatalogITCaseBase.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED; - -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.catalog.TableTestBase; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.StateBackend; -import org.apache.flink.runtime.state.filesystem.FsStateBackend; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.api.EnvironmentSettings; -import org.apache.flink.table.api.TableEnvironment; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.iceberg.flink.MiniClusterResource; -import org.junit.ClassRule; - -public abstract class CatalogITCaseBase extends TableTestBase { - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - private volatile StreamTableEnvironment tEnv = null; - private volatile StreamExecutionEnvironment env = null; - - public CatalogITCaseBase(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { - super(catalogTestHelper, tableTestHelper); - } - - protected TableResult exec(String query, Object... args) { - return exec(getTableEnv(), query, args); - } - - protected static TableResult exec(TableEnvironment env, String query, Object... args) { - return env.executeSql(String.format(query, args)); - } - - protected StreamTableEnvironment getTableEnv() { - if (tEnv == null) { - synchronized (this) { - if (tEnv == null) { - this.tEnv = - StreamTableEnvironment.create( - getEnv(), EnvironmentSettings.newInstance().inStreamingMode().build()); - Configuration configuration = tEnv.getConfig().getConfiguration(); - // set low-level key-value options - configuration.setString(TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED.key(), "true"); - } - } - } - return tEnv; - } - - protected StreamExecutionEnvironment getEnv() { - if (env == null) { - synchronized (this) { - if (env == null) { - StateBackend backend = - new FsStateBackend( - "file:///" + System.getProperty("java.io.tmpdir") + "/flink/backend"); - env = - StreamExecutionEnvironment.getExecutionEnvironment( - MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); - env.setParallelism(defaultParallelism()); - env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig().setCheckpointInterval(300); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - env.setStateBackend(backend); - env.setRestartStrategy(RestartStrategies.noRestart()); - } - } - } - return env; - } - - protected int defaultParallelism() { - return 1; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java deleted file mode 100644 index 9a1a21e09d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/LookupITCase.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.TaskWriter; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.concurrent.TimeUnit; - -public class LookupITCase extends CatalogITCaseBase implements FlinkTaskWriterBaseTest { - private String db; - - public LookupITCase() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, false)); - } - - @Before - public void setup() throws IOException { - List dbs = getMixedFormatCatalog().listDatabases(); - if (dbs.isEmpty()) { - db = "test_db"; - getMixedFormatCatalog().createDatabase(db); - } else { - db = dbs.get(0); - } - exec( - "create catalog mixed_catalog with ('type'='arctic', 'metastore.url'='%s')", - getCatalogUri()); - exec( - "create table mixed_catalog.%s.L (id int) " - + "with ('scan.startup.mode'='earliest', 'monitor-interval'='1 s','streaming'='true')", - db); - exec( - "create table mixed_catalog.%s.DIM (id int, name string, primary key(id) not enforced) " - + "with ('write.upsert.enabled'='true', 'lookup.reloading.interval'='1 s')", - db); - exec("create view vi as select *, PROCTIME() as proc from mixed_catalog.%s.L", db); - - writeAndCommit( - TableIdentifier.of(getCatalogName(), db, "DIM"), - Lists.newArrayList(DataUtil.toRowData(1, "a"), DataUtil.toRowData(2, "b"))); - writeAndCommit( - TableIdentifier.of(getCatalogName(), db, "L"), Lists.newArrayList(DataUtil.toRowData(1))); - } - - @After - public void drop() { - exec("drop table mixed_catalog.%s.L", db); - exec("drop table mixed_catalog.%s.DIM", db); - } - - @Test() - public void testLookup() throws Exception { - TableResult tableResult = - exec( - "select L.id, D.name from vi L LEFT JOIN mixed_catalog.%s.DIM " - + "for system_time as of L.proc AS D ON L.id = D.id", - db); - - tableResult.await(1, TimeUnit.MINUTES); // wait for the first row. - - writeToChangeAndCommit( - TableIdentifier.of(getCatalogName(), db, "DIM"), - Lists.newArrayList( - DataUtil.toRowData(2, "c"), - DataUtil.toRowData(3, "d"), - DataUtil.toRowData(4, "e"), - DataUtil.toRowData(5, "f")), - true); - Thread.sleep(2000); // wait dim table commit and reload - - writeToChangeAndCommit( - TableIdentifier.of(getCatalogName(), db, "L"), - Lists.newArrayList( - DataUtil.toRowData(2), - DataUtil.toRowData(3), - DataUtil.toRowData(4), - DataUtil.toRowData(5), - DataUtil.toRowData(6)), - false); - - int expected = 6, count = 0; - Set actual = new HashSet<>(); - try (CloseableIterator rows = tableResult.collect()) { - while (count < expected && rows.hasNext()) { - Row row = rows.next(); - actual.add(row); - count++; - } - } - - Assert.assertEquals(expected, actual.size()); - List expects = new LinkedList<>(); - expects.add(new Object[] {1, "a"}); - expects.add(new Object[] {2, "c"}); - expects.add(new Object[] {3, "d"}); - expects.add(new Object[] {4, "e"}); - expects.add(new Object[] {5, "f"}); - expects.add(new Object[] {6, null}); - Assert.assertEquals(DataUtil.toRowSet(expects), actual); - } - - @Override - public String getMetastoreUri() { - return getCatalogUri(); - } - - @Override - public String getCatalogName() { - return getMixedFormatCatalog().name(); - } - - @Override - public boolean upsertEnabled() { - return true; - } - - private void writeAndCommit(TableIdentifier table, List expected) throws IOException { - writeAndCommit(table, expected, true, false); - } - - private void writeToChangeAndCommit( - TableIdentifier table, List expected, boolean upsertEnabled) throws IOException { - writeAndCommit(table, expected, false, upsertEnabled); - } - - private void writeAndCommit( - TableIdentifier table, - List expected, - boolean writeToBaseStore, - boolean upsertEnabled) - throws IOException { - MixedTable mixedTable = getMixedFormatCatalog().loadTable(table); - Assert.assertNotNull(mixedTable); - RowType rowType = FlinkSchemaUtil.convert(mixedTable.schema()); - for (RowData rowData : expected) { - try (TaskWriter taskWriter = - writeToBaseStore - ? createBaseTaskWriter(mixedTable, rowType) - : createTaskWriter(mixedTable, rowType)) { - if (writeToBaseStore) { - writeAndCommit(rowData, taskWriter, mixedTable); - } else { - writeAndCommit(rowData, taskWriter, mixedTable, upsertEnabled); - } - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java deleted file mode 100644 index da5347a5e5..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestJoin.java +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.flink.table.planner.factories.TestValuesTableFactory.registerData; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.util.TestUtil; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.runtime.testutils.CommonTestUtils; -import org.apache.flink.shaded.guava30.com.google.common.collect.Lists; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.planner.factories.TestValuesTableFactory; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.io.TaskWriter; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -public class TestJoin extends FlinkTestBase { - - public static final Logger LOG = LoggerFactory.getLogger(TestJoin.class); - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private static final String DB = TableTestHelper.TEST_DB_NAME; - private static final String TABLE = "test_keyed"; - private static final TableIdentifier TABLE_ID = - TableIdentifier.of(TableTestHelper.TEST_CATALOG_NAME, TableTestHelper.TEST_DB_NAME, TABLE); - - public TestJoin() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(false, false)); - } - - @Before - public void before() throws Exception { - super.before(); - super.config(); - } - - @After - public void after() { - getMixedFormatCatalog().dropTable(TABLE_ID, true); - } - - @Test - public void testRightEmptyLookupJoin() throws Exception { - getEnv().getCheckpointConfig().disableCheckpointing(); - List data = new LinkedList<>(); - data.add(new Object[] {RowKind.INSERT, 1000004L, "a", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 1000015L, "b", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 1000011L, "c", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 1000022L, "d", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 1000021L, "e", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 1000016L, "e", LocalDateTime.now()}); - String id = TestValuesTableFactory.registerData(DataUtil.toRowList(data)); - sql( - "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " - + "with (" - + " 'connector' = 'values'," - + " 'bounded' = 'false'," - + " 'data-id' = '" - + id - + "' " - + " )"); - - sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); - Map tableProperties = new HashMap<>(); - String table = String.format("mixed_catalog.%s.%s", DB, TABLE); - - String sql = - String.format( - "CREATE TABLE IF NOT EXISTS %s (" - + " info int, id bigint, name STRING" - + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", - table, toWithClause(tableProperties)); - sql(sql); - - sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); - - TableResult result = - exec( - "select u.name, u.id, dim.info, dim.name dname from `user` as u left join d " - + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" - + " on u.id = dim.id"); - - CommonTestUtils.waitForJobStatus( - result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - result.getJobClient().ifPresent(TestUtil::cancelJob); - - List expected = new LinkedList<>(); - expected.add(new Object[] {"a", 1000004L, null, null}); - expected.add(new Object[] {"b", 1000015L, null, null}); - expected.add(new Object[] {"c", 1000011L, null, null}); - expected.add(new Object[] {"d", 1000022L, null, null}); - expected.add(new Object[] {"e", 1000021L, null, null}); - expected.add(new Object[] {"e", 1000016L, null, null}); - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - } - - @Test - public void testLookupJoin() throws Exception { - getEnv().getCheckpointConfig().disableCheckpointing(); - List data = new LinkedList<>(); - data.add(new Object[] {RowKind.INSERT, 1L, "a", LocalDateTime.now().minusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 2L, "b", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 3L, "c", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 4L, "d", LocalDateTime.now().plusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 5L, "e", LocalDateTime.now().plusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 3L, "e", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 6L, "f", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 8L, "g", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 9L, "h", LocalDateTime.now()}); - String id = registerData(DataUtil.toRowList(data)); - sql( - "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " - + "with (" - + " 'connector' = 'values'," - + " 'bounded' = 'false'," - + " 'data-id' = '" - + id - + "' " - + " )"); - - sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); - Map tableProperties = new HashMap<>(); - String table = String.format("mixed_catalog.%s.%s", DB, TABLE); - - String sql = - String.format( - "CREATE TABLE IF NOT EXISTS %s (" - + " info int, id bigint, name STRING" - + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", - table, toWithClause(tableProperties)); - sql(sql); - - TableSchema flinkSchema = - TableSchema.builder() - .field("info", DataTypes.INT()) - .field("id", DataTypes.BIGINT()) - .field("name", DataTypes.STRING()) - .build(); - RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); - KeyedTable keyedTable = - (KeyedTable) - MixedFormatUtils.loadMixedTable(MixedFormatTableLoader.of(TABLE_ID, catalogBuilder)); - TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); - List baseData = - new ArrayList() { - { - add(GenericRowData.ofKind(RowKind.INSERT, 123, 1L, StringData.fromString("a"))); - add(GenericRowData.ofKind(RowKind.INSERT, 324, 2L, StringData.fromString("b"))); - add(GenericRowData.ofKind(RowKind.INSERT, 456, 3L, StringData.fromString("c"))); - add(GenericRowData.ofKind(RowKind.INSERT, 463, 4L, StringData.fromString("d"))); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(keyedTable, taskWriter.complete(), true); - - writeChange(keyedTable, rowType); - - sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); - - TableResult result = - exec( - "select u.name, u.id, dim.info, dim.name dname from `user` as u left join d " - + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" - + " on u.id = dim.id"); - - CommonTestUtils.waitForJobStatus( - result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - result.getJobClient().ifPresent(TestUtil::cancelJob); - - List expected = new LinkedList<>(); - expected.add(new Object[] {"a", 1L, 123, "a"}); - expected.add(new Object[] {"b", 2L, 324, "b"}); - expected.add(new Object[] {"c", 3L, null, null}); - expected.add(new Object[] {"d", 4L, 463, "d"}); - expected.add(new Object[] {"e", 5L, 324, "john"}); - expected.add(new Object[] {"e", 3L, null, null}); - expected.add(new Object[] {"f", 6L, 324, "lily"}); - expected.add(new Object[] {"g", 8L, null, null}); - expected.add(new Object[] {"h", 9L, null, null}); - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - } - - @Test - public void testLookupJoinWithPartialFields() throws Exception { - getEnv().getCheckpointConfig().disableCheckpointing(); - List data = new LinkedList<>(); - data.add(new Object[] {RowKind.INSERT, 1L, "a", LocalDateTime.now().minusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 2L, "b", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 3L, "c", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 4L, "d", LocalDateTime.now().plusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 5L, "e", LocalDateTime.now().plusDays(3)}); - data.add(new Object[] {RowKind.INSERT, 3L, "e", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 6L, "f", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 8L, "g", LocalDateTime.now()}); - data.add(new Object[] {RowKind.INSERT, 9L, "h", LocalDateTime.now()}); - String id = registerData(DataUtil.toRowList(data)); - sql( - "CREATE TABLE `user` (id bigint, name string, op_time timestamp(3), watermark for op_time as op_time) " - + "with (" - + " 'connector' = 'values'," - + " 'bounded' = 'false'," - + " 'data-id' = '" - + id - + "' " - + " )"); - - sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); - Map tableProperties = new HashMap<>(); - String table = String.format("mixed_catalog.%s.%s", DB, TABLE); - - String sql = - String.format( - "CREATE TABLE IF NOT EXISTS %s (" - + " info int, id bigint, name STRING" - + ", PRIMARY KEY (id) NOT ENFORCED) WITH %s", - table, toWithClause(tableProperties)); - sql(sql); - - TableSchema flinkSchema = - TableSchema.builder() - .field("info", DataTypes.INT()) - .field("id", DataTypes.BIGINT()) - .field("name", DataTypes.STRING()) - .build(); - RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); - KeyedTable keyedTable = - (KeyedTable) - MixedFormatUtils.loadMixedTable(MixedFormatTableLoader.of(TABLE_ID, catalogBuilder)); - TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); - List baseData = - new ArrayList() { - { - add(GenericRowData.ofKind(RowKind.INSERT, 123, 1L, StringData.fromString("a"))); - add(GenericRowData.ofKind(RowKind.INSERT, 324, 2L, StringData.fromString("b"))); - add(GenericRowData.ofKind(RowKind.INSERT, 456, 3L, StringData.fromString("c"))); - add(GenericRowData.ofKind(RowKind.INSERT, 463, 4L, StringData.fromString("d"))); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(keyedTable, taskWriter.complete(), true); - - writeChange(keyedTable, rowType); - - sql("create table d (op_time timestamp(3), watermark for op_time as op_time) like %s", table); - - // schema fields:[info, id, name], now only use [id, name] - TableResult result = - exec( - "select u.name, u.id, dim.name dname from `user` as u left join d " - + "/*+OPTIONS('streaming'='true', 'dim-table.enabled'='true')*/ for system_time as of u.op_time as dim" - + " on u.id = dim.id"); - - CommonTestUtils.waitForJobStatus( - result.getJobClient().get(), Lists.newArrayList(JobStatus.RUNNING)); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - result.getJobClient().ifPresent(TestUtil::cancelJob); - - List expected = new LinkedList<>(); - expected.add(new Object[] {"a", 1L, "a"}); - expected.add(new Object[] {"b", 2L, "b"}); - expected.add(new Object[] {"c", 3L, null}); - expected.add(new Object[] {"d", 4L, "d"}); - expected.add(new Object[] {"e", 5L, "john"}); - expected.add(new Object[] {"e", 3L, null}); - expected.add(new Object[] {"f", 6L, "lily"}); - expected.add(new Object[] {"g", 8L, null}); - expected.add(new Object[] {"h", 9L, null}); - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - } - - private void writeChange(KeyedTable keyedTable, RowType rowType) { - TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, false); - List data = - new ArrayList() { - { - add(GenericRowData.ofKind(RowKind.INSERT, 324, 5L, StringData.fromString("john"))); - add(GenericRowData.ofKind(RowKind.INSERT, 324, 6L, StringData.fromString("lily"))); - add(GenericRowData.ofKind(RowKind.DELETE, 324, 3L, StringData.fromString("jake1"))); - } - }; - try { - for (RowData record : data) { - taskWriter.write(record); - } - commit(keyedTable, taskWriter.complete(), false); - } catch (IOException e) { - throw new RuntimeException(e); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java deleted file mode 100644 index 05fc24eb23..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestKeyed.java +++ /dev/null @@ -1,1164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; -import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; -import static org.apache.amoro.table.TableProperties.LOG_STORE_STORAGE_TYPE_KAFKA; -import static org.apache.amoro.table.TableProperties.LOG_STORE_TYPE; -import static org.apache.flink.table.api.Expressions.$; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.util.TestUtil; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.hive.catalog.HiveTableTestHelper; -import org.apache.amoro.table.TableProperties; -import org.apache.commons.collections.CollectionUtils; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.table.api.ApiExpression; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.rules.TestName; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -@RunWith(Parameterized.class) -public class TestKeyed extends FlinkTestBase { - - public static final Logger LOG = LoggerFactory.getLogger(TestKeyed.class); - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - @Rule public TestName testName = new TestName(); - @ClassRule public static TestHMS TEST_HMS = new TestHMS(); - - private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); - private static final String TABLE = "test_keyed"; - - private String catalog; - private String db; - private String topic; - private final Map tableProperties = new HashMap<>(); - public boolean isHive; - - public TestKeyed( - CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { - super(catalogTestHelper, tableTestHelper); - this.isHive = isHive; - } - - @Parameterized.Parameters(name = "{0}, {1}, {2}") - public static Collection parameters() { - return Arrays.asList( - new Object[][] { - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true), - true - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true), - true - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true), - false - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true), - false - } - }); - } - - @BeforeClass - public static void beforeClass() throws Exception { - FlinkTestBase.prepare(); - } - - @AfterClass - public static void afterClass() throws Exception { - FlinkTestBase.shutdown(); - } - - @Before - public void before() throws Exception { - if (isHive) { - db = HiveTableTestHelper.TEST_DB_NAME; - } else { - db = DB; - } - super.before(); - prepareLog(); - super.config(); - } - - @After - public void after() { - sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); - } - - private void prepareLog() { - topic = TestUtil.getUtMethodName(testName) + isHive; - tableProperties.clear(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - tableProperties.put(LOG_STORE_TYPE, LOG_STORE_STORAGE_TYPE_KAFKA); - tableProperties.put( - LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - } - - @Test - public void testSinkSourceFile() throws IOException { - - List data = new LinkedList<>(); - data.add( - new Object[] { - RowKind.INSERT, - 1000004, - "a", - LocalDateTime.parse("2022-06-17T10:10:11.0"), - LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - data.add( - new Object[] { - RowKind.DELETE, - 1000015, - "b", - LocalDateTime.parse("2022-06-17T10:08:11.0"), - LocalDateTime.parse("2022-06-17T10:08:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - data.add( - new Object[] { - RowKind.DELETE, - 1000011, - "c", - LocalDateTime.parse("2022-06-18T10:10:11.0"), - LocalDateTime.parse("2022-06-18T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, - 1000021, - "d", - LocalDateTime.parse("2022-06-17T10:11:11.0"), - LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, - 1000021, - "e", - LocalDateTime.parse("2022-06-17T10:11:11.0"), - LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - data.add( - new Object[] { - RowKind.INSERT, - 1000015, - "e", - LocalDateTime.parse("2022-06-17T10:10:11.0"), - LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - - DataStream source = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.TIMESTAMP().getLogicalType(), - DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE().getLogicalType())); - - Table input = - getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time"), $("op_time_tz")); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - sql( - "CREATE TABLE mixed_catalog." - + db - + "." - + TABLE - + " (" - + " id INT," - + " name STRING," - + " op_time_tz TIMESTAMP WITH LOCAL TIME ZONE," - + " op_time TIMESTAMP," - + " PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) " - + " WITH (" - + " 'connector' = 'arctic'" - + ")"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/ select id, name, op_time_tz, op_time from input"); - - List actual = - sql( - "select id, op_time, op_time_tz from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'streaming'='false'" - + ", 'source.parallelism'='2'" - + ")*/"); - - List expected = new LinkedList<>(); - expected.add( - new Object[] { - RowKind.INSERT, - 1000004, - LocalDateTime.parse("2022-06-17T10:10:11.0"), - LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - expected.add( - new Object[] { - RowKind.INSERT, - 1000021, - LocalDateTime.parse("2022-06-17T10:11:11.0"), - LocalDateTime.parse("2022-06-17T10:11:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - expected.add( - new Object[] { - RowKind.INSERT, - 1000015, - LocalDateTime.parse("2022-06-17T10:10:11.0"), - LocalDateTime.parse("2022-06-17T10:10:11.0").atZone(ZoneId.systemDefault()).toInstant() - }); - - Assert.assertTrue(CollectionUtils.isEqualCollection(DataUtil.toRowList(expected), actual)); - } - - @Test - public void testUnpartitionLogSinkSource() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, PRIMARY KEY (id) NOT ENFORCED) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ", 'source.parallelism'='2'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testUnpartitionLogSinkSourceWithSelectedFields() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select id, op_time from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - - List expected = new LinkedList<>(); - expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testUnPartitionDoubleSink() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, PRIMARY KEY (id) NOT ENFORCED) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='file, log'" - + ") */" - + "select id, name from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('streaming'='false') */"))); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testPartitionSinkFile() throws IOException { - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH ('connector' = 'arctic')"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/" - + " select * from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"))); - } - - @Test - public void testSinkSourceFileWithoutSelectPK() throws Exception { - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") WITH ('connector' = 'arctic')"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/" - + " select * from input"); - - TableResult result = - exec( - "select name, op_time from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"); - LinkedList actual = new LinkedList<>(); - try (CloseableIterator iterator = result.collect()) { - while (iterator.hasNext()) { - Row row = iterator.next(); - actual.add(row); - } - } - - List expected = new LinkedList<>(); - expected.add(new Object[] {"a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {"b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {"c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {"d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {"d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {"e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - Assert.assertEquals(DataUtil.toRowSet(expected), new HashSet<>(actual)); - } - - @Test - public void testFileUpsert() { - - List data = new LinkedList<>(); - data.add( - new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.DELETE, 1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); - data.add( - new Object[] {RowKind.DELETE, 1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000021, "d", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] {RowKind.INSERT, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - DataStream source = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.TIMESTAMP().getLogicalType())); - - Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(TableProperties.UPSERT_ENABLED, "true"); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/" - + " select * from input"); - - List expected = new LinkedList<>(); - expected.add( - new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - // key = 1000021 locate in two partitions. - expected.add( - new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - Assert.assertEquals( - DataUtil.toRowSet(expected), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"))); - } - - @Test - public void testFileCDC() { - - List data = new LinkedList<>(); - data.add( - new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.DELETE, 1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); - data.add( - new Object[] {RowKind.DELETE, 1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000021, "d", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] {RowKind.INSERT, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000021, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000015, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] {RowKind.INSERT, 1000031, "g", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000032, "h", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000031, "g", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_BEFORE, 1000032, "h", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000031, "f", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - RowKind.UPDATE_AFTER, 1000032, "e", LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - DataStream source = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.TIMESTAMP().getLogicalType())); - - Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/" - + " select * from input"); - - List expected = new LinkedList<>(); - // upsert is disEnabled, key=1000021 locate in two diff partitions. - expected.add( - new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000021, "e", LocalDateTime.parse("2022-06-17T10:11:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000021, "d", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000021, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000031, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000032, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - Assert.assertEquals( - DataUtil.toRowSet(expected), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"))); - } - - @Test - public void testFileUpsertWithSamePrimaryKey() throws Exception { - - List data = new LinkedList<>(); - data.add( - new Object[] {RowKind.INSERT, 1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000004, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000011, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {RowKind.INSERT, 1000011, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - DataStream source = - getEnv() - .fromCollection( - DataUtil.toRowData(data), - InternalTypeInfo.ofFields( - DataTypes.INT().getLogicalType(), - DataTypes.VARCHAR(100).getLogicalType(), - DataTypes.TIMESTAMP().getLogicalType())); - - getEnv().setParallelism(4); - Table input = getTableEnv().fromDataStream(source, $("id"), $("name"), $("op_time")); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(TableProperties.UPSERT_ENABLED, "true"); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.emit.mode'='file'" - + ")*/" - + " select * from input"); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"); - LinkedList actual = new LinkedList<>(); - try (CloseableIterator iterator = result.collect()) { - while (iterator.hasNext()) { - Row row = iterator.next(); - actual.add(row); - } - } - - LinkedList expected = new LinkedList<>(); - - expected.add( - new Object[] {RowKind.INSERT, 1000004, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add( - new Object[] {RowKind.INSERT, 1000011, "f", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - Map> actualMap = DataUtil.groupByPrimaryKey(actual, 0); - Map> expectedMap = - DataUtil.groupByPrimaryKey(DataUtil.toRowList(expected), 0); - - for (Object key : actualMap.keySet()) { - Assert.assertTrue( - CollectionUtils.isEqualCollection(actualMap.get(key), expectedMap.get(key))); - } - } - - @Test - public void testPartitionLogSinkSource() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testPartitionLogSinkSourceWithSelectedFields() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select id, op_time from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - - List expected = new LinkedList<>(); - expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testPartitionDoubleSink() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP, PRIMARY KEY (id) NOT ENFORCED " - + ") PARTITIONED BY(op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='file, log'" - + ", 'log.version'='v1'" - + ") */" - + "select * from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */"))); - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - Row row = iterator.next(); - actual.add(row); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java deleted file mode 100644 index 659f4e955f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestLookupSecondary.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.write.FlinkTaskWriterBaseTest; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.TaskWriter; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.LinkedList; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -public class TestLookupSecondary extends CatalogITCaseBase implements FlinkTaskWriterBaseTest { - private String db; - - public TestLookupSecondary() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, false)); - } - - @Before - public void setup() throws IOException { - List dbs = getMixedFormatCatalog().listDatabases(); - if (dbs.isEmpty()) { - db = "test_db"; - getMixedFormatCatalog().createDatabase(db); - } else { - db = dbs.get(0); - } - exec( - "create catalog mixed_catalog with ('type'='mixed_iceberg', 'ams.uri'='%s')", - getCatalogUri()); - exec( - "create table mixed_catalog.%s.L (id int) " - + "with ('scan.startup.mode'='earliest', 'monitor-interval'='1 s')", - db); - exec( - "create table mixed_catalog.%s.DIM_2 (id int, name string, cls bigint, primary key(id, name) not enforced) " - + "with ('write.upsert.enabled'='true', 'lookup.reloading.interval'='1 s')", - db); - exec("create view vi as select *, PROCTIME() as proc from mixed_catalog.%s.L", db); - - writeAndCommit( - TableIdentifier.of(getCatalogName(), db, "L"), - Lists.newArrayList( - DataUtil.toRowData(1), - DataUtil.toRowData(2), - DataUtil.toRowData(3), - DataUtil.toRowData(4))); - writeToChangeAndCommit( - TableIdentifier.of(getCatalogName(), db, "DIM_2"), - Lists.newArrayList( - DataUtil.toRowData(1, "a", 1L), - DataUtil.toRowData(1, "b", 1L), - DataUtil.toRowData(2, "c", 2L), - DataUtil.toRowData(3, "d", 3L)), - true); - } - - @After - public void drop() { - exec("drop table mixed_catalog.%s.L", db); - exec("drop table mixed_catalog.%s.DIM_2", db); - } - - @Test() - public void testLookup() throws Exception { - TableResult tableResult = - exec( - "select L.id, D.cls from vi L LEFT JOIN mixed_catalog.%s.DIM_2 " - + "for system_time as of L.proc AS D ON L.id = D.id", - db); - - tableResult.await(1, TimeUnit.MINUTES); // wait for the first row. - - List expects = new LinkedList<>(); - expects.add(new Object[] {1, 1L}); - expects.add(new Object[] {1, 1L}); - expects.add(new Object[] {2, 2L}); - expects.add(new Object[] {3, 3L}); - expects.add(new Object[] {4, null}); - int expected = expects.size(), count = 0; - List actual = new ArrayList<>(); - try (CloseableIterator rows = tableResult.collect()) { - while (count < expected && rows.hasNext()) { - Row row = rows.next(); - actual.add(row); - count++; - } - } - - Assert.assertEquals(expected, actual.size()); - List rows = - expects.stream() - .map( - r -> - r[0] instanceof RowKind - ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) - : Row.of(r)) - .collect(Collectors.toList()); - Assert.assertEquals( - rows.stream().sorted(Comparator.comparing(Row::toString)).collect(Collectors.toList()), - actual.stream().sorted(Comparator.comparing(Row::toString)).collect(Collectors.toList())); - } - - @Override - public String getMetastoreUri() { - return getCatalogUri(); - } - - @Override - public String getCatalogName() { - return getMixedFormatCatalog().name(); - } - - @Override - public boolean upsertEnabled() { - return true; - } - - private void writeAndCommit(TableIdentifier table, List expected) throws IOException { - writeAndCommit(table, expected, true, false); - } - - private void writeToChangeAndCommit( - TableIdentifier table, List expected, boolean upsertEnabled) throws IOException { - writeAndCommit(table, expected, false, upsertEnabled); - } - - private void writeAndCommit( - TableIdentifier table, - List expected, - boolean writeToBaseStore, - boolean upsertEnabled) - throws IOException { - MixedTable mixedTable = getMixedFormatCatalog().loadTable(table); - Assert.assertNotNull(mixedTable); - RowType rowType = FlinkSchemaUtil.convert(mixedTable.schema()); - for (RowData rowData : expected) { - try (TaskWriter taskWriter = - writeToBaseStore - ? createBaseTaskWriter(mixedTable, rowType) - : createTaskWriter(mixedTable, rowType)) { - if (writeToBaseStore) { - writeAndCommit(rowData, taskWriter, mixedTable); - } else { - writeAndCommit(rowData, taskWriter, mixedTable, upsertEnabled); - } - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java deleted file mode 100644 index 5bbc2412e2..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestTableRefresh.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP_TIMESTAMP; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.hive.catalog.HiveTableTestHelper; -import org.apache.amoro.table.MixedTable; -import org.apache.iceberg.UpdateProperties; -import org.junit.Assert; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; - -@RunWith(Parameterized.class) -public class TestTableRefresh extends FlinkTestBase { - @ClassRule public static TestHMS TEST_HMS = new TestHMS(); - - public TestTableRefresh(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { - super(catalogTestHelper, tableTestHelper); - } - - @Parameterized.Parameters(name = "{0}, {1}") - public static Collection parameters() { - return Arrays.asList( - new Object[][] { - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true) - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true) - } - }); - } - - @Test - public void testRefresh() { - MixedFormatTableLoader tableLoader = - MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - - tableLoader.open(); - MixedTable mixedTable = tableLoader.loadMixedFormatTable(); - boolean catchUp = true; - String catchUpTs = "1"; - - UpdateProperties updateProperties = mixedTable.updateProperties(); - updateProperties.set(LOG_STORE_CATCH_UP.key(), String.valueOf(catchUp)); - updateProperties.set(LOG_STORE_CATCH_UP_TIMESTAMP.key(), catchUpTs); - updateProperties.commit(); - - mixedTable.refresh(); - Map properties = mixedTable.properties(); - Assert.assertEquals(String.valueOf(catchUp), properties.get(LOG_STORE_CATCH_UP.key())); - Assert.assertEquals(catchUpTs, properties.get(LOG_STORE_CATCH_UP_TIMESTAMP.key())); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java deleted file mode 100644 index ccab05a761..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyed.java +++ /dev/null @@ -1,1052 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; -import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.util.TestUtil; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.hive.catalog.HiveTableTestHelper; -import org.apache.amoro.mixed.MixedFormatCatalog; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.table.api.ApiExpression; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.types.Row; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.types.Types; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -@RunWith(Parameterized.class) -public class TestUnkeyed extends FlinkTestBase { - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private static final String TABLE = "test_unkeyed"; - private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); - - private String catalog; - private MixedFormatCatalog mixedFormatCatalog; - private String db; - private String topic; - - @ClassRule public static TestHMS TEST_HMS = new TestHMS(); - public boolean isHive; - - public TestUnkeyed( - CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { - super(catalogTestHelper, tableTestHelper); - this.isHive = isHive; - } - - @Parameterized.Parameters(name = "{0}, {1}, {2}") - public static Collection parameters() { - return Arrays.asList( - new Object[][] { - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true), - true - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true), - false - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true), - false - } - }); - } - - @BeforeClass - public static void beforeClass() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void afterClass() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Before - public void before() throws Exception { - if (isHive) { - catalog = HiveTableTestHelper.TEST_CATALOG_NAME; - db = HiveTableTestHelper.TEST_DB_NAME; - } else { - catalog = TEST_CATALOG_NAME; - db = DB; - } - super.before(); - mixedFormatCatalog = getMixedFormatCatalog(); - topic = String.join(".", catalog, db, TABLE); - super.config(); - } - - @After - public void after() { - sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); - } - - @Test - public void testUnPartitionDDL() throws IOException { - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, age SMALLINT, sex TINYINT, score BIGINT, height FLOAT, speed DOUBLE, ts TIMESTAMP)"); - - MixedTable table = - mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TestUnkeyed.TABLE)); - - Schema required = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "name", Types.StringType.get()), - Types.NestedField.optional(3, "age", Types.IntegerType.get()), - Types.NestedField.optional(4, "sex", Types.IntegerType.get()), - Types.NestedField.optional(5, "score", Types.LongType.get()), - Types.NestedField.optional(6, "height", Types.FloatType.get()), - Types.NestedField.optional(7, "speed", Types.DoubleType.get()), - Types.NestedField.optional(8, "ts", Types.TimestampType.withoutZone())); - Assert.assertEquals(required.asStruct(), table.schema().asStruct()); - } - - @Test - public void testPartitionDDL() throws IOException { - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, age SMALLINT, sex TINYINT, score BIGINT, height FLOAT, speed DOUBLE, ts TIMESTAMP)" - + " PARTITIONED BY (ts)"); - - Schema required = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "name", Types.StringType.get()), - Types.NestedField.optional(3, "age", Types.IntegerType.get()), - Types.NestedField.optional(4, "sex", Types.IntegerType.get()), - Types.NestedField.optional(5, "score", Types.LongType.get()), - Types.NestedField.optional(6, "height", Types.FloatType.get()), - Types.NestedField.optional(7, "speed", Types.DoubleType.get()), - Types.NestedField.optional(8, "ts", Types.TimestampType.withoutZone())); - MixedTable table = mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TABLE)); - Assert.assertEquals(required.asStruct(), table.schema().asStruct()); - - PartitionSpec requiredSpec = PartitionSpec.builderFor(required).identity("ts").build(); - Assert.assertEquals(requiredSpec, table.spec()); - } - - @Test - public void testUnkeyedWatermarkSet() throws Exception { - List data = new LinkedList<>(); - - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-17T10:11:11.0")}); - data.add(new Object[] {1000021, "d", LocalDateTime.parse("2022-06-17T16:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("ts", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, ts TIMESTAMP)"); - - sql( - "create table user_tb (" - + " rtime as cast(ts as timestamp(3))," - + " WATERMARK FOR rtime as rtime" - + " ) LIKE mixed_catalog." - + db - + "." - + TABLE); - - sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); - - TableResult result = - exec( - "select id, name, ts from user_tb" - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testSinkBatchRead() throws IOException { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:08:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-17T10:11:11.0")}); - data.add(new Object[] {1000021, "d", LocalDateTime.parse("2022-06-17T16:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP)"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS('mixed-format.emit.mode'='file')*/ select * from input"); - - MixedTable table = mixedFormatCatalog.loadTable(TableIdentifier.of(catalog, db, TABLE)); - Iterable snapshots = table.asUnkeyedTable().snapshots(); - Snapshot s = snapshots.iterator().next(); - - Assert.assertEquals( - DataUtil.toRowSet(data), - new HashSet<>( - sql( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'streaming'='false'" - + ", 'snapshot-id'='" - + s.snapshotId() - + "'" - + ")*/"))); - } - - @Test - public void testSinkStreamRead() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - sql("CREATE TABLE IF NOT EXISTS mixed_catalog." + db + "." + TABLE + "(id INT, name STRING)"); - - sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); - - // verify in earliest scan-startup-mode file read - TableResult resultWithEarliestPosition = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'streaming'='true'" - + ", 'mixed-format.read.mode'='file'" - + ", 'scan.startup.mode'='earliest'" - + ", 'monitor-interval'='1s'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = resultWithEarliestPosition.collect()) { - for (int i = 0; i < data.size(); i++) { - actual.add(iterator.next()); - } - } - resultWithEarliestPosition.getJobClient().ifPresent(TestUtil::cancelJob); - Assert.assertEquals(DataUtil.toRowSet(data), actual); - - // verify in latest scan-startup-mode file read - TableResult resultWithLatestPosition = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'streaming'='true'" - + ", 'mixed-format.read.mode'='file'" - + ", 'scan.startup.mode'='latest'" - + ", 'monitor-interval'='1s'" - + ")*/"); - - List appendData = new LinkedList<>(); - appendData.add(new Object[] {2000004, "a"}); - appendData.add(new Object[] {2000015, "b"}); - appendData.add(new Object[] {2000011, "c"}); - appendData.add(new Object[] {2000014, "d"}); - appendData.add(new Object[] {2000021, "d"}); - appendData.add(new Object[] {2000007, "e"}); - - List appendRows = DataUtil.toRows(appendData); - - Table appendInput = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - appendRows); - getTableEnv().createTemporaryView("appendInput", appendInput); - - actual.clear(); - try (CloseableIterator iterator = resultWithLatestPosition.collect()) { - sql("insert into mixed_catalog." + db + "." + TABLE + " select * from appendInput"); - for (int i = 0; i < appendData.size(); i++) { - Assert.assertTrue("Should have more records", iterator.hasNext()); - actual.add(iterator.next()); - } - } - resultWithLatestPosition.getJobClient().ifPresent(TestUtil::cancelJob); - Assert.assertEquals(DataUtil.toRowSet(appendData), actual); - } - - @Test - public void testLogSinkSource() throws Exception { - String topic = this.topic + "testLogSinkSource"; - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put( - LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - KafkaContainerTest.deleteTopics(topic); - } - - @Test - public void testUnpartitionLogSinkSourceWithSelectedFields() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put( - LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select id, op_time from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - - List expected = new LinkedList<>(); - expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - } - - @Test - public void testUnPartitionDoubleSink() throws Exception { - String topic = this.topic + "testUnPartitionDoubleSink"; - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put( - LOG_STORE_ADDRESS, KafkaContainerTest.KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='file, log'" - + ", 'log.version'='v1'" - + ") */" - + "select id, name from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - sqlSet( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='file', 'streaming'='false') */")); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - KafkaContainerTest.deleteTopics(topic); - } - - @Test - public void testPartitionSinkBatchRead() throws IOException { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", "2022-05-17"}); - data.add(new Object[] {1000015, "b", "2022-05-17"}); - data.add(new Object[] {1000011, "c", "2022-05-17"}); - data.add(new Object[] {1000014, "d", "2022-05-18"}); - data.add(new Object[] {1000021, "d", "2022-05-18"}); - data.add(new Object[] {1000007, "e", "2022-05-18"}); - - List expected = new LinkedList<>(); - expected.add(new Object[] {1000014, "d", "2022-05-18"}); - expected.add(new Object[] {1000021, "d", "2022-05-18"}); - expected.add(new Object[] {1000007, "e", "2022-05-18"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, dt STRING)" - + " PARTITIONED BY (dt)"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " PARTITION (dt='2022-05-18') select id, name from input" - + " where dt='2022-05-18' "); - - TableIdentifier identifier = TableIdentifier.of(catalog, db, TABLE); - MixedTable table = mixedFormatCatalog.loadTable(identifier); - Iterable snapshots = table.asUnkeyedTable().snapshots(); - Snapshot s = snapshots.iterator().next(); - - Assert.assertEquals( - DataUtil.toRowSet(expected), - sqlSet( - "select * from mixed_catalog." - + db - + "." - + TestUnkeyed.TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'snapshot-id'='" - + s.snapshotId() - + "'" - + ", 'streaming'='false'" - + ")*/")); - Assert.assertEquals( - DataUtil.toRowSet(expected), - sqlSet( - "select * from mixed_catalog." - + db - + "." - + TestUnkeyed.TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'as-of-timestamp'='" - + s.timestampMillis() - + "'" - + ", 'streaming'='false'" - + ")*/")); - } - - @Test - public void testPartitionSinkStreamRead() throws Exception { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", "2022-05-17"}); - data.add(new Object[] {1000015, "b", "2022-05-17"}); - data.add(new Object[] {1000011, "c", "2022-05-17"}); - data.add(new Object[] {1000014, "d", "2022-05-18"}); - data.add(new Object[] {1000021, "d", "2022-05-18"}); - data.add(new Object[] {1000007, "e", "2022-05-18"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, dt STRING)" - + " PARTITIONED BY (dt)"); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " PARTITION (dt='2022-05-18') select id, name from input" - + " where dt='2022-05-18' "); - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " PARTITION (dt='2022-05-18') select id, name from input" - + " where dt='2022-05-18' "); - - TableIdentifier identifier = TableIdentifier.of(catalog, db, TABLE); - MixedTable table = mixedFormatCatalog.loadTable(identifier); - Iterable snapshots = table.asUnkeyedTable().snapshots(); - Snapshot s = snapshots.iterator().next(); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TestUnkeyed.TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='file'" - + ", 'start-snapshot-id'='" - + s.snapshotId() - + "'" - + ")*/"); - - List expected = - new ArrayList() { - { - add(Row.of(1000014, "d", "2022-05-18")); - add(Row.of(1000021, "d", "2022-05-18")); - add(Row.of(1000007, "e", "2022-05-18")); - } - }; - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (int i = 0; i < expected.size(); i++) { - actual.add(iterator.next()); - } - } - result.getJobClient().ifPresent(TestUtil::cancelJob); - Assert.assertEquals(new HashSet<>(expected), actual); - } - - @Test - public void testPartitionLogSinkSource() throws Exception { - String topic = this.topic + "testUnKeyedPartitionLogSinkSource"; - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", "2022-05-17"}); - data.add(new Object[] {1000015, "b", "2022-05-17"}); - data.add(new Object[] {1000011, "c", "2022-05-17"}); - data.add(new Object[] {1000014, "d", "2022-05-18"}); - data.add(new Object[] {1000021, "d", "2022-05-18"}); - data.add(new Object[] {1000007, "e", "2022-05-18"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, dt STRING) PARTITIONED BY (dt) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - KafkaContainerTest.deleteTopics(topic); - } - - @Test - public void testPartitionLogSinkSourceWithSelectedFields() throws Exception { - String topic = this.topic + "testPartitionLogSinkSourceWithSelectedFields"; - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000015, "b", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000011, "c", LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {1000014, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000015, "d", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {1000007, "e", LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - List rows = DataUtil.toRows(data); - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("op_time", DataTypes.TIMESTAMP())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, op_time TIMESTAMP) PARTITIONED BY (op_time) WITH %s", - toWithClause(tableProperties)); - - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='log'" - + ", 'log.version'='v1'" - + ") */" - + " select * from input"); - - TableResult result = - exec( - "select id, op_time from mixed_catalog." - + db - + "." - + TABLE - + "/*+ OPTIONS(" - + "'mixed-format.read.mode'='log'" - + ", 'scan.startup.mode'='earliest'" - + ")*/"); - - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - - List expected = new LinkedList<>(); - expected.add(new Object[] {1000004, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000011, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - expected.add(new Object[] {1000014, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000015, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - expected.add(new Object[] {1000007, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - - result.getJobClient().ifPresent(TestUtil::cancelJob); - KafkaContainerTest.deleteTopics(topic); - } - - @Test - public void testPartitionDoubleSink() throws Exception { - String topic = this.topic + "testUnkeyedPartitionDoubleSink"; - KafkaContainerTest.createTopics(KAFKA_PARTITION_NUMS, 1, topic); - - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", "2022-05-17"}); - data.add(new Object[] {1000015, "b", "2022-05-17"}); - data.add(new Object[] {1000011, "c", "2022-05-17"}); - data.add(new Object[] {1000014, "d", "2022-05-18"}); - data.add(new Object[] {1000021, "d", "2022-05-18"}); - data.add(new Object[] {1000007, "e", "2022-05-18"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - Map tableProperties = new HashMap<>(); - tableProperties.put(ENABLE_LOG_STORE, "true"); - tableProperties.put(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); - tableProperties.put(LOG_STORE_MESSAGE_TOPIC, topic); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, dt STRING) PARTITIONED BY (dt) WITH %s", - toWithClause(tableProperties)); - sql( - "insert into mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'mixed-format.emit.mode'='file, log'" - + ", 'log.version'='v1'" - + ") */" - + "select * from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - sqlSet( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='file', 'streaming'='false') */")); - TableResult result = - exec( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS('mixed-format.read.mode'='log', 'scan.startup.mode'='earliest') */"); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - for (Object[] datum : data) { - actual.add(iterator.next()); - } - } - Assert.assertEquals(DataUtil.toRowSet(data), actual); - result.getJobClient().ifPresent(TestUtil::cancelJob); - KafkaContainerTest.deleteTopics(topic); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java deleted file mode 100644 index fcb092e3d6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestUnkeyedOverwrite.java +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.hive.catalog.HiveTableTestHelper; -import org.apache.flink.table.api.ApiExpression; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Table; -import org.apache.flink.test.util.MiniClusterWithClientResource; -import org.apache.iceberg.flink.MiniClusterResource; -import org.junit.After; -import org.junit.Assert; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; - -@RunWith(Parameterized.class) -public class TestUnkeyedOverwrite extends FlinkTestBase { - - private static final Logger LOGGER = LoggerFactory.getLogger(TestUnkeyedOverwrite.class); - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = - MiniClusterResource.createWithClassloaderCheckDisabled(); - - private static final String TABLE = "test_unkeyed"; - private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); - - private String db; - public boolean isHive; - @ClassRule public static TestHMS TEST_HMS = new TestHMS(); - - public TestUnkeyedOverwrite( - CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper, boolean isHive) { - super(catalogTestHelper, tableTestHelper); - this.isHive = isHive; - } - - @Parameterized.Parameters(name = "{0}, {1}, {2}") - public static Object[] parameters() { - return new Object[][] { - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true), - true - }, - { - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true), - false - } - }; - } - - public void before() throws Exception { - if (isHive) { - db = HiveTableTestHelper.TEST_DB_NAME; - } else { - db = DB; - } - super.before(); - super.config(); - } - - @After - public void after() { - sql("DROP TABLE IF EXISTS mixed_catalog." + db + "." + TABLE); - } - - @Test - public void testInsertOverwrite() throws IOException { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a"}); - data.add(new Object[] {1000015, "b"}); - data.add(new Object[] {1000011, "c"}); - data.add(new Object[] {1000014, "d"}); - data.add(new Object[] {1000021, "d"}); - data.add(new Object[] {1000007, "e"}); - - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING)"); - - sql("insert overwrite mixed_catalog." + db + "." + TABLE + " select * from input"); - - Assert.assertEquals( - DataUtil.toRowSet(data), - sqlSet( - "select * from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */")); - } - - @Test - public void testPartitionInsertOverwrite() throws IOException { - List data = new LinkedList<>(); - data.add(new Object[] {1000004, "a", "2022-05-17"}); - data.add(new Object[] {1000015, "b", "2022-05-17"}); - data.add(new Object[] {1000011, "c", "2022-05-17"}); - data.add(new Object[] {1000014, "d", "2022-05-18"}); - data.add(new Object[] {1000021, "d", "2022-05-18"}); - data.add(new Object[] {1000007, "e", "2022-05-18"}); - - List expected = new LinkedList<>(); - expected.add(new Object[] {11, "d", "2022-05-19"}); - expected.add(new Object[] {21, "d", "2022-05-19"}); - expected.add(new Object[] {35, "e", "2022-05-19"}); - - data.addAll(expected); - List rows = DataUtil.toRows(data); - - Table input = - getTableEnv() - .fromValues( - DataTypes.ROW( - DataTypes.FIELD("id", DataTypes.INT()), - DataTypes.FIELD("name", DataTypes.STRING()), - DataTypes.FIELD("dt", DataTypes.STRING())), - rows); - getTableEnv().createTemporaryView("input", input); - - sql("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props)); - - sql( - "CREATE TABLE IF NOT EXISTS mixed_catalog." - + db - + "." - + TABLE - + "(" - + " id INT, name STRING, dt STRING) PARTITIONED BY (dt)"); - - sql("insert into mixed_catalog." + db + "." + TABLE + " select * from input"); - sql( - "insert overwrite mixed_catalog." - + db - + "." - + TABLE - + " PARTITION (dt='2022-05-18') select id, name from input where dt = '2022-05-19'"); - - Assert.assertEquals( - DataUtil.toRowSet(expected), - sqlSet( - "select id, name, '2022-05-19' from mixed_catalog." - + db - + "." - + TABLE - + " /*+ OPTIONS(" - + "'streaming'='false'" - + ") */" - + " where dt='2022-05-18'")); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java deleted file mode 100644 index ae0dbe8c77..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/table/TestWatermark.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.table; - -import static org.apache.amoro.MockAmoroManagementServer.TEST_CATALOG_NAME; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.util.TestUtil; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.TableIdentifier; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.java.tuple.Tuple2; -import org.apache.flink.runtime.testutils.CommonTestUtils; -import org.apache.flink.streaming.api.operators.AbstractStreamOperator; -import org.apache.flink.streaming.api.operators.ChainingStrategy; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.api.DataTypes; -import org.apache.flink.table.api.Table; -import org.apache.flink.table.api.TableResult; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CloseableIterator; -import org.apache.iceberg.io.TaskWriter; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; - -public class TestWatermark extends FlinkTestBase { - public static final Logger LOG = LoggerFactory.getLogger(TestWatermark.class); - - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private static final String DB = TableTestHelper.TEST_TABLE_ID.getDatabase(); - private static final String TABLE = "test_keyed"; - - public TestWatermark() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - @Before - public void before() throws Exception { - super.before(); - super.config(); - } - - @After - public void after() { - sql("DROP TABLE IF EXISTS mixed_catalog." + DB + "." + TABLE); - } - - @Test - public void testWatermark() throws Exception { - sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); - Map tableProperties = new HashMap<>(); - String table = String.format("mixed_catalog.%s.%s", DB, TABLE); - - sql( - "CREATE TABLE IF NOT EXISTS %s (" - + " id bigint, user_id int, name STRING, category string, op_time timestamp, is_true boolean" - + ", PRIMARY KEY (id, user_id) NOT ENFORCED) PARTITIONED BY(category, name) WITH %s", - table, toWithClause(tableProperties)); - - TableSchema flinkSchema = - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("user_id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("category", DataTypes.STRING()) - .field("op_time", DataTypes.TIMESTAMP(3)) - .field("is_true", DataTypes.BOOLEAN()) - .build(); - RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); - KeyedTable keyedTable = - (KeyedTable) - MixedFormatUtils.loadMixedTable( - MixedFormatTableLoader.of( - TableIdentifier.of(TEST_CATALOG_NAME, DB, TABLE), catalogBuilder)); - TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); - List baseData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2L, - 123, - StringData.fromString("a"), - StringData.fromString("a"), - TimestampData.fromLocalDateTime(LocalDateTime.now().minusMinutes(1)), - true)); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(keyedTable, taskWriter.complete(), true); - - sql( - "create table d (tt as cast(op_time as timestamp(3)), watermark for tt as tt) like %s", - table); - - Table source = getTableEnv().sqlQuery("select is_true from d"); - - WatermarkTestOperator op = new WatermarkTestOperator(); - getTableEnv() - .toRetractStream(source, RowData.class) - .transform("test watermark", TypeInformation.of(RowData.class), op); - getEnv().executeAsync("test watermark"); - - op.waitWatermark(); - - Assert.assertTrue(op.watermark > Long.MIN_VALUE); - } - - @Test - public void testSelectWatermarkField() throws Exception { - sql(String.format("CREATE CATALOG mixed_catalog WITH %s", toWithClause(props))); - Map tableProperties = new HashMap<>(); - String table = String.format("mixed_catalog.%s.%s", DB, TABLE); - - sql( - "CREATE TABLE IF NOT EXISTS %s (" - + " id bigint, user_id int, name STRING, category string, op_time timestamp, is_true boolean" - + ", PRIMARY KEY (id, user_id) NOT ENFORCED) PARTITIONED BY(category, name) WITH %s", - table, toWithClause(tableProperties)); - - TableSchema flinkSchema = - TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("user_id", DataTypes.INT()) - .field("name", DataTypes.STRING()) - .field("category", DataTypes.STRING()) - .field("op_time", DataTypes.TIMESTAMP(3)) - .field("is_true", DataTypes.BOOLEAN()) - .build(); - RowType rowType = (RowType) flinkSchema.toRowDataType().getLogicalType(); - KeyedTable keyedTable = - (KeyedTable) - MixedFormatUtils.loadMixedTable( - MixedFormatTableLoader.of( - TableIdentifier.of(TEST_CATALOG_NAME, DB, TABLE), catalogBuilder)); - TaskWriter taskWriter = createKeyedTaskWriter(keyedTable, rowType, true); - List baseData = - new ArrayList() { - { - add( - GenericRowData.ofKind( - RowKind.INSERT, - 2L, - 123, - StringData.fromString("a"), - StringData.fromString("a"), - TimestampData.fromLocalDateTime(LocalDateTime.parse("2022-06-17T10:08:11.0")), - true)); - } - }; - for (RowData record : baseData) { - taskWriter.write(record); - } - commit(keyedTable, taskWriter.complete(), true); - - sql( - "create table d (tt as cast(op_time as timestamp(3)), watermark for tt as tt) like %s", - table); - - TableResult result = exec("select is_true, tt from d"); - - CommonTestUtils.waitUntilJobManagerIsInitialized( - () -> result.getJobClient().get().getJobStatus().get()); - Set actual = new HashSet<>(); - try (CloseableIterator iterator = result.collect()) { - Row row = iterator.next(); - actual.add(row); - } - result.getJobClient().ifPresent(TestUtil::cancelJob); - - List expected = new LinkedList<>(); - expected.add(new Object[] {true, LocalDateTime.parse("2022-06-17T10:08:11")}); - Assert.assertEquals(DataUtil.toRowSet(expected), actual); - } - - public static class WatermarkTestOperator extends AbstractStreamOperator - implements OneInputStreamOperator, RowData> { - - private static final long serialVersionUID = 1L; - public long watermark; - private static final CompletableFuture waitWatermark = new CompletableFuture<>(); - - public WatermarkTestOperator() { - super(); - chainingStrategy = ChainingStrategy.ALWAYS; - } - - private void waitWatermark() throws InterruptedException, ExecutionException { - waitWatermark.get(); - } - - @Override - public void processElement(StreamRecord> element) throws Exception { - output.collect(element.asRecord()); - } - - @Override - public void processWatermark(Watermark mark) throws Exception { - LOG.info("processWatermark: {}", mark); - watermark = mark.getTimestamp(); - waitWatermark.complete(null); - super.processWatermark(mark); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java deleted file mode 100644 index 7b90f8a179..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/ClassLoaderUtils.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import javax.annotation.Nullable; -import javax.tools.JavaCompiler; -import javax.tools.ToolProvider; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.Serializable; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLClassLoader; -import java.nio.file.FileVisitResult; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.SimpleFileVisitor; -import java.nio.file.attribute.BasicFileAttributes; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -/** Utilities to create class loaders. */ -public class ClassLoaderUtils { - public static URLClassLoader compileAndLoadJava(File root, String filename, String source) - throws IOException { - return withRoot(root).addClass(filename.replaceAll("\\.java", ""), source).build(); - } - - private static URLClassLoader createClassLoader(File root) throws MalformedURLException { - return new URLClassLoader( - new URL[] {root.toURI().toURL()}, Thread.currentThread().getContextClassLoader()); - } - - private static void writeAndCompile(File root, String filename, String source) - throws IOException { - File file = writeSourceFile(root, filename, source); - - compileClass(file); - } - - private static File writeSourceFile(File root, String filename, String source) - throws IOException { - File file = new File(root, filename); - file.getParentFile().mkdirs(); - FileWriter fileWriter = new FileWriter(file); - - fileWriter.write(source); - fileWriter.close(); - - return file; - } - - public static ClassLoaderBuilder withRoot(File root) { - return new ClassLoaderBuilder(root); - } - - private static int compileClass(File sourceFile) { - JavaCompiler compiler = ToolProvider.getSystemJavaCompiler(); - return compiler.run(null, null, null, "-proc:none", sourceFile.getPath()); - } - - public static URL[] getClasspathURLs() { - final String[] cp = System.getProperty("java.class.path").split(File.pathSeparator); - - return Arrays.stream(cp) - .filter(str -> !str.isEmpty()) - .map(ClassLoaderUtils::parse) - .toArray(URL[]::new); - } - - private static URL parse(String fileName) { - try { - return new File(fileName).toURI().toURL(); - } catch (MalformedURLException e) { - throw new RuntimeException(e); - } - } - - public static class ClassLoaderBuilder { - - private final File root; - private final Map classes; - private final Map resources; - - private ClassLoaderBuilder(File root) { - this.root = root; - this.classes = new HashMap<>(); - this.resources = new HashMap<>(); - } - - public ClassLoaderBuilder addResource(String targetPath, String resource) { - String oldValue = resources.putIfAbsent(targetPath, resource); - - if (oldValue != null) { - throw new RuntimeException( - String.format("Resource with path %s already registered.", resource)); - } - - return this; - } - - public ClassLoaderBuilder addClass(String className, String source) { - String oldValue = classes.putIfAbsent(className, source); - - if (oldValue != null) { - throw new RuntimeException( - String.format("Class with name %s already registered.", className)); - } - - return this; - } - - public URLClassLoader build() throws IOException { - for (Map.Entry classInfo : classes.entrySet()) { - writeAndCompile(root, createFileName(classInfo.getKey()), classInfo.getValue()); - } - - for (Map.Entry resource : resources.entrySet()) { - writeSourceFile(root, resource.getKey(), resource.getValue()); - } - - return createClassLoader(root); - } - - private String createFileName(String className) { - return className + ".java"; - } - } - // ------------------------------------------------------------------------ - // Testing of objects not in the application class loader - // ------------------------------------------------------------------------ - - /** - * A new object and the corresponding ClassLoader for that object, as returned by {@link - * #createSerializableObjectFromNewClassLoader()} or {@link - * #createExceptionObjectFromNewClassLoader()}. - */ - public static final class ObjectAndClassLoader { - - private final T object; - private final ClassLoader classLoader; - - private ObjectAndClassLoader(T object, ClassLoader classLoader) { - this.object = object; - this.classLoader = classLoader; - } - - public ClassLoader getClassLoader() { - return classLoader; - } - - public T getObject() { - return object; - } - } - - /** - * Creates a new ClassLoader and a new {@link Serializable} class inside that ClassLoader. This is - * useful when unit testing the class loading behavior of code, and needing a class that is - * outside the system class path. - * - *

NOTE: Even though this method may throw IOExceptions, we do not declare those and rather - * wrap them in Runtime Exceptions. While this is generally discouraged, we do this here because - * it is merely a test utility and not production code, and it makes it easier to use this method - * during the initialization of variables and especially static variables. - */ - public static ObjectAndClassLoader createSerializableObjectFromNewClassLoader() { - - final String classSource = - "import java.io.Serializable;" - + "import java.util.Random;" - + "public class TestSerializable implements Serializable {" - + " private static final long serialVersionUID = -3L;" - + " private final long random;" - + " public TestSerializable() {" - + " random = new Random().nextLong();" - + " }" - + " public boolean equals(Object o) {" - + " if (this == o) { return true; }" - + " if ((o == null) || (getClass() != o.getClass())) { return false; }" - + " TestSerializable that = (TestSerializable) o;" - + " return random == random;" - + " }" - + " public int hashCode() {" - + " return (int)(random ^ random >>> 32);" - + " }" - + " public String toString() {" - + " return \"TestSerializable{random=\" + random + '}';" - + " }" - + "}"; - - return createObjectFromNewClassLoader("TestSerializable", Serializable.class, classSource); - } - - /** - * Creates a new ClassLoader and a new {@link Exception} class inside that ClassLoader. This is - * useful when unit testing the class loading behavior of code, and needing a class that is - * outside the system class path. - * - *

NOTE: Even though this method may throw IOExceptions, we do not declare those and rather - * wrap them in Runtime Exceptions. While this is generally discouraged, we do this here because - * it is merely a test utility and not production code, and it makes it easier to use this method - * during the initialization of variables and especially static variables. - */ - public static ObjectAndClassLoader createExceptionObjectFromNewClassLoader() { - - return createObjectFromNewClassLoader( - "TestExceptionForSerialization", - Exception.class, - "public class TestExceptionForSerialization extends java.lang.Exception {}"); - } - - private static ObjectAndClassLoader createObjectFromNewClassLoader( - String testClassName, Class testClass, String source) { - final Path classDirPath = - new File(System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString()).toPath(); - - URLClassLoader classLoader = null; - try { - Files.createDirectories(classDirPath); - classLoader = compileAndLoadJava(classDirPath.toFile(), testClassName, source); - - final Class clazz = classLoader.loadClass(testClassName); - final T object = clazz.asSubclass(testClass).getDeclaredConstructor().newInstance(); - - return new ObjectAndClassLoader<>(object, classLoader); - } catch (Exception e) { - throw new RuntimeException("Cannot create test class outside system class path", e); - } finally { - // we clean up eagerly, because it is fine to delete the class file once the class is - // loaded - // and we have no later life cycle hook here to do the cleanup - tryClose(classLoader); - tryDeleteDirectoryRecursively(classDirPath); - } - } - - // ------------------------------------------------------------------------ - // miscellaneous utils - // ------------------------------------------------------------------------ - - private static void tryClose(@Nullable AutoCloseable closeable) { - if (closeable != null) { - try { - closeable.close(); - } catch (Exception ignored) { - } - } - } - - private static void tryDeleteDirectoryRecursively(Path directory) { - final SimpleFileVisitor deletingVisitor = - new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) - throws IOException { - Files.delete(file); - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { - Files.delete(dir); - return FileVisitResult.CONTINUE; - } - }; - - try { - Files.walkFileTree(directory, deletingVisitor); - } catch (Exception ignored) { - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java deleted file mode 100644 index f71476ad31..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/DataUtil.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static org.apache.flink.table.api.Expressions.row; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.flink.table.api.ApiExpression; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.types.Row; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.CollectionUtil; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.CloseableIterable; -import org.junit.Assert; - -import java.time.Instant; -import java.time.LocalDateTime; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -public class DataUtil { - - public static List toRows(Collection data) { - return data.stream() - .map( - i -> { - int size = i.length; - return size == 1 ? row(i[0]) : row(i[0], ArrayUtils.subarray(i, 1, size)); - }) - .collect(Collectors.toList()); - } - - public static Set toRowSet(Collection data) { - return data.stream() - .map( - r -> - r[0] instanceof RowKind - ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) - : Row.of(r)) - .collect(Collectors.toSet()); - } - - public static List toRowList(Collection data) { - return data.stream() - .map( - r -> - r[0] instanceof RowKind - ? Row.ofKind((RowKind) r[0], ArrayUtils.subarray(r, 1, r.length)) - : Row.of(r)) - .collect(Collectors.toList()); - } - - public static void assertEqual(Collection expected, Collection actual) { - Assert.assertEquals( - CollectionUtil.isNullOrEmpty(expected), CollectionUtil.isNullOrEmpty(actual)); - if (expected == null) { - return; - } - Assert.assertEquals(expected.size(), actual.size()); - for (Iterator i1 = expected.iterator(), i2 = actual.iterator(); i1.hasNext(); ) { - Object[] actualRow = i2.next(); - System.out.println(ArrayUtils.toString(actualRow)); - Assert.assertArrayEquals(i1.next(), actualRow); - } - } - - private static Object[] convertData(Object... values) { - Object[] row = new Object[values.length]; - for (int i = 0; i < values.length; i++) { - if (values[i] instanceof String) { - row[i] = StringData.fromString((String) values[i]); - } else if (values[i] instanceof LocalDateTime) { - row[i] = TimestampData.fromLocalDateTime(((LocalDateTime) values[i])); - } else if (values[i] instanceof Instant) { - row[i] = TimestampData.fromInstant((Instant) values[i]); - } else { - row[i] = values[i]; - } - } - return row; - } - - public static Collection toRowData(List data) { - return data.stream() - .map( - d -> - d[0] instanceof RowKind - ? toRowDataWithKind((RowKind) d[0], ArrayUtils.subarray(d, 1, d.length)) - : toRowData(d)) - .collect(Collectors.toList()); - } - - public static RowData toRowData(Object... values) { - return GenericRowData.of(convertData(values)); - } - - public static RowData toRowDataWithKind(RowKind rowKind, Object... values) { - return GenericRowData.ofKind(rowKind, convertData(values)); - } - - public static Set read(Table table) { - table.refresh(); - - Set records = new HashSet<>(); - - try (CloseableIterable iterable = IcebergGenerics.read(table).build()) { - for (Record record : iterable) { - records.add(record); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - return records; - } - - public static Map> groupByPrimaryKey(List rowList, int pkIdx) { - Map> result = new HashMap<>(); - for (Row row : rowList) { - Object pk = row.getField(pkIdx); - List list = result.getOrDefault(pk, new LinkedList<>()); - list.add(row); - result.put(pk, list); - } - return result; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java deleted file mode 100644 index 18f5e3ea8f..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MixedFormatMockEnvironment.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.JobID; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.checkpoint.channel.ChannelStateWriteRequestExecutorFactory; -import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider; -import org.apache.flink.runtime.io.disk.iomanager.IOManager; -import org.apache.flink.runtime.jobgraph.JobVertexID; -import org.apache.flink.runtime.memory.MemoryManager; -import org.apache.flink.runtime.metrics.groups.TaskMetricGroup; -import org.apache.flink.runtime.operators.testutils.MockEnvironment; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.runtime.state.TaskStateManager; -import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; -import org.apache.flink.runtime.taskmanager.TaskManagerRuntimeInfo; -import org.apache.flink.util.UserCodeClassLoader; - -public class MixedFormatMockEnvironment extends MockEnvironment { - - protected MixedFormatMockEnvironment( - JobID jobID, - JobVertexID jobVertexID, - String taskName, - MockInputSplitProvider inputSplitProvider, - int bufferSize, - Configuration taskConfiguration, - ExecutionConfig executionConfig, - IOManager ioManager, - TaskStateManager taskStateManager, - GlobalAggregateManager aggregateManager, - int maxParallelism, - int parallelism, - int subtaskIndex, - UserCodeClassLoader userCodeClassLoader, - TaskMetricGroup taskMetricGroup, - TaskManagerRuntimeInfo taskManagerRuntimeInfo, - MemoryManager memManager, - ExternalResourceInfoProvider externalResourceInfoProvider, - ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory) { - super( - jobID, - jobVertexID, - taskName, - inputSplitProvider, - bufferSize, - taskConfiguration, - executionConfig, - ioManager, - taskStateManager, - aggregateManager, - maxParallelism, - parallelism, - subtaskIndex, - userCodeClassLoader, - taskMetricGroup, - taskManagerRuntimeInfo, - memManager, - externalResourceInfoProvider, - channelStateExecutorFactory); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java deleted file mode 100644 index 0738c98d52..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/MockEnvironmentBuilder.java +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.common.JobID; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.checkpoint.channel.ChannelStateWriteRequestExecutorFactory; -import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider; -import org.apache.flink.runtime.io.disk.iomanager.IOManager; -import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; -import org.apache.flink.runtime.jobgraph.JobVertexID; -import org.apache.flink.runtime.memory.MemoryManager; -import org.apache.flink.runtime.memory.MemoryManagerBuilder; -import org.apache.flink.runtime.metrics.groups.TaskMetricGroup; -import org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups; -import org.apache.flink.runtime.operators.testutils.MockEnvironment; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.runtime.state.TaskStateManager; -import org.apache.flink.runtime.state.TestTaskStateManager; -import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; -import org.apache.flink.runtime.taskmanager.TaskManagerRuntimeInfo; -import org.apache.flink.runtime.util.TestingTaskManagerRuntimeInfo; -import org.apache.flink.runtime.util.TestingUserCodeClassLoader; -import org.apache.flink.util.UserCodeClassLoader; - -public class MockEnvironmentBuilder { - private String taskName = "mock-task"; - private MockInputSplitProvider inputSplitProvider = null; - private int bufferSize = 16; - private TaskStateManager taskStateManager = new TestTaskStateManager(); - private GlobalAggregateManager aggregateManager = new TestGlobalAggregateManager(); - private Configuration taskConfiguration = new Configuration(); - private ExecutionConfig executionConfig = new ExecutionConfig(); - private int maxParallelism = 1; - private int parallelism = 1; - private int subtaskIndex = 0; - private UserCodeClassLoader userCodeClassLoader = TestingUserCodeClassLoader.newBuilder().build(); - private JobID jobID = new JobID(); - private JobVertexID jobVertexID = new JobVertexID(); - private TaskMetricGroup taskMetricGroup = - UnregisteredMetricGroups.createUnregisteredTaskMetricGroup(); - private TaskManagerRuntimeInfo taskManagerRuntimeInfo = new TestingTaskManagerRuntimeInfo(); - private IOManager ioManager; - private MemoryManager memoryManager = this.buildMemoryManager(33554432L); - private ExternalResourceInfoProvider externalResourceInfoProvider; - private ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory = - new ChannelStateWriteRequestExecutorFactory(this.jobID); - - public MockEnvironmentBuilder() { - this.externalResourceInfoProvider = ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES; - } - - private MemoryManager buildMemoryManager(long memorySize) { - return MemoryManagerBuilder.newBuilder().setMemorySize(memorySize).build(); - } - - public MockEnvironmentBuilder setTaskName(String taskName) { - this.taskName = taskName; - return this; - } - - public MockEnvironmentBuilder setManagedMemorySize(long managedMemorySize) { - this.memoryManager = this.buildMemoryManager(managedMemorySize); - return this; - } - - public MockEnvironmentBuilder setInputSplitProvider(MockInputSplitProvider inputSplitProvider) { - this.inputSplitProvider = inputSplitProvider; - return this; - } - - public MockEnvironmentBuilder setBufferSize(int bufferSize) { - this.bufferSize = bufferSize; - return this; - } - - public MockEnvironmentBuilder setTaskStateManager(TaskStateManager taskStateManager) { - this.taskStateManager = taskStateManager; - return this; - } - - public MockEnvironmentBuilder setAggregateManager(GlobalAggregateManager aggregateManager) { - this.aggregateManager = aggregateManager; - return this; - } - - public MockEnvironmentBuilder setTaskConfiguration(Configuration taskConfiguration) { - this.taskConfiguration = taskConfiguration; - return this; - } - - public MockEnvironmentBuilder setExecutionConfig(ExecutionConfig executionConfig) { - this.executionConfig = executionConfig; - return this; - } - - public MockEnvironmentBuilder setTaskManagerRuntimeInfo( - TaskManagerRuntimeInfo taskManagerRuntimeInfo) { - this.taskManagerRuntimeInfo = taskManagerRuntimeInfo; - return this; - } - - public MockEnvironmentBuilder setMaxParallelism(int maxParallelism) { - this.maxParallelism = maxParallelism; - return this; - } - - public MockEnvironmentBuilder setParallelism(int parallelism) { - this.parallelism = parallelism; - return this; - } - - public MockEnvironmentBuilder setSubtaskIndex(int subtaskIndex) { - this.subtaskIndex = subtaskIndex; - return this; - } - - public MockEnvironmentBuilder setUserCodeClassLoader(ClassLoader userCodeClassLoader) { - this.userCodeClassLoader = - TestingUserCodeClassLoader.newBuilder().setClassLoader(userCodeClassLoader).build(); - return this; - } - - public MockEnvironmentBuilder setJobID(JobID jobID) { - this.jobID = jobID; - return this; - } - - public MockEnvironmentBuilder setJobVertexID(JobVertexID jobVertexID) { - this.jobVertexID = jobVertexID; - return this; - } - - public MockEnvironmentBuilder setMetricGroup(TaskMetricGroup taskMetricGroup) { - this.taskMetricGroup = taskMetricGroup; - return this; - } - - public MockEnvironmentBuilder setIOManager(IOManager ioManager) { - this.ioManager = ioManager; - return this; - } - - public MockEnvironmentBuilder setMemoryManager(MemoryManager memoryManager) { - this.memoryManager = memoryManager; - return this; - } - - public MockEnvironmentBuilder setExternalResourceInfoProvider( - ExternalResourceInfoProvider externalResourceInfoProvider) { - this.externalResourceInfoProvider = externalResourceInfoProvider; - return this; - } - - public MockEnvironmentBuilder setGlobalAggregateManager( - GlobalAggregateManager globalAggregateManager) { - this.aggregateManager = globalAggregateManager; - return this; - } - - public void setChannelStateExecutorFactory( - ChannelStateWriteRequestExecutorFactory channelStateExecutorFactory) { - this.channelStateExecutorFactory = channelStateExecutorFactory; - } - - public MockEnvironment build() { - if (this.ioManager == null) { - this.ioManager = new IOManagerAsync(); - } - - return new MixedFormatMockEnvironment( - this.jobID, - this.jobVertexID, - this.taskName, - this.inputSplitProvider, - this.bufferSize, - this.taskConfiguration, - this.executionConfig, - this.ioManager, - this.taskStateManager, - this.aggregateManager, - this.maxParallelism, - this.parallelism, - this.subtaskIndex, - this.userCodeClassLoader, - this.taskMetricGroup, - this.taskManagerRuntimeInfo, - this.memoryManager, - this.externalResourceInfoProvider, - this.channelStateExecutorFactory); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java deleted file mode 100644 index dfe12e2f82..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestCompatibleFlinkPropertyUtil.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.amoro.flink.table.descriptors.MixedFormatValidator; -import org.apache.flink.configuration.Configuration; -import org.junit.Assert; -import org.junit.Test; - -public class TestCompatibleFlinkPropertyUtil { - @Test - public void testGetNewProperty() { - Configuration config = new Configuration(); - Assert.assertEquals( - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.defaultValue(), - CompatibleFlinkPropertyUtil.propertyAsBoolean( - config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); - - config.setBoolean(MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE, true); - Assert.assertTrue( - CompatibleFlinkPropertyUtil.propertyAsBoolean( - config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); - - config.setBoolean( - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY, false); - Assert.assertTrue( - CompatibleFlinkPropertyUtil.propertyAsBoolean( - config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); - } - - @Test - public void testGetLegacyProperty() { - Configuration config = new Configuration(); - config.setBoolean( - MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE_LEGACY, true); - Assert.assertTrue( - CompatibleFlinkPropertyUtil.propertyAsBoolean( - config, MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE)); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java deleted file mode 100644 index 0162cd04e1..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestGlobalAggregateManager.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.api.common.functions.AggregateFunction; -import org.apache.flink.runtime.taskexecutor.GlobalAggregateManager; -import org.apache.flink.runtime.taskexecutor.rpc.RpcGlobalAggregateManager; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -/** - * An util class of global aggregate manager that simulates action as {@link - * RpcGlobalAggregateManager} in the jobMaster. - */ -public class TestGlobalAggregateManager implements GlobalAggregateManager { - private final Map accumulators = new HashMap<>(); - - @Override - public OUT updateGlobalAggregate( - String aggregateName, Object aggregand, AggregateFunction aggregateFunction) - throws IOException { - - Object accumulator = accumulators.get(aggregateName); - if (null == accumulator) { - accumulator = aggregateFunction.createAccumulator(); - } - - accumulator = aggregateFunction.add((IN) aggregand, (ACC) accumulator); - accumulators.put(aggregateName, accumulator); - return aggregateFunction.getResult((ACC) accumulator); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java deleted file mode 100644 index e77d4b2e48..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestOneInputStreamOperatorIntern.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.runtime.checkpoint.PrioritizedOperatorSubtaskState; -import org.apache.flink.runtime.checkpoint.TaskStateSnapshot; -import org.apache.flink.runtime.jobgraph.OperatorID; -import org.apache.flink.runtime.operators.testutils.MockInputSplitProvider; -import org.apache.flink.runtime.state.TestTaskStateManager; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; - -import javax.annotation.Nonnull; - -import java.util.Collections; -import java.util.List; - -public class TestOneInputStreamOperatorIntern - extends OneInputStreamOperatorTestHarness { - public TestOneInputStreamOperatorIntern( - OneInputStreamOperator operator, - int maxParallelism, - int parallelism, - int subtaskIndex, - Long restoredCheckpointId, - TestGlobalAggregateManager testGlobalAggregateManager) - throws Exception { - super( - operator, - (new MockEnvironmentBuilder()) - .setTaskName("MockTask") - .setManagedMemorySize(3145728L) - .setInputSplitProvider(new MockInputSplitProvider()) - .setBufferSize(1024) - .setTaskStateManager(new TestTaskStateManagerIntern(restoredCheckpointId)) - .setAggregateManager(testGlobalAggregateManager) - .setMaxParallelism(maxParallelism) - .setParallelism(parallelism) - .setSubtaskIndex(subtaskIndex) - .build()); - } - - public void notifyOfAbortedCheckpoint(long checkpointId) throws Exception { - this.operator.notifyCheckpointAborted(checkpointId); - } - - static class TestTaskStateManagerIntern extends TestTaskStateManager { - private long reportedCheckpointId = -1L; - private boolean restored = false; - - public TestTaskStateManagerIntern(Long reportedCheckpointId) { - super(); - if (reportedCheckpointId != null) { - this.reportedCheckpointId = reportedCheckpointId; - this.restored = true; - } - } - - @Nonnull - public PrioritizedOperatorSubtaskState prioritizedOperatorState(OperatorID operatorID) { - TaskStateSnapshot jmTaskStateSnapshot = this.getLastJobManagerTaskStateSnapshot(); - TaskStateSnapshot tmTaskStateSnapshot = this.getLastTaskManagerTaskStateSnapshot(); - if (jmTaskStateSnapshot == null) { - return PrioritizedOperatorSubtaskState.emptyNotRestored(); - } else { - OperatorSubtaskState jmOpState = - jmTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID); - if (jmOpState == null) { - return PrioritizedOperatorSubtaskState.emptyNotRestored(); - } else { - List tmStateCollection = Collections.emptyList(); - if (tmTaskStateSnapshot != null) { - OperatorSubtaskState tmOpState = - tmTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID); - if (tmOpState != null) { - tmStateCollection = Collections.singletonList(tmOpState); - } - } - - PrioritizedOperatorSubtaskState.Builder builder = - new PrioritizedOperatorSubtaskState.Builder( - jmOpState, tmStateCollection, this.reportedCheckpointId); - return builder.build(); - } - } - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java deleted file mode 100644 index 69431378bf..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestProjection.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import static org.apache.flink.table.api.DataTypes.BIGINT; -import static org.apache.flink.table.api.DataTypes.BOOLEAN; -import static org.apache.flink.table.api.DataTypes.DOUBLE; -import static org.apache.flink.table.api.DataTypes.FIELD; -import static org.apache.flink.table.api.DataTypes.INT; -import static org.apache.flink.table.api.DataTypes.ROW; -import static org.apache.flink.table.api.DataTypes.STRING; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import org.apache.flink.table.types.DataType; -import org.junit.jupiter.api.Test; - -class TestProjection { - - @Test - void testTopLevelProject() { - assertThat( - Projection.of(new int[] {2, 1}) - .project(ROW(FIELD("f0", BIGINT()), FIELD("f1", STRING()), FIELD("f2", INT())))) - .isEqualTo(ROW(FIELD("f2", INT()), FIELD("f1", STRING()))); - } - - @Test - void testNestedProject() { - final DataType thirdLevelRow = - ROW(FIELD("c0", BOOLEAN()), FIELD("c1", DOUBLE()), FIELD("c2", INT())); - final DataType secondLevelRow = - ROW(FIELD("b0", BOOLEAN()), FIELD("b1", thirdLevelRow), FIELD("b2", INT())); - final DataType topLevelRow = - ROW(FIELD("a0", INT()), FIELD("a1", secondLevelRow), FIELD("a1_b1_c0", INT())); - - assertThat(Projection.of(new int[][] {{0}, {1, 1, 0}}).project(topLevelRow)) - .isEqualTo(ROW(FIELD("a0", INT()), FIELD("a1_b1_c0", BOOLEAN()))); - assertThat(Projection.of(new int[][] {{1, 1}, {0}}).project(topLevelRow)) - .isEqualTo(ROW(FIELD("a1_b1", thirdLevelRow), FIELD("a0", INT()))); - assertThat(Projection.of(new int[][] {{1, 1, 2}, {1, 1, 1}, {1, 1, 0}}).project(topLevelRow)) - .isEqualTo( - ROW( - FIELD("a1_b1_c2", INT()), - FIELD("a1_b1_c1", DOUBLE()), - FIELD("a1_b1_c0", BOOLEAN()))); - assertThat(Projection.of(new int[][] {{1, 1, 0}, {2}}).project(topLevelRow)) - .isEqualTo(ROW(FIELD("a1_b1_c0", BOOLEAN()), FIELD("a1_b1_c0_$0", INT()))); - } - - @Test - void testIsNested() { - assertThat(Projection.of(new int[] {2, 1}).isNested()).isFalse(); - assertThat(Projection.of(new int[][] {new int[] {1}, new int[] {3}}).isNested()).isFalse(); - assertThat( - Projection.of(new int[][] {new int[] {1}, new int[] {1, 2}, new int[] {3}}).isNested()) - .isTrue(); - } - - @Test - void testDifference() { - assertThat(Projection.of(new int[] {4, 1, 0, 3, 2}).difference(Projection.of(new int[] {4, 2}))) - .isEqualTo(Projection.of(new int[] {1, 0, 2})); - - assertThat( - Projection.of( - new int[][] { - new int[] {4}, - new int[] {1, 3}, - new int[] {0}, - new int[] {3, 1}, - new int[] {2} - }) - .difference(Projection.of(new int[] {4, 2}))) - .isEqualTo(Projection.of(new int[][] {new int[] {1, 3}, new int[] {0}, new int[] {2, 1}})); - - assertThatThrownBy( - () -> - Projection.of(new int[] {1, 2, 3, 4}) - .difference(Projection.of(new int[][] {new int[] {2}, new int[] {3, 4}}))) - .isInstanceOf(IllegalArgumentException.class); - } - - @Test - void testComplement() { - assertThat(Projection.of(new int[] {4, 1, 2}).complement(5)) - .isEqualTo(Projection.of(new int[] {0, 3})); - - assertThat( - Projection.of(new int[][] {new int[] {4}, new int[] {1}, new int[] {2}}).complement(5)) - .isEqualTo(Projection.of(new int[] {0, 3})); - - assertThatThrownBy( - () -> - Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) - .complement(10)) - .isInstanceOf(IllegalStateException.class); - } - - @Test - void testToTopLevelIndexes() { - assertThat(Projection.of(new int[] {1, 2, 3, 4}).toTopLevelIndexes()) - .isEqualTo(new int[] {1, 2, 3, 4}); - - assertThat( - Projection.of(new int[][] {new int[] {4}, new int[] {1}, new int[] {2}}) - .toTopLevelIndexes()) - .isEqualTo(new int[] {4, 1, 2}); - - assertThatThrownBy( - () -> - Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) - .toTopLevelIndexes()) - .isInstanceOf(IllegalStateException.class); - } - - @Test - void testToNestedIndexes() { - assertThat(Projection.of(new int[] {1, 2, 3, 4}).toNestedIndexes()) - .isEqualTo(new int[][] {new int[] {1}, new int[] {2}, new int[] {3}, new int[] {4}}); - assertThat( - Projection.of(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}) - .toNestedIndexes()) - .isEqualTo(new int[][] {new int[] {4}, new int[] {1, 3}, new int[] {2}}); - } - - @Test - void testEquals() { - assertThat(Projection.of(new int[][] {new int[] {1}, new int[] {2}, new int[] {3}})) - .isEqualTo(Projection.of(new int[] {1, 2, 3})); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java deleted file mode 100644 index 6888add512..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/util/TestUtil.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.util; - -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.core.execution.JobClient; -import org.junit.rules.TestName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TestUtil { - - public static final Logger LOG = LoggerFactory.getLogger(TestUtil.class); - - /** get ut method name without parameters. */ - public static String getUtMethodName(TestName testName) { - int i = testName.getMethodName().indexOf("["); - if (i == -1) { - return testName.getMethodName(); - } - return testName.getMethodName().substring(0, i); - } - - public static void cancelJob(JobClient jobClient) { - if (isJobTerminated(jobClient)) { - return; - } - try { - jobClient.cancel(); - } catch (Exception e) { - LOG.warn("cancel job exception.", e); - } - } - - public static boolean isJobTerminated(JobClient jobClient) { - try { - JobStatus status = jobClient.getJobStatus().get(); - return status.isGloballyTerminalState(); - } catch (Exception e) { - // TODO - // This is sort of hack. - // Currently different execution environment will have different behaviors - // when fetching a finished job status. - // For example, standalone session cluster will return a normal FINISHED, - // while mini cluster will throw IllegalStateException, - // and yarn per job will throw ApplicationNotFoundException. - // We have to assume that job has finished in this case. - // Change this when these behaviors are unified. - LOG.warn( - "Failed to get job status so we assume that the job has terminated. Some data might be lost.", - e); - return true; - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java deleted file mode 100644 index 610a7854b0..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/FlinkTaskWriterBaseTest.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.BasicTableTestHelper.PRIMARY_KEY_SPEC; - -import org.apache.amoro.flink.FlinkTableTestBase; -import org.apache.amoro.flink.read.FlinkSplitPlanner; -import org.apache.amoro.flink.read.hybrid.reader.RowDataReaderFunction; -import org.apache.amoro.flink.read.hybrid.split.MixedFormatSplit; -import org.apache.amoro.flink.read.source.DataIterator; -import org.apache.amoro.io.AuthenticatedFileIO; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.TableLoader; -import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.FlinkInputFormat; -import org.apache.iceberg.flink.source.FlinkSource; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.types.TypeUtil; -import org.junit.Assert; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -public interface FlinkTaskWriterBaseTest extends FlinkTableTestBase { - Logger LOG = LoggerFactory.getLogger(FlinkTaskWriterBaseTest.class); - - default void testWriteAndReadMixedFormatTable( - MixedTable mixedTable, TableSchema flinkTableSchema, RowData expected) { - - // This is a partial-write schema from Flink engine view. - RowType rowType = (RowType) flinkTableSchema.toRowDataType().getLogicalType(); - - try (TaskWriter taskWriter = createTaskWriter(mixedTable, rowType)) { - Assert.assertNotNull(taskWriter); - - writeAndCommit(expected, taskWriter, mixedTable); - - mixedTable.refresh(); - - // This is a partial-read schema from Flink engine view, should reassign schema id to - // selected-schema - Schema selectedSchema = - TypeUtil.reassignIds(FlinkSchemaUtil.convert(flinkTableSchema), mixedTable.schema()); - - assertRecords(mixedTable.schema(), selectedSchema, mixedTable, expected, flinkTableSchema); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - default void assertRecords( - Schema tableSchema, - Schema selectedSchema, - MixedTable mixedTable, - RowData expected, - TableSchema flinkTableSchema) - throws IOException { - List records; - if (mixedTable.isKeyedTable()) { - records = - recordsOfKeyedTable( - mixedTable.asKeyedTable(), tableSchema, selectedSchema, mixedTable.io()); - } else { - records = - recordsOfUnkeyedTable( - getTableLoader(getCatalogName(), getMetastoreUri(), mixedTable), - selectedSchema, - flinkTableSchema); - } - Assert.assertEquals(1, records.size()); - Assert.assertEquals(expected, records.get(0)); - } - - /** For asserting unkeyed table records. */ - String getMetastoreUri(); - - /** For asserting unkeyed table records. */ - String getCatalogName(); - - default void writeAndCommit( - RowData expected, TaskWriter taskWriter, MixedTable mixedTable) throws IOException { - writeAndCommit(expected, taskWriter, mixedTable, false); - } - - default void writeAndCommit( - RowData expected, - TaskWriter taskWriter, - MixedTable mixedTable, - boolean upsertEnabled) - throws IOException { - taskWriter.write(expected); - WriteResult writerResult = taskWriter.complete(); - boolean writeToBase = mixedTable.isUnkeyedTable(); - commit(mixedTable, writerResult, writeToBase); - Assert.assertEquals(upsertEnabled ? 2 : 1, writerResult.dataFiles().length); - } - - default boolean upsertEnabled() { - return false; - } - - default List recordsOfUnkeyedTable( - TableLoader tableLoader, Schema projectedSchema, TableSchema flinkTableSchema) - throws IOException { - FlinkInputFormat inputFormat = - FlinkSource.forRowData().tableLoader(tableLoader).project(flinkTableSchema).buildFormat(); - return runFormat(inputFormat, FlinkSchemaUtil.convert(projectedSchema)); - } - - default List recordsOfKeyedTable( - KeyedTable table, Schema tableSchema, Schema projectedSchema, AuthenticatedFileIO io) { - List mixedFormatSplits = - FlinkSplitPlanner.planFullTable(table, new AtomicInteger(0)); - - RowDataReaderFunction rowDataReaderFunction = - new RowDataReaderFunction( - new Configuration(), tableSchema, projectedSchema, PRIMARY_KEY_SPEC, null, true, io); - - List actual = new ArrayList<>(); - mixedFormatSplits.forEach( - split -> { - LOG.info("Mixed-format split: {}.", split); - DataIterator dataIterator = rowDataReaderFunction.createDataIterator(split); - while (dataIterator.hasNext()) { - RowData rowData = dataIterator.next(); - LOG.info("{}", rowData); - actual.add(rowData); - } - }); - - return actual; - } - - default List runFormat(FlinkInputFormat inputFormat, RowType readRowType) - throws IOException { - return TestHelpers.readRowData(inputFormat, readRowType); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java deleted file mode 100644 index 133d9acbb6..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/MixedFormatFileWriterITCase.java +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.data.FileNameRules; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.read.TestMixedFormatSource; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.api.common.RuntimeExecutionMode; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.api.common.state.ListState; -import org.apache.flink.api.common.state.ListStateDescriptor; -import org.apache.flink.api.common.time.Time; -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.ExecutionOptions; -import org.apache.flink.configuration.RestOptions; -import org.apache.flink.runtime.jobgraph.JobGraph; -import org.apache.flink.runtime.minicluster.MiniCluster; -import org.apache.flink.runtime.minicluster.MiniClusterConfiguration; -import org.apache.flink.runtime.state.CheckpointListener; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.datastream.DataStreamSource; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; -import org.apache.flink.streaming.api.graph.StreamGraph; -import org.apache.flink.table.api.TableSchema; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Snapshot; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.Collections; -import java.util.HashSet; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.Stack; -import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; - -public class MixedFormatFileWriterITCase extends FlinkTestBase { - - public static final Logger LOG = LoggerFactory.getLogger(MixedFormatFileWriterITCase.class); - - private static final Map LATCH_MAP = new ConcurrentHashMap<>(); - public MixedFormatTableLoader tableLoader; - private String latchId; - private final int NUM_SOURCES = 4; - private final int NUM_RECORDS = 10000; - - public MixedFormatFileWriterITCase() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - @Before - public void setup() { - this.latchId = UUID.randomUUID().toString(); - // We wait for two successful checkpoints in sources before shutting down. This ensures that - // the sink can commit its data. - // We need to keep a "static" latch here because all sources need to be kept running - // while we're waiting for the required number of checkpoints. Otherwise, we would lock up - // because we can only do checkpoints while all operators are running. - LATCH_MAP.put(latchId, new CountDownLatch(NUM_SOURCES * 2)); - } - - protected static final double FAILOVER_RATIO = 0.4; - - private static class StreamingExecutionTestSource extends RichParallelSourceFunction - implements CheckpointListener, CheckpointedFunction { - - private final String latchId; - - private final int numberOfRecords; - - /** - * Whether the test is executing in a scenario that induces a failover. This doesn't mean that - * this source induces the failover. - */ - private final boolean isFailoverScenario; - - private ListState nextValueState; - - private int nextValue; - - private volatile boolean isCanceled; - - private volatile boolean snapshottedAfterAllRecordsOutput; - - private volatile boolean isWaitingCheckpointComplete; - - private volatile boolean hasCompletedCheckpoint; - - public StreamingExecutionTestSource( - String latchId, int numberOfRecords, boolean isFailoverScenario) { - this.latchId = latchId; - this.numberOfRecords = numberOfRecords; - this.isFailoverScenario = isFailoverScenario; - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - nextValueState = - context - .getOperatorStateStore() - .getListState(new ListStateDescriptor<>("nextValue", Integer.class)); - - if (nextValueState.get() != null && nextValueState.get().iterator().hasNext()) { - nextValue = nextValueState.get().iterator().next(); - } - } - - @Override - public void run(SourceContext ctx) throws Exception { - if (isFailoverScenario && getRuntimeContext().getAttemptNumber() == 0) { - // In the first execution, we first send a part of record... - sendRecordsUntil((int) (numberOfRecords * FAILOVER_RATIO * 0.5), ctx); - - // Wait till the first part of data is committed. - while (!hasCompletedCheckpoint) { - Thread.sleep(50); - } - - // Then we write the second part of data... - sendRecordsUntil((int) (numberOfRecords * FAILOVER_RATIO), ctx); - - // And then trigger the failover. - if (getRuntimeContext().getIndexOfThisSubtask() == 0) { - throw new RuntimeException("Designated Exception"); - } else { - while (true) { - Thread.sleep(50); - } - } - } else { - // If we are not going to trigger failover or we have already triggered failover, - // run until finished. - sendRecordsUntil(numberOfRecords, ctx); - - // Wait the last checkpoint to commit all the pending records. - isWaitingCheckpointComplete = true; - CountDownLatch latch = LATCH_MAP.get(latchId); - latch.await(); - } - } - - private void sendRecordsUntil(int targetNumber, SourceContext ctx) { - while (!isCanceled && nextValue < targetNumber) { - synchronized (ctx.getCheckpointLock()) { - ctx.collect( - GenericRowData.of( - nextValue++, - StringData.fromString(""), - LocalDateTime.now().toInstant(ZoneOffset.UTC).toEpochMilli(), - TimestampData.fromLocalDateTime(LocalDateTime.now()))); - } - } - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - nextValueState.update(Collections.singletonList(nextValue)); - - if (isWaitingCheckpointComplete) { - snapshottedAfterAllRecordsOutput = true; - } - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - if (isWaitingCheckpointComplete && snapshottedAfterAllRecordsOutput) { - CountDownLatch latch = LATCH_MAP.get(latchId); - latch.countDown(); - } - - hasCompletedCheckpoint = true; - } - - @Override - public void cancel() { - isCanceled = true; - } - } - - protected JobGraph createJobGraph( - MixedFormatTableLoader tableLoader, TableSchema tableSchema, boolean triggerFailover) { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - Configuration config = new Configuration(); - config.set(ExecutionOptions.RUNTIME_MODE, RuntimeExecutionMode.STREAMING); - env.configure(config, getClass().getClassLoader()); - - env.enableCheckpointing(10, CheckpointingMode.EXACTLY_ONCE); - - if (triggerFailover) { - env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, Time.milliseconds(100))); - } else { - env.setRestartStrategy(RestartStrategies.noRestart()); - } - - DataStreamSource source = - env.addSource(new StreamingExecutionTestSource(latchId, NUM_RECORDS, triggerFailover)) - .setParallelism(4); - MixedTable table = MixedFormatUtils.loadMixedTable(tableLoader); - FlinkSink.forRowData(source) - .context(Optional::of) - .table(table) - .tableLoader(tableLoader) - .flinkSchema(tableSchema) - .build(); - - StreamGraph streamGraph = env.getStreamGraph(); - return streamGraph.getJobGraph(); - } - - @Test - public void testWrite() throws Exception { - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - - JobGraph jobGraph = createJobGraph(tableLoader, FLINK_SCHEMA, true); - final Configuration config = new Configuration(); - config.setString(RestOptions.BIND_PORT, "18081-19000"); - final MiniClusterConfiguration cfg = - new MiniClusterConfiguration.Builder() - .setNumTaskManagers(1) - .setNumSlotsPerTaskManager(NUM_SOURCES) - .setConfiguration(config) - .build(); - - try (MiniCluster miniCluster = new MiniCluster(cfg)) { - miniCluster.start(); - miniCluster.executeJobBlocking(jobGraph); - } - - KeyedTable keyedTable = tableLoader.loadMixedFormatTable().asKeyedTable(); - checkResult(keyedTable, NUM_RECORDS * NUM_SOURCES); - } - - public static void checkResult(KeyedTable keyedTable, int exceptedSize) { - keyedTable.refresh(); - Snapshot crt = keyedTable.changeTable().currentSnapshot(); - - Stack snapshots = new Stack<>(); - while (crt != null) { - snapshots.push(crt); - if (crt.parentId() == null) { - break; - } - crt = keyedTable.changeTable().snapshot(crt.parentId()); - } - - Set paths = new HashSet<>(); - long maxTxId = -1; - while (!snapshots.isEmpty()) { - Snapshot snapshot = snapshots.pop(); - long minTxIdInSnapshot = Integer.MAX_VALUE; - long maxTxIdInSnapshot = -1; - for (DataFile addedFile : snapshot.addedDataFiles(keyedTable.io())) { - String path = addedFile.path().toString(); - Assert.assertFalse(paths.contains(path)); - paths.add(path); - LOG.info("add file: {}", addedFile.path()); - - long txId = FileNameRules.parseChange(path, snapshot.sequenceNumber()).transactionId(); - minTxIdInSnapshot = Math.min(minTxIdInSnapshot, txId); - maxTxIdInSnapshot = Math.max(maxTxIdInSnapshot, txId); - } - Assert.assertTrue(maxTxId <= minTxIdInSnapshot); - - maxTxId = maxTxIdInSnapshot; - } - - Assert.assertEquals(exceptedSize, TestMixedFormatSource.tableRecords(keyedTable).size()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java deleted file mode 100644 index 057765df1c..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAdaptHiveWriter.java +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.table.TableProperties.FILE_FORMAT_ORC; - -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.CatalogTestHelper; -import org.apache.amoro.catalog.TableTestBase; -import org.apache.amoro.flink.read.AdaptHiveFlinkParquetReaders; -import org.apache.amoro.hive.TestHMS; -import org.apache.amoro.hive.catalog.HiveCatalogTestHelper; -import org.apache.amoro.hive.catalog.HiveTableTestHelper; -import org.apache.amoro.hive.table.HiveLocationKind; -import org.apache.amoro.shade.guava32.com.google.common.collect.Iterators; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.table.BaseLocationKind; -import org.apache.amoro.table.ChangeLocationKind; -import org.apache.amoro.table.LocationKind; -import org.apache.amoro.table.MixedTable; -import org.apache.amoro.table.WriteOperationKind; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.AdaptHiveParquet; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.io.IOException; -import java.math.BigDecimal; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -@RunWith(Parameterized.class) -public class TestAdaptHiveWriter extends TableTestBase { - - @ClassRule public static TestHMS TEST_HMS = new TestHMS(); - - public TestAdaptHiveWriter(CatalogTestHelper catalogTestHelper, TableTestHelper tableTestHelper) { - super(catalogTestHelper, tableTestHelper); - } - - @Parameterized.Parameters(name = "{0}, {1}") - public static Object[] parameters() { - return new Object[][] { - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, false) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(false, true) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(false, false) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, true, FILE_FORMAT_ORC) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(true, false, FILE_FORMAT_ORC) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(false, true, FILE_FORMAT_ORC) - }, - { - new HiveCatalogTestHelper(TableFormat.MIXED_HIVE, TEST_HMS.getHiveConf()), - new HiveTableTestHelper(false, false, FILE_FORMAT_ORC) - } - }; - } - - @Test - public void testKeyedTableWriteTypeFromOperateKind() { - Assume.assumeTrue(isKeyedTable()); - MixedTable testKeyedHiveTable = getMixedTable(); - FlinkTaskWriterBuilder builder = - FlinkTaskWriterBuilder.buildFor(testKeyedHiveTable) - .withFlinkSchema(FlinkSchemaUtil.convert(testKeyedHiveTable.schema())); - - Assert.assertTrue( - builder.buildWriter(ChangeLocationKind.INSTANT) instanceof FlinkChangeTaskWriter); - Assert.assertTrue(builder.buildWriter(BaseLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); - Assert.assertTrue(builder.buildWriter(HiveLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); - - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.APPEND) instanceof FlinkChangeTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.OVERWRITE) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.MINOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.MAJOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.FULL_OPTIMIZE) instanceof FlinkBaseTaskWriter); - } - - @Test - public void testUnKeyedTableWriteTypeFromOperateKind() { - Assume.assumeFalse(isKeyedTable()); - MixedTable testHiveTable = getMixedTable(); - FlinkTaskWriterBuilder builder = - FlinkTaskWriterBuilder.buildFor(testHiveTable) - .withFlinkSchema(FlinkSchemaUtil.convert(testHiveTable.schema())); - - Assert.assertTrue(builder.buildWriter(BaseLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); - Assert.assertTrue(builder.buildWriter(HiveLocationKind.INSTANT) instanceof FlinkBaseTaskWriter); - - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.APPEND) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.OVERWRITE) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.MAJOR_OPTIMIZE) instanceof FlinkBaseTaskWriter); - Assert.assertTrue( - builder.buildWriter(WriteOperationKind.FULL_OPTIMIZE) instanceof FlinkBaseTaskWriter); - } - - @Test - public void testKeyedTableChangeWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); - } - - @Test - public void testKeyedTableBaseWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); - } - - @Test - public void testKeyedTableHiveWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); - } - - @Test - public void testUnPartitionKeyedTableChangeWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); - } - - @Test - public void testUnPartitionKeyedTableBaseWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); - } - - @Test - public void testUnPartitionKeyedTableHiveWriteByLocationKind() throws IOException { - Assume.assumeTrue(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); - } - - @Test - public void testUnKeyedTableChangeWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - try { - testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); - } catch (Exception e) { - Assert.assertTrue(e instanceof IllegalArgumentException); - } - } - - @Test - public void testUnKeyedTableBaseWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); - } - - @Test - public void testUnKeyedTableHiveWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeTrue(isPartitionedTable()); - testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); - } - - @Test - public void testUnPartitionUnKeyedTableChangeWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - try { - testWrite(getMixedTable(), ChangeLocationKind.INSTANT, geneRowData(), "change"); - } catch (Exception e) { - Assert.assertTrue(e instanceof IllegalArgumentException); - } - } - - @Test - public void testUnPartitionUnKeyedTableBaseWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - testWrite(getMixedTable(), BaseLocationKind.INSTANT, geneRowData(), "base"); - } - - @Test - public void testUnPartitionUnKeyedTableHiveWriteByLocationKind() throws IOException { - Assume.assumeFalse(isKeyedTable()); - Assume.assumeFalse(isPartitionedTable()); - testWrite(getMixedTable(), HiveLocationKind.INSTANT, geneRowData(), "hive"); - } - - public void testWrite( - MixedTable table, LocationKind locationKind, List records, String pathFeature) - throws IOException { - FlinkTaskWriterBuilder builder = - FlinkTaskWriterBuilder.buildFor(table) - .withFlinkSchema(FlinkSchemaUtil.convert(table.schema())); - - TaskWriter changeWrite = builder.buildWriter(locationKind); - for (RowData record : records) { - changeWrite.write(record); - } - WriteResult complete = changeWrite.complete(); - Arrays.stream(complete.dataFiles()) - .forEach(s -> Assert.assertTrue(s.path().toString().contains(pathFeature))); - CloseableIterable concat = - CloseableIterable.concat( - Arrays.stream(complete.dataFiles()) - .map( - s -> { - switch (s.format()) { - case PARQUET: - return readParquet(table.schema(), s.path().toString()); - case ORC: - return readOrc(table.schema(), s.path().toString()); - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + s.format()); - } - }) - .collect(Collectors.toList())); - Set result = new HashSet<>(); - Iterators.addAll(result, concat.iterator()); - Assert.assertEquals(result, records.stream().collect(Collectors.toSet())); - } - - private CloseableIterable readParquet(Schema schema, String path) { - AdaptHiveParquet.ReadBuilder builder = - AdaptHiveParquet.read(Files.localInput(path)) - .project(schema) - .createReaderFunc( - fileSchema -> - AdaptHiveFlinkParquetReaders.buildReader(schema, fileSchema, new HashMap<>())) - .caseSensitive(false); - - CloseableIterable iterable = builder.build(); - return iterable; - } - - private CloseableIterable readOrc(Schema schema, String path) { - ORC.ReadBuilder builder = - ORC.read(Files.localInput(path)) - .project(schema) - .createReaderFunc(fileSchema -> new FlinkOrcReader(schema, fileSchema, new HashMap<>())) - .caseSensitive(false); - - CloseableIterable iterable = builder.build(); - return iterable; - } - - private List geneRowData() { - return Lists.newArrayList(geneRowData(1, "lily", 0, "2022-01-02T12:00:00")); - } - - private RowData geneRowData(int id, String name, long ts, String timestamp) { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss"); - return GenericRowData.of( - id, - StringData.fromString(name), - ts, - TimestampData.fromLocalDateTime(LocalDateTime.parse(timestamp, formatter)), - TimestampData.fromLocalDateTime(LocalDateTime.parse(timestamp, formatter)), - DecimalData.fromBigDecimal(new BigDecimal("0"), 10, 0), - StringData.fromString(timestamp.substring(0, 10))); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java deleted file mode 100644 index fbdf6a4b7e..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticDoubleWriteStatus.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.iceberg.UpdateProperties; -import org.junit.Assert; -import org.junit.Test; - -import java.time.Duration; - -public class TestAutomaticDoubleWriteStatus extends FlinkTestBase { - public MixedFormatTableLoader tableLoader; - - public TestAutomaticDoubleWriteStatus() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - @Test - public void testTableProperties() { - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - tableLoader.open(); - MixedTable mixedTable = tableLoader.loadMixedFormatTable(); - UpdateProperties up = mixedTable.updateProperties(); - up.set(AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), "10"); - up.commit(); - AutomaticDoubleWriteStatus status = - new AutomaticDoubleWriteStatus(tableLoader, Duration.ofSeconds(10)); - status.open(); - - Assert.assertFalse(status.isDoubleWrite()); - status.processWatermark(new Watermark(System.currentTimeMillis() - 11 * 1000)); - Assert.assertFalse(status.isDoubleWrite()); - Assert.assertFalse(Boolean.parseBoolean(mixedTable.properties().get(LOG_STORE_CATCH_UP.key()))); - status.processWatermark(new Watermark(System.currentTimeMillis() - 9 * 1000)); - Assert.assertTrue(status.isDoubleWrite()); - Assert.assertTrue(status.isDoubleWrite()); - - mixedTable.refresh(); - Assert.assertTrue(Boolean.parseBoolean(mixedTable.properties().get(LOG_STORE_CATCH_UP.key()))); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java deleted file mode 100644 index 37150e3563..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestAutomaticLogWriter.java +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.AUTO_EMIT_LOGSTORE_WATERMARK_GAP; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.LOG_STORE_CATCH_UP; -import static org.apache.amoro.table.TableProperties.ENABLE_LOG_STORE; -import static org.apache.amoro.table.TableProperties.LOG_STORE_ADDRESS; -import static org.apache.amoro.table.TableProperties.LOG_STORE_MESSAGE_TOPIC; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.metric.MetricsGenerator; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.flink.util.TestGlobalAggregateManager; -import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; -import org.apache.amoro.flink.write.hidden.kafka.HiddenKafkaFactory; -import org.apache.amoro.io.MixedDataTestHelpers; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.Schema; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.io.WriteResult; -import org.apache.iceberg.types.TypeUtil; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.producer.ProducerConfig; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.jupiter.api.Assertions; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; -import java.util.stream.Collectors; - -@RunWith(Parameterized.class) -public class TestAutomaticLogWriter extends FlinkTestBase { - private static final Logger LOG = LoggerFactory.getLogger(TestAutomaticLogWriter.class); - public MixedFormatTableLoader tableLoader; - public static final TestGlobalAggregateManager GLOBAL_AGGREGATE_MANGER = - new TestGlobalAggregateManager(); - - private final boolean isGapNone; - private final boolean logstoreEnabled; - - public TestAutomaticLogWriter(boolean isGapNone, boolean logstoreEnabled) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - this.isGapNone = isGapNone; - this.logstoreEnabled = logstoreEnabled; - } - - @Parameterized.Parameters(name = "isGapNone={0}, logstoreEnabled={1}") - public static Object[][] parameters() { - return new Object[][] { - {true, true}, - {false, false}, - {false, true}, - {true, false} - }; - } - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Before - public void init() { - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - tableLoader.open(); - } - - @Test - public void testHasCaughtUp() throws Exception { - String topic = - Thread.currentThread().getStackTrace()[1].getMethodName() + isGapNone + logstoreEnabled; - - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - env.getConfig().setAutoWatermarkInterval(10); - - List expects = new LinkedList<>(); - DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); - expects.add( - new Object[] { - 1000004, - "a", - LocalDateTime.parse("2022-06-17 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17 10:10:11", dtf) - }); - expects.add( - new Object[] { - 1000015, - "b", - LocalDateTime.parse("2022-06-17 10:08:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17 10:08:11", dtf) - }); - expects.add( - new Object[] { - 1000011, - "c", - LocalDateTime.parse("2022-06-18 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-18 10:10:11", dtf) - }); - List catchUpExpects = new LinkedList<>(); - catchUpExpects.add( - new Object[] { - 1000014, - "d", - LocalDateTime.now().minusSeconds(3).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.now().minusSeconds(3) - }); - catchUpExpects.add( - new Object[] { - 1000021, - "d", - LocalDateTime.now().minusSeconds(2).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.now().minusSeconds(2) - }); - catchUpExpects.add( - new Object[] { - 1000015, - "e", - LocalDateTime.now().minusSeconds(1).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.now().minusSeconds(1) - }); - expects.addAll(catchUpExpects); - - DataStream input = - env.fromElements(expects.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); - - KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); - UpdateProperties up = testKeyedTable.updateProperties(); - up.set(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); - up.set(LOG_STORE_MESSAGE_TOPIC, topic); - if (logstoreEnabled) { - up.set(ENABLE_LOG_STORE, "true"); - } else { - up.set(ENABLE_LOG_STORE, "false"); - } - up.set(LOG_STORE_CATCH_UP.key(), "true"); - up.commit(); - - FlinkSink.forRowData(input) - .context(Optional::of) - .table(testKeyedTable) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .producerConfig(getPropertiesByTopic(topic)) - .topic(topic) - .build(); - - env.execute(); - - testKeyedTable.changeTable().refresh(); - List actual = MixedDataTestHelpers.readKeyedTable(testKeyedTable, null); - - Set expected = toRecords(DataUtil.toRowSet(expects)); - Assert.assertEquals(expected, new HashSet<>(actual)); - if (logstoreEnabled) { - checkLogstoreDataAccuracy(topic, expects); - } else { - checkLogstoreDataAccuracy(topic, new ArrayList<>()); - } - } - - @Test - public void testHasNotCaughtUp() throws Exception { - String topic = - Thread.currentThread().getStackTrace()[1].getMethodName() + isGapNone + logstoreEnabled; - byte[] jobId = IdGenerator.generateUpstreamId(); - Duration gap; - KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); - UpdateProperties up = testKeyedTable.updateProperties(); - up.set(LOG_STORE_ADDRESS, KAFKA_CONTAINER.getBootstrapServers()); - up.set(LOG_STORE_MESSAGE_TOPIC, topic); - up.set(ENABLE_LOG_STORE, "true"); - if (!isGapNone) { - up.set(AUTO_EMIT_LOGSTORE_WATERMARK_GAP.key(), "20"); - } - up.commit(); - - if (isGapNone) { - gap = null; - } else { - gap = Duration.ofSeconds(20); - } - - List expects = new LinkedList<>(); - List results; - testKeyedTable.refresh(); - Assert.assertFalse( - Boolean.parseBoolean( - testKeyedTable.properties().getOrDefault(LOG_STORE_CATCH_UP.key(), "false"))); - try (TestOneInputStreamOperatorIntern harness = - createSingleProducer(1, jobId, topic, gap)) { - DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); - expects.add( - new Object[] { - 1000004, - "a", - LocalDateTime.parse("2022-06-17 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17 10:10:11", dtf) - }); - expects.add( - new Object[] { - 1000015, - "b", - LocalDateTime.parse("2022-06-17 10:18:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17 10:18:11", dtf) - }); - expects.add( - new Object[] { - 1000011, - "c", - LocalDateTime.parse("2022-06-18 10:10:11", dtf).toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-18 10:10:11", dtf) - }); - long checkpoint = 0; - - harness.setup(); - harness.initializeEmptyState(); - harness.open(); - harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(0)))); - harness.processWatermark(1); - harness.prepareSnapshotPreBarrier(++checkpoint); - harness.snapshot(1, 1); - harness.notifyOfCompletedCheckpoint(checkpoint); - harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(1)))); - harness.processWatermark(System.currentTimeMillis() - 1000); - harness.prepareSnapshotPreBarrier(++checkpoint); - harness.snapshot(2, 1); - harness.notifyOfCompletedCheckpoint(checkpoint); - harness.processElement(new StreamRecord<>(createRowData(RowKind.INSERT, expects.get(2)))); - harness.processWatermark(System.currentTimeMillis()); - harness.prepareSnapshotPreBarrier(++checkpoint); - harness.snapshot(3, 1); - harness.notifyOfCompletedCheckpoint(checkpoint); - - results = harness.extractOutputValues(); - } catch (Throwable e) { - LOG.error("", e); - throw e; - } - - // check expects accuracy. - Assert.assertEquals(3, results.size()); - results.forEach(result -> Assert.assertEquals(1, result.dataFiles().length)); - List expected = isGapNone ? expects : expects.subList(2, expects.size()); - checkLogstoreDataAccuracy(topic, expected); - testKeyedTable.refresh(); - if (!isGapNone) { - Assert.assertTrue( - Boolean.parseBoolean(testKeyedTable.properties().get(LOG_STORE_CATCH_UP.key()))); - } - } - - private void checkLogstoreDataAccuracy(String topic, List expects) { - LogDataJsonDeserialization logDataJsonDeserialization = - new LogDataJsonDeserialization<>( - TABLE_SCHEMA, LogRecordV1.factory, LogRecordV1.arrayFactory, LogRecordV1.mapFactory); - ConsumerRecords consumerRecords = KafkaContainerTest.readRecordsBytes(topic); - Assertions.assertEquals(expects.size(), consumerRecords.count()); - List actual = new ArrayList<>(); - consumerRecords.forEach( - consumerRecord -> { - try { - actual.add( - logDataJsonDeserialization.deserialize(consumerRecord.value()).getActualValue()); - } catch (IOException e) { - e.printStackTrace(); - } - }); - Collection expected = DataUtil.toRowData(expects); - Assertions.assertEquals( - expected.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList()), - actual.stream() - .sorted(Comparator.comparing(RowData::toString)) - .collect(Collectors.toList())); - } - - public TestOneInputStreamOperatorIntern createSingleProducer( - int maxParallelism, byte[] jobId, String topic, Duration writeLogstoreWatermarkGap) - throws Exception { - return createProducer( - maxParallelism, - maxParallelism, - 0, - null, - jobId, - GLOBAL_AGGREGATE_MANGER, - topic, - writeLogstoreWatermarkGap); - } - - private TestOneInputStreamOperatorIntern createProducer( - int maxParallelism, - int parallelism, - int subTaskId, - Long restoredCheckpointId, - byte[] jobId, - TestGlobalAggregateManager testGlobalAggregateManager, - String topic, - Duration writeLogstoreWatermarkGap) - throws Exception { - AutomaticLogWriter automaticLogWriter = - new AutomaticLogWriter( - TABLE_SCHEMA, - getPropertiesByTopic(topic), - topic, - new HiddenKafkaFactory<>(), - LogRecordV1.FIELD_GETTER_FACTORY, - jobId, - ShuffleHelper.EMPTY, - tableLoader, - writeLogstoreWatermarkGap); - - KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); - RowType flinkSchemaRowType = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); - Schema writeSchema = - TypeUtil.reassignIds(FlinkSchemaUtil.convert(FLINK_SCHEMA), testKeyedTable.schema()); - MetricsGenerator metricsGenerator = - MixedFormatUtils.getMetricsGenerator( - false, false, testKeyedTable, flinkSchemaRowType, writeSchema); - - MixedFormatFileWriter streamWriter = - FlinkSink.createFileWriter( - testKeyedTable, - null, - false, - (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(), - tableLoader); - - MixedFormatWriter mixedFormatWriter = - new MixedFormatWriter<>(automaticLogWriter, streamWriter, metricsGenerator); - - TestOneInputStreamOperatorIntern harness = - new TestOneInputStreamOperatorIntern<>( - mixedFormatWriter, - maxParallelism, - parallelism, - subTaskId, - restoredCheckpointId, - testGlobalAggregateManager); - harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); - return harness; - } - - private static Properties getPropertiesByTopic(String topic) { - Properties properties = new Properties(); - properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); - properties.put(LOG_STORE_MESSAGE_TOPIC, topic); - properties.put(ProducerConfig.ACKS_CONFIG, "all"); - properties.put(ProducerConfig.BATCH_SIZE_CONFIG, "0"); - return properties; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java deleted file mode 100644 index 226d721155..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestFlinkSink.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.DataUtil; -import org.apache.amoro.io.MixedDataTestHelpers; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.UnkeyedTable; -import org.apache.flink.streaming.api.CheckpointingMode; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.environment.CheckpointConfig; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.table.data.RowData; -import org.apache.iceberg.data.Record; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; -import java.util.Set; - -@RunWith(Parameterized.class) -public class TestFlinkSink extends FlinkTestBase { - - public TestFlinkSink(boolean isKeyed) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(isKeyed, false)); - } - - @Parameterized.Parameters(name = "{0}") - public static Collection parameters() { - return Arrays.asList(new Object[][] {{true}, {false}}); - } - - @Test - public void testKeyedSink() throws Exception { - Assume.assumeTrue(isKeyedTable()); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - KeyedTable testKeyedTable = getMixedTable().asKeyedTable(); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List data = new LinkedList<>(); - data.add( - new Object[] { - 1000004, - "a", - LocalDateTime.parse("2022-06-17T10:10:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - data.add( - new Object[] { - 1000015, - "b", - LocalDateTime.parse("2022-06-17T10:08:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17T10:08:11.0") - }); - data.add( - new Object[] { - 1000011, - "c", - LocalDateTime.parse("2022-06-18T10:10:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-18T10:10:11.0") - }); - data.add( - new Object[] { - 1000014, - "d", - LocalDateTime.parse("2022-06-17T10:11:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17T10:11:11.0") - }); - data.add( - new Object[] { - 1000021, - "d", - LocalDateTime.parse("2022-06-17T16:10:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17T16:10:11.0") - }); - data.add( - new Object[] { - 1000015, - "e", - LocalDateTime.parse("2022-06-17T10:10:11.0").toEpochSecond(ZoneOffset.UTC), - LocalDateTime.parse("2022-06-17T10:10:11.0") - }); - - DataStream input = - env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); - - FlinkSink.forRowData(input) - .context(Optional::of) - .table(testKeyedTable) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .build(); - - env.execute(); - - testKeyedTable.changeTable().refresh(); - List actual = MixedDataTestHelpers.readKeyedTable(testKeyedTable, null); - - Set expected = toRecords(DataUtil.toRowSet(data)); - Assert.assertEquals(expected, new HashSet<>(actual)); - } - - @Test - public void testUnkeyedSink() throws Exception { - Assume.assumeFalse(isKeyedTable()); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - UnkeyedTable testTable = getMixedTable().asUnkeyedTable(); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List data = new LinkedList<>(); - data.add( - new Object[] {1000004, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {1000015, "b", 1655513411000L, LocalDateTime.parse("2022-06-17T10:08:11.0")}); - data.add( - new Object[] {1000011, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] {1000014, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T10:11:11.0")}); - data.add( - new Object[] {1000021, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T16:10:11.0")}); - data.add( - new Object[] {1000015, "e", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - DataStream input = - env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); - - FlinkSink.forRowData(input) - .context(Optional::of) - .table(testTable) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .build(); - - env.execute(); - testTable.refresh(); - Set actual = DataUtil.read(testTable); - - Set expected = toRecords(DataUtil.toRowSet(data)); - Assert.assertEquals(expected, actual); - } - - @Test - public void testUnkeyedOverwrite() throws Exception { - Assume.assumeFalse(isKeyedTable()); - final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - UnkeyedTable testTable = getMixedTable().asUnkeyedTable(); - - env.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE); - env.getCheckpointConfig() - .enableExternalizedCheckpoints( - CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); - - List data = new LinkedList<>(); - data.add( - new Object[] {1000004, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {1000015, "b", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add( - new Object[] {1000011, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] {1000014, "d", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] {1000021, "d", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add( - new Object[] {1000015, "e", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - DataStream input = - env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); - - FlinkSink.forRowData(input) - .context(Optional::of) - .table(testTable) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .flinkSchema(FLINK_SCHEMA) - .build(); - env.execute(); - - data.clear(); - data.add(new Object[] {12, "d", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {11, "a", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {15, "c", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {21, "k", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - data.add(new Object[] {91, "l", 1655599811000L, LocalDateTime.parse("2022-06-18T10:10:11.0")}); - data.add(new Object[] {74, "m", 1655513411000L, LocalDateTime.parse("2022-06-17T10:10:11.0")}); - - DataStream overwrite = - env.fromElements(data.stream().map(DataUtil::toRowData).toArray(RowData[]::new)); - - FlinkSink.forRowData(overwrite) - .context(Optional::of) - .table(testTable) - .tableLoader(MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder)) - .overwrite(true) - .flinkSchema(FLINK_SCHEMA) - .build(); - - env.execute(); - testTable.refresh(); - Set actual = DataUtil.read(testTable); - - Set expected = toRecords(DataUtil.toRowSet(data)); - Assert.assertEquals(expected, actual); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java deleted file mode 100644 index 6b3b572e33..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileCommitter.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.MixedFormatUtils; -import org.apache.amoro.table.KeyedTable; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.WriteResult; -import org.junit.Assert; -import org.junit.Test; - -import java.util.List; - -public class TestMixedFormatFileCommitter extends FlinkTestBase { - public MixedFormatTableLoader tableLoader; - - public TestMixedFormatFileCommitter() { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(true, true)); - } - - public OneInputStreamOperatorTestHarness createMixedFormatFileCommitter( - MixedFormatTableLoader tableLoader, - MixedTable table, - OperatorSubtaskState operatorSubtaskState) - throws Exception { - OneInputStreamOperator committer = - FlinkSink.createFileCommitter( - table, tableLoader, false, SnapshotRef.MAIN_BRANCH, table.spec()); - OneInputStreamOperatorTestHarness harness = - new OneInputStreamOperatorTestHarness<>(committer, 1, 1, 0); - - harness.setup(); - if (operatorSubtaskState == null) { - harness.initializeEmptyState(); - } else { - harness.initializeState(operatorSubtaskState); - } - harness.open(); - - return harness; - } - - public void checkChangeFiles(int fileCnt, int recordCnt, KeyedTable table) { - table.changeTable().refresh(); - TableScan tableScan = table.changeTable().newScan(); - CloseableIterable fileScanTasks = tableScan.planFiles(); - int actualFileCnt = 0; - int actualRecordCnt = 0; - for (FileScanTask fileScanTask : fileScanTasks) { - actualFileCnt++; - actualRecordCnt += fileScanTask.file().recordCount(); - } - Assert.assertEquals(fileCnt, actualFileCnt); - Assert.assertEquals(recordCnt, actualRecordCnt); - } - - @Test - public void testCommit() throws Exception { - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - KeyedTable table = MixedFormatUtils.loadMixedTable(tableLoader).asKeyedTable(); - - List completedFiles = prepareChangeFiles(); - OperatorSubtaskState snapshot; - long checkpoint = 1; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatFileCommitter(tableLoader, table, null)) { - - for (WriteResult completedFile : completedFiles) { - testHarness.processElement(new StreamRecord<>(completedFile)); - } - snapshot = testHarness.snapshot(checkpoint, System.currentTimeMillis()); - } - - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatFileCommitter(tableLoader, table, snapshot)) { - testHarness.notifyOfCompletedCheckpoint(checkpoint); - } - - checkChangeFiles(7, 9, table); - } - - private List prepareChangeFiles() throws Exception { - List changeFiles; - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - TestMixedFormatFileWriter.createMixedFormatStreamWriter(tableLoader)) { - // The first checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(1, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement( - createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_BEFORE), 1); - testHarness.processElement( - createRowData(2, "hello0", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); - testHarness.processElement( - createRowData(3, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); - testHarness.processElement(createRowData(5, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(6, "hello", "2020-10-12T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - // testHarness.extractOutputValues() compute the sum - Assert.assertEquals(2, testHarness.extractOutputValues().size()); - Assert.assertEquals(4, testHarness.extractOutputValues().get(1).dataFiles().length); - changeFiles = testHarness.extractOutputValues(); - } - return changeFiles; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java deleted file mode 100644 index 889fd74e1b..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/TestMixedFormatFileWriter.java +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write; - -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.SUBMIT_EMPTY_SNAPSHOTS; - -import org.apache.amoro.BasicTableTestHelper; -import org.apache.amoro.TableFormat; -import org.apache.amoro.TableTestHelper; -import org.apache.amoro.catalog.BasicCatalogTestHelper; -import org.apache.amoro.flink.FlinkTestBase; -import org.apache.amoro.flink.table.MixedFormatTableLoader; -import org.apache.amoro.flink.util.TestGlobalAggregateManager; -import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; -import org.apache.amoro.table.MixedTable; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.types.RowKind; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.flink.sink.RowDataTaskWriterFactory; -import org.apache.iceberg.flink.sink.TaskWriterFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.WriteResult; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.HashMap; -import java.util.List; - -@RunWith(Parameterized.class) -public class TestMixedFormatFileWriter extends FlinkTestBase { - - public static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - public MixedFormatTableLoader tableLoader; - private final boolean submitEmptySnapshots; - - @Parameterized.Parameters(name = "{0}, {1}") - public static Object[][] parameters() { - return new Object[][] { - {true, false}, - {true, true}, - {false, false}, - {false, true} - }; - } - - public TestMixedFormatFileWriter(boolean isKeyed, boolean submitEmptySnapshots) { - super( - new BasicCatalogTestHelper(TableFormat.MIXED_ICEBERG), - new BasicTableTestHelper(isKeyed, true)); - this.submitEmptySnapshots = submitEmptySnapshots; - } - - public static OneInputStreamOperatorTestHarness - createMixedFormatStreamWriter(MixedFormatTableLoader tableLoader) throws Exception { - return createMixedFormatStreamWriter(tableLoader, true, null); - } - - public static OneInputStreamOperatorTestHarness - createMixedFormatStreamWriter( - MixedFormatTableLoader tableLoader, - boolean submitEmptySnapshots, - Long restoredCheckpointId) - throws Exception { - OneInputStreamOperatorTestHarness harness = - doCreateMixedFormatStreamWriter(tableLoader, submitEmptySnapshots, restoredCheckpointId); - - harness.setup(); - harness.open(); - - return harness; - } - - public static OneInputStreamOperatorTestHarness - doCreateMixedFormatStreamWriter( - MixedFormatTableLoader tableLoader, - boolean submitEmptySnapshots, - Long restoredCheckpointId) - throws Exception { - tableLoader.open(); - MixedTable mixedTable = tableLoader.loadMixedFormatTable(); - mixedTable.properties().put(SUBMIT_EMPTY_SNAPSHOTS.key(), String.valueOf(submitEmptySnapshots)); - - MixedFormatFileWriter streamWriter = - FlinkSink.createFileWriter( - mixedTable, - null, - false, - (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(), - tableLoader); - TestOneInputStreamOperatorIntern harness = - new TestOneInputStreamOperatorIntern<>( - streamWriter, 1, 1, 0, restoredCheckpointId, new TestGlobalAggregateManager()); - - return harness; - } - - public static TaskWriter createUnkeyedTaskWriter( - Table table, long targetFileSize, FileFormat format, RowType rowType) { - TaskWriterFactory taskWriterFactory = - new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - rowType, - targetFileSize, - format, - new HashMap<>(), - null, - false); - taskWriterFactory.initialize(1, 1); - return taskWriterFactory.create(); - } - - @Test - public void testInsertWrite() throws Exception { - Assume.assumeTrue(isKeyedTable()); - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader)) { - MixedFormatFileWriter fileWriter = (MixedFormatFileWriter) testHarness.getOneInputOperator(); - Assert.assertNotNull(fileWriter.getWriter()); - // The first checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertNull(fileWriter.getWriter()); - Assert.assertEquals(1, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); - Assert.assertNotNull(fileWriter.getWriter()); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(3, "hello", "2020-10-12T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - // testHarness.extractOutputValues() calculates the cumulative value - List completedFiles = testHarness.extractOutputValues(); - Assert.assertEquals(2, completedFiles.size()); - Assert.assertEquals(3, completedFiles.get(1).dataFiles().length); - } - } - - @Test - public void testSnapshotMultipleTimes() throws Exception { - long checkpointId = 1; - long timestamp = 1; - - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader)) { - testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), timestamp++); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), timestamp); - testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), timestamp); - - testHarness.prepareSnapshotPreBarrier(checkpointId++); - long expectedDataFiles = 3; - WriteResult result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - - // snapshot again immediately. - for (int i = 0; i < 5; i++) { - testHarness.prepareSnapshotPreBarrier(checkpointId++); - - result = WriteResult.builder().addAll(testHarness.extractOutputValues()).build(); - Assert.assertEquals(0, result.deleteFiles().length); - Assert.assertEquals(expectedDataFiles, result.dataFiles().length); - } - } - } - - @Test - public void testInsertWriteWithoutPk() throws Exception { - Assume.assumeFalse(isKeyedTable()); - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader)) { - // The first checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-11T10:10:11.0"), 1); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(3, "hello", "2020-10-13T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(1, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement(createRowData(1, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(2, "hello", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement(createRowData(3, "hello", "2020-10-12T10:10:11.0"), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - // testHarness.extractOutputValues() calculates the cumulative value - List completedFiles = testHarness.extractOutputValues(); - Assert.assertEquals(2, completedFiles.size()); - Assert.assertEquals(1, completedFiles.get(1).dataFiles().length); - } - } - - @Test - public void testDeleteWrite() throws Exception { - Assume.assumeTrue(isKeyedTable()); - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader)) { - // The first checkpoint - testHarness.processElement( - createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.INSERT), 1); - testHarness.processElement( - createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.INSERT), 1); - testHarness.processElement( - createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.DELETE), 1); - testHarness.processElement( - createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.DELETE), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(1, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement( - createRowData(1, "hello", "2020-10-12T10:10:11.0", RowKind.INSERT), 1); - testHarness.processElement( - createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); - testHarness.processElement( - createRowData(3, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - // testHarness.extractOutputValues() calculates the cumulative value - Assert.assertEquals(2, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(1).dataFiles().length); - } - } - - @Test - public void testUpdateWrite() throws Exception { - Assume.assumeTrue(isKeyedTable()); - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader)) { - // The first checkpoint - testHarness.processElement( - createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.INSERT), 1); - testHarness.processElement( - createRowData(1, "hello", "2020-10-11T10:10:11.0", RowKind.UPDATE_BEFORE), 1); - testHarness.processElement( - createRowData(1, "hi", "2020-10-11T10:10:11.0", RowKind.UPDATE_AFTER), 1); - testHarness.processElement( - createRowData(1, "hello", "2020-10-13T10:10:11.0", RowKind.UPDATE_AFTER), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(1, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(0).dataFiles().length); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.processElement( - createRowData(1, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); - testHarness.processElement(createRowData(2, "h", "2020-10-12T10:10:11.0"), 1); - testHarness.processElement( - createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.UPDATE_AFTER), 1); - testHarness.processElement( - createRowData(2, "hello", "2020-10-12T10:10:11.0", RowKind.DELETE), 1); - - testHarness.prepareSnapshotPreBarrier(checkpointId); - // testHarness.extractOutputValues() calculates the cumulative value - Assert.assertEquals(2, testHarness.extractOutputValues().size()); - Assert.assertEquals(3, testHarness.extractOutputValues().get(1).dataFiles().length); - } - } - - @Test - public void testEmitEmptyResults() throws Exception { - Assume.assumeTrue(isKeyedTable()); - tableLoader = MixedFormatTableLoader.of(TableTestHelper.TEST_TABLE_ID, catalogBuilder); - long checkpointId = 1L; - long excepted = submitEmptySnapshots ? 1 : 0; - try (OneInputStreamOperatorTestHarness testHarness = - createMixedFormatStreamWriter(tableLoader, submitEmptySnapshots, null)) { - // The first checkpoint - - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(excepted, testHarness.extractOutputValues().size()); - - checkpointId = checkpointId + 1; - - // The second checkpoint - testHarness.prepareSnapshotPreBarrier(checkpointId); - Assert.assertEquals(excepted, testHarness.extractOutputValues().size()); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java deleted file mode 100644 index 76fc10446d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestBaseLog.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden.kafka; - -import static org.apache.amoro.flink.shuffle.LogRecordV1.arrayFactory; -import static org.apache.amoro.flink.shuffle.LogRecordV1.mapFactory; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.table.PrimaryKeySpec; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.Schema; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.types.Types; - -import java.util.ArrayList; - -public class TestBaseLog { - public static final Schema USER_SCHEMA = - new Schema( - new ArrayList() { - { - add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); - add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); - add(Types.NestedField.optional(2, "f_long", Types.LongType.get())); - add( - Types.NestedField.optional( - 3, - "f_struct", - Types.StructType.of( - Types.NestedField.optional(4, "f_sub_boolean", Types.BooleanType.get()), - Types.NestedField.optional(5, "f_sub_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "f_sub_long", Types.LongType.get()), - Types.NestedField.optional(7, "f_sub_string", Types.StringType.get()), - Types.NestedField.optional(8, "f_sub_time", Types.TimeType.get()), - Types.NestedField.optional( - 9, "f_sub_decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(10, "f_sub_float", Types.FloatType.get()), - Types.NestedField.optional(11, "f_sub_double", Types.DoubleType.get()), - Types.NestedField.optional(12, "f_sub_date", Types.DateType.get()), - Types.NestedField.optional( - 13, "f_sub_timestamp_local", Types.TimestampType.withoutZone()), - Types.NestedField.optional( - 14, "f_sub_timestamp_tz", Types.TimestampType.withZone()), - Types.NestedField.optional(15, "f_sub_uuid", Types.UUIDType.get()), - Types.NestedField.optional( - 16, "f_sub_fixed", Types.FixedType.ofLength(18)), - Types.NestedField.optional(17, "f_sub_binary", Types.BinaryType.get()), - Types.NestedField.optional( - 18, - "f_sub_list", - Types.ListType.ofOptional(19, Types.LongType.get())), - Types.NestedField.optional( - 20, - "f_list2", - Types.ListType.ofOptional(21, Types.IntegerType.get())), - Types.NestedField.optional( - 22, - "f_list3", - Types.ListType.ofOptional( - 23, - Types.StructType.of( - Types.NestedField.optional( - 24, "f_sub_boolean", Types.BooleanType.get()), - Types.NestedField.optional( - 25, "f_sub_int", Types.IntegerType.get()), - Types.NestedField.optional( - 26, "f_sub_long", Types.LongType.get())))), - Types.NestedField.optional( - 27, - "f_map", - Types.MapType.ofOptional( - 28, 29, Types.StringType.get(), Types.StringType.get()))))); - } - }); - - public static final Schema USER_SCHEMA_WITH_ALL_DATA_TYPE = - new Schema( - new ArrayList() { - { - add(Types.NestedField.optional(0, "f_boolean", Types.BooleanType.get())); - add(Types.NestedField.optional(1, "f_int", Types.IntegerType.get())); - add(Types.NestedField.optional(2, "f_date", Types.DateType.get())); - add(Types.NestedField.optional(3, "f_long", Types.LongType.get())); - add(Types.NestedField.optional(4, "f_time", Types.TimeType.get())); - add(Types.NestedField.optional(5, "f_float", Types.FloatType.get())); - add(Types.NestedField.optional(6, "f_double", Types.DoubleType.get())); - add( - Types.NestedField.optional( - 7, "f_timestamp_local", Types.TimestampType.withoutZone())); - add(Types.NestedField.optional(8, "f_timestamp_tz", Types.TimestampType.withZone())); - add(Types.NestedField.optional(9, "f_string", Types.StringType.get())); - add(Types.NestedField.optional(10, "f_uuid", Types.UUIDType.get())); - add(Types.NestedField.optional(11, "f_fixed", Types.FixedType.ofLength(18))); - add(Types.NestedField.optional(12, "f_binary", Types.BinaryType.get())); - add(Types.NestedField.optional(13, "f_decimal", Types.DecimalType.of(38, 18))); - add( - Types.NestedField.optional( - 14, "f_list", Types.ListType.ofOptional(15, Types.LongType.get()))); - add( - Types.NestedField.optional( - 16, - "f_map", - Types.MapType.ofOptional( - 17, 18, Types.StringType.get(), Types.StringType.get()))); - add( - Types.NestedField.optional( - 19, - "f_struct", - Types.StructType.of( - Types.NestedField.optional(20, "f_sub_boolean", Types.BooleanType.get()), - Types.NestedField.optional(21, "f_sub_int", Types.IntegerType.get()), - Types.NestedField.optional(22, "f_sub_long", Types.LongType.get()), - Types.NestedField.optional(23, "f_sub_string", Types.StringType.get()), - Types.NestedField.optional(24, "f_sub_time", Types.TimeType.get()), - Types.NestedField.optional( - 25, "f_sub_decimal", Types.DecimalType.of(36, 18)), - Types.NestedField.optional(26, "f_sub_float", Types.FloatType.get()), - Types.NestedField.optional(27, "f_sub_double", Types.DoubleType.get()), - Types.NestedField.optional(28, "f_sub_date", Types.DateType.get()), - Types.NestedField.optional( - 29, "f_sub_timestamp_local", Types.TimestampType.withoutZone()), - Types.NestedField.optional( - 30, "f_sub_timestamp_tz", Types.TimestampType.withZone()), - Types.NestedField.optional(31, "f_sub_uuid", Types.UUIDType.get()), - Types.NestedField.optional( - 32, "f_sub_fixed", Types.FixedType.ofLength(18)), - Types.NestedField.optional(33, "f_sub_binary", Types.BinaryType.get()), - Types.NestedField.optional( - 34, - "f_sub_list", - Types.ListType.ofOptional(35, Types.LongType.get())), - Types.NestedField.optional( - 36, - "f_list2", - Types.ListType.ofOptional(37, Types.IntegerType.get())), - Types.NestedField.optional( - 38, - "f_list3", - Types.ListType.ofOptional( - 39, - Types.StructType.of( - Types.NestedField.optional( - 40, "f_sub_boolean", Types.BooleanType.get()), - Types.NestedField.optional( - 41, "f_sub_int", Types.IntegerType.get()), - Types.NestedField.optional( - 42, "f_sub_long", Types.LongType.get())))), - Types.NestedField.optional( - 43, - "f_map", - Types.MapType.ofOptional( - 44, 45, Types.StringType.get(), Types.StringType.get()))))); - } - }); - - private final PrimaryKeySpec primaryKeySpec = - PrimaryKeySpec.builderFor(USER_SCHEMA).addColumn(1).build(); - - public final RowType flinkUserSchema = FlinkSchemaUtil.convert(USER_SCHEMA); - - public final LogData FLIP_LOG = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - IdGenerator.generateUpstreamId(), - 1L, - true, - ChangeAction.INSERT, - new GenericRowData(0)); - - public static LogDataJsonDeserialization createLogDataDeserialization() { - return new LogDataJsonDeserialization<>( - USER_SCHEMA, LogRecordV1.factory, arrayFactory, mapFactory); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java deleted file mode 100644 index 845b79eb16..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenKafkaProducer.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden.kafka; - -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getProperties; -import static org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate.getPropertiesWithByteArray; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.shade.guava32.com.google.common.base.Preconditions.checkNotNull; -import static org.apache.kafka.clients.CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG; -import static org.apache.kafka.clients.producer.ProducerConfig.TRANSACTIONAL_ID_CONFIG; -import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertEquals; - -import org.apache.amoro.data.ChangeAction; -import org.apache.amoro.flink.kafka.testutils.KafkaConfigGenerate; -import org.apache.amoro.flink.kafka.testutils.KafkaContainerTest; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.write.hidden.LogMsgFactory; -import org.apache.amoro.log.Bytes; -import org.apache.amoro.log.FormatVersion; -import org.apache.amoro.log.LogData; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.log.LogDataJsonSerialization; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.streaming.connectors.kafka.internals.FlinkKafkaInternalProducer; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.types.RowKind; -import org.apache.flink.util.InstantiationUtil; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.time.Duration; -import java.util.Properties; -import java.util.UUID; - -public class TestHiddenKafkaProducer extends TestBaseLog { - private static final Logger LOG = LoggerFactory.getLogger(TestHiddenKafkaProducer.class); - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Test - public void testInitTransactionId() { - final String topic = "test-init-transactions"; - KafkaContainerTest.createTopics(1, 1, topic); - FlinkKafkaInternalProducer reuse = null; - final String transactionalIdPrefix = UUID.randomUUID().toString(); - try { - int numTransactions = 20; - for (int i = 1; i <= numTransactions; i++) { - Properties properties = new Properties(); - properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - properties = getProperties(KafkaConfigGenerate.getStandardProperties(properties)); - properties.put(TRANSACTIONAL_ID_CONFIG, transactionalIdPrefix + i); - reuse = new FlinkKafkaInternalProducer<>(properties); - reuse.initTransactions(); - reuse.beginTransaction(); - reuse.send(new ProducerRecord<>(topic, "test-value-" + i)); - if (i % 2 == 0) { - reuse.commitTransaction(); - } else { - reuse.flush(); - reuse.abortTransaction(); - } - int count = KafkaContainerTest.countAllRecords(topic, properties); - LOG.info("consumption = {}", count); - assertThat(count).isEqualTo(i / 2); - } - } catch (Throwable e) { - LOG.error("error:", e); - if (reuse != null) { - reuse.abortTransaction(); - } - } finally { - assert reuse != null; - reuse.close(Duration.ofMillis(1000)); - } - } - - @Test - public void testLogProducerSendFlip() throws Exception { - final String topic = "test-recover-transactions"; - int numPartitions = 3; - KafkaContainerTest.createTopics(numPartitions, 1, topic); - LogData.FieldGetterFactory fieldGetterFactory = LogRecordV1.FIELD_GETTER_FACTORY; - LogDataJsonSerialization logDataJsonSerialization = - new LogDataJsonSerialization<>(checkNotNull(USER_SCHEMA), checkNotNull(fieldGetterFactory)); - Properties properties = new Properties(); - properties.put(BOOTSTRAP_SERVERS_CONFIG, KAFKA_CONTAINER.getBootstrapServers()); - properties = getPropertiesWithByteArray(KafkaConfigGenerate.getStandardProperties(properties)); - LogMsgFactory.Producer producer = - new HiddenKafkaFactory() - .createProducer(properties, topic, logDataJsonSerialization, null); - producer.open(); - - int recoverNum = 3; - for (int i = 0; i < recoverNum; i++) { - producer.sendToAllPartitions(FLIP_LOG); - } - producer.close(); - int count = KafkaContainerTest.countAllRecords(topic, properties); - assertThat(count).isEqualTo(numPartitions * recoverNum); - } - - @Test - public void testLogDataNullValueSerialize() throws IOException { - - LogDataJsonSerialization logDataJsonSerialization = - new LogDataJsonSerialization<>( - USER_SCHEMA_WITH_ALL_DATA_TYPE, LogRecordV1.FIELD_GETTER_FACTORY); - - GenericRowData rowData = new GenericRowData(17); - rowData.setRowKind(RowKind.INSERT); - rowData.setField(0, null); - rowData.setField(1, null); - rowData.setField(2, null); - rowData.setField(3, null); - rowData.setField(4, null); - rowData.setField(5, null); - rowData.setField(6, null); - rowData.setField(7, null); - rowData.setField(8, null); - rowData.setField(9, null); - rowData.setField(10, null); - rowData.setField(11, null); - rowData.setField(12, null); - rowData.setField(13, null); - rowData.setField(14, null); - rowData.setField(15, null); - rowData.setField(16, null); - - LogData logData = - new LogRecordV1( - FormatVersion.FORMAT_VERSION_V1, - IdGenerator.generateUpstreamId(), - 1L, - false, - ChangeAction.INSERT, - rowData); - - byte[] bytes = logDataJsonSerialization.serialize(logData); - - Assert.assertNotNull(bytes); - String actualJson = new String(Bytes.subByte(bytes, 18, bytes.length - 18)); - - String expected = - "{\"f_boolean\":null,\"f_int\":null,\"f_date\":null,\"f_long\":null,\"f_time\":null,\"f_float\":null,\"f_double\":null,\"f_timestamp_local\":null,\"f_timestamp_tz\":null,\"f_string\":null,\"f_uuid\":null,\"f_fixed\":null,\"f_binary\":null,\"f_decimal\":null,\"f_list\":null,\"f_map\":null,\"f_struct\":null}"; - assertEquals(expected, actualJson); - - LogDataJsonDeserialization logDataDeserialization = createLogDataDeserialization(); - LogData result = logDataDeserialization.deserialize(bytes); - Assert.assertNotNull(result); - } - - @Test - public void testLogDataJsonSerializationClassSerialize() - throws IOException, ClassNotFoundException { - LogDataJsonSerialization actual = - new LogDataJsonSerialization<>(USER_SCHEMA, LogRecordV1.FIELD_GETTER_FACTORY); - byte[] bytes = InstantiationUtil.serializeObject(actual); - LogDataJsonSerialization result = - InstantiationUtil.deserializeObject(bytes, actual.getClass().getClassLoader()); - Assert.assertNotNull(result); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java deleted file mode 100644 index 9a42fd37fb..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/amoro/flink/write/hidden/kafka/TestHiddenLogOperators.java +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.amoro.flink.write.hidden.kafka; - -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.KAFKA_CONTAINER; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.getPropertiesByTopic; -import static org.apache.amoro.flink.kafka.testutils.KafkaContainerTest.readRecordsBytes; -import static org.apache.amoro.flink.table.descriptors.MixedFormatValidator.MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE; -import static org.apache.amoro.flink.write.hidden.kafka.TestBaseLog.USER_SCHEMA; -import static org.apache.amoro.flink.write.hidden.kafka.TestBaseLog.createLogDataDeserialization; - -import org.apache.amoro.flink.read.source.log.kafka.LogKafkaSource; -import org.apache.amoro.flink.shuffle.LogRecordV1; -import org.apache.amoro.flink.shuffle.ShuffleHelper; -import org.apache.amoro.flink.util.TestGlobalAggregateManager; -import org.apache.amoro.flink.util.TestOneInputStreamOperatorIntern; -import org.apache.amoro.flink.write.hidden.HiddenLogWriter; -import org.apache.amoro.log.LogDataJsonDeserialization; -import org.apache.amoro.utils.IdGenerator; -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.api.common.restartstrategy.RestartStrategies; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; -import org.apache.flink.core.execution.JobClient; -import org.apache.flink.runtime.checkpoint.OperatorSubtaskState; -import org.apache.flink.streaming.api.TimeCharacteristic; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamUtils; -import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.operators.collect.ClientAndIterator; -import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericArrayData; -import org.apache.flink.table.data.GenericMapData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.util.CloseableIterator; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.jupiter.api.Assertions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.math.BigDecimal; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Properties; - -/** Hidden log operator tests. */ -public class TestHiddenLogOperators { - private static final Logger LOG = LoggerFactory.getLogger(TestHiddenLogOperators.class); - public static final String TOPIC = "produce-consume-topic"; - public static final TestGlobalAggregateManager GLOBAL_AGGREGATE_MANGER = - new TestGlobalAggregateManager(); - - @BeforeClass - public static void prepare() throws Exception { - KAFKA_CONTAINER.start(); - } - - @AfterClass - public static void shutdown() throws Exception { - KAFKA_CONTAINER.close(); - } - - @Test - public void testProduceAndConsume() throws Exception { - String topic = "testProduceAndConsume"; - final int count = 20; - - String[] expect = new String[count]; - try (OneInputStreamOperatorTestHarness harness = - createProducer(null, topic)) { - harness.setup(); - harness.initializeEmptyState(); - harness.open(); - for (int i = 0; i < count; i++) { - RowData row = createRowData(i); - expect[i] = row.toString(); - harness.processElement(row, 0); - } - harness.snapshot(1, 1); - harness.notifyOfCompletedCheckpoint(1); - List output = collect(harness); - Assertions.assertEquals(count, output.size()); - Assertions.assertArrayEquals(expect, output.toArray(new String[0])); - - createConsumerWithoutRetract(true, count, "test-gid", topic); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - } - - @Test - public void testProducerFailoverWithoutRetract() throws Exception { - String topic = "testProducerFailoverWithoutRetract"; - OperatorSubtaskState state; - try { - OneInputStreamOperatorTestHarness harness = createProducer(null, topic); - harness.setup(); - harness.initializeEmptyState(); - harness.open(); - harness.processElement(createRowData(1), 0); - harness.processElement(createRowData(2), 0); - harness.processElement(createRowData(3), 0); - state = harness.snapshot(1, 1); - harness.processElement(createRowData(4), 0); - harness.processElement(createRowData(5), 0); - harness.notifyOfCompletedCheckpoint(1); - List output = collect(harness); - Assertions.assertEquals(5, output.size()); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - - // failover happen 1 time - try { - OneInputStreamOperatorTestHarness harness = createProducer(1L, topic); - harness.setup(); - harness.initializeState(state); - harness.open(); - harness.processElement(createRowData(4), 0); - harness.processElement(createRowData(5), 0); - harness.processElement(createRowData(6), 0); - harness.snapshot(2, 1); - harness.processElement(createRowData(7), 0); - harness.processElement(createRowData(8), 0); - harness.notifyOfCompletedCheckpoint(2); - List output = collect(harness); - Assertions.assertEquals(5, output.size()); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - - createConsumerWithoutRetract(true, 10, "test-gid", topic); - } - - @Test - public void testMultiParallelismFailoverConsistencyRead() throws Exception { - String topic = "testMultiParallelismFailoverConsistencyRead"; - OperatorSubtaskState state0; - OperatorSubtaskState state1; - OperatorSubtaskState state2; - byte[] jobId = IdGenerator.generateUpstreamId(); - try (TestOneInputStreamOperatorIntern harness0 = - createProducer(3, 0, jobId, topic); - TestOneInputStreamOperatorIntern harness1 = - createProducer(3, 1, jobId, topic); - TestOneInputStreamOperatorIntern harness2 = - createProducer(3, 2, jobId, topic)) { - harness0.setup(); - harness0.initializeEmptyState(); - harness0.open(); - harness1.setup(); - harness1.initializeEmptyState(); - harness1.open(); - harness2.setup(); - harness2.initializeEmptyState(); - harness2.open(); - - harness0.processElement(createRowData(1), 0); - - state0 = harness0.snapshot(1, 1); - - harness1.processElement(createRowData(11), 0); - harness2.processElement(createRowData(21), 0); - - // chp-1 success. - state1 = harness1.snapshot(1, 1); - state2 = harness2.snapshot(1, 1); - - harness0.processElement(createRowData(2), 0); - harness1.processElement(createRowData(12), 0); - harness2.processElement(createRowData(22), 0); - harness0.notifyOfCompletedCheckpoint(1); - harness1.notifyOfCompletedCheckpoint(1); - harness2.notifyOfCompletedCheckpoint(1); - harness0.processElement(createRowData(3), 0); - // after 3, harness0 happen timeout - harness1.processElement(createRowData(13), 0); - harness2.processElement(createRowData(23), 0); - - // harness0 snapshot chp-2 failed. - harness1.snapshot(2, 1); - harness2.snapshot(2, 1); - - harness1.processElement(createRowData(14), 0); - harness2.processElement(createRowData(24), 0); - // notify chp-2 aborted - harness1.notifyOfAbortedCheckpoint(2); - harness2.notifyOfAbortedCheckpoint(2); - - List output = collect(harness0); - output.addAll(collect(harness1)); - output.addAll(collect(harness2)); - Assertions.assertEquals(11, output.size()); - ConsumerRecords consumerRecords = readRecordsBytes(topic); - Assertions.assertEquals(11, consumerRecords.count()); - LogDataJsonDeserialization deserialization = createLogDataDeserialization(); - consumerRecords.forEach( - consumerRecord -> { - try { - System.out.println(deserialization.deserialize(consumerRecord.value())); - } catch (IOException e) { - e.printStackTrace(); - } - }); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - - // failover restore from chp-1 - try (TestOneInputStreamOperatorIntern harness0 = - createProducer(3, 0, jobId, 1L, topic); - TestOneInputStreamOperatorIntern harness1 = - createProducer(3, 1, jobId, 1L, topic); - TestOneInputStreamOperatorIntern harness2 = - createProducer(3, 2, jobId, 1L, topic)) { - harness0.setup(); - harness0.initializeState(state0); - harness0.open(); - harness1.setup(); - harness1.initializeState(state1); - harness1.open(); - harness2.setup(); - harness2.initializeState(state2); - harness2.open(); - - harness0.processElement(createRowData(2), 0); - harness1.processElement(createRowData(12), 0); - harness2.processElement(createRowData(22), 0); - // chp-2 - state1 = harness1.snapshot(3, 1); - state2 = harness2.snapshot(3, 1); - - harness0.processElement(createRowData(3), 0); - // after 3, harness0 happen timeout - harness1.processElement(createRowData(13), 0); - harness2.processElement(createRowData(23), 0); - - harness1.processElement(createRowData(14), 0); - harness2.processElement(createRowData(24), 0); - - harness1.notifyOfAbortedCheckpoint(2); - harness2.notifyOfAbortedCheckpoint(2); - - List output = collect(harness0); - output.addAll(collect(harness1)); - output.addAll(collect(harness2)); - Assertions.assertEquals(8, output.size()); - ConsumerRecords consumerRecords = readRecordsBytes(topic); - LogDataJsonDeserialization deserialization = createLogDataDeserialization(); - consumerRecords.forEach( - consumerRecord -> { - try { - System.out.println(deserialization.deserialize(consumerRecord.value())); - } catch (IOException e) { - e.printStackTrace(); - } - }); - Assertions.assertEquals(20, consumerRecords.count()); - } catch (Exception e) { - e.printStackTrace(); - throw e; - } - createConsumerWithoutRetract(true, 19, "test-gid", topic); - createConsumerWithRetract(true, 27, "test-gid-2", topic); - } - - public static RowData createRowData(int i) { - GenericRowData rowData = new GenericRowData(USER_SCHEMA.columns().size()); - rowData.setField(0, true); - rowData.setField(1, i); - rowData.setField(2, 1L); - GenericRowData sub = new GenericRowData(18); - sub.setField(0, true); - sub.setField(1, 1); - sub.setField(2, 1L); - sub.setField(3, StringData.fromString("sssss")); - sub.setField(4, LocalTime.of(13, 23, 23, 98766545).toNanoOfDay()); - sub.setField( - 5, DecimalData.fromBigDecimal(new BigDecimal("123456789.123456789123456789"), 30, 18)); - sub.setField(6, 123.12345f); - sub.setField(7, 123.12345d); - sub.setField(8, (int) LocalDate.of(2022, 5, 5).toEpochDay()); - sub.setField( - 9, TimestampData.fromLocalDateTime(LocalDateTime.of(2022, 12, 12, 13, 14, 14, 987654234))); - sub.setField(10, TimestampData.fromInstant(Instant.parse("2022-12-13T13:33:44.98765432Z"))); - sub.setField(11, new byte[] {1}); - sub.setField(12, new byte[] {'1'}); - sub.setField(13, new byte[] {2}); - - GenericArrayData fSubList = new GenericArrayData(new long[] {112L, 123L}); - sub.setField(14, fSubList); - - GenericArrayData fSubList2 = new GenericArrayData(new int[] {112, 123}); - sub.setField(15, fSubList2); - - GenericRowData subStruct = new GenericRowData(3); - subStruct.setField(0, false); - subStruct.setField(1, 112); - subStruct.setField(2, 123L); - GenericArrayData structList = new GenericArrayData(new GenericRowData[] {subStruct}); - sub.setField(16, structList); - - GenericMapData map = - new GenericMapData( - new HashMap() { - { - put(StringData.fromString("Key_123"), StringData.fromString("Str_123")); - put(StringData.fromString("Key_124"), StringData.fromString("Str_123")); - put(StringData.fromString("Key_125"), StringData.fromString("Str_123")); - } - }); - sub.setField(17, map); - - rowData.setField(3, sub); - return rowData; - } - - private static List collect(OneInputStreamOperatorTestHarness harness) { - List parts = new ArrayList<>(); - harness.extractOutputValues().forEach(m -> parts.add(m.toString())); - return parts; - } - - private void createConsumerWithRetract( - boolean print, int count, final String groupId, String topic) throws Exception { - createConsumer(print, count, groupId, true, topic); - } - - private void createConsumerWithoutRetract( - boolean print, int count, final String groupId, String topic) throws Exception { - createConsumer(print, count, groupId, false, topic); - } - - private void createConsumer( - boolean print, int count, final String groupId, boolean retract, String topic) - throws Exception { - StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); - env.setParallelism(1); - env.enableCheckpointing(10000); - env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); - List topics = new ArrayList<>(); - topics.add(topic); - Properties properties = getPropertiesByTopic(topic); - properties.put("group.id", groupId); - properties.put("auto.offset.reset", "earliest"); - - Map configuration = new HashMap<>(); - configuration.put(MIXED_FORMAT_LOG_CONSISTENCY_GUARANTEE_ENABLE.key(), String.valueOf(retract)); - - DataStream streamWithTimestamps = - env.fromSource( - LogKafkaSource.builder(USER_SCHEMA, configuration) - .setTopics(topics) - .setStartingOffsets(OffsetsInitializer.earliest()) - .setProperties(properties) - .build(), - WatermarkStrategy.noWatermarks(), - "Log Source"); - if (print) { - streamWithTimestamps.print("log-hidden"); - } - - ClientAndIterator clientAndIterator = - DataStreamUtils.collectWithClient(streamWithTimestamps, "testLog"); - - JobClient jobClient = clientAndIterator.client; - CloseableIterator iterator = clientAndIterator.iterator; - - List actualResult = new ArrayList<>(); - - while (iterator.hasNext()) { - RowData row = iterator.next(); - actualResult.add(row); - LOG.info("size {}, {}, {}.", actualResult.size(), row.getRowKind(), row.getInt(1)); - if (actualResult.size() == count) { - break; - } - } - } - - public static OneInputStreamOperatorTestHarness createProducer( - Long restoredCheckpoint, String topic) throws Exception { - return createProducer( - 1, - 1, - 0, - restoredCheckpoint, - IdGenerator.generateUpstreamId(), - new TestGlobalAggregateManager(), - topic); - } - - public static TestOneInputStreamOperatorIntern createProducer( - int maxParallelism, int subTaskId, byte[] jobId, Long restoredCheckpointId, String topic) - throws Exception { - return createProducer( - maxParallelism, - maxParallelism, - subTaskId, - restoredCheckpointId, - jobId, - GLOBAL_AGGREGATE_MANGER, - topic); - } - - public static TestOneInputStreamOperatorIntern createProducer( - int maxParallelism, int subTaskId, byte[] jobId, String topic) throws Exception { - return createProducer( - maxParallelism, maxParallelism, subTaskId, null, jobId, GLOBAL_AGGREGATE_MANGER, topic); - } - - private static TestOneInputStreamOperatorIntern createProducer( - int maxParallelism, - int parallelism, - int subTaskId, - Long restoredCheckpointId, - byte[] jobId, - TestGlobalAggregateManager testGlobalAggregateManager, - String topic) - throws Exception { - HiddenLogWriter writer = - new HiddenLogWriter( - USER_SCHEMA, - getPropertiesByTopic(topic), - topic, - new HiddenKafkaFactory<>(), - LogRecordV1.FIELD_GETTER_FACTORY, - jobId, - ShuffleHelper.EMPTY); - - TestOneInputStreamOperatorIntern harness = - new TestOneInputStreamOperatorIntern<>( - writer, - maxParallelism, - parallelism, - subTaskId, - restoredCheckpointId, - testGlobalAggregateManager); - harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); - return harness; - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 5a1e3d85b9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-1.17/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -/** - * Compatibility shim for tests that previously used Iceberg's removed MiniClusterResource helper. - */ -public class MiniClusterResource { - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration().set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml deleted file mode 100644 index 182ca4d1c8..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/pom.xml +++ /dev/null @@ -1,349 +0,0 @@ - - - - 4.0.0 - - org.apache.amoro - amoro-mixed-flink - 0.9-SNAPSHOT - ../pom.xml - - - amoro-format-mixed-flink-common-iceberg-bridge-1.17 - Amoro Project Mixed Format Flink Iceberg Bridge - https://amoro.apache.org - - - 3.21.0 - 1.17.2 - 1.17.2 - 3.0.2-1.17 - 1.6.1 - - - - - - org.apache.amoro - amoro-format-iceberg - - - org.ow2.asm - asm - - - - - - org.apache.amoro - amoro-mixed-hive - ${project.version} - - - - org.apache.iceberg - iceberg-flink-1.17 - ${iceberg.version} - provided - - - org.slf4j - slf4j-api - - - org.apache.parquet - parquet-column - - - org.apache.parquet - parquet-avro - - - - - - cglib - cglib - - - - com.google.code.gson - gson - ${gson.version} - - - - - org.apache.flink - flink-connector-files - ${flink.version} - provided - - - - org.apache.flink - flink-connector-kafka - ${flink-kafka.version} - provided - - - - org.apache.flink - flink-json - ${flink.version} - provided - - - - org.apache.flink - flink-hadoop-compatibility_${flink.scala.binary.version} - ${flink.version} - provided - - - - org.apache.flink - flink-table-api-java-bridge - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - - org.apache.flink - flink-orc - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-parquet - ${flink.version} - provided - - - org.apache.parquet - parquet-hadoop - - - - - - org.apache.flink - flink-table-runtime - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - org.apache.flink - flink-table-planner_${flink.scala.binary.version} - ${flink.version} - provided - - - org.slf4j - slf4j-api - - - - - - org.apache.iceberg - iceberg-flink-1.17 - ${iceberg.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-runtime - ${flink.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - - org.apache.flink - flink-streaming-java - ${flink.version} - tests - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-clients - ${flink.version} - test - - - org.slf4j - slf4j-api - - - - - - org.apache.flink - flink-test-utils - ${flink.version} - test - - - org.apache.logging.log4j - log4j-slf4j-impl - - - org.slf4j - slf4j-api - - - com.google.guava - guava - - - - - org.apache.flink - flink-connector-test-utils - ${flink.version} - test - - - - org.apache.iceberg - iceberg-hive-metastore - ${iceberg.version} - tests - test - - - - - com.fasterxml.jackson.core - jackson-databind - ${jackson.vesion} - provided - - - - org.apache.amoro - amoro-common - ${project.version} - tests - test - - - - org.apache.flink - flink-metrics-jmx - ${flink.version} - test - - - org.apache.flink - flink-runtime-web - ${flink.version} - test - - - - - org.apache.flink - flink-table-planner_${flink.scala.binary.version} - ${flink.version} - test-jar - test - - - org.slf4j - slf4j-api - - - - - - - org.apache.curator - curator-test - 2.12.0 - test - - - - org.testcontainers - kafka - ${testcontainers.version} - test - - - - org.testcontainers - junit-jupiter - ${testcontainers.version} - test - - - org.assertj - assertj-core - ${assertj.version} - test - - - diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java deleted file mode 100644 index 6b984b1b5d..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java +++ /dev/null @@ -1,873 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink.data; - -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableList; -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.amoro.shade.guava32.com.google.common.collect.Maps; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.GenericRowData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.time.Instant; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -public class AdaptHiveFlinkParquetReaders { - private AdaptHiveFlinkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - if (fieldReaders.get(i) != null) { - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - reorderedFields.add(ParquetValueReaders.constant(idToConstant.get(id))); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else { - ParquetValueReader reader = readersById.get(id); - if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } - } - } - - return new RowDataReader(types, reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - if (expectedList == null) { - return null; - } - - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = repeated.getType(0); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - if (expectedMap == null) { - return null; - } - - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - @Override - @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - if (expected == null) { - return null; - } - - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return new StringReader(desc); - case INT_8: - case INT_16: - case INT_32: - if (expected.typeId() == Types.LongType.get().typeId()) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case TIME_MICROS: - return new LossyMicrosToMillisTimeReader(desc); - case TIME_MILLIS: - return new MillisTimeReader(desc); - case DATE: - case INT_64: - return new ParquetValueReaders.UnboxedReader<>(desc); - case TIMESTAMP_MICROS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MicrosToTimestampTzReader(desc); - } else { - return new MicrosToTimestampReader(desc); - } - case TIMESTAMP_MILLIS: - if (((Types.TimestampType) expected).shouldAdjustToUTC()) { - return new MillisToTimestampTzReader(desc); - } else { - return new MillisToTimestampReader(desc); - } - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT32: - return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return new ParquetValueReaders.ByteArrayReader(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.LONG) { - return new ParquetValueReaders.IntAsLongReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case FLOAT: - if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { - return new ParquetValueReaders.FloatAsDoubleReader(desc); - } else { - return new ParquetValueReaders.UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new ParquetValueReaders.UnboxedReader<>(desc); - case INT96: - Types.TimestampType tsMicrosType = (Types.TimestampType) expected; - if (tsMicrosType.shouldAdjustToUTC()) { - return new TimestampIntWithTZ96Reader(desc); - } else { - return new TimestampIntWithOutTZ96Reader(desc); - } - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class TimestampIntWithOutTZ96Reader - extends ParquetValueReaders.PrimitiveReader { - private static final long UNIX_EPOCH_JULIAN = 2_440_588L; - - TimestampIntWithOutTZ96Reader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData reuse) { - final ByteBuffer byteBuffer = - column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); - final long timeOfDayNanos = byteBuffer.getLong(); - final int julianDay = byteBuffer.getInt(); - - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) - .plusNanos(timeOfDayNanos) - .atZone(ZoneId.systemDefault()) - .toLocalDateTime()); - } - } - - private static class TimestampIntWithTZ96Reader - extends ParquetValueReaders.PrimitiveReader { - private static final long UNIX_EPOCH_JULIAN = 2_440_588L; - - private TimestampIntWithTZ96Reader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData reuse) { - final ByteBuffer byteBuffer = - column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); - final long timeOfDayNanos = byteBuffer.getLong(); - final int julianDay = byteBuffer.getInt(); - - return TimestampData.fromInstant( - Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) - .plusNanos(timeOfDayNanos)); - } - } - - private static class BinaryDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - Binary binary = column.nextBinary(); - BigDecimal bigDecimal = new BigDecimal(new BigInteger(binary.getBytes()), scale); - // TODO: need a unit test to write-read-validate decimal via FlinkParquetWrite/Reader - return DecimalData.fromBigDecimal(bigDecimal, precision, scale); - } - } - - private static class IntegerDecimalReader - extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends ParquetValueReaders.PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public DecimalData read(DecimalData ignored) { - return DecimalData.fromUnscaledLong(column.nextLong(), precision, scale); - } - } - - private static class MicrosToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MicrosToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MicrosToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long value = readLong(); - return TimestampData.fromInstant( - Instant.ofEpochSecond( - Math.floorDiv(value, 1000_000L), Math.floorMod(value, 1000_000L) * 1000)); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromEpochMillis(millis); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class MillisToTimestampTzReader - extends ParquetValueReaders.UnboxedReader { - MillisToTimestampTzReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public TimestampData read(TimestampData ignored) { - long millis = readLong(); - return TimestampData.fromLocalDateTime( - Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); - } - - @Override - public long readLong() { - return column.nextLong(); - } - } - - private static class StringReader extends ParquetValueReaders.PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public StringData read(StringData ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return StringData.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return StringData.fromBytes(binary.getBytes()); - } - } - } - - private static class LossyMicrosToMillisTimeReader - extends ParquetValueReaders.PrimitiveReader { - LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - // Discard microseconds since Flink uses millisecond unit for TIME type. - return (int) Math.floorDiv(column.nextLong(), 1000L); - } - } - - private static class MillisTimeReader extends ParquetValueReaders.PrimitiveReader { - MillisTimeReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Integer read(Integer reuse) { - return (int) column.nextLong(); - } - } - - private static class ArrayReader - extends ParquetValueReaders.RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - list.setNumElements(writePos); - return list; - } - } - - private static class MapReader - extends ParquetValueReaders.RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ParquetValueReaders.ReusableEntry entry = - new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = - new ParquetValueReaders.ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class RowDataReader - extends ParquetValueReaders.StructReader { - private final int numFields; - - RowDataReader(List types, List> readers) { - super(types, readers); - this.numFields = readers.size(); - } - - @Override - protected GenericRowData newStructData(RowData reuse) { - if (reuse instanceof GenericRowData) { - return (GenericRowData) reuse; - } else { - return new GenericRowData(numFields); - } - } - - @Override - protected Object getField(GenericRowData intermediate, int pos) { - return intermediate.getField(pos); - } - - @Override - protected RowData buildStruct(GenericRowData struct) { - return struct; - } - - @Override - protected void set(GenericRowData row, int pos, Object value) { - row.setField(pos, value); - } - - @Override - protected void setNull(GenericRowData row, int pos) { - row.setField(pos, null); - } - - @Override - protected void setBoolean(GenericRowData row, int pos, boolean value) { - row.setField(pos, value); - } - - @Override - protected void setInteger(GenericRowData row, int pos, int value) { - row.setField(pos, value); - } - - @Override - protected void setLong(GenericRowData row, int pos, long value) { - row.setField(pos, value); - } - - @Override - protected void setFloat(GenericRowData row, int pos, float value) { - row.setField(pos, value); - } - - @Override - protected void setDouble(GenericRowData row, int pos, double value) { - row.setField(pos, value); - } - } - - private static class ReusableMapData implements MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int size() { - return numElements; - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData implements ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 1]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public StringData getString(int pos) { - return (StringData) values[pos]; - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return (DecimalData) values[pos]; - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return (TimestampData) values[pos]; - } - - @SuppressWarnings("unchecked") - @Override - public RawValueData getRawValue(int pos) { - return (RawValueData) values[pos]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - - @Override - public RowData getRow(int pos, int numFields) { - return (RowData) values[pos]; - } - - @Override - public boolean[] toBooleanArray() { - return ArrayUtil.toPrimitive((Boolean[]) values); - } - - @Override - public byte[] toByteArray() { - return ArrayUtil.toPrimitive((Byte[]) values); - } - - @Override - public short[] toShortArray() { - return ArrayUtil.toPrimitive((Short[]) values); - } - - @Override - public int[] toIntArray() { - return ArrayUtil.toPrimitive((Integer[]) values); - } - - @Override - public long[] toLongArray() { - return ArrayUtil.toPrimitive((Long[]) values); - } - - @Override - public float[] toFloatArray() { - return ArrayUtil.toPrimitive((Float[]) values); - } - - @Override - public double[] toDoubleArray() { - return ArrayUtil.toPrimitive((Double[]) values); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java deleted file mode 100644 index 6407265d89..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetWriters.java +++ /dev/null @@ -1,599 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink.data; - -import static org.apache.flink.table.types.logical.LogicalTypeRoot.TIMESTAMP_WITHOUT_TIME_ZONE; - -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.LogicalTypeRoot; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.flink.table.types.logical.SmallIntType; -import org.apache.flink.table.types.logical.TinyIntType; -import org.apache.iceberg.parquet.AdaptHivePrimitiveWriter; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.ParquetValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.time.Instant; -import java.time.ZoneId; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.concurrent.TimeUnit; - -/** - * Copy from iceberg {@link FlinkParquetWriters} to support int96 type and use {@link - * AdaptHiveParquetWithFlinkSchemaVisitor}. - */ -public class AdaptHiveFlinkParquetWriters { - private AdaptHiveFlinkParquetWriters() {} - - @SuppressWarnings("unchecked") - public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) - AdaptHiveParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); - } - - private static class WriteBuilder - extends AdaptHiveParquetWithFlinkSchemaVisitor> { - private final MessageType type; - - WriteBuilder(MessageType type) { - this.type = type; - } - - @Override - public ParquetValueWriter message( - RowType rowType, MessageType message, List> fields) { - return struct(rowType, message.asGroupType(), fields); - } - - @Override - public ParquetValueWriter struct( - RowType rowType, GroupType struct, List> fieldWriters) { - List fields = struct.getFields(); - List flinkFields = rowType.getFields(); - List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List flinkTypes = Lists.newArrayList(); - for (int i = 0; i < fields.size(); i += 1) { - writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - flinkTypes.add(flinkFields.get(i).getType()); - } - - return new RowDataWriter(writers, flinkTypes); - } - - @Override - public ParquetValueWriter list( - ArrayType arrayType, GroupType array, ParquetValueWriter elementWriter) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new ArrayDataWriter<>( - repeatedD, - repeatedR, - newOption(repeated.getType(0), elementWriter), - arrayType.getElementType()); - } - - @Override - public ParquetValueWriter map( - MapType mapType, - GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new MapDataWriter<>( - repeatedD, - repeatedR, - newOption(repeatedKeyValue.getType(0), keyWriter), - newOption(repeatedKeyValue.getType(1), valueWriter), - mapType.getKeyType(), - mapType.getValueType()); - } - - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { - int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); - return ParquetValueWriters.option(fieldType, maxD, writer); - } - - @Override - public ParquetValueWriter primitive(LogicalType logicalType, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return strings(desc); - case DATE: - case INT_8: - case INT_16: - case INT_32: - return ints(logicalType, desc); - case INT_64: - return ParquetValueWriters.longs(desc); - case TIME_MICROS: - return timeMicros(desc); - case TIMESTAMP_MICROS: - return timestamps(desc); - case DECIMAL: - DecimalLogicalTypeAnnotation decimal = - (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); - switch (primitive.getPrimitiveTypeName()) { - case INT32: - return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return byteArrays(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return byteArrays(desc); - case BOOLEAN: - return ParquetValueWriters.booleans(desc); - case INT32: - return ints(logicalType, desc); - case INT64: - return ParquetValueWriters.longs(desc); - case INT96: - LogicalTypeRoot typeRoot = logicalType.getTypeRoot(); - if (typeRoot == TIMESTAMP_WITHOUT_TIME_ZONE) { - return new TimestampInt96Writer(desc); - } else { - return new TimestampTZInt96Writer(desc); - } - case FLOAT: - return ParquetValueWriters.floats(desc); - case DOUBLE: - return ParquetValueWriters.doubles(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static class TimestampTZInt96Writer extends AdaptHivePrimitiveWriter { - - private static final long JULIAN_DAY_OF_EPOCH = 2440588L; - private static final long MICROS_PER_DAY = 86400000000L; - private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); - private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1); - - public TimestampTZInt96Writer(ColumnDescriptor descriptor) { - super(descriptor); - } - - /** Writes nano timestamps to parquet int96 */ - void writeBinary(int repetitionLevel, int julianDay, long nanosOfDay) { - ByteBuffer buf = ByteBuffer.allocate(12); - buf.order(ByteOrder.LITTLE_ENDIAN); - buf.putLong(nanosOfDay); - buf.putInt(julianDay); - buf.flip(); - column.writeBinary(repetitionLevel, Binary.fromConstantByteBuffer(buf)); - } - - void writeInstant(int repetitionLevel, Instant instant) { - long timestamp = instant.toEpochMilli(); - int julianDay = (int) (timestamp / MILLIS_IN_DAY + 2440588L); - long nanosOfDay = - timestamp % MILLIS_IN_DAY * NANOS_PER_MILLISECOND - + instant.getNano() % NANOS_PER_MILLISECOND; - writeBinary(repetitionLevel, julianDay, nanosOfDay); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - writeInstant(repetitionLevel, value.toInstant()); - } - } - - private static class TimestampInt96Writer extends AdaptHivePrimitiveWriter { - - private static final long JULIAN_DAY_OF_EPOCH = 2440588L; - private static final long MICROS_PER_DAY = 86400000000L; - private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); - private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1); - - public TimestampInt96Writer(ColumnDescriptor descriptor) { - super(descriptor); - } - - /** Writes nano timestamps to parquet int96 */ - void writeBinary(int repetitionLevel, int julianDay, long nanosOfDay) { - ByteBuffer buf = ByteBuffer.allocate(12); - buf.order(ByteOrder.LITTLE_ENDIAN); - buf.putLong(nanosOfDay); - buf.putInt(julianDay); - buf.flip(); - column.writeBinary(repetitionLevel, Binary.fromConstantByteBuffer(buf)); - } - - void writeInstant(int repetitionLevel, Instant instant) { - long timestamp = instant.toEpochMilli(); - int julianDay = (int) (timestamp / MILLIS_IN_DAY + 2440588L); - long nanosOfDay = - timestamp % MILLIS_IN_DAY * NANOS_PER_MILLISECOND - + instant.getNano() % NANOS_PER_MILLISECOND; - writeBinary(repetitionLevel, julianDay, nanosOfDay); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - writeInstant( - repetitionLevel, value.toLocalDateTime().atZone(ZoneId.systemDefault()).toInstant()); - } - } - - private static ParquetValueWriters.PrimitiveWriter ints( - LogicalType type, ColumnDescriptor desc) { - if (type instanceof TinyIntType) { - return ParquetValueWriters.tinyints(desc); - } else if (type instanceof SmallIntType) { - return ParquetValueWriters.shorts(desc); - } - return ParquetValueWriters.ints(desc); - } - - private static ParquetValueWriters.PrimitiveWriter strings(ColumnDescriptor desc) { - return new StringDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDescriptor desc) { - return new TimeMicrosWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 9, - "Cannot write decimal value as integer with precision larger than 9," - + " wrong precision %s", - precision); - return new IntegerDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsLong( - ColumnDescriptor desc, int precision, int scale) { - Preconditions.checkArgument( - precision <= 18, - "Cannot write decimal value as long with precision larger than 18, " - + " wrong precision %s", - precision); - return new LongDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( - ColumnDescriptor desc, int precision, int scale) { - return new FixedDecimalWriter(desc, precision, scale); - } - - private static ParquetValueWriters.PrimitiveWriter timestamps( - ColumnDescriptor desc) { - return new TimestampDataWriter(desc); - } - - private static ParquetValueWriters.PrimitiveWriter byteArrays(ColumnDescriptor desc) { - return new ByteArrayWriter(desc); - } - - private static class StringDataWriter extends ParquetValueWriters.PrimitiveWriter { - private StringDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, StringData value) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.toBytes())); - } - } - - private static class TimeMicrosWriter extends ParquetValueWriters.PrimitiveWriter { - private TimeMicrosWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, Integer value) { - long micros = value.longValue() * 1000; - column.writeLong(repetitionLevel, micros); - } - } - - private static class IntegerDecimalWriter - extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); - } - } - - private static class LongDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - - private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeLong(repetitionLevel, decimal.toUnscaledLong()); - } - } - - private static class FixedDecimalWriter extends ParquetValueWriters.PrimitiveWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = - DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); - } - } - - private static class TimestampDataWriter - extends ParquetValueWriters.PrimitiveWriter { - private TimestampDataWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, TimestampData value) { - column.writeLong( - repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); - } - } - - private static class ByteArrayWriter extends ParquetValueWriters.PrimitiveWriter { - private ByteArrayWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, byte[] bytes) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); - } - } - - private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { - private final LogicalType elementType; - - private ArrayDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter writer, - LogicalType elementType) { - super(definitionLevel, repetitionLevel, writer); - this.elementType = elementType; - } - - @Override - protected Iterator elements(ArrayData list) { - return new ElementIterator<>(list); - } - - private class ElementIterator implements Iterator { - private final int size; - private final ArrayData list; - private final ArrayData.ElementGetter getter; - private int index; - - private ElementIterator(ArrayData list) { - this.list = list; - size = list.size(); - getter = ArrayData.createElementGetter(elementType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public E next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - E element = (E) getter.getElementOrNull(list, index); - index += 1; - - return element; - } - } - } - - private static class MapDataWriter - extends ParquetValueWriters.RepeatedKeyValueWriter { - private final LogicalType keyType; - private final LogicalType valueType; - - private MapDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter, - LogicalType keyType, - LogicalType valueType) { - super(definitionLevel, repetitionLevel, keyWriter, valueWriter); - this.keyType = keyType; - this.valueType = valueType; - } - - @Override - protected Iterator> pairs(MapData map) { - return new EntryIterator<>(map); - } - - private class EntryIterator implements Iterator> { - private final int size; - private final ArrayData keys; - private final ArrayData values; - private final ParquetValueReaders.ReusableEntry entry; - private final ArrayData.ElementGetter keyGetter; - private final ArrayData.ElementGetter valueGetter; - private int index; - - private EntryIterator(MapData map) { - size = map.size(); - keys = map.keyArray(); - values = map.valueArray(); - entry = new ParquetValueReaders.ReusableEntry<>(); - keyGetter = ArrayData.createElementGetter(keyType); - valueGetter = ArrayData.createElementGetter(valueType); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public Map.Entry next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - entry.set( - (K) keyGetter.getElementOrNull(keys, index), - (V) valueGetter.getElementOrNull(values, index)); - index += 1; - - return entry; - } - } - } - - private static class RowDataWriter extends ParquetValueWriters.StructWriter { - private final RowData.FieldGetter[] fieldGetter; - - RowDataWriter(List> writers, List types) { - super(writers); - fieldGetter = new RowData.FieldGetter[types.size()]; - for (int i = 0; i < types.size(); i += 1) { - fieldGetter[i] = RowData.createFieldGetter(types.get(i), i); - } - } - - @Override - protected Object get(RowData struct, int index) { - return fieldGetter[index].getFieldOrNull(struct); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java deleted file mode 100644 index 34099c47d3..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/data/AdaptHiveParquetWithFlinkSchemaVisitor.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink.data; - -import org.apache.amoro.shade.guava32.com.google.common.base.Preconditions; -import org.apache.amoro.shade.guava32.com.google.common.collect.Lists; -import org.apache.flink.table.types.logical.ArrayType; -import org.apache.flink.table.types.logical.LogicalType; -import org.apache.flink.table.types.logical.MapType; -import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; - -import java.util.Deque; -import java.util.List; - -/** - * Copy from iceberg {@link ParquetWithFlinkSchemaVisitor}. see annotation "Change For mixed-format" - */ -public class AdaptHiveParquetWithFlinkSchemaVisitor { - private final Deque fieldNames = Lists.newLinkedList(); - - public static T visit( - LogicalType sType, Type type, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { - Preconditions.checkArgument(sType != null, "Invalid DataType: null"); - if (type instanceof MessageType) { - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.message( - struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); - } else if (type.isPrimitive()) { - return visitor.primitive(sType, type.asPrimitiveType()); - } else { - // if not a primitive, the typeId must be a group - GroupType group = type.asGroupType(); - OriginalType annotation = group.getOriginalType(); - if (annotation != null) { - switch (annotation) { - case LIST: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", - group); - - GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument( - repeatedElement.isRepetition(Type.Repetition.REPEATED), - "Invalid list: inner group is not repeated"); - Preconditions.checkArgument( - repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", - group); - - Preconditions.checkArgument( - sType instanceof ArrayType, "Invalid list: %s is not an array", sType); - ArrayType array = (ArrayType) sType; - RowField element = - new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); - - visitor.fieldNames.push(repeatedElement.getName()); - try { - T elementResult = null; - if (repeatedElement.getFieldCount() > 0) { - elementResult = visitField(element, repeatedElement.getType(0), visitor); - } - - return visitor.list(array, group, elementResult); - - } finally { - visitor.fieldNames.pop(); - } - - case MAP: - Preconditions.checkArgument( - !group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", - group); - - GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument( - repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), - "Invalid map: inner group is not repeated"); - Preconditions.checkArgument( - repeatedKeyValue.getFieldCount() <= 2, - "Invalid map: repeated group does not have 2 fields"); - - Preconditions.checkArgument( - sType instanceof MapType, "Invalid map: %s is not a map", sType); - MapType map = (MapType) sType; - RowField keyField = - new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = - new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); - - visitor.fieldNames.push(repeatedKeyValue.getName()); - try { - T keyResult = null; - T valueResult = null; - switch (repeatedKeyValue.getFieldCount()) { - case 2: - // if there are 2 fields, both key and value are projected - keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); - valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); - break; - case 1: - // if there is just one, use the name to determine what it is - Type keyOrValue = repeatedKeyValue.getType(0); - if (keyOrValue.getName().equalsIgnoreCase("key")) { - keyResult = visitField(keyField, keyOrValue, visitor); - // value result remains null - } else { - valueResult = visitField(valueField, keyOrValue, visitor); - // key result remains null - } - break; - default: - // both results will remain null - } - - return visitor.map(map, group, keyResult, valueResult); - - } finally { - visitor.fieldNames.pop(); - } - - default: - } - } - Preconditions.checkArgument( - sType instanceof RowType, "Invalid struct: %s is not a struct", sType); - RowType struct = (RowType) sType; - return visitor.struct(struct, group, visitFields(struct, group, visitor)); - } - } - - private static T visitField( - RowField sField, Type field, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { - visitor.fieldNames.push(field.getName()); - try { - return visit(sField.getType(), field, visitor); - } finally { - visitor.fieldNames.pop(); - } - } - - private static List visitFields( - RowType struct, GroupType group, AdaptHiveParquetWithFlinkSchemaVisitor visitor) { - List sFields = struct.getFields(); - Preconditions.checkArgument( - sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); - List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.size(); i += 1) { - Type field = group.getFields().get(i); - RowField sField = sFields.get(i); - - // Change for mixed-format table ⬇ - // Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - // "Structs do not match: field %s != %s", field.getName(), sField.getName()); - Preconditions.checkArgument( - field.getName().equals(sField.getName()), - "Structs do not match: field %s != %s", - field.getName(), - sField.getName()); - // Change for mixed-format table ⬆ - - results.add(visitField(sField, field, visitor)); - } - - return results; - } - - public T message(RowType sStruct, MessageType message, List fields) { - return null; - } - - public T struct(RowType sStruct, GroupType struct, List fields) { - return null; - } - - public T list(ArrayType sArray, GroupType array, T element) { - return null; - } - - public T map(MapType sMap, GroupType map, T key, T value) { - return null; - } - - public T primitive(LogicalType sPrimitive, PrimitiveType primitive) { - return null; - } - - protected String[] currentPath() { - return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); - } - - protected String[] path(String name) { - List list = Lists.newArrayList(fieldNames.descendingIterator()); - list.add(name); - return list.toArray(new String[0]); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java deleted file mode 100644 index 599d4cbb2a..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink.source; - -import org.apache.amoro.shade.guava32.com.google.common.collect.ImmutableMap; -import org.apache.amoro.shade.guava32.com.google.common.collect.Sets; -import org.apache.flink.annotation.Internal; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DeleteFilter; -import org.apache.iceberg.encryption.InputFilesDecryptor; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.FlinkSourceFilter; -import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetReaders; -import org.apache.iceberg.flink.data.FlinkAvroReader; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.RowDataProjection; -import org.apache.iceberg.flink.data.RowDataUtil; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; - -import java.util.List; -import java.util.Map; - -/** Copy from iceberg. Adopt AdaptHiveFlinkParquetReaders to adapt hive flink parquet readers. */ -@Internal -public class RowDataFileScanTaskReader implements FileScanTaskReader { - - private final Schema tableSchema; - private final Schema projectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final FlinkSourceFilter rowFilter; - - public RowDataFileScanTaskReader( - Schema tableSchema, - Schema projectedSchema, - String nameMapping, - boolean caseSensitive, - List filters) { - this.tableSchema = tableSchema; - this.projectedSchema = projectedSchema; - this.nameMapping = nameMapping; - this.caseSensitive = caseSensitive; - - if (filters != null && !filters.isEmpty()) { - Expression combinedExpression = - filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); - this.rowFilter = - new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); - } else { - this.rowFilter = null; - } - } - - @Override - public CloseableIterator open( - FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { - Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - - Map idToConstant = - partitionSchema.columns().isEmpty() - ? ImmutableMap.of() - : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - - FlinkDeleteFilter deletes = - new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = - deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); - - // Project the RowData to remove the extra meta columns. - if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = - RowDataProjection.create( - deletes.requiredRowType(), - deletes.requiredSchema().asStruct(), - projectedSchema.asStruct()); - iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); - } - - return iterable.iterator(); - } - - private CloseableIterable newIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - CloseableIterable iter; - if (task.isDataTask()) { - throw new UnsupportedOperationException("Cannot read data task."); - } else { - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case AVRO: - iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case ORC: - iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); - } - } - - if (rowFilter != null) { - return CloseableIterable.filter(iter, rowFilter::filter); - } - return iter; - } - - private CloseableIterable newAvroIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = - Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = - Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - // Change for mixed-format table - .createReaderFunc( - fileSchema -> - AdaptHiveFlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private static class FlinkDeleteFilter extends DeleteFilter { - private final RowType requiredRowType; - private final RowDataWrapper asStructLike; - private final InputFilesDecryptor inputFilesDecryptor; - - FlinkDeleteFilter( - FileScanTask task, - Schema tableSchema, - Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { - super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); - this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); - this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); - this.inputFilesDecryptor = inputFilesDecryptor; - } - - public RowType requiredRowType() { - return requiredRowType; - } - - @Override - protected StructLike asStructLike(RowData row) { - return asStructLike.wrap(row); - } - - @Override - protected InputFile getInputFile(String location) { - return inputFilesDecryptor.getInputFile(location); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java deleted file mode 100644 index c3ff9bea62..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge-1.17/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ /dev/null @@ -1,707 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink.source; - -import org.apache.flink.configuration.ConfigOption; -import org.apache.flink.configuration.ConfigOptions; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.util.Preconditions; -import org.apache.flink.util.TimeUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.flink.FlinkConfigOptions; -import org.apache.iceberg.flink.FlinkReadConf; -import org.apache.iceberg.flink.FlinkReadOptions; - -import java.io.Serializable; -import java.time.Duration; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -/** - * Copy from Iceberg {@link ScanContext}. only change line 115 and expand the modifier. Context - * object with optional arguments for a Flink Scan. - */ -public class ScanContext implements Serializable { - - private static final long serialVersionUID = 1L; - - public static final ConfigOption SNAPSHOT_ID = - ConfigOptions.key("snapshot-id") - .longType() - .defaultValue(null) - .withDescription( - "Retrieve the full data of the specified snapshot by ID, used for batch scan mode"); - - public static final ConfigOption TAG = - ConfigOptions.key("tag").stringType().defaultValue(null); - - public static final ConfigOption BRANCH = - ConfigOptions.key("branch").stringType().defaultValue(null); - - public static final ConfigOption START_TAG = - ConfigOptions.key("start-tag").stringType().defaultValue(null); - - public static final ConfigOption END_TAG = - ConfigOptions.key("end-tag").stringType().defaultValue(null); - - public static final ConfigOption CASE_SENSITIVE = - ConfigOptions.key("case-sensitive") - .booleanType() - .defaultValue(false) - .withDescription("Set if column names are case-sensitive"); - - public static final ConfigOption AS_OF_TIMESTAMP = - ConfigOptions.key("as-of-timestamp") - .longType() - .defaultValue(null) - .withDescription( - "Retrieve the full data of the specified snapshot at the given timestamp, " - + "used for batch scan mode"); - - public static final ConfigOption STARTING_STRATEGY = - ConfigOptions.key("starting-strategy") - .enumType(StreamingStartingStrategy.class) - .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .withDescription("Specific the starting strategy for streaming execution"); - - public static final ConfigOption START_SNAPSHOT_TIMESTAMP = - ConfigOptions.key("start-snapshot-timestamp") - .longType() - .defaultValue(null) - .withDescription("Specific the snapshot timestamp that streaming job starts from"); - - public static final ConfigOption START_SNAPSHOT_ID = - ConfigOptions.key("start-snapshot-id") - .longType() - .defaultValue(null) - .withDescription("Specific the snapshot id that streaming job starts from"); - - public static final ConfigOption END_SNAPSHOT_ID = - ConfigOptions.key("end-snapshot-id") - .longType() - .defaultValue(null) - .withDescription("Specific the snapshot id that streaming job to end"); - - public static final ConfigOption SPLIT_SIZE = - ConfigOptions.key("split-size") - .longType() - .defaultValue(null) - .withDescription("Specific the target size when combining data input splits"); - - public static final ConfigOption SPLIT_LOOKBACK = - ConfigOptions.key("split-lookback") - .intType() - .defaultValue(null) - .withDescription("Specify the number of bins to consider when combining input splits"); - - public static final ConfigOption SPLIT_FILE_OPEN_COST = - ConfigOptions.key("split-file-open-cost") - .longType() - .defaultValue(null) - .withDescription( - "The estimated cost to open a file, used as a minimum weight when combining splits"); - - public static final ConfigOption STREAMING = - ConfigOptions.key("streaming") - .booleanType() - .defaultValue(true) - .withDescription("Set if job is bounded or unbounded"); - - public static final ConfigOption MONITOR_INTERVAL = - ConfigOptions.key("monitor-interval") - .durationType() - .defaultValue(Duration.ofSeconds(10)) - .withDescription( - "Specify the time interval for consecutively monitoring newly committed data files"); - - public static final ConfigOption INCLUDE_COLUMN_STATS = - ConfigOptions.key("include-column-stats") - .booleanType() - .defaultValue(false) - .withDescription("Set if loads the column stats with each file"); - - public static final ConfigOption MAX_PLANNING_SNAPSHOT_COUNT = - ConfigOptions.key("max-planning-snapshot-count") - .intType() - .defaultValue(Integer.MAX_VALUE) - .withDescription("Specify the max planning snapshot count"); - - public static final ConfigOption LIMIT_OPTION = - ConfigOptions.key("limit").longType().defaultValue(-1L); - - public static final ConfigOption MAX_ALLOWED_PLANNING_FAILURES_OPTION = - ConfigOptions.key("max-allowed-planning-failures").intType().defaultValue(3); - - protected final boolean caseSensitive; - protected final boolean exposeLocality; - protected final Long snapshotId; - protected final String branch; - protected final String tag; - protected final StreamingStartingStrategy startingStrategy; - protected final Long startSnapshotId; - protected final Long startSnapshotTimestamp; - protected final Long endSnapshotId; - protected final Long asOfTimestamp; - protected final String startTag; - protected final String endTag; - protected final Long splitSize; - protected final Integer splitLookback; - protected final Long splitOpenFileCost; - protected final boolean isStreaming; - protected final Duration monitorInterval; - - protected final String nameMapping; - protected final Schema schema; - protected final List filters; - protected final long limit; - protected final boolean includeColumnStats; - protected final Collection includeStatsForColumns; - protected final Integer planParallelism; - protected final int maxPlanningSnapshotCount; - protected final int maxAllowedPlanningFailures; - protected final String watermarkColumn; - protected final TimeUnit watermarkColumnTimeUnit; - - protected ScanContext( - boolean caseSensitive, - Long snapshotId, - StreamingStartingStrategy startingStrategy, - Long startSnapshotTimestamp, - Long startSnapshotId, - Long endSnapshotId, - Long asOfTimestamp, - Long splitSize, - Integer splitLookback, - Long splitOpenFileCost, - boolean isStreaming, - Duration monitorInterval, - String nameMapping, - Schema schema, - List filters, - long limit, - boolean includeColumnStats, - Collection includeStatsForColumns, - boolean exposeLocality, - Integer planParallelism, - int maxPlanningSnapshotCount, - int maxAllowedPlanningFailures, - String watermarkColumn, - TimeUnit watermarkColumnTimeUnit, - String branch, - String tag, - String startTag, - String endTag) { - this.caseSensitive = caseSensitive; - this.snapshotId = snapshotId; - this.tag = tag; - this.branch = branch; - this.startingStrategy = startingStrategy; - this.startSnapshotTimestamp = startSnapshotTimestamp; - this.startSnapshotId = startSnapshotId; - this.endSnapshotId = endSnapshotId; - this.asOfTimestamp = asOfTimestamp; - this.startTag = startTag; - this.endTag = endTag; - this.splitSize = splitSize; - this.splitLookback = splitLookback; - this.splitOpenFileCost = splitOpenFileCost; - this.isStreaming = isStreaming; - this.monitorInterval = monitorInterval; - - this.nameMapping = nameMapping; - this.schema = schema; - this.filters = filters; - this.limit = limit; - this.includeColumnStats = includeColumnStats; - this.includeStatsForColumns = includeStatsForColumns; - this.exposeLocality = exposeLocality; - this.planParallelism = planParallelism; - this.maxPlanningSnapshotCount = maxPlanningSnapshotCount; - this.maxAllowedPlanningFailures = maxAllowedPlanningFailures; - this.watermarkColumn = watermarkColumn; - this.watermarkColumnTimeUnit = watermarkColumnTimeUnit; - } - - void validate() { - if (isStreaming) { - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { - Preconditions.checkArgument( - startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - Preconditions.checkArgument( - startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { - Preconditions.checkArgument( - startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - Preconditions.checkArgument( - startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); - } - - Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); - Preconditions.checkArgument( - snapshotId == null, "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument( - asOfTimestamp == null, "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument( - endSnapshotId == null, "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument(endTag == null, "Cannot set end-tag option for streaming reader"); - } - Preconditions.checkArgument( - !(startTag != null && startSnapshotId() != null), - "START_SNAPSHOT_ID and START_TAG cannot both be set."); - - Preconditions.checkArgument( - !(endTag != null && endSnapshotId() != null), - "END_SNAPSHOT_ID and END_TAG cannot both be set."); - - Preconditions.checkArgument( - maxAllowedPlanningFailures >= -1, - "Cannot set maxAllowedPlanningFailures to a negative number other than -1."); - } - - public boolean caseSensitive() { - return caseSensitive; - } - - public Long snapshotId() { - return snapshotId; - } - - public String branch() { - return branch; - } - - public String tag() { - return tag; - } - - public String startTag() { - return startTag; - } - - public String endTag() { - return endTag; - } - - public StreamingStartingStrategy streamingStartingStrategy() { - return startingStrategy; - } - - public Long startSnapshotTimestamp() { - return startSnapshotTimestamp; - } - - public Long startSnapshotId() { - return startSnapshotId; - } - - public Long endSnapshotId() { - return endSnapshotId; - } - - public Long asOfTimestamp() { - return asOfTimestamp; - } - - public Long splitSize() { - return splitSize; - } - - public Integer splitLookback() { - return splitLookback; - } - - public Long splitOpenFileCost() { - return splitOpenFileCost; - } - - public boolean isStreaming() { - return isStreaming; - } - - public Duration monitorInterval() { - return monitorInterval; - } - - public String nameMapping() { - return nameMapping; - } - - public Schema project() { - return schema; - } - - public List filters() { - return filters; - } - - public long limit() { - return limit; - } - - public boolean includeColumnStats() { - return includeColumnStats; - } - - public Collection includeStatsForColumns() { - return includeStatsForColumns; - } - - public boolean exposeLocality() { - return exposeLocality; - } - - public Integer planParallelism() { - return planParallelism; - } - - public int maxPlanningSnapshotCount() { - return maxPlanningSnapshotCount; - } - - public int maxAllowedPlanningFailures() { - return maxAllowedPlanningFailures; - } - - public String watermarkColumn() { - return watermarkColumn; - } - - public TimeUnit watermarkColumnTimeUnit() { - return watermarkColumnTimeUnit; - } - - public ScanContext copyWithAppendsBetween(Long newStartSnapshotId, long newEndSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(null) - .useBranch(branch) - .useTag(null) - .startSnapshotId(newStartSnapshotId) - .endSnapshotId(newEndSnapshotId) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public ScanContext copyWithSnapshotId(long newSnapshotId) { - return ScanContext.builder() - .caseSensitive(caseSensitive) - .useSnapshotId(newSnapshotId) - .useBranch(branch) - .useTag(tag) - .startSnapshotId(null) - .endSnapshotId(null) - .startTag(null) - .endTag(null) - .asOfTimestamp(null) - .splitSize(splitSize) - .splitLookback(splitLookback) - .splitOpenFileCost(splitOpenFileCost) - .streaming(isStreaming) - .monitorInterval(monitorInterval) - .nameMapping(nameMapping) - .project(schema) - .filters(filters) - .limit(limit) - .includeColumnStats(includeColumnStats) - .includeColumnStats(includeStatsForColumns) - .exposeLocality(exposeLocality) - .planParallelism(planParallelism) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(watermarkColumn) - .watermarkColumnTimeUnit(watermarkColumnTimeUnit) - .build(); - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - private boolean caseSensitive = FlinkReadOptions.CASE_SENSITIVE_OPTION.defaultValue(); - private Long snapshotId = FlinkReadOptions.SNAPSHOT_ID.defaultValue(); - private String branch = FlinkReadOptions.BRANCH.defaultValue(); - private String tag = FlinkReadOptions.TAG.defaultValue(); - private String startTag = FlinkReadOptions.START_TAG.defaultValue(); - private String endTag = FlinkReadOptions.END_TAG.defaultValue(); - private StreamingStartingStrategy startingStrategy = - FlinkReadOptions.STARTING_STRATEGY_OPTION.defaultValue(); - private Long startSnapshotTimestamp = FlinkReadOptions.START_SNAPSHOT_TIMESTAMP.defaultValue(); - private Long startSnapshotId = FlinkReadOptions.START_SNAPSHOT_ID.defaultValue(); - private Long endSnapshotId = FlinkReadOptions.END_SNAPSHOT_ID.defaultValue(); - private Long asOfTimestamp = FlinkReadOptions.AS_OF_TIMESTAMP.defaultValue(); - private Long splitSize = FlinkReadOptions.SPLIT_SIZE_OPTION.defaultValue(); - private Integer splitLookback = FlinkReadOptions.SPLIT_LOOKBACK_OPTION.defaultValue(); - private Long splitOpenFileCost = FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION.defaultValue(); - private boolean isStreaming = FlinkReadOptions.STREAMING_OPTION.defaultValue(); - private Duration monitorInterval = - TimeUtils.parseDuration(FlinkReadOptions.MONITOR_INTERVAL_OPTION.defaultValue()); - private String nameMapping; - private Schema projectedSchema; - private List filters; - private long limit = FlinkReadOptions.LIMIT_OPTION.defaultValue(); - private boolean includeColumnStats = - FlinkReadOptions.INCLUDE_COLUMN_STATS_OPTION.defaultValue(); - private Collection includeStatsForColumns = null; - private boolean exposeLocality; - private Integer planParallelism = - FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); - private int maxPlanningSnapshotCount = - FlinkReadOptions.MAX_PLANNING_SNAPSHOT_COUNT_OPTION.defaultValue(); - private int maxAllowedPlanningFailures = - FlinkReadOptions.MAX_ALLOWED_PLANNING_FAILURES_OPTION.defaultValue(); - private String watermarkColumn = FlinkReadOptions.WATERMARK_COLUMN_OPTION.defaultValue(); - private TimeUnit watermarkColumnTimeUnit = - FlinkReadOptions.WATERMARK_COLUMN_TIME_UNIT_OPTION.defaultValue(); - - private Builder() {} - - public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; - return this; - } - - public Builder useSnapshotId(Long newSnapshotId) { - this.snapshotId = newSnapshotId; - return this; - } - - public Builder useTag(String newTag) { - this.tag = newTag; - return this; - } - - public Builder useBranch(String newBranch) { - this.branch = newBranch; - return this; - } - - public Builder startingStrategy(StreamingStartingStrategy newStartingStrategy) { - this.startingStrategy = newStartingStrategy; - return this; - } - - public Builder startSnapshotTimestamp(Long newStartSnapshotTimestamp) { - this.startSnapshotTimestamp = newStartSnapshotTimestamp; - return this; - } - - public Builder startSnapshotId(Long newStartSnapshotId) { - this.startSnapshotId = newStartSnapshotId; - return this; - } - - public Builder endSnapshotId(Long newEndSnapshotId) { - this.endSnapshotId = newEndSnapshotId; - return this; - } - - public Builder startTag(String newStartTag) { - this.startTag = newStartTag; - return this; - } - - public Builder endTag(String newEndTag) { - this.endTag = newEndTag; - return this; - } - - public Builder asOfTimestamp(Long newAsOfTimestamp) { - this.asOfTimestamp = newAsOfTimestamp; - return this; - } - - public Builder splitSize(Long newSplitSize) { - this.splitSize = newSplitSize; - return this; - } - - public Builder splitLookback(Integer newSplitLookback) { - this.splitLookback = newSplitLookback; - return this; - } - - public Builder splitOpenFileCost(Long newSplitOpenFileCost) { - this.splitOpenFileCost = newSplitOpenFileCost; - return this; - } - - public Builder streaming(boolean streaming) { - this.isStreaming = streaming; - return this; - } - - public Builder monitorInterval(Duration newMonitorInterval) { - this.monitorInterval = newMonitorInterval; - return this; - } - - public Builder nameMapping(String newNameMapping) { - this.nameMapping = newNameMapping; - return this; - } - - public Builder project(Schema newProjectedSchema) { - this.projectedSchema = newProjectedSchema; - return this; - } - - public Builder filters(List newFilters) { - this.filters = newFilters; - return this; - } - - public Builder limit(long newLimit) { - this.limit = newLimit; - return this; - } - - public Builder includeColumnStats(boolean newIncludeColumnStats) { - this.includeColumnStats = newIncludeColumnStats; - return this; - } - - public Builder includeColumnStats(Collection newIncludeStatsForColumns) { - this.includeStatsForColumns = newIncludeStatsForColumns; - return this; - } - - public Builder exposeLocality(boolean newExposeLocality) { - this.exposeLocality = newExposeLocality; - return this; - } - - public Builder planParallelism(Integer parallelism) { - this.planParallelism = parallelism; - return this; - } - - public Builder maxPlanningSnapshotCount(int newMaxPlanningSnapshotCount) { - this.maxPlanningSnapshotCount = newMaxPlanningSnapshotCount; - return this; - } - - public Builder maxAllowedPlanningFailures(int newMaxAllowedPlanningFailures) { - this.maxAllowedPlanningFailures = newMaxAllowedPlanningFailures; - return this; - } - - public Builder watermarkColumn(String newWatermarkColumn) { - this.watermarkColumn = newWatermarkColumn; - return this; - } - - public Builder watermarkColumnTimeUnit(TimeUnit newWatermarkTimeUnit) { - this.watermarkColumnTimeUnit = newWatermarkTimeUnit; - return this; - } - - public Builder resolveConfig( - Table table, Map readOptions, ReadableConfig readableConfig) { - FlinkReadConf flinkReadConf = new FlinkReadConf(table, readOptions, readableConfig); - - return this.useSnapshotId(flinkReadConf.snapshotId()) - .useTag(flinkReadConf.tag()) - .useBranch(flinkReadConf.branch()) - .startTag(flinkReadConf.startTag()) - .endTag(flinkReadConf.endTag()) - .caseSensitive(flinkReadConf.caseSensitive()) - .asOfTimestamp(flinkReadConf.asOfTimestamp()) - .startingStrategy(flinkReadConf.startingStrategy()) - .startSnapshotTimestamp(flinkReadConf.startSnapshotTimestamp()) - .startSnapshotId(flinkReadConf.startSnapshotId()) - .endSnapshotId(flinkReadConf.endSnapshotId()) - .splitSize(flinkReadConf.splitSize()) - .splitLookback(flinkReadConf.splitLookback()) - .splitOpenFileCost(flinkReadConf.splitFileOpenCost()) - .streaming(flinkReadConf.streaming()) - .monitorInterval(flinkReadConf.monitorInterval()) - .nameMapping(flinkReadConf.nameMapping()) - .limit(flinkReadConf.limit()) - .planParallelism(flinkReadConf.workerPoolSize()) - .includeColumnStats(flinkReadConf.includeColumnStats()) - .maxPlanningSnapshotCount(flinkReadConf.maxPlanningSnapshotCount()) - .maxAllowedPlanningFailures(maxAllowedPlanningFailures) - .watermarkColumn(flinkReadConf.watermarkColumn()) - .watermarkColumnTimeUnit(flinkReadConf.watermarkColumnTimeUnit()); - } - - public ScanContext build() { - return new ScanContext( - caseSensitive, - snapshotId, - startingStrategy, - startSnapshotTimestamp, - startSnapshotId, - endSnapshotId, - asOfTimestamp, - splitSize, - splitLookback, - splitOpenFileCost, - isStreaming, - monitorInterval, - nameMapping, - projectedSchema, - filters, - limit, - includeColumnStats, - includeStatsForColumns, - exposeLocality, - planParallelism, - maxPlanningSnapshotCount, - maxAllowedPlanningFailures, - watermarkColumn, - watermarkColumnTimeUnit, - branch, - tag, - startTag, - endTag); - } - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java index ab0b36a2c2..6b984b1b5d 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/data/AdaptHiveFlinkParquetReaders.java @@ -135,7 +135,7 @@ public ParquetValueReader struct( } } - return new RowDataReader(reorderedFields); + return new RowDataReader(types, reorderedFields); } @Override @@ -622,8 +622,8 @@ private static class RowDataReader extends ParquetValueReaders.StructReader { private final int numFields; - RowDataReader(List> readers) { - super(readers); + RowDataReader(List types, List> readers) { + super(types, readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index 8b99a33621..599d4cbb2a 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common-iceberg-bridge/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -36,8 +36,8 @@ import org.apache.iceberg.flink.FlinkSourceFilter; import org.apache.iceberg.flink.RowDataWrapper; import org.apache.iceberg.flink.data.AdaptHiveFlinkParquetReaders; +import org.apache.iceberg.flink.data.FlinkAvroReader; import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.FlinkPlannedAvroReader; import org.apache.iceberg.flink.data.RowDataProjection; import org.apache.iceberg.flink.data.RowDataUtil; import org.apache.iceberg.io.CloseableIterable; @@ -156,7 +156,7 @@ private CloseableIterable newAvroIterable( .reuseContainers() .project(schema) .split(task.start(), task.length()) - .createReaderFunc(ignore -> FlinkPlannedAvroReader.create(schema, idToConstant)); + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java index a1cf6f5741..155bda30ad 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/AdaptHiveFlinkParquetReaders.java @@ -135,7 +135,7 @@ public ParquetValueReader struct( } } - return new RowDataReader(reorderedFields); + return new RowDataReader(types, reorderedFields); } @Override @@ -622,8 +622,8 @@ private static class RowDataReader extends ParquetValueReaders.StructReader { private final int numFields; - RowDataReader(List> readers) { - super(readers); + RowDataReader(List types, List> readers) { + super(types, readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java index 0e922e50dc..024c65f80d 100644 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java +++ b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/main/java/org/apache/amoro/flink/read/hybrid/reader/RowDataRecordFactory.java @@ -29,12 +29,10 @@ class RowDataRecordFactory implements RecordFactory { private final RowType rowType; private final TypeSerializer[] fieldSerializers; - private final RowData.FieldGetter[] fieldGetters; RowDataRecordFactory(RowType rowType) { this.rowType = rowType; this.fieldSerializers = createFieldSerializers(rowType); - this.fieldGetters = createFieldGetters(rowType); } static TypeSerializer[] createFieldSerializers(RowType rowType) { @@ -43,14 +41,6 @@ static TypeSerializer[] createFieldSerializers(RowType rowType) { .toArray(TypeSerializer[]::new); } - static RowData.FieldGetter[] createFieldGetters(RowType rowType) { - RowData.FieldGetter[] getters = new RowData.FieldGetter[rowType.getFieldCount()]; - for (int i = 0; i < rowType.getFieldCount(); i++) { - getters[i] = RowData.createFieldGetter(rowType.getTypeAt(i), i); - } - return getters; - } - @Override public RowData[] createBatch(int batchSize) { RowData[] arr = new RowData[batchSize]; @@ -67,7 +57,6 @@ public void clone(RowData from, RowData[] batch, int position) { // Clone method will allocate a new GenericRowData object // if the target object is NOT a GenericRowData. // So we should always set the clone return value back to the array. - batch[position] = - RowDataUtil.clone(from, batch[position], rowType, fieldSerializers, fieldGetters); + batch[position] = RowDataUtil.clone(from, batch[position], rowType, fieldSerializers); } } diff --git a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java deleted file mode 100644 index 5a1e3d85b9..0000000000 --- a/amoro-format-mixed/amoro-mixed-flink/amoro-mixed-flink-common/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.iceberg.flink; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; -import org.apache.flink.test.util.MiniClusterWithClientResource; - -/** - * Compatibility shim for tests that previously used Iceberg's removed MiniClusterResource helper. - */ -public class MiniClusterResource { - private static final int DEFAULT_TM_NUM = 1; - private static final int DEFAULT_PARALLELISM = 4; - - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = - new Configuration().set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() {} - - public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { - return new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(DEFAULT_TM_NUM) - .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) - .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) - .build()); - } -} diff --git a/amoro-format-mixed/amoro-mixed-flink/pom.xml b/amoro-format-mixed/amoro-mixed-flink/pom.xml index e4d98a6534..4216ef937a 100644 --- a/amoro-format-mixed/amoro-mixed-flink/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/pom.xml @@ -34,10 +34,8 @@ amoro-mixed-flink-common - amoro-mixed-flink-common-1.17 amoro-mixed-flink-common-format amoro-mixed-flink-common-iceberg-bridge - amoro-mixed-flink-common-iceberg-bridge-1.17 v1.16/amoro-mixed-flink-1.16 v1.16/amoro-mixed-flink-runtime-1.16 v1.17/amoro-mixed-flink-1.17 @@ -49,7 +47,6 @@ 1.18.1 - 1.9.2 3.2.0-1.18 3.4.0 2.9.0 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml index 7d7e935233..2e349a5a5d 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-1.17/pom.xml @@ -33,7 +33,6 @@ https://amoro.apache.org - 1.6.1 3.2.3 3.21.0 1.17.2 @@ -42,7 +41,7 @@ org.apache.amoro - amoro-mixed-flink-common-1.17 + amoro-mixed-flink-common ${project.parent.version} @@ -88,7 +87,7 @@ - org.apache.amoro:amoro-mixed-flink-common-1.17 + org.apache.amoro:amoro-format-mixed-flink-common diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml index 4169e0660d..4d5186dc36 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.17/amoro-mixed-flink-runtime-1.17/pom.xml @@ -31,7 +31,6 @@ https://amoro.apache.org - 1.6.1 1.17.2 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml index 95f82a7e1d..69d90fc205 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-1.18/pom.xml @@ -33,7 +33,6 @@ https://amoro.apache.org - 1.9.2 3.2.3 3.21.0 1.18.1 diff --git a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml index 64c3b2e116..d2e7155222 100644 --- a/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml +++ b/amoro-format-mixed/amoro-mixed-flink/v1.18/amoro-mixed-flink-runtime-1.18/pom.xml @@ -31,7 +31,6 @@ https://amoro.apache.org - 1.9.2 1.18.1 3.2.0-1.18 diff --git a/amoro-format-mixed/amoro-mixed-hive/pom.xml b/amoro-format-mixed/amoro-mixed-hive/pom.xml index 40a9030cd0..64a0396782 100644 --- a/amoro-format-mixed/amoro-mixed-hive/pom.xml +++ b/amoro-format-mixed/amoro-mixed-hive/pom.xml @@ -30,6 +30,12 @@ Amoro Project Mixed Hive Format https://amoro.apache.org + + 1.10.1 + 1.16.0 + 1.16.0 + + org.apache.amoro diff --git a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml index 0032a96ad0..ad5668c54d 100644 --- a/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml +++ b/amoro-format-mixed/amoro-mixed-spark/amoro-mixed-spark-3-common/pom.xml @@ -45,13 +45,6 @@ ${project.version} - - - org.apache.avro - avro - 1.12.0 - - org.apache.spark spark-sql_${scala.binary.version} diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index ef33a3d301..46f3e6f4fa 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.3/amoro-mixed-spark-3.3/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(newFields); + return new InternalRowReader(types, newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(reorderedFields); + return new InternalRowReader(types, reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List> readers) { - super(readers); + InternalRowReader(List types, List> readers) { + super(types, readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index ef33a3d301..46f3e6f4fa 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(newFields); + return new InternalRowReader(types, newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(reorderedFields); + return new InternalRowReader(types, reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List> readers) { - super(readers); + InternalRowReader(List types, List> readers) { + super(types, readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala index 2db295c827..acf956f241 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala +++ b/amoro-format-mixed/amoro-mixed-spark/v3.4/amoro-mixed-spark-3.4/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala @@ -43,17 +43,11 @@ class MixedFormatSparkExtensions extends (SparkSessionExtensions => Unit) { extensions.injectPostHocResolutionRule(spark => RewriteMixedFormatCommand(spark)) - // mixed-format row-level operation rewrite rules - // These must be resolution rules (not optimizer rules) so they run BEFORE Iceberg 1.10.x's - // RewriteUpdateTableForRowLineage and RewriteMergeIntoTableForRowLineage rules. Those Iceberg - // rules do pattern matching on the table and throw scala.MatchError for non-SparkTable types - // (i.e., MixedSparkTable). - extensions.injectResolutionRule { spark => RewriteUpdateMixedFormatTable(spark) } - extensions.injectResolutionRule { spark => RewriteDeleteFromMixedFormatTable(spark) } - // mixed-format optimizer rules extensions.injectPostHocResolutionRule { spark => QueryWithConstraintCheck(spark) } extensions.injectOptimizerRule { spark => RewriteAppendMixedFormatTable(spark) } + extensions.injectOptimizerRule { spark => RewriteDeleteFromMixedFormatTable(spark) } + extensions.injectOptimizerRule { spark => RewriteUpdateMixedFormatTable(spark) } // planner extensions extensions.injectPlannerStrategy { spark => MixedFormatExtendedDataSourceV2Strategy(spark) } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java index ef33a3d301..46f3e6f4fa 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java +++ b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/java/org/apache/amoro/spark/reader/SparkParquetReaders.java @@ -104,7 +104,7 @@ public ParquetValueReader struct( types.add(fieldType); } - return new InternalRowReader(newFields); + return new InternalRowReader(types, newFields); } } @@ -169,7 +169,7 @@ public ParquetValueReader struct( } } - return new InternalRowReader(reorderedFields); + return new InternalRowReader(types, reorderedFields); } @Override @@ -526,8 +526,8 @@ private static class InternalRowReader extends ParquetValueReaders.StructReader { private final int numFields; - InternalRowReader(List> readers) { - super(readers); + InternalRowReader(List types, List> readers) { + super(types, readers); this.numFields = readers.size(); } diff --git a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala index 2db295c827..acf956f241 100644 --- a/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala +++ b/amoro-format-mixed/amoro-mixed-spark/v3.5/amoro-mixed-spark-3.5/src/main/scala/org/apache/amoro/spark/MixedFormatSparkExtensions.scala @@ -43,17 +43,11 @@ class MixedFormatSparkExtensions extends (SparkSessionExtensions => Unit) { extensions.injectPostHocResolutionRule(spark => RewriteMixedFormatCommand(spark)) - // mixed-format row-level operation rewrite rules - // These must be resolution rules (not optimizer rules) so they run BEFORE Iceberg 1.10.x's - // RewriteUpdateTableForRowLineage and RewriteMergeIntoTableForRowLineage rules. Those Iceberg - // rules do pattern matching on the table and throw scala.MatchError for non-SparkTable types - // (i.e., MixedSparkTable). - extensions.injectResolutionRule { spark => RewriteUpdateMixedFormatTable(spark) } - extensions.injectResolutionRule { spark => RewriteDeleteFromMixedFormatTable(spark) } - // mixed-format optimizer rules extensions.injectPostHocResolutionRule { spark => QueryWithConstraintCheck(spark) } extensions.injectOptimizerRule { spark => RewriteAppendMixedFormatTable(spark) } + extensions.injectOptimizerRule { spark => RewriteDeleteFromMixedFormatTable(spark) } + extensions.injectOptimizerRule { spark => RewriteUpdateMixedFormatTable(spark) } // planner extensions extensions.injectPlannerStrategy { spark => MixedFormatExtendedDataSourceV2Strategy(spark) } diff --git a/dev/deps/dependencies-hadoop-2-spark-3.3 b/dev/deps/dependencies-hadoop-2-spark-3.3 index 8e6f46a0cd..cd86dfc6f8 100644 --- a/dev/deps/dependencies-hadoop-2-spark-3.3 +++ b/dev/deps/dependencies-hadoop-2-spark-3.3 @@ -30,7 +30,7 @@ async-profiler/2.9//async-profiler-2.9.jar auth/2.24.12//auth-2.24.12.jar avro-ipc/1.11.0//avro-ipc-1.11.0.jar avro-mapred/1.11.0//avro-mapred-1.11.0.jar -avro/1.12.0//avro-1.12.0.jar +avro/1.11.3//avro-1.11.3.jar aws-core/2.24.12//aws-core-2.24.12.jar aws-json-protocol/2.24.12//aws-json-protocol-2.24.12.jar aws-query-protocol/2.24.12//aws-query-protocol-2.24.12.jar @@ -80,7 +80,6 @@ ehcache/3.3.1//ehcache-3.3.1.jar endpoints-spi/2.24.12//endpoints-spi-2.24.12.jar error_prone_annotations/2.18.0//error_prone_annotations-2.18.0.jar eventstream/1.0.1//eventstream-1.0.1.jar -failsafe/3.3.2//failsafe-3.3.2.jar failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/23.5.26//flatbuffers-java-23.5.26.jar flink-annotations/1.20.3//flink-annotations-1.20.3.jar @@ -144,23 +143,23 @@ http-auth-spi/2.24.12//http-auth-spi-2.24.12.jar http-auth/2.24.12//http-auth-2.24.12.jar http-client-spi/2.24.12//http-client-spi-2.24.12.jar httpclient/4.5.13//httpclient-4.5.13.jar -httpclient5/5.4.1//httpclient5-5.4.1.jar +httpclient5/5.3.1//httpclient5-5.3.1.jar httpcore/4.4.13//httpcore-4.4.13.jar -httpcore5-h2/5.3.1//httpcore5-h2-5.3.1.jar -httpcore5/5.3.1//httpcore5-5.3.1.jar -iceberg-aliyun/1.8.1//iceberg-aliyun-1.8.1.jar -iceberg-api/1.8.1//iceberg-api-1.8.1.jar -iceberg-arrow/1.8.1//iceberg-arrow-1.8.1.jar -iceberg-aws/1.8.1//iceberg-aws-1.8.1.jar -iceberg-bundled-guava/1.8.1//iceberg-bundled-guava-1.8.1.jar -iceberg-common/1.8.1//iceberg-common-1.8.1.jar -iceberg-core/1.8.1//iceberg-core-1.8.1.jar -iceberg-data/1.8.1//iceberg-data-1.8.1.jar -iceberg-hive-metastore/1.8.1//iceberg-hive-metastore-1.8.1.jar -iceberg-orc/1.8.1//iceberg-orc-1.8.1.jar -iceberg-parquet/1.8.1//iceberg-parquet-1.8.1.jar -iceberg-spark-3.3_2.12/1.8.1//iceberg-spark-3.3_2.12-1.8.1.jar -iceberg-spark-extensions-3.3_2.12/1.8.1//iceberg-spark-extensions-3.3_2.12-1.8.1.jar +httpcore5-h2/5.2.4//httpcore5-h2-5.2.4.jar +httpcore5/5.2.4//httpcore5-5.2.4.jar +iceberg-aliyun/1.6.1//iceberg-aliyun-1.6.1.jar +iceberg-api/1.6.1//iceberg-api-1.6.1.jar +iceberg-arrow/1.6.1//iceberg-arrow-1.6.1.jar +iceberg-aws/1.6.1//iceberg-aws-1.6.1.jar +iceberg-bundled-guava/1.6.1//iceberg-bundled-guava-1.6.1.jar +iceberg-common/1.6.1//iceberg-common-1.6.1.jar +iceberg-core/1.6.1//iceberg-core-1.6.1.jar +iceberg-data/1.6.1//iceberg-data-1.6.1.jar +iceberg-hive-metastore/1.6.1//iceberg-hive-metastore-1.6.1.jar +iceberg-orc/1.6.1//iceberg-orc-1.6.1.jar +iceberg-parquet/1.6.1//iceberg-parquet-1.6.1.jar +iceberg-spark-3.3_2.12/1.6.1//iceberg-spark-3.3_2.12-1.6.1.jar +iceberg-spark-extensions-3.3_2.12/1.6.1//iceberg-spark-extensions-3.3_2.12-1.6.1.jar icu4j/69.1//icu4j-69.1.jar identity-spi/2.24.12//identity-spi-2.24.12.jar ivy/2.5.1//ivy-2.5.1.jar @@ -359,7 +358,7 @@ parquet-common/1.15.2//parquet-common-1.15.2.jar parquet-encoding/1.15.2//parquet-encoding-1.15.2.jar parquet-format-structures/1.15.2//parquet-format-structures-1.15.2.jar parquet-hadoop/1.15.2//parquet-hadoop-1.15.2.jar -parquet-jackson/1.15.2//parquet-jackson-1.15.2.jar +parquet-jackson/1.13.1//parquet-jackson-1.13.1.jar pickle/1.2//pickle-1.2.jar postgresql/42.7.2//postgresql-42.7.2.jar profiles/2.24.12//profiles-2.24.12.jar diff --git a/dev/deps/dependencies-hadoop-3-spark-3.5 b/dev/deps/dependencies-hadoop-3-spark-3.5 index 24a4eb5064..2facef79f9 100644 --- a/dev/deps/dependencies-hadoop-3-spark-3.5 +++ b/dev/deps/dependencies-hadoop-3-spark-3.5 @@ -25,7 +25,7 @@ async-profiler/2.9//async-profiler-2.9.jar auth/2.24.12//auth-2.24.12.jar avro-ipc/1.11.4//avro-ipc-1.11.4.jar avro-mapred/1.11.4//avro-mapred-1.11.4.jar -avro/1.12.0//avro-1.12.0.jar +avro/1.11.3//avro-1.11.3.jar aws-core/2.24.12//aws-core-2.24.12.jar aws-json-protocol/2.24.12//aws-json-protocol-2.24.12.jar aws-query-protocol/2.24.12//aws-query-protocol-2.24.12.jar @@ -73,7 +73,6 @@ eclipse-collections/11.1.0//eclipse-collections-11.1.0.jar endpoints-spi/2.24.12//endpoints-spi-2.24.12.jar error_prone_annotations/2.18.0//error_prone_annotations-2.18.0.jar eventstream/1.0.1//eventstream-1.0.1.jar -failsafe/3.3.2//failsafe-3.3.2.jar failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/23.5.26//flatbuffers-java-23.5.26.jar flatbuffers/1.2.0-3f79e055//flatbuffers-1.2.0-3f79e055.jar @@ -127,23 +126,23 @@ http-auth-spi/2.24.12//http-auth-spi-2.24.12.jar http-auth/2.24.12//http-auth-2.24.12.jar http-client-spi/2.24.12//http-client-spi-2.24.12.jar httpclient/4.5.14//httpclient-4.5.14.jar -httpclient5/5.5//httpclient5-5.5.jar +httpclient5/5.3.1//httpclient5-5.3.1.jar httpcore/4.4.16//httpcore-4.4.16.jar -httpcore5-h2/5.3.4//httpcore5-h2-5.3.4.jar -httpcore5/5.3.4//httpcore5-5.3.4.jar -iceberg-aliyun/1.10.1//iceberg-aliyun-1.10.1.jar -iceberg-api/1.10.1//iceberg-api-1.10.1.jar -iceberg-arrow/1.10.1//iceberg-arrow-1.10.1.jar -iceberg-aws/1.10.1//iceberg-aws-1.10.1.jar -iceberg-bundled-guava/1.10.1//iceberg-bundled-guava-1.10.1.jar -iceberg-common/1.10.1//iceberg-common-1.10.1.jar -iceberg-core/1.10.1//iceberg-core-1.10.1.jar -iceberg-data/1.10.1//iceberg-data-1.10.1.jar -iceberg-hive-metastore/1.10.1//iceberg-hive-metastore-1.10.1.jar -iceberg-orc/1.10.1//iceberg-orc-1.10.1.jar -iceberg-parquet/1.10.1//iceberg-parquet-1.10.1.jar -iceberg-spark-3.5_2.12/1.10.1//iceberg-spark-3.5_2.12-1.10.1.jar -iceberg-spark-extensions-3.5_2.12/1.10.1//iceberg-spark-extensions-3.5_2.12-1.10.1.jar +httpcore5-h2/5.2.4//httpcore5-h2-5.2.4.jar +httpcore5/5.2.4//httpcore5-5.2.4.jar +iceberg-aliyun/1.6.1//iceberg-aliyun-1.6.1.jar +iceberg-api/1.6.1//iceberg-api-1.6.1.jar +iceberg-arrow/1.6.1//iceberg-arrow-1.6.1.jar +iceberg-aws/1.6.1//iceberg-aws-1.6.1.jar +iceberg-bundled-guava/1.6.1//iceberg-bundled-guava-1.6.1.jar +iceberg-common/1.6.1//iceberg-common-1.6.1.jar +iceberg-core/1.6.1//iceberg-core-1.6.1.jar +iceberg-data/1.6.1//iceberg-data-1.6.1.jar +iceberg-hive-metastore/1.6.1//iceberg-hive-metastore-1.6.1.jar +iceberg-orc/1.6.1//iceberg-orc-1.6.1.jar +iceberg-parquet/1.6.1//iceberg-parquet-1.6.1.jar +iceberg-spark-3.5_2.12/1.6.1//iceberg-spark-3.5_2.12-1.6.1.jar +iceberg-spark-extensions-3.5_2.12/1.6.1//iceberg-spark-extensions-3.5_2.12-1.6.1.jar icu4j/69.1//icu4j-69.1.jar identity-spi/2.24.12//identity-spi-2.24.12.jar ivy/2.5.1//ivy-2.5.1.jar @@ -198,7 +197,6 @@ json4s-scalap_2.12/3.7.0-M11//json4s-scalap_2.12-3.7.0-M11.jar jsqlparser/4.7//jsqlparser-4.7.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jts-core/1.20.0//jts-core-1.20.0.jar junit-jupiter-api/5.9.1//junit-jupiter-api-5.9.1.jar junit-jupiter-engine/5.9.1//junit-jupiter-engine-5.9.1.jar junit-jupiter-params/5.9.1//junit-jupiter-params-5.9.1.jar @@ -322,14 +320,13 @@ oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar pagehelper/6.1.0//pagehelper-6.1.0.jar paranamer/2.8//paranamer-2.8.jar -parquet-avro/1.16.0//parquet-avro-1.16.0.jar -parquet-column/1.16.0//parquet-column-1.16.0.jar -parquet-common/1.16.0//parquet-common-1.16.0.jar -parquet-encoding/1.16.0//parquet-encoding-1.16.0.jar -parquet-format-structures/1.16.0//parquet-format-structures-1.16.0.jar -parquet-hadoop/1.16.0//parquet-hadoop-1.16.0.jar -parquet-jackson/1.16.0//parquet-jackson-1.16.0.jar -parquet-variant/1.16.0//parquet-variant-1.16.0.jar +parquet-avro/1.15.2//parquet-avro-1.15.2.jar +parquet-column/1.15.2//parquet-column-1.15.2.jar +parquet-common/1.15.2//parquet-common-1.15.2.jar +parquet-encoding/1.15.2//parquet-encoding-1.15.2.jar +parquet-format-structures/1.15.2//parquet-format-structures-1.15.2.jar +parquet-hadoop/1.15.2//parquet-hadoop-1.15.2.jar +parquet-jackson/1.13.1//parquet-jackson-1.13.1.jar pickle/1.3//pickle-1.3.jar postgresql/42.7.2//postgresql-42.7.2.jar profiles/2.24.12//profiles-2.24.12.jar diff --git a/pom.xml b/pom.xml index c16d2b11ca..5c5279f421 100644 --- a/pom.xml +++ b/pom.xml @@ -102,7 +102,7 @@ 3.3.2 3.3.1 - 1.10.1 + 1.6.1 1.2.0 3.1.3 3.4.2 @@ -126,8 +126,8 @@ 5.7.0 4.11.0 1.21.4 - 1.16.0 - 1.16.0 + 1.13.1 + 1.15.2 8.0.33 1.9.7 2.24.12 @@ -396,7 +396,7 @@ parquet-avro ${parquet-avro.version} - + org.apache.parquet parquet-jackson @@ -1839,9 +1839,6 @@ 2.3.8 2.10.2 - 1.8.1 - 1.15.2 - 1.15.2 3.3.4 3.3 hadoop-client @@ -1876,7 +1873,6 @@ 3.3.4 3.3 - 1.8.1