From 16c72bee1b91c08cb5bb64f05855da485c985fc6 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Thu, 12 Feb 2026 20:45:44 -0500 Subject: [PATCH 1/8] add iceberg --- Cargo.lock | 1860 +++++++++++++++++++++++++++++++++++++------ Cargo.toml | 18 +- config.example.yaml | 12 + src/catalog.rs | 63 ++ src/config.rs | 47 ++ src/iceberg.rs | 129 +++ src/main.rs | 2 + src/sync.rs | 141 ++-- 8 files changed, 1984 insertions(+), 288 deletions(-) create mode 100644 src/catalog.rs create mode 100644 src/iceberg.rs diff --git a/Cargo.lock b/Cargo.lock index 7a89ce2..eb4f312 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,17 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "version_check", +] + [[package]] name = "ahash" version = "0.8.12" @@ -52,12 +63,6 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -123,11 +128,48 @@ version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +[[package]] +name = "apache-avro" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" +dependencies = [ + "bigdecimal", + "bon", + "digest", + "log", + "miniz_oxide", + "num-bigint", + "quad-rand", + "rand 0.9.2", + "regex-lite", + "serde", + "serde_bytes", + "serde_json", + "strum", + "strum_macros", + "thiserror", + "uuid", + "zstd", +] + +[[package]] +name = "array-init" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "arrow" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a3ec4fe573f9d1f59d99c085197ef669b00b088ba1d7bb75224732d9357a74" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" dependencies = [ "arrow-arith", "arrow-array", @@ -146,56 +188,59 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dcf19f07792d8c7f91086c67b574a79301e367029b17fcf63fb854332246a10" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7845c32b41f7053e37a075b3c2f29c6f5ea1b3ca6e5df7a2d325ee6e1b4a63cf" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" dependencies = [ - "ahash", + "ahash 0.8.12", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", "chrono-tz", "half", - "hashbrown 0.15.5", - "num", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b5c681a99606f3316f2a99d9c8b6fa3aad0b1d34d8f6d7a1b471893940219d8" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6365f8527d4f87b133eeb862f9b8093c009d41a210b8f101f91aa2392f61daac" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "atoi", @@ -203,60 +248,57 @@ dependencies = [ "chrono", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30dac4d23ac769300349197b845e0fd18c7f9f15d260d4659ae6b5a9ca06f586" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", - "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd962fc3bf7f60705b25bcaa8eb3318b2545aa1d528656525ebdd6a17a6cd6fb" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-ipc" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3527365b24372f9c948f16e53738eb098720eea2093ae73c7af04ac5e30a39b" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", ] [[package]] name = "arrow-json" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdec0024749fc0d95e025c0b0266d78613727b3b3a5d4cf8ea47eb6d38afdd1" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" dependencies = [ "arrow-array", "arrow-buffer", @@ -265,35 +307,36 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", + "indexmap 2.13.0", + "itoa", "lexical-core", - "num", - "serde", + "memchr", + "num-traits", + "ryu", + "serde_core", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79af2db0e62a508d34ddf4f76bfd6109b6ecc845257c9cba6f939653668f89ac" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da30e9d10e9c52f09ea0cf15086d6d785c11ae8dcc3ea5f16d402221b6ac7735" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -303,29 +346,29 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b0f9c0c3582dd55db0f136d3b44bfa0189df07adcf7dc7f2f2e74db0f52eb8" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" [[package]] name = "arrow-select" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92fc337f01635218493c23da81a364daf38c694b05fc20569c3193c11c561984" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" dependencies = [ - "ahash", + "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d596a9fc25dae556672d5069b090331aca8acb93cae426d8b7dcdf1c558fa0ce" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,11 +376,28 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] +[[package]] +name = "as-any" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0f477b951e452a0b6b4a10b53ccd569042d1d01729b519e02074a9c0958a063" + +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -346,7 +406,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -461,6 +521,30 @@ dependencies = [ "uuid", ] +[[package]] +name = "aws-sdk-glue" +version = "1.137.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "034b1d0fa165c64bf825b8ba5e5567b115f7439be1f13e51c5c754de98b896d9" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.122.0" @@ -811,6 +895,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", + "gloo-timers", + "tokio", +] + [[package]] name = "base16ct" version = "0.1.1" @@ -840,10 +935,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" [[package]] -name = "bitflags" -version = "1.3.2" +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] + +[[package]] +name = "bimap" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +checksum = "230c5f1ca6a325a32553f8640d31ac9b49f2411e901e427570154868b46da4f7" [[package]] name = "bitflags" @@ -851,6 +960,18 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -860,11 +981,59 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d13a61f2963b88eef9c1be03df65d42f6996dfeac1054870d950fcf66686f83" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d314cc62af2b6b0c65780555abb4d02a03dd3b799cd42419044f0c38d99738c0" +dependencies = [ + "darling 0.23.0", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.114", +] + +[[package]] +name = "borsh" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1da5ab77c1437701eeff7c88d968729e7766172279eab0676857b3d63af7a6f" +dependencies = [ + "borsh-derive", + "cfg_aliases", +] + +[[package]] +name = "borsh-derive" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0686c856aa6aac0c4498f936d7d6a02df690f614c03e4d906d1018062b5c5e2c" +dependencies = [ + "once_cell", + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "brotli" -version = "7.0.0" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -873,9 +1042,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.3" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -887,6 +1056,34 @@ version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +[[package]] +name = "bytecheck" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" @@ -927,19 +1124,24 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ - "android-tzdata", "iana-time-zone", "js-sys", "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.6", + "windows-link", ] [[package]] @@ -983,7 +1185,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -1007,6 +1209,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1085,6 +1296,15 @@ dependencies = [ "spin", ] +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1094,6 +1314,30 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crunchy" version = "0.2.4" @@ -1154,96 +1398,254 @@ dependencies = [ ] [[package]] -name = "der" -version = "0.6.1" +name = "darling" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "const-oid", - "zeroize", + "darling_core 0.20.11", + "darling_macro 0.20.11", ] [[package]] -name = "deranged" -version = "0.5.5" +name = "darling" +version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "powerfmt", + "darling_core 0.21.3", + "darling_macro 0.21.3", ] [[package]] -name = "digest" -version = "0.10.7" +name = "darling" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "block-buffer", - "crypto-common", - "subtle", + "darling_core 0.23.0", + "darling_macro 0.23.0", ] [[package]] -name = "displaydoc" -version = "0.2.5" +name = "darling_core" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" dependencies = [ + "fnv", + "ident_case", "proc-macro2", "quote", - "syn", + "strsim", + "syn 2.0.114", ] [[package]] -name = "dunce" -version = "1.0.5" +name = "darling_core" +version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.114", +] [[package]] -name = "ecdsa" -version = "0.14.8" +name = "darling_core" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "der", - "elliptic-curve", - "rfc6979", - "signature", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.114", ] [[package]] -name = "either" -version = "1.15.0" +name = "darling_macro" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.114", +] [[package]] -name = "elliptic-curve" -version = "0.12.3" +name = "darling_macro" +version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "base16ct", - "crypto-bigint 0.4.9", - "der", - "digest", - "ff", - "generic-array", - "group", - "pkcs8", - "rand_core 0.6.4", - "sec1", - "subtle", - "zeroize", + "darling_core 0.21.3", + "quote", + "syn 2.0.114", ] [[package]] -name = "equivalent" -version = "1.0.2" +name = "darling_macro" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.114", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "dissimilar" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921" + +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der", + "elliptic-curve", + "rfc6979", + "signature", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der", + "digest", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "errno" version = "0.3.14" @@ -1254,6 +1656,37 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + +[[package]] +name = "expect-test" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0" +dependencies = [ + "dissimilar", + "once_cell", +] + [[package]] name = "fallible-iterator" version = "0.2.0" @@ -1296,11 +1729,11 @@ checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 1.3.2", + "bitflags", "rustc_version", ] @@ -1312,6 +1745,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -1341,6 +1775,27 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -1357,6 +1812,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -1375,11 +1858,16 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", + "futures-io", + "futures-macro", "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -1399,8 +1887,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.11.1+wasi-snapshot-preview1", + "wasm-bindgen", ] [[package]] @@ -1410,9 +1900,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", ] [[package]] @@ -1438,7 +1942,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap", + "indexmap 2.13.0", "slab", "tokio", "tokio-util", @@ -1457,7 +1961,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap", + "indexmap 2.13.0", "slab", "tokio", "tokio-util", @@ -1478,18 +1982,21 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.5" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.8", ] [[package]] name = "hashbrown" -version = "0.15.5" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.12", +] [[package]] name = "hashbrown" @@ -1532,6 +2039,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "http" version = "0.2.12" @@ -1675,6 +2191,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.4", "tower-service", + "webpki-roots", ] [[package]] @@ -1725,31 +2242,124 @@ dependencies = [ ] [[package]] -name = "icu_collections" -version = "2.1.1" +name = "iceberg" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "e65918e701cf610ab0cea57f7f31db5bf4f973230c2c160244067bce01f7c5fa" dependencies = [ - "displaydoc", - "potential_utf", - "yoke", - "zerofrom", - "zerovec", + "anyhow", + "apache-avro", + "array-init", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "arrow-string", + "as-any", + "async-trait", + "backon", + "base64", + "bimap", + "bytes", + "chrono", + "derive_builder", + "expect-test", + "flate2", + "fnv", + "futures", + "itertools", + "moka", + "murmur3", + "num-bigint", + "once_cell", + "opendal", + "ordered-float 4.6.0", + "parquet", + "rand 0.8.5", + "reqsign", + "reqwest", + "roaring", + "rust_decimal", + "serde", + "serde_bytes", + "serde_derive", + "serde_json", + "serde_repr", + "serde_with", + "strum", + "tokio", + "typed-builder", + "url", + "uuid", + "zstd", ] [[package]] -name = "icu_locale_core" -version = "2.1.1" +name = "iceberg-catalog-glue" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "abf46eabd76b154e569627ce7642933df7c2450eb770111241f4b40735a2644a" dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - + "anyhow", + "async-trait", + "aws-config", + "aws-sdk-glue", + "iceberg", + "serde_json", + "tokio", + "tracing", +] + +[[package]] +name = "iceberg-catalog-rest" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6d5e120317ab88a3af332c17166aad101f2aee9bfb098d63d4525bdd5cc2da7" +dependencies = [ + "async-trait", + "chrono", + "http 1.4.0", + "iceberg", + "itertools", + "reqwest", + "serde", + "serde_derive", + "serde_json", + "tokio", + "tracing", + "typed-builder", + "uuid", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + [[package]] name = "icu_normalizer" version = "2.1.1" @@ -1805,6 +2415,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1826,6 +2442,17 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + [[package]] name = "indexmap" version = "2.13.0" @@ -1834,6 +2461,8 @@ checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", + "serde", + "serde_core", ] [[package]] @@ -1848,18 +2477,78 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jiff" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -1961,7 +2650,7 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ - "bitflags 2.10.0", + "bitflags", "libc", ] @@ -2006,13 +2695,19 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ - "twox-hash 2.1.2", + "twox-hash", ] [[package]] @@ -2062,26 +2757,38 @@ dependencies = [ ] [[package]] -name = "nu-ansi-term" -version = "0.50.3" +name = "moka" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +checksum = "b4ac832c50ced444ef6be0767a008b02c106a909ba79d1d830501e94b96f6b7e" dependencies = [ - "windows-sys 0.61.2", + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "equivalent", + "event-listener", + "futures-util", + "parking_lot", + "portable-atomic", + "smallvec", + "tagptr", + "uuid", ] [[package]] -name = "num" -version = "0.4.3" +name = "murmur3" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", + "windows-sys 0.61.2", ] [[package]] @@ -2092,6 +2799,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", + "serde", ] [[package]] @@ -2118,28 +2826,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -2162,6 +2848,35 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" +dependencies = [ + "anyhow", + "backon", + "base64", + "bytes", + "crc32c", + "futures", + "getrandom 0.2.17", + "http 1.4.0", + "http-body 1.0.1", + "jiff", + "log", + "md-5", + "percent-encoding", + "quick-xml 0.38.4", + "reqsign", + "reqwest", + "serde", + "serde_json", + "tokio", + "url", + "uuid", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -2177,6 +2892,25 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-float" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" +dependencies = [ + "dlv-list", + "hashbrown 0.14.5", +] + [[package]] name = "outref" version = "0.5.2" @@ -2194,6 +2928,12 @@ dependencies = [ "sha2", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.5" @@ -2219,11 +2959,11 @@ dependencies = [ [[package]] name = "parquet" -version = "53.4.1" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f8cf58b29782a7add991f655ff42929e31a7859f5319e53db9e39a714cb113c" +checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" dependencies = [ - "ahash", + "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-cast", @@ -2236,18 +2976,21 @@ dependencies = [ "bytes", "chrono", "flate2", + "futures", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "paste", "seq-macro", + "simdutf8", "snap", "thrift", - "twox-hash 1.6.3", + "tokio", + "twox-hash", "zstd", - "zstd-sys", ] [[package]] @@ -2327,6 +3070,21 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres-protocol" version = "0.6.10" @@ -2340,7 +3098,7 @@ dependencies = [ "hmac", "md-5", "memchr", - "rand", + "rand 0.9.2", "sha2", "stringprep", ] @@ -2384,6 +3142,25 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.114", +] + +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -2393,6 +3170,107 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "quad-rand" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls 0.23.36", + "socket2 0.6.2", + "thiserror", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls 0.23.36", + "rustls-pki-types", + "slab", + "thiserror", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2 0.6.2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" version = "1.0.44" @@ -2408,16 +3286,43 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "rand_chacha", + "rand_chacha 0.9.0", "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + [[package]] name = "rand_chacha" version = "0.9.0" @@ -2452,7 +3357,27 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.10.0", + "bitflags", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] @@ -2490,29 +3415,147 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +[[package]] +name = "rend" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "reqsign" +version = "0.16.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701" +dependencies = [ + "anyhow", + "async-trait", + "base64", + "chrono", + "form_urlencoded", + "getrandom 0.2.17", + "hex", + "hmac", + "home", + "http 1.4.0", + "log", + "percent-encoding", + "quick-xml 0.37.5", + "rand 0.8.5", + "reqwest", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper 1.8.1", + "hyper-rustls 0.27.7", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls 0.23.36", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls 0.26.4", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + [[package]] name = "rfc6979" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rkyv" +version = "0.7.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2297bf9c81a3f0dc96bc9521370b88f054168c29826a75e89c55ff196e7ed6a1" +dependencies = [ + "bitvec", + "bytecheck", + "bytes", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84d7b42d4b8d06048d3ac8db0eb31bcb942cbeb709f0b5f2b2ebde398d3038f5" dependencies = [ - "crypto-bigint 0.4.9", - "hmac", - "zeroize", + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] -name = "ring" -version = "0.17.14" +name = "roaring" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" dependencies = [ - "cc", - "cfg-if", - "getrandom 0.2.17", - "libc", - "untrusted", - "windows-sys 0.52.0", + "bytemuck", + "byteorder", ] [[package]] @@ -2521,7 +3564,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" dependencies = [ - "bitflags 2.10.0", + "bitflags", "fallible-iterator 0.3.0", "fallible-streaming-iterator", "hashlink", @@ -2529,6 +3572,38 @@ dependencies = [ "smallvec", ] +[[package]] +name = "rust-ini" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + +[[package]] +name = "rust_decimal" +version = "1.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61f703d19852dbf87cbc513643fa81428361eb6940f1ac14fd58155d295a3eb0" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "rand 0.8.5", + "rkyv", + "serde", + "serde_json", +] + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.1" @@ -2558,6 +3633,7 @@ checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", "once_cell", + "ring", "rustls-pki-types", "rustls-webpki 0.103.9", "subtle", @@ -2582,6 +3658,7 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ + "web-time", "zeroize", ] @@ -2618,6 +3695,9 @@ dependencies = [ "bytes", "chrono", "clap", + "iceberg", + "iceberg-catalog-glue", + "iceberg-catalog-rest", "parquet", "postgres-types", "rusqlite", @@ -2652,6 +3732,30 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2668,6 +3772,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "sec1" version = "0.3.0" @@ -2688,7 +3798,7 @@ version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ - "bitflags 2.10.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -2727,6 +3837,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" +dependencies = [ + "serde", + "serde_core", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -2744,7 +3864,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2760,13 +3880,67 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.13.0", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +dependencies = [ + "darling 0.21.3", + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "serde_yaml" version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap", + "indexmap 2.13.0", "itoa", "ryu", "serde", @@ -2836,6 +4010,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.2" @@ -2902,12 +4082,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stringprep" version = "0.1.5" @@ -2925,12 +4099,44 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "subtle" version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.114" @@ -2942,6 +4148,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + [[package]] name = "synstructure" version = "0.13.2" @@ -2950,7 +4165,39 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", +] + +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] @@ -2970,7 +4217,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float", + "ordered-float 2.10.1", ] [[package]] @@ -2980,6 +4227,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -3062,7 +4310,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3084,7 +4332,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "rand", + "rand 0.9.2", "socket2 0.6.2", "tokio", "tokio-util", @@ -3124,12 +4372,65 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml_datetime" +version = "0.7.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.23.10+spec-1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +dependencies = [ + "indexmap 2.13.0", + "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.8+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0742ff5ff03ea7e67c8ae6c93cac239e0d9784833362da3f9a9c1da8dfefcbdc" +dependencies = [ + "winnow", +] + [[package]] name = "tower" version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "iri-string", + "pin-project-lite", + "tower", "tower-layer", "tower-service", ] @@ -3165,7 +4466,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3215,19 +4516,29 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "twox-hash" -version = "1.6.3" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typed-builder" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +checksum = "cd9d30e3a08026c78f246b173243cf07b3696d274debd26680773b6773c2afc7" dependencies = [ - "cfg-if", - "static_assertions", + "typed-builder-macro", ] [[package]] -name = "twox-hash" -version = "2.1.2" +name = "typed-builder-macro" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] [[package]] name = "typenum" @@ -3310,6 +4621,7 @@ version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f" dependencies = [ + "getrandom 0.3.4", "js-sys", "serde_core", "wasm-bindgen", @@ -3394,6 +4706,20 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.108" @@ -3413,7 +4739,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.114", "wasm-bindgen-shared", ] @@ -3426,6 +4752,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "web-sys" version = "0.3.85" @@ -3436,6 +4775,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "2.1.0" @@ -3468,7 +4826,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3479,7 +4837,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3662,6 +5020,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.51.0" @@ -3674,6 +5041,15 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + [[package]] name = "xmlparser" version = "0.13.6" @@ -3699,7 +5075,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", "synstructure", ] @@ -3720,7 +5096,7 @@ checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3740,7 +5116,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", "synstructure", ] @@ -3780,9 +5156,15 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] +[[package]] +name = "zlib-rs" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" + [[package]] name = "zmij" version = "1.0.20" diff --git a/Cargo.toml b/Cargo.toml index 1f87524..859b41d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,9 +17,9 @@ tokio = { version = "1", features = ["full"] } tokio-postgres = { version = "0.7", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } postgres-types = { version = "0.2", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } -# Arrow / Parquet -arrow = { version = "53", features = ["chrono-tz"] } -parquet = { version = "53", features = ["arrow"] } +# Arrow / Parquet (v57 to match iceberg 0.8) +arrow = { version = "57", features = ["chrono-tz"] } +parquet = { version = "57", features = ["arrow"] } # CLI clap = { version = "4", features = ["derive"] } @@ -39,11 +39,23 @@ rusqlite = { version = "0.32", features = ["bundled"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } +# Iceberg +iceberg = "0.8" +iceberg-catalog-rest = "0.8" + # Utils chrono = { version = "0.4", features = ["serde"] } uuid = { version = "1", features = ["serde"] } anyhow = "1" serde_json = "1" +[features] +default = [] +glue = ["iceberg-catalog-glue"] + +[dependencies.iceberg-catalog-glue] +version = "0.8" +optional = true + [dev-dependencies] bytes = "1" diff --git a/config.example.yaml b/config.example.yaml index 51f1d9b..b4ba02f 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -24,6 +24,18 @@ output: # prefix: raw/postgres # region: us-east-1 +# Output format: parquet (default) or iceberg +# format: parquet + +# Iceberg output (uncomment to use instead of standalone Parquet) +# Writes proper Iceberg table metadata so Spark, Trino, Athena can query it. +# format: iceberg +# warehouse: s3://my-bucket/warehouse # or ./local_warehouse +# catalog: +# type: filesystem # default, zero setup +# # type: glue # for Athena (requires --features glue) +# # glue_database: my_db # required when type=glue + # Batch size for reading rows from Postgres batch_size: 10000 diff --git a/src/catalog.rs b/src/catalog.rs new file mode 100644 index 0000000..995fb53 --- /dev/null +++ b/src/catalog.rs @@ -0,0 +1,63 @@ +use anyhow::{Context, Result}; +use iceberg::memory::{MemoryCatalogBuilder, MEMORY_CATALOG_WAREHOUSE}; +use iceberg::{Catalog, CatalogBuilder}; +use std::collections::HashMap; +use std::sync::Arc; + +use crate::config::{CatalogConfig, CatalogType}; + +/// Build a catalog from the config. Returns an Arc so callers +/// don't need to know the concrete type. +/// +/// - **filesystem** (default): Uses a MemoryCatalog backed by FileIO pointed +/// at the warehouse path. Metadata JSON and Parquet live side-by-side. +/// - **glue**: Requires the `glue` feature flag. +pub async fn build_catalog( + warehouse: &str, + catalog_config: Option<&CatalogConfig>, +) -> Result> { + let catalog_type = catalog_config + .map(|c| &c.catalog_type) + .unwrap_or(&CatalogType::Filesystem); + + match catalog_type { + CatalogType::Filesystem => build_filesystem_catalog(warehouse).await, + CatalogType::Glue => build_glue_catalog(warehouse, catalog_config.unwrap()).await, + } +} + +async fn build_filesystem_catalog(warehouse: &str) -> Result> { + let mut props = HashMap::new(); + props.insert(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse.to_string()); + + let catalog = MemoryCatalogBuilder::default() + .load("rustream", props) + .await + .context("building filesystem catalog")?; + + tracing::info!(warehouse = %warehouse, "created filesystem catalog"); + Ok(Arc::new(catalog)) +} + +async fn build_glue_catalog( + _warehouse: &str, + catalog_config: &CatalogConfig, +) -> Result> { + let _db = catalog_config + .glue_database + .as_deref() + .expect("glue_database validated at config load"); + + #[cfg(feature = "glue")] + { + anyhow::bail!("Glue catalog support is compiled but not yet wired up. Coming soon."); + } + + #[cfg(not(feature = "glue"))] + { + anyhow::bail!( + "Glue catalog requires the 'glue' feature. \ + Rebuild with: cargo build --features glue" + ); + } +} diff --git a/src/config.rs b/src/config.rs index bb73260..f79b2cc 100644 --- a/src/config.rs +++ b/src/config.rs @@ -16,6 +16,36 @@ pub struct Config { pub batch_size: usize, #[serde(default)] pub state_dir: Option, + #[serde(default)] + pub format: OutputFormat, + #[serde(default)] + pub catalog: Option, + #[serde(default)] + pub warehouse: Option, +} + +#[derive(Debug, Deserialize, Clone, Default, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum OutputFormat { + #[default] + Parquet, + Iceberg, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct CatalogConfig { + #[serde(default = "default_catalog_type", rename = "type")] + pub catalog_type: CatalogType, + #[serde(default)] + pub glue_database: Option, +} + +#[derive(Debug, Deserialize, Clone, Default)] +#[serde(rename_all = "snake_case")] +pub enum CatalogType { + #[default] + Filesystem, + Glue, } #[derive(Debug, Deserialize, Clone)] @@ -106,11 +136,28 @@ fn default_batch_size() -> usize { 10_000 } +fn default_catalog_type() -> CatalogType { + CatalogType::Filesystem +} + pub fn load(path: &str) -> Result { let content = std::fs::read_to_string(Path::new(path)) .with_context(|| format!("reading config from {path}"))?; let config: Config = serde_yaml::from_str(&content).with_context(|| format!("parsing config from {path}"))?; + + // Validate: iceberg format requires a warehouse path + if config.format == OutputFormat::Iceberg && config.warehouse.is_none() { + anyhow::bail!("'warehouse' is required when format is 'iceberg'"); + } + + // Validate: glue catalog requires glue_database + if let Some(ref cat) = config.catalog { + if matches!(cat.catalog_type, CatalogType::Glue) && cat.glue_database.is_none() { + anyhow::bail!("'glue_database' is required when catalog type is 'glue'"); + } + } + Ok(config) } diff --git a/src/iceberg.rs b/src/iceberg.rs new file mode 100644 index 0000000..77d03bb --- /dev/null +++ b/src/iceberg.rs @@ -0,0 +1,129 @@ +use anyhow::{Context, Result}; +use arrow::record_batch::RecordBatch; +use iceberg::arrow::arrow_schema_to_schema; +use iceberg::spec::DataFileFormat; +use iceberg::transaction::{ApplyTransactionAction, Transaction}; +use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; +use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, +}; +use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; +use iceberg::writer::file_writer::ParquetWriterBuilder; +use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; +use iceberg::{Catalog, NamespaceIdent, TableCreation, TableIdent}; +use parquet::file::properties::WriterProperties; +use std::sync::Arc; + +/// Write RecordBatches to an Iceberg table. +/// +/// - Loads the table if it exists, otherwise creates it from the Arrow schema. +/// - Writes each batch through the Iceberg writer pipeline. +/// - Commits all data files in a single FastAppend transaction. +pub async fn write_iceberg( + catalog: &Arc, + table_name: &str, + batches: &[RecordBatch], +) -> Result<()> { + if batches.is_empty() { + return Ok(()); + } + + let arrow_schema = batches[0].schema(); + let namespace = NamespaceIdent::new("default".to_string()); + let table_ident = TableIdent::new(namespace.clone(), table_name.to_string()); + + // Load or create the table + let table = match catalog.table_exists(&table_ident).await { + Ok(true) => catalog + .load_table(&table_ident) + .await + .with_context(|| format!("loading Iceberg table {table_name}"))?, + _ => { + // Ensure namespace exists + if catalog + .list_namespaces(None) + .await + .map_or(true, |ns| !ns.iter().any(|n| n == &namespace)) + { + let _ = catalog + .create_namespace(&namespace, Default::default()) + .await; + } + + let iceberg_schema = arrow_schema_to_schema(&arrow_schema) + .context("converting Arrow schema to Iceberg schema")?; + + let creation = TableCreation::builder() + .name(table_name.to_string()) + .schema(iceberg_schema) + .build(); + + catalog + .create_table(&namespace, creation) + .await + .with_context(|| format!("creating Iceberg table {table_name}"))? + } + }; + + tracing::info!(table = %table_name, "writing to Iceberg table"); + + // Build the writer pipeline: + // ParquetWriterBuilder → RollingFileWriterBuilder → DataFileWriterBuilder + let file_io = table.file_io().clone(); + let location_gen = + DefaultLocationGenerator::new(table.metadata().clone()).context("location generator")?; + let file_name_gen = + DefaultFileNameGenerator::new("data".to_string(), None, DataFileFormat::Parquet); + + let props = WriterProperties::builder().build(); + let iceberg_schema = table.metadata().current_schema().clone(); + let parquet_builder = ParquetWriterBuilder::new(props, iceberg_schema); + + let rolling_builder = RollingFileWriterBuilder::new_with_default_file_size( + parquet_builder, + file_io, + location_gen, + file_name_gen, + ); + + let data_file_builder = DataFileWriterBuilder::new(rolling_builder); + let mut writer = data_file_builder + .build(None) + .await + .context("building Iceberg data file writer")?; + + // Write each batch + for batch in batches { + writer + .write(batch.clone()) + .await + .context("writing RecordBatch to Iceberg")?; + } + + // Close writer to get data files + let data_files = writer.close().await.context("closing Iceberg writer")?; + + if data_files.is_empty() { + tracing::warn!(table = %table_name, "no data files produced"); + return Ok(()); + } + + tracing::info!( + table = %table_name, + files = data_files.len(), + "committing data files" + ); + + // Commit via FastAppend transaction + let tx = Transaction::new(&table); + let action = tx.fast_append().add_data_files(data_files); + let tx = action + .apply(tx) + .context("applying fast append to transaction")?; + tx.commit(catalog.as_ref()) + .await + .context("committing Iceberg transaction")?; + + tracing::info!(table = %table_name, "Iceberg commit complete"); + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index dd7c1be..2ee5a04 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,6 @@ +mod catalog; mod config; +mod iceberg; mod output; mod reader; mod schema; diff --git a/src/sync.rs b/src/sync.rs index 4482c8d..4deb46a 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -1,8 +1,13 @@ use anyhow::{anyhow, Context, Result}; +use arrow::record_batch::RecordBatch; use chrono::Utc; +use iceberg::Catalog; +use std::sync::Arc; use tokio_postgres::NoTls; -use crate::config::{Config, PartitionBy, TableConfig}; +use crate::catalog; +use crate::config::{Config, OutputFormat, PartitionBy, TableConfig}; +use crate::iceberg as iceberg_write; use crate::output; use crate::reader; use crate::schema::{self, ColumnInfo}; @@ -118,8 +123,18 @@ pub async fn run(config: Config) -> Result<()> { .unwrap_or_else(|| ".rustream_state".to_string()); let state = StateStore::open(&state_dir)?; + // Build Iceberg catalog if format is iceberg + let iceberg_catalog: Option> = if config.format == OutputFormat::Iceberg { + let warehouse = config.warehouse.as_deref().expect("validated at load"); + let cat = catalog::build_catalog(warehouse, config.catalog.as_ref()).await?; + Some(cat) + } else { + None + }; + for table in &tables { - if let Err(e) = sync_table(&client, &config, table, &state).await { + if let Err(e) = sync_table(&client, &config, table, &state, iceberg_catalog.as_ref()).await + { tracing::error!(table = %table.full_name(), error = %e, "failed to sync table"); } } @@ -133,6 +148,7 @@ async fn sync_table( config: &Config, table: &TableConfig, state: &StateStore, + iceberg_catalog: Option<&Arc>, ) -> Result<()> { let table_name = table.full_name(); tracing::info!(table = %table_name, "starting sync"); @@ -220,6 +236,8 @@ async fn sync_table( let mut total_rows = 0u64; let mut batch_num = 0u32; + // Collect batches for Iceberg (needs all batches for a single commit) + let mut iceberg_batches: Vec = Vec::new(); loop { let batch = reader::read_batch( @@ -266,53 +284,69 @@ async fn sync_table( None => None, }; - // Write parquet to buffer - let mut buf = Vec::new(); - writer::write_parquet(&mut buf, &[batch])?; - - // Generate output filename - let now = Utc::now(); - let filename = match &table.partition_by { - Some(PartitionBy::Date) => format!( - "{}/year={}/month={:02}/day={:02}/{}_{:04}.parquet", - table.name, - now.format("%Y"), - now.format("%m"), - now.format("%d"), - now.format("%H%M%S"), - batch_num - ), - Some(PartitionBy::Month) => format!( - "{}/year={}/month={:02}/{}_{:04}.parquet", - table.name, - now.format("%Y"), - now.format("%m"), - now.format("%d_%H%M%S"), - batch_num - ), - Some(PartitionBy::Year) => format!( - "{}/year={}/{}_{:04}.parquet", - table.name, - now.format("%Y"), - now.format("%m%d_%H%M%S"), - batch_num - ), - None => format!( - "{}/{}_{:04}.parquet", - table.name, - now.format("%Y%m%d_%H%M%S"), - batch_num - ), - }; - - output::write_output(&config.output, &filename, buf).await?; + match config.format { + OutputFormat::Parquet => { + // Write parquet to buffer + let mut buf = Vec::new(); + writer::write_parquet(&mut buf, &[batch])?; + + // Generate output filename + let now = Utc::now(); + let filename = match &table.partition_by { + Some(PartitionBy::Date) => format!( + "{}/year={}/month={:02}/day={:02}/{}_{:04}.parquet", + table.name, + now.format("%Y"), + now.format("%m"), + now.format("%d"), + now.format("%H%M%S"), + batch_num + ), + Some(PartitionBy::Month) => format!( + "{}/year={}/month={:02}/{}_{:04}.parquet", + table.name, + now.format("%Y"), + now.format("%m"), + now.format("%d_%H%M%S"), + batch_num + ), + Some(PartitionBy::Year) => format!( + "{}/year={}/{}_{:04}.parquet", + table.name, + now.format("%Y"), + now.format("%m%d_%H%M%S"), + batch_num + ), + None => format!( + "{}/{}_{:04}.parquet", + table.name, + now.format("%Y%m%d_%H%M%S"), + batch_num + ), + }; + + output::write_output(&config.output, &filename, buf).await?; + } + OutputFormat::Iceberg => { + iceberg_batches.push(batch); + } + } - // Persist incremental progress batch-by-batch so reruns can resume safely. + // Always advance in-memory cursor for next batch query. if let Some(ref wm) = new_watermark { - state.set_progress(&table_name, wm, new_cursor.as_deref())?; watermark_val = Some(wm.clone()); - cursor_val = new_cursor; - tracing::debug!(table = %table_name, watermark = %wm, cursor = ?cursor_val, "checkpointed watermark"); + cursor_val = new_cursor.clone(); + + // For Parquet we can checkpoint immediately after each successful file write. + if config.format == OutputFormat::Parquet { + state.set_progress(&table_name, wm, new_cursor.as_deref())?; + tracing::debug!( + table = %table_name, + watermark = %wm, + cursor = ?cursor_val, + "checkpointed watermark" + ); + } } batch_num += 1; @@ -323,6 +357,21 @@ async fn sync_table( } } + // For Iceberg format, write all batches in a single commit + if config.format == OutputFormat::Iceberg && !iceberg_batches.is_empty() { + let catalog = iceberg_catalog.expect("iceberg catalog required for iceberg format"); + iceberg_write::write_iceberg(catalog, &table_name, &iceberg_batches).await?; + if let Some(ref wm) = watermark_val { + state.set_progress(&table_name, wm, cursor_val.as_deref())?; + tracing::debug!( + table = %table_name, + watermark = %wm, + cursor = ?cursor_val, + "checkpointed watermark after iceberg commit" + ); + } + } + if let Some(ref wm) = watermark_val { tracing::info!(table = %table_name, watermark = %wm, "final watermark"); } From fd444b6778bcf252776483ad1602ca641e3c8b3b Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Fri, 20 Feb 2026 20:08:59 -0500 Subject: [PATCH 2/8] =?UTF-8?q?Add=20S3/local=20=E2=86=92=20Postgres=20ing?= =?UTF-8?q?est=20subcommand?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `rustream ingest` subcommand reads Parquet/CSV files from local filesystem or S3 and writes them to Postgres via batch INSERT/UPSERT. Supports write modes (insert, upsert, truncate_insert), auto table creation from file schema, glob-based file discovery, and SQLite-based ingestion tracking to avoid reprocessing. --- Cargo.lock | 41 ++++ Cargo.toml | 5 +- config.example.yaml | 35 ++++ src/config.rs | 222 ++++++++++++++++++++- src/ingest.rs | 291 +++++++++++++++++++++++++++ src/input.rs | 365 ++++++++++++++++++++++++++++++++++ src/main.rs | 25 +++ src/pg_writer.rs | 467 ++++++++++++++++++++++++++++++++++++++++++++ src/state.rs | 73 ++++++- src/sync.rs | 16 +- src/types.rs | 82 +++++++- 11 files changed, 1608 insertions(+), 14 deletions(-) create mode 100644 src/ingest.rs create mode 100644 src/input.rs create mode 100644 src/pg_writer.rs diff --git a/Cargo.lock b/Cargo.lock index eb4f312..9325e15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1907,6 +1907,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "gloo-timers" version = "0.3.0" @@ -2665,6 +2671,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + [[package]] name = "litemap" version = "0.8.1" @@ -3613,6 +3625,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + [[package]] name = "rustls" version = "0.21.12" @@ -3690,11 +3715,13 @@ version = "0.1.2" dependencies = [ "anyhow", "arrow", + "arrow-csv", "aws-config", "aws-sdk-s3", "bytes", "chrono", "clap", + "glob", "iceberg", "iceberg-catalog-glue", "iceberg-catalog-rest", @@ -3704,6 +3731,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "tempfile", "tokio", "tokio-postgres", "tracing", @@ -4180,6 +4208,19 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tempfile" +version = "3.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +dependencies = [ + "fastrand", + "getrandom 0.3.4", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "thiserror" version = "2.0.18" diff --git a/Cargo.toml b/Cargo.toml index 859b41d..6134f9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,9 @@ chrono = { version = "0.4", features = ["serde"] } uuid = { version = "1", features = ["serde"] } anyhow = "1" serde_json = "1" +bytes = "1" +glob = "0.3" +arrow-csv = "57" [features] default = [] @@ -58,4 +61,4 @@ version = "0.8" optional = true [dev-dependencies] -bytes = "1" +tempfile = "3" diff --git a/config.example.yaml b/config.example.yaml index b4ba02f..bff6e5e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -73,3 +73,38 @@ tables: # exclude: # skip these tables # - schema_migrations # - ar_internal_metadata + +# ─── Ingest: load Parquet/CSV files into Postgres ─────────────── +# Preview before ingesting: +# rustream ingest --config config.yaml --dry-run +# +# ingest: +# input: +# type: local +# path: ./parquet_files +# pattern: "**/*.parquet" +# +# # S3 input (uncomment to use instead of local): +# # input: +# # type: s3 +# # bucket: my-data-lake +# # prefix: raw/postgres +# # region: us-east-1 +# # pattern: "**/*.parquet" +# +# file_format: parquet # "parquet" or "csv" +# write_mode: insert # "insert" | "upsert" | "truncate_insert" +# batch_size: 5000 +# target_schema: public # Postgres schema for target tables +# +# tables: +# - file_pattern: "users/*.parquet" +# target_table: users +# key_columns: [id] # required for upsert mode +# create_if_missing: true # auto-CREATE TABLE from file schema +# +# - file_pattern: "orders/**/*.parquet" +# target_table: orders +# key_columns: [id] +# +# # If no tables listed, table name is inferred from directory/filename diff --git a/src/config.rs b/src/config.rs index f79b2cc..17c78b6 100644 --- a/src/config.rs +++ b/src/config.rs @@ -5,7 +5,8 @@ use std::path::Path; #[derive(Debug, Deserialize, Clone)] pub struct Config { pub postgres: PostgresConfig, - pub output: OutputConfig, + #[serde(default)] + pub output: Option, #[serde(default)] pub tables: Option>, #[serde(default)] @@ -22,6 +23,8 @@ pub struct Config { pub catalog: Option, #[serde(default)] pub warehouse: Option, + #[serde(default)] + pub ingest: Option, } #[derive(Debug, Deserialize, Clone, Default, PartialEq)] @@ -111,6 +114,83 @@ impl TableConfig { } } +// --- Ingest config --- + +#[derive(Debug, Deserialize, Clone)] +pub struct IngestConfig { + pub input: InputConfig, + #[serde(default = "default_ingest_file_format")] + pub file_format: FileFormat, + #[serde(default)] + pub write_mode: WriteMode, + #[serde(default = "default_ingest_batch_size")] + pub batch_size: usize, + #[serde(default)] + pub tables: Vec, + #[serde(default = "default_schema")] + pub target_schema: String, +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(tag = "type")] +pub enum InputConfig { + #[serde(rename = "local")] + Local { + path: String, + #[serde(default = "default_pattern")] + pattern: String, + }, + #[serde(rename = "s3")] + S3 { + bucket: String, + #[serde(default)] + prefix: Option, + #[serde(default)] + region: Option, + #[serde(default = "default_pattern")] + pattern: String, + }, +} + +#[derive(Debug, Deserialize, Clone, Default, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum FileFormat { + #[default] + Parquet, + Csv, +} + +#[derive(Debug, Deserialize, Clone, Default, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum WriteMode { + #[default] + Insert, + Upsert, + TruncateInsert, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct IngestTableConfig { + pub file_pattern: String, + pub target_table: String, + #[serde(default)] + pub key_columns: Vec, + #[serde(default)] + pub create_if_missing: bool, +} + +fn default_pattern() -> String { + "**/*.parquet".to_string() +} + +fn default_ingest_file_format() -> FileFormat { + FileFormat::Parquet +} + +fn default_ingest_batch_size() -> usize { + 5000 +} + impl PostgresConfig { pub fn connection_string(&self) -> String { let mut s = format!( @@ -158,6 +238,20 @@ pub fn load(path: &str) -> Result { } } + // Validate: upsert mode requires key_columns on each table mapping + if let Some(ref ingest) = config.ingest { + if ingest.write_mode == WriteMode::Upsert { + for table in &ingest.tables { + if table.key_columns.is_empty() { + anyhow::bail!( + "ingest table '{}' requires 'key_columns' when write_mode is 'upsert'", + table.target_table + ); + } + } + } + } + Ok(config) } @@ -238,7 +332,7 @@ exclude: assert_eq!(config.exclude, vec!["migrations"]); - match config.output { + match config.output.unwrap() { OutputConfig::S3 { bucket, prefix, @@ -333,4 +427,128 @@ exclude: let result = load("/tmp/nonexistent_rustream_config_xyz.yaml"); assert!(result.is_err()); } + + #[test] + fn parse_ingest_config_local() { + let yaml = r#" +postgres: + host: localhost + database: testdb + user: postgres +output: + type: local + path: ./output +ingest: + input: + type: local + path: ./parquet_files + pattern: "**/*.parquet" + file_format: parquet + write_mode: upsert + batch_size: 3000 + tables: + - file_pattern: "users/*.parquet" + target_table: users + key_columns: [id] + create_if_missing: true +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let ingest = config.ingest.unwrap(); + assert!(matches!(ingest.input, InputConfig::Local { .. })); + assert_eq!(ingest.file_format, FileFormat::Parquet); + assert_eq!(ingest.write_mode, WriteMode::Upsert); + assert_eq!(ingest.batch_size, 3000); + assert_eq!(ingest.tables.len(), 1); + assert_eq!(ingest.tables[0].target_table, "users"); + assert_eq!(ingest.tables[0].key_columns, vec!["id"]); + assert!(ingest.tables[0].create_if_missing); + } + + #[test] + fn parse_ingest_config_s3() { + let yaml = r#" +postgres: + host: localhost + database: testdb + user: postgres +ingest: + input: + type: s3 + bucket: my-bucket + prefix: raw/data + region: us-east-1 + file_format: csv + write_mode: truncate_insert +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let ingest = config.ingest.unwrap(); + match &ingest.input { + InputConfig::S3 { + bucket, + prefix, + region, + .. + } => { + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix.as_deref(), Some("raw/data")); + assert_eq!(region.as_deref(), Some("us-east-1")); + } + _ => panic!("expected S3 input config"), + } + assert_eq!(ingest.file_format, FileFormat::Csv); + assert_eq!(ingest.write_mode, WriteMode::TruncateInsert); + } + + #[test] + fn ingest_config_defaults() { + let yaml = r#" +postgres: + host: localhost + database: testdb + user: postgres +ingest: + input: + type: local + path: ./data +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let ingest = config.ingest.unwrap(); + assert_eq!(ingest.file_format, FileFormat::Parquet); + assert_eq!(ingest.write_mode, WriteMode::Insert); + assert_eq!(ingest.batch_size, 5000); + assert_eq!(ingest.target_schema, "public"); + assert!(ingest.tables.is_empty()); + } + + #[test] + fn config_without_ingest() { + let yaml = r#" +postgres: + host: localhost + database: testdb + user: postgres +output: + type: local + path: ./output +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + assert!(config.ingest.is_none()); + } + + #[test] + fn config_without_output() { + let yaml = r#" +postgres: + host: localhost + database: testdb + user: postgres +ingest: + input: + type: local + path: ./data +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + assert!(config.output.is_none()); + assert!(config.ingest.is_some()); + } } diff --git a/src/ingest.rs b/src/ingest.rs new file mode 100644 index 0000000..9a28562 --- /dev/null +++ b/src/ingest.rs @@ -0,0 +1,291 @@ +use anyhow::{Context, Result}; + +use crate::config::{Config, IngestConfig, IngestTableConfig, WriteMode}; +use crate::input; +use crate::pg_writer; +use crate::state::StateStore; +use crate::sync; + +/// Resolve which target table a file should be ingested into. +/// First checks explicit table mappings, then infers from the file path. +fn resolve_target_table(file_key: &str, ingest: &IngestConfig) -> Option { + // Check explicit table mappings + for table_cfg in &ingest.tables { + if let Ok(pattern) = glob::Pattern::new(&table_cfg.file_pattern) { + if pattern.matches(file_key) { + return Some(table_cfg.target_table.clone()); + } + } + } + + // Infer from directory name (parent directory = table name) + let path = std::path::Path::new(file_key); + if let Some(parent) = path.parent() { + let dir_name = parent + .components() + .next() + .map(|c| c.as_os_str().to_string_lossy().to_string()); + if let Some(name) = dir_name { + if !name.is_empty() { + return Some(name); + } + } + } + + // Last resort: use filename without extension + path.file_stem().map(|s| s.to_string_lossy().to_string()) +} + +/// Find the IngestTableConfig for a given file, if any. +fn find_table_config<'a>( + file_key: &str, + ingest: &'a IngestConfig, +) -> Option<&'a IngestTableConfig> { + ingest.tables.iter().find(|t| { + glob::Pattern::new(&t.file_pattern) + .map(|p| p.matches(file_key)) + .unwrap_or(false) + }) +} + +/// Dry run: discover files and show what would be ingested. +pub async fn dry_run(config: Config) -> Result<()> { + let ingest = config + .ingest + .as_ref() + .context("'ingest' section required in config")?; + + let files = input::discover_files(&ingest.input).await?; + + if files.is_empty() { + println!("No files found matching the configured pattern."); + return Ok(()); + } + + println!("Would ingest {} files:\n", files.len()); + + for file_key in &files { + let target = + resolve_target_table(file_key, ingest).unwrap_or_else(|| "(unknown)".to_string()); + let table_cfg = find_table_config(file_key, ingest); + let create = table_cfg.is_some_and(|t| t.create_if_missing); + + println!( + " {} → {}{}", + file_key, + target, + if create { " (create if missing)" } else { "" } + ); + } + + println!("\nWrite mode: {:?}", ingest.write_mode); + println!("File format: {:?}", ingest.file_format); + println!("Batch size: {}", ingest.batch_size); + + Ok(()) +} + +/// Run the ingest process: read files → write to Postgres. +pub async fn run(config: Config) -> Result<()> { + let ingest = config + .ingest + .clone() + .context("'ingest' section required in config")?; + + let client = sync::connect(&config).await?; + + let state_dir = config + .state_dir + .clone() + .unwrap_or_else(|| ".rustream_state".to_string()); + let state = StateStore::open(&state_dir)?; + + let files = input::discover_files(&ingest.input).await?; + + if files.is_empty() { + tracing::info!("no files found matching pattern"); + return Ok(()); + } + + tracing::info!(files = files.len(), "discovered files for ingestion"); + + let mut total_files = 0u64; + let mut total_rows = 0u64; + + for file_key in &files { + // Skip already-ingested files + if state.is_file_ingested(file_key)? { + tracing::debug!(file = %file_key, "skipping already-ingested file"); + continue; + } + + let target_table = match resolve_target_table(file_key, &ingest) { + Some(t) => t, + None => { + tracing::warn!(file = %file_key, "could not determine target table, skipping"); + continue; + } + }; + + let table_cfg = find_table_config(file_key, &ingest); + + tracing::info!(file = %file_key, table = %target_table, "ingesting file"); + + // Read the file + let (schema, batches) = input::read_file( + &ingest.input, + file_key, + &ingest.file_format, + ingest.batch_size, + ) + .await + .with_context(|| format!("reading file {file_key}"))?; + + if batches.is_empty() { + tracing::debug!(file = %file_key, "file is empty, skipping"); + continue; + } + + // Ensure table exists if configured + let create_if_missing = table_cfg.is_some_and(|t| t.create_if_missing); + if create_if_missing { + pg_writer::ensure_table(&client, &schema, &target_table, &ingest.target_schema).await?; + } + + // Truncate if write mode is truncate_insert (only once per table per run) + if ingest.write_mode == WriteMode::TruncateInsert { + pg_writer::truncate_table(&client, &target_table, &ingest.target_schema).await?; + } + + // Write batches + let key_columns = table_cfg.map(|t| t.key_columns.clone()).unwrap_or_default(); + + let mut file_rows = 0u64; + for batch in &batches { + let rows = pg_writer::write_batch( + &client, + batch, + &target_table, + &ingest.target_schema, + &ingest.write_mode, + &key_columns, + ingest.batch_size, + ) + .await + .with_context(|| format!("writing batch to {target_table}"))?; + file_rows += rows; + } + + // Mark file as ingested + state.mark_file_ingested(file_key, &target_table, file_rows)?; + + tracing::info!( + file = %file_key, + table = %target_table, + rows = file_rows, + "ingested file" + ); + + total_files += 1; + total_rows += file_rows; + } + + tracing::info!(files = total_files, rows = total_rows, "ingest complete"); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::{FileFormat, InputConfig}; + + fn make_ingest_config(tables: Vec) -> IngestConfig { + IngestConfig { + input: InputConfig::Local { + path: "./data".to_string(), + pattern: "**/*.parquet".to_string(), + }, + file_format: FileFormat::Parquet, + write_mode: WriteMode::Insert, + batch_size: 5000, + tables, + target_schema: "public".to_string(), + } + } + + #[test] + fn test_resolve_target_table_explicit_mapping() { + let ingest = make_ingest_config(vec![IngestTableConfig { + file_pattern: "users/*.parquet".to_string(), + target_table: "users".to_string(), + key_columns: vec![], + create_if_missing: false, + }]); + + assert_eq!( + resolve_target_table("users/part-0.parquet", &ingest), + Some("users".to_string()) + ); + } + + #[test] + fn test_resolve_target_table_infer_from_directory() { + let ingest = make_ingest_config(vec![]); + + assert_eq!( + resolve_target_table("orders/2024/data.parquet", &ingest), + Some("orders".to_string()) + ); + } + + #[test] + fn test_resolve_target_table_infer_from_filename() { + let ingest = make_ingest_config(vec![]); + + assert_eq!( + resolve_target_table("products.parquet", &ingest), + Some("products".to_string()) + ); + } + + #[test] + fn test_resolve_target_table_explicit_takes_priority() { + let ingest = make_ingest_config(vec![IngestTableConfig { + file_pattern: "raw/*.parquet".to_string(), + target_table: "customers".to_string(), + key_columns: vec![], + create_if_missing: false, + }]); + + assert_eq!( + resolve_target_table("raw/data.parquet", &ingest), + Some("customers".to_string()) + ); + } + + #[test] + fn test_find_table_config() { + let ingest = make_ingest_config(vec![ + IngestTableConfig { + file_pattern: "users/*.parquet".to_string(), + target_table: "users".to_string(), + key_columns: vec!["id".to_string()], + create_if_missing: true, + }, + IngestTableConfig { + file_pattern: "orders/**/*.parquet".to_string(), + target_table: "orders".to_string(), + key_columns: vec!["id".to_string()], + create_if_missing: false, + }, + ]); + + let cfg = find_table_config("users/part-0.parquet", &ingest); + assert!(cfg.is_some()); + assert_eq!(cfg.unwrap().target_table, "users"); + + let cfg = find_table_config("unknown/file.parquet", &ingest); + assert!(cfg.is_none()); + } +} diff --git a/src/input.rs b/src/input.rs new file mode 100644 index 0000000..04fbf17 --- /dev/null +++ b/src/input.rs @@ -0,0 +1,365 @@ +use anyhow::{Context, Result}; +use arrow::datatypes::Schema; +use arrow::record_batch::RecordBatch; +use std::sync::Arc; + +use crate::config::{FileFormat, InputConfig}; + +/// Discover files matching the configured pattern from local filesystem or S3. +pub async fn discover_files(input: &InputConfig) -> Result> { + match input { + InputConfig::Local { path, pattern } => discover_local(path, pattern), + InputConfig::S3 { + bucket, + prefix, + region, + pattern, + } => discover_s3(bucket, prefix.as_deref(), region.as_deref(), pattern).await, + } +} + +/// Read a file into Arrow RecordBatches. +pub async fn read_file( + input: &InputConfig, + file_key: &str, + format: &FileFormat, + batch_size: usize, +) -> Result<(Arc, Vec)> { + match input { + InputConfig::Local { path, .. } => { + let full_path = std::path::Path::new(path).join(file_key); + let full_path_str = full_path.to_str().context("invalid file path")?; + match format { + FileFormat::Parquet => read_parquet_file(full_path_str, batch_size), + FileFormat::Csv => read_csv_file(full_path_str, batch_size), + } + } + InputConfig::S3 { + bucket, + prefix, + region, + .. + } => { + let s3_key = match prefix { + Some(p) => format!("{}/{}", p.trim_end_matches('/'), file_key), + None => file_key.to_string(), + }; + let data = read_s3_object(bucket, &s3_key, region.as_deref()).await?; + match format { + FileFormat::Parquet => read_parquet_bytes(&data, batch_size), + FileFormat::Csv => read_csv_bytes(&data, batch_size), + } + } + } +} + +// --- Local file discovery --- + +fn discover_local(base_path: &str, pattern: &str) -> Result> { + let full_pattern = format!( + "{}/{}", + base_path.trim_end_matches('/'), + pattern.trim_start_matches('/') + ); + + let mut files: Vec = Vec::new(); + for entry in glob::glob(&full_pattern) + .with_context(|| format!("invalid glob pattern: {full_pattern}"))? + { + let path = entry.context("reading glob entry")?; + if path.is_file() { + // Store relative path from base_path + let relative = path + .strip_prefix(base_path) + .unwrap_or(&path) + .to_string_lossy() + .to_string(); + files.push(relative); + } + } + + files.sort(); + Ok(files) +} + +// --- S3 file discovery --- + +async fn discover_s3( + bucket: &str, + prefix: Option<&str>, + region: Option<&str>, + pattern: &str, +) -> Result> { + let mut config_loader = aws_config::from_env(); + if let Some(r) = region { + config_loader = config_loader.region(aws_config::Region::new(r.to_string())); + } + let aws_config = config_loader.load().await; + let client = aws_sdk_s3::Client::new(&aws_config); + + let prefix_str = prefix.unwrap_or(""); + let mut files = Vec::new(); + let mut continuation_token: Option = None; + + let glob_pattern = + glob::Pattern::new(pattern).with_context(|| format!("invalid glob pattern: {pattern}"))?; + + loop { + let mut req = client.list_objects_v2().bucket(bucket).prefix(prefix_str); + if let Some(token) = &continuation_token { + req = req.continuation_token(token); + } + + let resp = req + .send() + .await + .with_context(|| format!("listing objects in s3://{bucket}/{prefix_str}"))?; + + for obj in resp.contents() { + if let Some(key) = obj.key() { + // Strip prefix to get relative key + let relative = if !prefix_str.is_empty() { + key.strip_prefix(prefix_str) + .unwrap_or(key) + .trim_start_matches('/') + } else { + key + }; + + if glob_pattern.matches(relative) { + files.push(relative.to_string()); + } + } + } + + if resp.is_truncated() == Some(true) { + continuation_token = resp.next_continuation_token().map(|s| s.to_string()); + } else { + break; + } + } + + files.sort(); + Ok(files) +} + +async fn read_s3_object(bucket: &str, key: &str, region: Option<&str>) -> Result> { + let mut config_loader = aws_config::from_env(); + if let Some(r) = region { + config_loader = config_loader.region(aws_config::Region::new(r.to_string())); + } + let aws_config = config_loader.load().await; + let client = aws_sdk_s3::Client::new(&aws_config); + + let resp = client + .get_object() + .bucket(bucket) + .key(key) + .send() + .await + .with_context(|| format!("reading s3://{bucket}/{key}"))?; + + let data = resp + .body + .collect() + .await + .with_context(|| format!("reading body of s3://{bucket}/{key}"))?; + + Ok(data.into_bytes().to_vec()) +} + +// --- Parquet reading --- + +fn read_parquet_file(path: &str, batch_size: usize) -> Result<(Arc, Vec)> { + let file = std::fs::File::open(path).with_context(|| format!("opening {path}"))?; + let builder = parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(file) + .with_context(|| format!("reading Parquet metadata from {path}"))? + .with_batch_size(batch_size); + + let schema = Arc::new(builder.schema().as_ref().clone()); + let reader = builder + .build() + .with_context(|| format!("building Parquet reader for {path}"))?; + + let mut batches = Vec::new(); + for batch in reader { + batches.push(batch.with_context(|| format!("reading batch from {path}"))?); + } + + Ok((schema, batches)) +} + +fn read_parquet_bytes(data: &[u8], batch_size: usize) -> Result<(Arc, Vec)> { + let cursor = bytes::Bytes::from(data.to_vec()); + let builder = parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(cursor) + .context("reading Parquet metadata from bytes")? + .with_batch_size(batch_size); + + let schema = Arc::new(builder.schema().as_ref().clone()); + let reader = builder + .build() + .context("building Parquet reader from bytes")?; + + let mut batches = Vec::new(); + for batch in reader { + batches.push(batch.context("reading batch from Parquet bytes")?); + } + + Ok((schema, batches)) +} + +// --- CSV reading --- + +fn read_csv_file(path: &str, batch_size: usize) -> Result<(Arc, Vec)> { + // First pass: infer schema + let mut file = std::fs::File::open(path).with_context(|| format!("opening {path}"))?; + let format = arrow_csv::reader::Format::default().with_header(true); + let (inferred_schema, _) = format + .infer_schema(&mut file, None) + .with_context(|| format!("inferring CSV schema from {path}"))?; + let schema = Arc::new(inferred_schema); + + // Second pass: read data + let file = std::fs::File::open(path).with_context(|| format!("reopening {path}"))?; + let reader = arrow_csv::ReaderBuilder::new(schema.clone()) + .with_batch_size(batch_size) + .with_header(true) + .build(file) + .with_context(|| format!("building CSV reader for {path}"))?; + + let mut batches = Vec::new(); + for batch in reader { + batches.push(batch.with_context(|| format!("reading batch from {path}"))?); + } + + Ok((schema, batches)) +} + +fn read_csv_bytes(data: &[u8], batch_size: usize) -> Result<(Arc, Vec)> { + // First pass: infer schema + let mut cursor = std::io::Cursor::new(data); + let format = arrow_csv::reader::Format::default().with_header(true); + let (inferred_schema, _) = format + .infer_schema(&mut cursor, None) + .context("inferring CSV schema from bytes")?; + let schema = Arc::new(inferred_schema); + + // Second pass: read data + let cursor = std::io::Cursor::new(data); + let reader = arrow_csv::ReaderBuilder::new(schema.clone()) + .with_batch_size(batch_size) + .with_header(true) + .build(cursor) + .context("building CSV reader from bytes")?; + + let mut batches = Vec::new(); + for batch in reader { + batches.push(batch.context("reading batch from CSV bytes")?); + } + + Ok((schema, batches)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field}; + + #[test] + fn test_discover_local_files() { + let dir = tempfile::TempDir::new().unwrap(); + let base = dir.path(); + + // Create subdirectories and files + std::fs::create_dir_all(base.join("users")).unwrap(); + std::fs::create_dir_all(base.join("orders")).unwrap(); + std::fs::File::create(base.join("users/part-0.parquet")).unwrap(); + std::fs::File::create(base.join("users/part-1.parquet")).unwrap(); + std::fs::File::create(base.join("orders/data.parquet")).unwrap(); + std::fs::File::create(base.join("orders/data.csv")).unwrap(); + + let files = discover_local(base.to_str().unwrap(), "**/*.parquet").unwrap(); + assert_eq!(files.len(), 3); + assert!(files.contains(&"orders/data.parquet".to_string())); + assert!(files.contains(&"users/part-0.parquet".to_string())); + assert!(files.contains(&"users/part-1.parquet".to_string())); + } + + #[test] + fn test_discover_local_csv_pattern() { + let dir = tempfile::TempDir::new().unwrap(); + let base = dir.path(); + + std::fs::create_dir_all(base.join("data")).unwrap(); + std::fs::File::create(base.join("data/file.csv")).unwrap(); + std::fs::File::create(base.join("data/file.parquet")).unwrap(); + + let files = discover_local(base.to_str().unwrap(), "**/*.csv").unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0], "data/file.csv"); + } + + #[test] + fn test_read_parquet_roundtrip() { + let dir = tempfile::TempDir::new().unwrap(); + let path = dir.path().join("test.parquet"); + + // Write a test Parquet file + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "charlie"])), + ], + ) + .unwrap(); + + let file = std::fs::File::create(&path).unwrap(); + let props = parquet::file::properties::WriterProperties::builder().build(); + let mut writer = + parquet::arrow::ArrowWriter::try_new(file, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read it back + let (read_schema, batches) = read_parquet_file(path.to_str().unwrap(), 1024).unwrap(); + assert_eq!(read_schema.fields().len(), 2); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 3); + + let ids = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(2), 3); + } + + #[test] + fn test_read_csv_roundtrip() { + let dir = tempfile::TempDir::new().unwrap(); + let path = dir.path().join("test.csv"); + + std::fs::write(&path, "id,name\n1,alice\n2,bob\n3,charlie\n").unwrap(); + + let (schema, batches) = read_csv_file(path.to_str().unwrap(), 1024).unwrap(); + assert_eq!(schema.fields().len(), 2); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 3); + } + + #[test] + fn test_read_csv_bytes_roundtrip() { + let data = b"x,y\n10,hello\n20,world\n"; + let (schema, batches) = read_csv_bytes(data, 1024).unwrap(); + assert_eq!(schema.fields().len(), 2); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 2); + } +} diff --git a/src/main.rs b/src/main.rs index 2ee5a04..b57fb6d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,10 @@ mod catalog; mod config; mod iceberg; +mod ingest; +mod input; mod output; +mod pg_writer; mod reader; mod schema; mod state; @@ -32,6 +35,17 @@ enum Commands { #[arg(long)] dry_run: bool, }, + + /// Ingest Parquet/CSV files from local filesystem or S3 into Postgres + Ingest { + /// Path to config YAML file + #[arg(short, long)] + config: String, + + /// Show what would be ingested without actually doing it + #[arg(long)] + dry_run: bool, + }, } #[tokio::main] @@ -54,6 +68,17 @@ async fn main() -> Result<()> { sync::run(cfg).await?; } } + Commands::Ingest { + config: config_path, + dry_run, + } => { + let cfg = config::load(&config_path)?; + if dry_run { + ingest::dry_run(cfg).await?; + } else { + ingest::run(cfg).await?; + } + } } Ok(()) diff --git a/src/pg_writer.rs b/src/pg_writer.rs new file mode 100644 index 0000000..169076b --- /dev/null +++ b/src/pg_writer.rs @@ -0,0 +1,467 @@ +use anyhow::{bail, Context, Result}; +use arrow::array::*; +use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow::record_batch::RecordBatch; +use chrono::NaiveDate; +use tokio_postgres::types::ToSql; +use tokio_postgres::Client; + +use crate::config::WriteMode; +use crate::types::arrow_type_to_pg; + +/// Check if a table exists in the given schema. +pub async fn table_exists(client: &Client, table: &str, schema: &str) -> Result { + let row = client + .query_one( + "SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_schema = $1 AND table_name = $2 + )", + &[&schema, &table], + ) + .await + .context("checking if table exists")?; + + Ok(row.get(0)) +} + +/// Create a table from an Arrow schema if it doesn't exist. +pub async fn ensure_table( + client: &Client, + arrow_schema: &Schema, + table: &str, + schema: &str, +) -> Result<()> { + if table_exists(client, table, schema).await? { + return Ok(()); + } + + let ddl = create_table_ddl(arrow_schema, table, schema); + tracing::info!(table, schema, "creating table"); + client + .execute(&ddl, &[]) + .await + .with_context(|| format!("creating table {schema}.{table}"))?; + + Ok(()) +} + +/// Truncate a table. +pub async fn truncate_table(client: &Client, table: &str, schema: &str) -> Result<()> { + let sql = format!("TRUNCATE TABLE \"{schema}\".\"{table}\""); + client + .execute(&sql, &[]) + .await + .with_context(|| format!("truncating table {schema}.{table}"))?; + Ok(()) +} + +/// Generate a CREATE TABLE DDL from an Arrow schema. +pub fn create_table_ddl(arrow_schema: &Schema, table: &str, schema: &str) -> String { + let columns: Vec = arrow_schema + .fields() + .iter() + .map(|field| { + let pg_type = arrow_type_to_pg(field.data_type()); + let nullable = if field.is_nullable() { "" } else { " NOT NULL" }; + format!(" \"{}\" {}{}", field.name(), pg_type, nullable) + }) + .collect(); + + format!( + "CREATE TABLE \"{}\".\"{}\" (\n{}\n)", + schema, + table, + columns.join(",\n") + ) +} + +/// Build a multi-row INSERT SQL statement. +pub fn build_insert_sql(table: &str, schema: &str, columns: &[&str], num_rows: usize) -> String { + let col_list: String = columns + .iter() + .map(|c| format!("\"{}\"", c)) + .collect::>() + .join(", "); + + let num_cols = columns.len(); + let mut value_rows = Vec::with_capacity(num_rows); + for row_idx in 0..num_rows { + let placeholders: Vec = (0..num_cols) + .map(|col_idx| format!("${}", row_idx * num_cols + col_idx + 1)) + .collect(); + value_rows.push(format!("({})", placeholders.join(", "))); + } + + format!( + "INSERT INTO \"{}\".\"{}\" ({}) VALUES {}", + schema, + table, + col_list, + value_rows.join(", ") + ) +} + +/// Build a multi-row UPSERT (INSERT ... ON CONFLICT DO UPDATE) SQL statement. +pub fn build_upsert_sql( + table: &str, + schema: &str, + columns: &[&str], + key_columns: &[String], + num_rows: usize, +) -> String { + let insert_sql = build_insert_sql(table, schema, columns, num_rows); + + let key_list: String = key_columns + .iter() + .map(|c| format!("\"{}\"", c)) + .collect::>() + .join(", "); + + let update_set: String = columns + .iter() + .filter(|c| !key_columns.iter().any(|k| k == *c)) + .map(|c| format!("\"{}\" = EXCLUDED.\"{}\"", c, c)) + .collect::>() + .join(", "); + + if update_set.is_empty() { + // All columns are keys, nothing to update — just do nothing + format!("{insert_sql} ON CONFLICT ({key_list}) DO NOTHING") + } else { + format!("{insert_sql} ON CONFLICT ({key_list}) DO UPDATE SET {update_set}") + } +} + +/// Write a RecordBatch to Postgres using batch INSERT or UPSERT. +pub async fn write_batch( + client: &Client, + batch: &RecordBatch, + table: &str, + schema: &str, + write_mode: &WriteMode, + key_columns: &[String], + batch_size: usize, +) -> Result { + let arrow_schema = batch.schema(); + let columns: Vec<&str> = arrow_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + let num_cols = columns.len(); + let total_rows = batch.num_rows(); + let mut rows_written = 0u64; + + // Process in chunks of batch_size + let mut offset = 0; + while offset < total_rows { + let chunk_size = (total_rows - offset).min(batch_size); + let chunk = batch.slice(offset, chunk_size); + + let sql = match write_mode { + WriteMode::Insert | WriteMode::TruncateInsert => { + build_insert_sql(table, schema, &columns, chunk_size) + } + WriteMode::Upsert => build_upsert_sql(table, schema, &columns, key_columns, chunk_size), + }; + + // Extract all values into a flat Vec + let params = extract_params(&chunk)?; + + // Build the references for tokio-postgres + let param_refs: Vec<&(dyn ToSql + Sync)> = params + .iter() + .map(|p| p.as_ref() as &(dyn ToSql + Sync)) + .collect(); + + // Validate param count + let expected = chunk_size * num_cols; + if param_refs.len() != expected { + bail!( + "parameter count mismatch: expected {expected}, got {}", + param_refs.len() + ); + } + + client + .execute(&sql, ¶m_refs) + .await + .with_context(|| format!("executing batch insert into {schema}.{table}"))?; + + rows_written += chunk_size as u64; + offset += chunk_size; + } + + Ok(rows_written) +} + +/// Extract all values from a RecordBatch as a flat Vec of boxed ToSql values. +/// Values are in row-major order: row0_col0, row0_col1, ..., row1_col0, ... +fn extract_params(batch: &RecordBatch) -> Result>> { + let num_rows = batch.num_rows(); + let num_cols = batch.num_columns(); + let mut params: Vec> = Vec::with_capacity(num_rows * num_cols); + + for row_idx in 0..num_rows { + for col_idx in 0..num_cols { + let array = batch.column(col_idx); + let value = extract_value(array, row_idx)?; + params.push(value); + } + } + + Ok(params) +} + +/// Extract a single value from an Arrow array at the given row index. +fn extract_value(array: &dyn Array, row_idx: usize) -> Result> { + if array.is_null(row_idx) { + return Ok(Box::new(None::)); + } + + let dt = array.data_type(); + match dt { + DataType::Boolean => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::Int8 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx) as i16)) + } + DataType::Int16 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::Int32 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::Int64 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::UInt8 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx) as i16)) + } + DataType::UInt16 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx) as i32)) + } + DataType::UInt32 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx) as i64)) + } + DataType::UInt64 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx) as i64)) + } + DataType::Float32 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::Float64 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx))) + } + DataType::Utf8 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx).to_string())) + } + DataType::LargeUtf8 => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx).to_string())) + } + DataType::Binary => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx).to_vec())) + } + DataType::LargeBinary => { + let arr = array.as_any().downcast_ref::().unwrap(); + Ok(Box::new(arr.value(row_idx).to_vec())) + } + DataType::Date32 => { + let arr = array.as_any().downcast_ref::().unwrap(); + let days = arr.value(row_idx); + let date = NaiveDate::from_num_days_from_ce_opt(days + 719_163) + .context("invalid Date32 value")?; + Ok(Box::new(date)) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let micros = arr.value(row_idx); + let secs = micros / 1_000_000; + let nsecs = ((micros % 1_000_000) * 1000) as u32; + let dt = chrono::DateTime::from_timestamp(secs, nsecs) + .map(|dt| dt.naive_utc()) + .context("invalid timestamp value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Microsecond, Some(_tz)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let micros = arr.value(row_idx); + let dt = chrono::DateTime::from_timestamp_micros(micros) + .context("invalid timestamptz value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let millis = arr.value(row_idx); + let secs = millis / 1000; + let nsecs = ((millis % 1000) * 1_000_000) as u32; + let dt = chrono::DateTime::from_timestamp(secs, nsecs) + .map(|dt| dt.naive_utc()) + .context("invalid timestamp value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Millisecond, Some(_tz)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let millis = arr.value(row_idx); + let dt = chrono::DateTime::from_timestamp_millis(millis) + .context("invalid timestamptz value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Second, None) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let secs = arr.value(row_idx); + let dt = chrono::DateTime::from_timestamp(secs, 0) + .map(|dt| dt.naive_utc()) + .context("invalid timestamp value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Second, Some(_tz)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let secs = arr.value(row_idx); + let dt = + chrono::DateTime::from_timestamp(secs, 0).context("invalid timestamptz value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let nanos = arr.value(row_idx); + let secs = nanos / 1_000_000_000; + let nsecs = (nanos % 1_000_000_000) as u32; + let dt = chrono::DateTime::from_timestamp(secs, nsecs) + .map(|dt| dt.naive_utc()) + .context("invalid timestamp value")?; + Ok(Box::new(dt)) + } + DataType::Timestamp(TimeUnit::Nanosecond, Some(_tz)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap(); + let nanos = arr.value(row_idx); + let secs = nanos / 1_000_000_000; + let nsecs = (nanos % 1_000_000_000) as u32; + let dt = chrono::DateTime::from_timestamp(secs, nsecs) + .context("invalid timestamptz value")?; + Ok(Box::new(dt)) + } + // Fallback: convert to string + _ => { + let formatter = arrow::util::display::ArrayFormatter::try_new( + array, + &arrow::util::display::FormatOptions::default(), + ) + .context("creating array formatter")?; + Ok(Box::new(formatter.value(row_idx).to_string())) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_insert_sql() { + let sql = build_insert_sql("users", "public", &["id", "name", "email"], 2); + assert_eq!( + sql, + "INSERT INTO \"public\".\"users\" (\"id\", \"name\", \"email\") VALUES ($1, $2, $3), ($4, $5, $6)" + ); + } + + #[test] + fn test_build_insert_sql_single_row() { + let sql = build_insert_sql("users", "public", &["id"], 1); + assert_eq!(sql, "INSERT INTO \"public\".\"users\" (\"id\") VALUES ($1)"); + } + + #[test] + fn test_build_upsert_sql() { + let sql = build_upsert_sql( + "users", + "public", + &["id", "name", "email"], + &["id".to_string()], + 2, + ); + assert_eq!( + sql, + "INSERT INTO \"public\".\"users\" (\"id\", \"name\", \"email\") \ + VALUES ($1, $2, $3), ($4, $5, $6) \ + ON CONFLICT (\"id\") DO UPDATE SET \"name\" = EXCLUDED.\"name\", \"email\" = EXCLUDED.\"email\"" + ); + } + + #[test] + fn test_build_upsert_sql_composite_key() { + let sql = build_upsert_sql( + "order_items", + "public", + &["order_id", "item_id", "quantity"], + &["order_id".to_string(), "item_id".to_string()], + 1, + ); + assert!(sql.contains("ON CONFLICT (\"order_id\", \"item_id\")")); + assert!(sql.contains("\"quantity\" = EXCLUDED.\"quantity\"")); + } + + #[test] + fn test_build_upsert_all_keys() { + let sql = build_upsert_sql("tags", "public", &["id"], &["id".to_string()], 1); + assert!(sql.contains("ON CONFLICT (\"id\") DO NOTHING")); + } + + #[test] + fn test_create_table_ddl() { + use arrow::datatypes::Field; + + let schema = Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, true), + Field::new("active", DataType::Boolean, true), + ]); + + let ddl = create_table_ddl(&schema, "users", "public"); + assert!(ddl.contains("\"id\" BIGINT NOT NULL")); + assert!(ddl.contains("\"name\" TEXT")); + assert!(ddl.contains("\"active\" BOOLEAN")); + assert!(ddl.starts_with("CREATE TABLE \"public\".\"users\"")); + } +} diff --git a/src/state.rs b/src/state.rs index 410be2b..8e72fbc 100644 --- a/src/state.rs +++ b/src/state.rs @@ -23,9 +23,15 @@ impl StateStore { watermark_value TEXT NOT NULL, cursor_value TEXT, updated_at TEXT NOT NULL DEFAULT (datetime('now')) - )", + ); + CREATE TABLE IF NOT EXISTS ingested_files ( + file_key TEXT PRIMARY KEY, + target_table TEXT NOT NULL, + rows_ingested INTEGER NOT NULL, + ingested_at TEXT NOT NULL DEFAULT (datetime('now')) + );", ) - .context("creating watermarks table")?; + .context("creating state tables")?; // Backward-compatible migration for existing state DBs. let has_cursor_col = conn @@ -91,6 +97,39 @@ impl StateStore { Ok(()) } + + /// Check if a file has already been ingested. + pub fn is_file_ingested(&self, file_key: &str) -> Result { + let mut stmt = self + .conn + .prepare("SELECT 1 FROM ingested_files WHERE file_key = ?1") + .context("preparing ingested_files select")?; + + let exists = stmt.exists([file_key]).context("checking ingested file")?; + Ok(exists) + } + + /// Mark a file as ingested. + pub fn mark_file_ingested( + &self, + file_key: &str, + target_table: &str, + rows_ingested: u64, + ) -> Result<()> { + self.conn + .execute( + "INSERT INTO ingested_files (file_key, target_table, rows_ingested, ingested_at) + VALUES (?1, ?2, ?3, datetime('now')) + ON CONFLICT(file_key) DO UPDATE SET + target_table = excluded.target_table, + rows_ingested = excluded.rows_ingested, + ingested_at = excluded.ingested_at", + rusqlite::params![file_key, target_table, rows_ingested as i64], + ) + .with_context(|| format!("marking file as ingested: {file_key}"))?; + + Ok(()) + } } #[cfg(test)] @@ -177,6 +216,36 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + #[test] + fn ingested_files_tracking() { + let dir = temp_state_dir(); + let store = StateStore::open(&dir).unwrap(); + + assert!(!store.is_file_ingested("users/part-0.parquet").unwrap()); + + store + .mark_file_ingested("users/part-0.parquet", "users", 1000) + .unwrap(); + assert!(store.is_file_ingested("users/part-0.parquet").unwrap()); + assert!(!store.is_file_ingested("users/part-1.parquet").unwrap()); + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn ingested_file_update() { + let dir = temp_state_dir(); + let store = StateStore::open(&dir).unwrap(); + + store + .mark_file_ingested("orders/data.parquet", "orders", 500) + .unwrap(); + store + .mark_file_ingested("orders/data.parquet", "orders", 750) + .unwrap(); + assert!(store.is_file_ingested("orders/data.parquet").unwrap()); + let _ = fs::remove_dir_all(&dir); + } + #[test] fn state_persists_across_opens() { let dir = temp_state_dir(); diff --git a/src/sync.rs b/src/sync.rs index 4deb46a..9e98aea 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -14,7 +14,7 @@ use crate::schema::{self, ColumnInfo}; use crate::state::StateStore; use crate::writer; -async fn connect(config: &Config) -> Result { +pub(crate) async fn connect(config: &Config) -> Result { let conn_str = config.postgres.connection_string(); let (client, connection) = tokio_postgres::connect(&conn_str, NoTls) .await @@ -105,7 +105,13 @@ pub async fn dry_run(config: Config) -> Result<()> { } } - println!("\nOutput: {:?}", config.output); + println!( + "\nOutput: {:?}", + config + .output + .as_ref() + .expect("output config required for sync") + ); Ok(()) } @@ -325,7 +331,11 @@ async fn sync_table( ), }; - output::write_output(&config.output, &filename, buf).await?; + let output = config + .output + .as_ref() + .expect("output config required for sync"); + output::write_output(output, &filename, buf).await?; } OutputFormat::Iceberg => { iceberg_batches.push(batch); diff --git a/src/types.rs b/src/types.rs index 6cc6b81..6ba51a9 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,4 +1,30 @@ -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, TimeUnit}; + +/// Map an Arrow DataType to a Postgres type name (for CREATE TABLE DDL). +pub fn arrow_type_to_pg(dt: &DataType) -> &'static str { + match dt { + DataType::Boolean => "BOOLEAN", + DataType::Int8 => "SMALLINT", + DataType::Int16 => "SMALLINT", + DataType::Int32 => "INTEGER", + DataType::Int64 => "BIGINT", + DataType::UInt8 => "SMALLINT", + DataType::UInt16 => "INTEGER", + DataType::UInt32 => "BIGINT", + DataType::UInt64 => "BIGINT", + DataType::Float16 => "REAL", + DataType::Float32 => "REAL", + DataType::Float64 => "DOUBLE PRECISION", + DataType::Utf8 | DataType::LargeUtf8 => "TEXT", + DataType::Binary | DataType::LargeBinary => "BYTEA", + DataType::Date32 | DataType::Date64 => "DATE", + DataType::Timestamp(_, Some(_)) => "TIMESTAMPTZ", + DataType::Timestamp(_, None) => "TIMESTAMP", + DataType::Time32(_) | DataType::Time64(_) => "TIME", + DataType::Duration(_) => "INTERVAL", + _ => "TEXT", + } +} /// Map a Postgres type name (from information_schema or pg_type) to an Arrow DataType. pub fn pg_type_to_arrow(pg_type: &str) -> DataType { @@ -29,14 +55,12 @@ pub fn pg_type_to_arrow(pg_type: &str) -> DataType { // Date / Time "date" => DataType::Date32, "timestamp" | "timestamp without time zone" => { - DataType::Timestamp(arrow::datatypes::TimeUnit::Microsecond, None) + DataType::Timestamp(TimeUnit::Microsecond, None) } "timestamp with time zone" | "timestamptz" => { - DataType::Timestamp(arrow::datatypes::TimeUnit::Microsecond, Some("UTC".into())) - } - "time" | "time without time zone" => { - DataType::Time64(arrow::datatypes::TimeUnit::Microsecond) + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) } + "time" | "time without time zone" => DataType::Time64(TimeUnit::Microsecond), "interval" => DataType::Utf8, // intervals as string // UUID @@ -173,4 +197,50 @@ mod tests { fn unknown_type_falls_back_to_utf8() { assert_eq!(pg_type_to_arrow("some_custom_type"), DataType::Utf8); } + + #[test] + fn arrow_type_to_pg_primitives() { + assert_eq!(arrow_type_to_pg(&DataType::Boolean), "BOOLEAN"); + assert_eq!(arrow_type_to_pg(&DataType::Int16), "SMALLINT"); + assert_eq!(arrow_type_to_pg(&DataType::Int32), "INTEGER"); + assert_eq!(arrow_type_to_pg(&DataType::Int64), "BIGINT"); + assert_eq!(arrow_type_to_pg(&DataType::Float32), "REAL"); + assert_eq!(arrow_type_to_pg(&DataType::Float64), "DOUBLE PRECISION"); + assert_eq!(arrow_type_to_pg(&DataType::Utf8), "TEXT"); + assert_eq!(arrow_type_to_pg(&DataType::Binary), "BYTEA"); + assert_eq!(arrow_type_to_pg(&DataType::Date32), "DATE"); + } + + #[test] + fn arrow_type_to_pg_timestamps() { + assert_eq!( + arrow_type_to_pg(&DataType::Timestamp(TimeUnit::Microsecond, None)), + "TIMESTAMP" + ); + assert_eq!( + arrow_type_to_pg(&DataType::Timestamp( + TimeUnit::Microsecond, + Some(std::sync::Arc::from("UTC")) + )), + "TIMESTAMPTZ" + ); + } + + #[test] + fn arrow_type_to_pg_unsigned() { + assert_eq!(arrow_type_to_pg(&DataType::UInt8), "SMALLINT"); + assert_eq!(arrow_type_to_pg(&DataType::UInt16), "INTEGER"); + assert_eq!(arrow_type_to_pg(&DataType::UInt32), "BIGINT"); + assert_eq!(arrow_type_to_pg(&DataType::UInt64), "BIGINT"); + } + + #[test] + fn arrow_type_to_pg_fallback() { + assert_eq!( + arrow_type_to_pg(&DataType::List(std::sync::Arc::new( + arrow::datatypes::Field::new("item", DataType::Int32, true) + ))), + "TEXT" + ); + } } From 71766b0850179e0c8e829258ca701c984edc0746 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Fri, 20 Feb 2026 20:39:18 -0500 Subject: [PATCH 3/8] Update README with ingest and Iceberg documentation --- README.md | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 13950e1..7ae5685 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # rustream -Fast Postgres to Parquet sync tool. Reads tables from Postgres, writes Parquet files to local disk or S3. Supports incremental sync via `updated_at` watermark tracking. +Bidirectional Postgres sync tool. Reads tables from Postgres and writes Parquet/Iceberg files to local disk or S3, or ingests Parquet/CSV files from local disk or S3 back into Postgres. Supports incremental sync via watermark tracking and upsert-based ingestion. ## Installation @@ -31,6 +31,8 @@ maturin develop --release ## Usage +### Sync (Postgres → Parquet/S3) + ```bash # Copy and edit the example config cp config.example.yaml config.yaml @@ -42,10 +44,21 @@ rustream sync --config config.yaml --dry-run rustream sync --config config.yaml ``` +### Ingest (S3/local → Postgres) + +```bash +# Preview what would be ingested +rustream ingest --config ingest_config.yaml --dry-run + +# Run ingest +rustream ingest --config ingest_config.yaml +``` + Enable debug logging with `RUST_LOG`: ```bash RUST_LOG=rustream=debug rustream sync --config config.yaml +RUST_LOG=rustream=debug rustream ingest --config ingest_config.yaml ``` ## Configuration @@ -116,7 +129,72 @@ output: AWS credentials come from environment variables, `~/.aws/credentials`, or IAM role. -### Config reference +### Iceberg output + +```yaml +output: + type: s3 + bucket: my-data-lake + prefix: warehouse + region: us-east-1 + +format: iceberg +warehouse: s3://my-data-lake/warehouse +catalog: + type: filesystem # or "glue" (requires --features glue) + # glue_database: my_db # required when type=glue +``` + +### Ingest (S3 → Postgres) + +```yaml +postgres: + host: localhost + database: mydb + user: postgres + password: secret + +ingest: + input: + type: s3 + bucket: my-data-lake + prefix: raw/postgres/ + region: us-east-1 + pattern: "**/*.parquet" + + file_format: parquet # "parquet" or "csv" + write_mode: upsert # "insert" | "upsert" | "truncate_insert" + batch_size: 5000 + target_schema: public + + tables: + - file_pattern: "users/*.parquet" + target_table: users + key_columns: [id] + create_if_missing: true + + - file_pattern: "orders/*.parquet" + target_table: orders + key_columns: [id] +``` + +### Ingest from local files + +```yaml +ingest: + input: + type: local + path: ./parquet_files + pattern: "**/*.parquet" + + file_format: parquet + write_mode: insert + batch_size: 5000 +``` + +If no `tables` are listed, the target table name is inferred from the parent directory or filename. + +### Config reference (sync) | Field | Description | |---|---| @@ -141,9 +219,33 @@ AWS credentials come from environment variables, `~/.aws/credentials`, or IAM ro | `tables[].incremental_tiebreaker_column` | Stable cursor column for duplicate-safe incremental paging (required when `incremental_column` is set; recommended: primary key) | | `tables[].incremental_column_is_unique` | Allow watermark-only incremental mode when incremental column is strictly unique/monotonic (e.g. append-only `id`) | | `tables[].partition_by` | Partition output files: `date`, `month`, or `year` | +| `format` | Output format: `parquet` (default) or `iceberg` | +| `warehouse` | Warehouse path for Iceberg (required when format=iceberg) | +| `catalog.type` | Iceberg catalog: `filesystem` (default) or `glue` | + +### Config reference (ingest) + +| Field | Description | +|---|---| +| `ingest.input.type` | `local` or `s3` | +| `ingest.input.path` | Local directory (when type=local) | +| `ingest.input.bucket` | S3 bucket (when type=s3) | +| `ingest.input.prefix` | S3 key prefix (when type=s3) | +| `ingest.input.region` | AWS region (when type=s3, optional) | +| `ingest.input.pattern` | Glob pattern for file matching (default: `**/*.parquet`) | +| `ingest.file_format` | `parquet` (default) or `csv` | +| `ingest.write_mode` | `insert` (default), `upsert`, or `truncate_insert` | +| `ingest.batch_size` | Rows per INSERT statement (default: 5000) | +| `ingest.target_schema` | Postgres schema for target tables (default: `public`) | +| `ingest.tables[].file_pattern` | Glob pattern to match files to this table | +| `ingest.tables[].target_table` | Postgres table to write to | +| `ingest.tables[].key_columns` | Primary key columns (required for upsert mode) | +| `ingest.tables[].create_if_missing` | Auto-CREATE TABLE from file schema (default: false) | ## How it works +### Sync (Postgres → Parquet) + 1. Connects to Postgres and introspects each table's schema via `information_schema` 2. Maps Postgres column types to Arrow types automatically 3. Reads rows in batches, converting to Arrow RecordBatches @@ -154,6 +256,15 @@ AWS credentials come from environment variables, `~/.aws/credentials`, or IAM ro Tables without `incremental_column` do a full sync every run. +### Ingest (Parquet/CSV → Postgres) + +1. Discovers files matching the glob pattern from local disk or S3 +2. Skips files already ingested (tracked in local SQLite) +3. Reads each file into Arrow RecordBatches (Parquet or CSV with schema inference) +4. Creates the target table if `create_if_missing: true` (DDL from Arrow schema) +5. Writes rows via multi-row parameterized INSERT or INSERT...ON CONFLICT (upsert) +6. Marks each file as ingested in SQLite to avoid reprocessing on next run + ## Supported Postgres types | Postgres | Arrow | From 2e27eed7e2905d8d0060c9d400900b3c851fb157 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Wed, 25 Feb 2026 19:10:55 -0500 Subject: [PATCH 4/8] Add duplicate-watermark cursor paging integration test --- src/reader.rs | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/src/reader.rs b/src/reader.rs index 578a2db..f4c552b 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -237,6 +237,10 @@ fn get_as_string(row: &Row, idx: usize) -> Option { #[cfg(test)] mod tests { use super::*; + use crate::schema; + use crate::sync::extract_watermark; + use std::time::{SystemTime, UNIX_EPOCH}; + use tokio_postgres::NoTls; fn make_table(name: &str) -> TableConfig { TableConfig { @@ -349,4 +353,135 @@ mod tests { assert!(q.contains("ORDER BY \"updated_at\" ASC, \"id\" ASC")); assert!(q.ends_with("LIMIT 1000")); } + + #[tokio::test] + async fn read_batch_cursor_paging_handles_duplicate_watermarks() { + let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { + Ok(v) => v, + Err(_) => return, // Optional integration-style test; set env var to run. + }; + + let (client, connection) = tokio_postgres::connect(&db_url, NoTls).await.unwrap(); + tokio::spawn(async move { + let _ = connection.await; + }); + + let suffix = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let table_name = format!("rustream_it_reader_{suffix}"); + + client + .execute( + &format!( + "CREATE TABLE public.{table_name} ( + id INTEGER PRIMARY KEY, + updated_at TIMESTAMPTZ NOT NULL, + payload TEXT + )" + ), + &[], + ) + .await + .unwrap(); + + client + .execute( + &format!( + "INSERT INTO public.{table_name} (id, updated_at, payload) VALUES + (1, '2026-01-01T00:00:00Z', 'a'), + (2, '2026-01-01T00:00:00Z', 'b'), + (3, '2026-01-01T00:00:00Z', 'c'), + (4, '2026-01-01T00:01:00Z', 'd')" + ), + &[], + ) + .await + .unwrap(); + + let table = TableConfig { + name: table_name.clone(), + schema: Some("public".into()), + columns: None, + incremental_column: Some("updated_at".into()), + incremental_tiebreaker_column: Some("id".into()), + incremental_column_is_unique: false, + partition_by: None, + }; + let columns = schema::introspect_table(&client, &table).await.unwrap(); + let arrow_schema = schema::build_arrow_schema(&columns); + + let batch1 = read_batch( + &client, + &table, + &columns, + &arrow_schema, + ReadBatchOptions { + watermark_col: Some("updated_at"), + cursor_col: Some("id"), + watermark_val: None, + cursor_val: None, + batch_size: 2, + }, + ) + .await + .unwrap() + .unwrap(); + assert_eq!(batch1.num_rows(), 2); + let wm1 = extract_watermark(&columns, &batch1, "updated_at").unwrap(); + let cur1 = extract_watermark(&columns, &batch1, "id").unwrap(); + + let batch2 = read_batch( + &client, + &table, + &columns, + &arrow_schema, + ReadBatchOptions { + watermark_col: Some("updated_at"), + cursor_col: Some("id"), + watermark_val: Some(&wm1), + cursor_val: Some(&cur1), + batch_size: 2, + }, + ) + .await + .unwrap() + .unwrap(); + assert_eq!(batch2.num_rows(), 2); + + let id_col = columns.iter().position(|c| c.name == "id").unwrap(); + let ids = batch2 + .column(id_col) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ids.value(0), 3); + assert_eq!(ids.value(1), 4); + + let wm2 = extract_watermark(&columns, &batch2, "updated_at").unwrap(); + let cur2 = extract_watermark(&columns, &batch2, "id").unwrap(); + + let batch3 = read_batch( + &client, + &table, + &columns, + &arrow_schema, + ReadBatchOptions { + watermark_col: Some("updated_at"), + cursor_col: Some("id"), + watermark_val: Some(&wm2), + cursor_val: Some(&cur2), + batch_size: 2, + }, + ) + .await + .unwrap(); + assert!(batch3.is_none()); + + client + .execute(&format!("DROP TABLE public.{table_name}"), &[]) + .await + .unwrap(); + } } From 5c131aabbf10f40d65529f0c9e5124db7f135716 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Wed, 25 Feb 2026 19:13:51 -0500 Subject: [PATCH 5/8] Add state migration and sync resume integration tests --- src/state.rs | 47 +++++++++++++++ src/sync.rs | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) diff --git a/src/state.rs b/src/state.rs index 8e72fbc..fbd71fc 100644 --- a/src/state.rs +++ b/src/state.rs @@ -135,6 +135,7 @@ impl StateStore { #[cfg(test)] mod tests { use super::*; + use rusqlite::Connection; use std::fs; use std::sync::atomic::{AtomicU32, Ordering}; @@ -282,4 +283,50 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + + #[test] + fn open_migrates_legacy_watermarks_table() { + let dir = temp_state_dir(); + fs::create_dir_all(&dir).unwrap(); + let db_path = Path::new(&dir).join("rustream_state.db"); + + // Simulate an old state db that predates cursor_value. + let conn = Connection::open(&db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE watermarks ( + table_name TEXT PRIMARY KEY, + watermark_value TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + );", + ) + .unwrap(); + conn.execute( + "INSERT INTO watermarks (table_name, watermark_value) VALUES (?1, ?2)", + ["users", "2026-01-01 00:00:00"], + ) + .unwrap(); + drop(conn); + + let store = StateStore::open(&dir).unwrap(); + assert_eq!( + store.get_progress("users").unwrap(), + Some(("2026-01-01 00:00:00".to_string(), None)) + ); + + // Verify migrated schema contains cursor_value. + let conn = Connection::open(&db_path).unwrap(); + let mut stmt = conn.prepare("PRAGMA table_info(watermarks)").unwrap(); + let mut rows = stmt.query([]).unwrap(); + let mut has_cursor = false; + while let Some(row) = rows.next().unwrap() { + let col_name: String = row.get(1).unwrap(); + if col_name == "cursor_value" { + has_cursor = true; + break; + } + } + assert!(has_cursor); + + let _ = fs::remove_dir_all(&dir); + } } diff --git a/src/sync.rs b/src/sync.rs index 9e98aea..9e9daf1 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -478,10 +478,19 @@ pub(crate) fn extract_watermark( #[cfg(test)] mod tests { use super::*; + use crate::config::{OutputConfig, PostgresConfig}; + use crate::reader; + use crate::schema; + use crate::state::StateStore; + use anyhow::Result; use arrow::array::{Int32Array, Int64Array, StringArray, TimestampMicrosecondArray}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; + use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + use std::fs::File; use std::sync::Arc; + use std::time::{SystemTime, UNIX_EPOCH}; + use tokio_postgres::NoTls; #[test] fn extract_watermark_from_timestamp() { @@ -600,4 +609,152 @@ mod tests { let wm = extract_watermark(&columns, &batch, "updated_at"); assert_eq!(wm, None); } + + #[tokio::test] + async fn sync_table_resumes_from_saved_progress() -> Result<()> { + let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { + Ok(v) => v, + Err(_) => return Ok(()), // Optional integration-style test. + }; + + let (client, connection) = tokio_postgres::connect(&db_url, NoTls).await?; + tokio::spawn(async move { + let _ = connection.await; + }); + + let suffix = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + let table_name = format!("rustream_it_sync_resume_{suffix}"); + let full_table = format!("public.{table_name}"); + + client + .execute( + &format!( + "CREATE TABLE {full_table} ( + id INTEGER PRIMARY KEY, + updated_at TIMESTAMPTZ NOT NULL, + payload TEXT + )" + ), + &[], + ) + .await?; + + client + .execute( + &format!( + "INSERT INTO {full_table} (id, updated_at, payload) VALUES + (1, '2026-01-01T00:00:00Z', 'a'), + (2, '2026-01-01T00:00:00Z', 'b'), + (3, '2026-01-01T00:00:00Z', 'c'), + (4, '2026-01-01T00:01:00Z', 'd')" + ), + &[], + ) + .await?; + + let tmp = tempfile::tempdir()?; + let out_dir = tmp.path().join("out"); + let state_dir = tmp.path().join("state"); + std::fs::create_dir_all(&out_dir)?; + std::fs::create_dir_all(&state_dir)?; + + let table = TableConfig { + name: table_name.clone(), + schema: Some("public".to_string()), + columns: None, + incremental_column: Some("updated_at".to_string()), + incremental_tiebreaker_column: Some("id".to_string()), + incremental_column_is_unique: false, + partition_by: None, + }; + + // Derive a realistic saved progress point from first page (ids 1,2). + let columns = schema::introspect_table(&client, &table).await?; + let arrow_schema = schema::build_arrow_schema(&columns); + let first_batch = reader::read_batch( + &client, + &table, + &columns, + &arrow_schema, + reader::ReadBatchOptions { + watermark_col: Some("updated_at"), + cursor_col: Some("id"), + watermark_val: None, + cursor_val: None, + batch_size: 2, + }, + ) + .await? + .expect("first batch expected"); + + let saved_wm = extract_watermark(&columns, &first_batch, "updated_at").unwrap(); + let saved_cursor = extract_watermark(&columns, &first_batch, "id").unwrap(); + + let state = StateStore::open(state_dir.to_str().unwrap())?; + state.set_progress(&full_table, &saved_wm, Some(&saved_cursor))?; + + let config = Config { + postgres: PostgresConfig { + host: "localhost".to_string(), + port: 5432, + database: "ignored_for_sync_table_test".to_string(), + user: "ignored".to_string(), + password: None, + }, + output: Some(OutputConfig::Local { + path: out_dir.to_string_lossy().to_string(), + }), + tables: None, + exclude: vec![], + schema: "public".to_string(), + batch_size: 2, + state_dir: Some(state_dir.to_string_lossy().to_string()), + format: OutputFormat::Parquet, + catalog: None, + warehouse: None, + ingest: None, + }; + + sync_table(&client, &config, &table, &state, None).await?; + + // Validate only remaining ids (3,4) are written. + let mut seen_ids = Vec::new(); + for entry in std::fs::read_dir(out_dir.join(&table_name))? { + let path = entry?.path(); + if path.extension().and_then(|s| s.to_str()) != Some("parquet") { + continue; + } + let file = File::open(path)?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let reader = builder.build()?; + for batch in reader { + let batch = batch?; + let id_idx = batch + .schema() + .fields() + .iter() + .position(|f| f.name() == "id") + .unwrap(); + let arr = batch + .column(id_idx) + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..arr.len() { + seen_ids.push(arr.value(i)); + } + } + } + seen_ids.sort_unstable(); + assert_eq!(seen_ids, vec![3, 4]); + + let progress = state.get_progress(&full_table)?.unwrap(); + assert_eq!(progress.1.as_deref(), Some("4")); + + client + .execute(&format!("DROP TABLE {full_table}"), &[]) + .await?; + + Ok(()) + } } From a3874e1f9952d758c8d8207a6390ad0ccab826bb Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Wed, 25 Feb 2026 19:21:35 -0500 Subject: [PATCH 6/8] Add incremental state edge-case integration tests --- src/sync.rs | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/src/sync.rs b/src/sync.rs index 9e9daf1..dc886fd 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -757,4 +757,189 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn sync_table_fails_when_cursor_state_missing_in_cursor_mode() -> Result<()> { + let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { + Ok(v) => v, + Err(_) => return Ok(()), // Optional integration-style test. + }; + + let (client, connection) = tokio_postgres::connect(&db_url, NoTls).await?; + tokio::spawn(async move { + let _ = connection.await; + }); + + let suffix = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + let table_name = format!("rustream_it_sync_missing_cursor_{suffix}"); + let full_table = format!("public.{table_name}"); + + client + .execute( + &format!( + "CREATE TABLE {full_table} ( + id INTEGER PRIMARY KEY, + updated_at TIMESTAMPTZ NOT NULL + )" + ), + &[], + ) + .await?; + + client + .execute( + &format!( + "INSERT INTO {full_table} (id, updated_at) VALUES + (1, '2026-01-01T00:00:00Z'), + (2, '2026-01-01T00:00:00Z')" + ), + &[], + ) + .await?; + + let tmp = tempfile::tempdir()?; + let out_dir = tmp.path().join("out"); + let state_dir = tmp.path().join("state"); + std::fs::create_dir_all(&out_dir)?; + std::fs::create_dir_all(&state_dir)?; + + let table = TableConfig { + name: table_name.clone(), + schema: Some("public".to_string()), + columns: None, + incremental_column: Some("updated_at".to_string()), + incremental_tiebreaker_column: Some("id".to_string()), + incremental_column_is_unique: false, + partition_by: None, + }; + + let state = StateStore::open(state_dir.to_str().unwrap())?; + // Seed legacy-like state with watermark only, no cursor. + state.set_progress(&full_table, "2026-01-01 00:00:00.000000", None)?; + + let config = Config { + postgres: PostgresConfig { + host: "localhost".to_string(), + port: 5432, + database: "ignored_for_sync_table_test".to_string(), + user: "ignored".to_string(), + password: None, + }, + output: Some(OutputConfig::Local { + path: out_dir.to_string_lossy().to_string(), + }), + tables: None, + exclude: vec![], + schema: "public".to_string(), + batch_size: 2, + state_dir: Some(state_dir.to_string_lossy().to_string()), + format: OutputFormat::Parquet, + catalog: None, + warehouse: None, + ingest: None, + }; + + let err = sync_table(&client, &config, &table, &state, None) + .await + .expect_err("cursor mode should fail when saved cursor is missing"); + assert!(err.to_string().contains("has watermark but no cursor")); + + client + .execute(&format!("DROP TABLE {full_table}"), &[]) + .await?; + + Ok(()) + } + + #[tokio::test] + async fn sync_table_supports_unique_watermark_without_tiebreaker() -> Result<()> { + let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { + Ok(v) => v, + Err(_) => return Ok(()), // Optional integration-style test. + }; + + let (client, connection) = tokio_postgres::connect(&db_url, NoTls).await?; + tokio::spawn(async move { + let _ = connection.await; + }); + + let suffix = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + let table_name = format!("rustream_it_sync_unique_wm_{suffix}"); + let full_table = format!("public.{table_name}"); + + client + .execute( + &format!( + "CREATE TABLE {full_table} ( + id INTEGER PRIMARY KEY, + payload TEXT + )" + ), + &[], + ) + .await?; + + client + .execute( + &format!( + "INSERT INTO {full_table} (id, payload) VALUES + (1, 'a'), + (2, 'b'), + (3, 'c'), + (4, 'd')" + ), + &[], + ) + .await?; + + let tmp = tempfile::tempdir()?; + let out_dir = tmp.path().join("out"); + let state_dir = tmp.path().join("state"); + std::fs::create_dir_all(&out_dir)?; + std::fs::create_dir_all(&state_dir)?; + + let table = TableConfig { + name: table_name.clone(), + schema: Some("public".to_string()), + columns: None, + incremental_column: Some("id".to_string()), + incremental_tiebreaker_column: None, + incremental_column_is_unique: true, + partition_by: None, + }; + + let state = StateStore::open(state_dir.to_str().unwrap())?; + let config = Config { + postgres: PostgresConfig { + host: "localhost".to_string(), + port: 5432, + database: "ignored_for_sync_table_test".to_string(), + user: "ignored".to_string(), + password: None, + }, + output: Some(OutputConfig::Local { + path: out_dir.to_string_lossy().to_string(), + }), + tables: None, + exclude: vec![], + schema: "public".to_string(), + batch_size: 2, + state_dir: Some(state_dir.to_string_lossy().to_string()), + format: OutputFormat::Parquet, + catalog: None, + warehouse: None, + ingest: None, + }; + + sync_table(&client, &config, &table, &state, None).await?; + let progress = state.get_progress(&full_table)?.unwrap(); + assert_eq!(progress.0, "4"); + assert_eq!(progress.1, None); + + client + .execute(&format!("DROP TABLE {full_table}"), &[]) + .await?; + + Ok(()) + } } From d6e5bbf3f11331189b0f41b29f3d84b8c98ae531 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Wed, 25 Feb 2026 19:42:06 -0500 Subject: [PATCH 7/8] Document incremental integration test intent --- src/reader.rs | 1 + src/state.rs | 1 + src/sync.rs | 3 +++ 3 files changed, 5 insertions(+) diff --git a/src/reader.rs b/src/reader.rs index f4c552b..0a37eb1 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -354,6 +354,7 @@ mod tests { assert!(q.ends_with("LIMIT 1000")); } + /// Ensures duplicate watermark values are paged safely using the cursor tiebreaker. #[tokio::test] async fn read_batch_cursor_paging_handles_duplicate_watermarks() { let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { diff --git a/src/state.rs b/src/state.rs index fbd71fc..8a5b072 100644 --- a/src/state.rs +++ b/src/state.rs @@ -284,6 +284,7 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + /// Verifies opening state migrates legacy watermark schema and preserves existing values. #[test] fn open_migrates_legacy_watermarks_table() { let dir = temp_state_dir(); diff --git a/src/sync.rs b/src/sync.rs index dc886fd..b4d602d 100644 --- a/src/sync.rs +++ b/src/sync.rs @@ -610,6 +610,7 @@ mod tests { assert_eq!(wm, None); } + /// Confirms sync resumes from saved progress and writes only remaining rows. #[tokio::test] async fn sync_table_resumes_from_saved_progress() -> Result<()> { let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { @@ -758,6 +759,7 @@ mod tests { Ok(()) } + /// Ensures cursor mode fails fast when saved state is missing cursor_value. #[tokio::test] async fn sync_table_fails_when_cursor_state_missing_in_cursor_mode() -> Result<()> { let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { @@ -851,6 +853,7 @@ mod tests { Ok(()) } + /// Verifies unique-watermark mode works without a tiebreaker and persists progress. #[tokio::test] async fn sync_table_supports_unique_watermark_without_tiebreaker() -> Result<()> { let db_url = match std::env::var("RUSTREAM_IT_DB_URL") { From c8249cd6bb73bdd8c1dd4e3b9caa72c5ece02e90 Mon Sep 17 00:00:00 2001 From: Maria Dubyaga Date: Wed, 25 Feb 2026 19:46:12 -0500 Subject: [PATCH 8/8] Document optional DB integration test setup --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 7ae5685..d8cb79a 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,16 @@ If no `tables` are listed, the target table name is inferred from the parent dir | `ingest.tables[].key_columns` | Primary key columns (required for upsert mode) | | `ingest.tables[].create_if_missing` | Auto-CREATE TABLE from file schema (default: false) | +## Running Integration Tests + +Some DB-backed tests are optional and run only when `RUSTREAM_IT_DB_URL` is set. +Without this env var, those tests no-op/return early. + +```bash +export RUSTREAM_IT_DB_URL="host=localhost port=5432 dbname=mydb user=postgres password=secret" +cargo test +``` + ## How it works ### Sync (Postgres → Parquet)