diff --git a/Cargo.lock b/Cargo.lock index ba141f6fa1..018e1487be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,63 +495,63 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith 58.2.0", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data 58.2.0", - "arrow-ipc 58.2.0", - "arrow-json 58.2.0", - "arrow-ord 58.2.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-json 58.3.0", + "arrow-ord 58.3.0", "arrow-row", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", - "arrow-string 58.2.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", ] [[package]] name = "arrow-arith" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +checksum = "7c7bbd679c5418b8639b92be01f361d60013c4906574b578b77b63c78356594c" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", "chrono", "num-traits", ] [[package]] name = "arrow-arith" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] [[package]] name = "arrow-array" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +checksum = "c8a4ab47b3f3eac60f7fd31b81e9028fda018607bcc63451aca4f2b755269862" dependencies = [ "ahash 0.8.12", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", "chrono", "half", "hashbrown 0.16.1", @@ -562,18 +562,18 @@ dependencies = [ [[package]] name = "arrow-array" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", - "hashbrown 0.17.0", + "hashbrown 0.17.1", "num-complex", "num-integer", "num-traits", @@ -581,9 +581,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +checksum = "0d18b89b4c4f4811d0858175e79541fe98e33e18db3b011708bc287b1240593f" dependencies = [ "bytes", "half", @@ -593,9 +593,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f955dfb73fae000425f49c8226d2044dab60fb7ad4af1e24f961756354d996c9" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" dependencies = [ "bytes", "half", @@ -605,16 +605,16 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +checksum = "722b5c41dd1d14d0a879a1bce92c6fe33f546101bb2acce57a209825edd075b3" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-ord 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-ord 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", "atoi", "base64", "chrono", @@ -626,16 +626,16 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -647,13 +647,13 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array 58.2.0", - "arrow-cast 58.2.0", - "arrow-schema 58.2.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", @@ -662,12 +662,12 @@ dependencies = [ [[package]] name = "arrow-data" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +checksum = "c1683705c63dcf0d18972759eda48489028cbbff67af7d6bef2c6b7b74ab778a" dependencies = [ - "arrow-buffer 57.3.0", - "arrow-schema 57.3.0", + "arrow-buffer 57.3.1", + "arrow-schema 57.3.1", "half", "num-integer", "num-traits", @@ -675,12 +675,12 @@ dependencies = [ [[package]] name = "arrow-data" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer 58.2.0", - "arrow-schema 58.2.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", @@ -688,43 +688,43 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +checksum = "8cf72d04c07229fbf4dbebe7145cac37d7cf7ec582fe705c6b92cb314af096ab" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", "flatbuffers", ] [[package]] name = "arrow-ipc" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", ] [[package]] name = "arrow-json" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +checksum = "a84a905f41fedfcd7679813c89a61dc369c0f932b27aa8dcc6aa051cc781a97d" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", "chrono", "half", "indexmap 2.14.0", @@ -740,16 +740,16 @@ dependencies = [ [[package]] name = "arrow-json" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -765,54 +765,54 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +checksum = "082342947d4e5a2bcccf029a0a0397e21cb3bb8421edd9571d34fb5dd2670256" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", ] [[package]] name = "arrow-ord" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", ] [[package]] name = "arrow-row" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] [[package]] name = "arrow-schema" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +checksum = "e4cf0d4a6609679e03002167a61074a21d7b1ad9ea65e462b2c0a97f8a3b2bc6" [[package]] name = "arrow-schema" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18aa020f6bc8e5201dcd2d4b7f98c68f8a410ef37128263243e6ff2a47a67d4f" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ "bitflags 2.11.1", "serde", @@ -821,43 +821,43 @@ dependencies = [ [[package]] name = "arrow-select" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +checksum = "0b320d86a9806923663bb0fd9baa65ecaba81cb0cd77ff8c1768b9716b4ef891" dependencies = [ "ahash 0.8.12", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", "num-traits", ] [[package]] name = "arrow-select" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] [[package]] name = "arrow-string" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +checksum = "b493e99162e5764077e7823e50ba284858d365922631c7aaefe9487b1abd02c2" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", "memchr", "num-traits", "regex", @@ -866,15 +866,15 @@ dependencies = [ [[package]] name = "arrow-string" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -2222,7 +2222,7 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "object_store", - "parquet 58.2.0", + "parquet 58.3.0", "rand 0.9.4", "reqwest 0.13.3", "roaring", @@ -3710,16 +3710,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b6f1d41164959efaaae6d77fc00f9609ec59159ad4d0278924e79a0738f61b0" dependencies = [ "arrow", - "arrow-arith 58.2.0", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", - "arrow-ipc 58.2.0", - "arrow-json 58.2.0", - "arrow-ord 58.2.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-json 58.3.0", + "arrow-ord 58.3.0", "arrow-row", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "buoyant_kernel", "bytes", @@ -3736,7 +3736,7 @@ dependencies = [ "num_cpus", "object_store", "parking_lot", - "parquet 58.2.0", + "parquet 58.3.0", "percent-encoding", "percent-encoding-rfc3986", "pin-project-lite", @@ -4646,9 +4646,9 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.28" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d5b2eef6fafbf69f877e55509ce5b11a760690ac9700a2921be067aa6afaef6" +checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759" dependencies = [ "cfg-if", "libc", @@ -5679,9 +5679,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.17.0" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" [[package]] name = "hashlink" @@ -5938,9 +5938,9 @@ dependencies = [ [[package]] name = "hybrid-array" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" dependencies = [ "typenum", ] @@ -6086,14 +6086,14 @@ dependencies = [ "anyhow", "apache-avro", "array-init", - "arrow-arith 57.3.0", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-ord 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "arrow-string 57.3.0", + "arrow-arith 57.3.1", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-ord 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "arrow-string 57.3.1", "as-any", "async-trait", "backon", @@ -6112,7 +6112,7 @@ dependencies = [ "murmur3", "once_cell", "ordered-float 4.6.0", - "parquet 57.3.0", + "parquet 57.3.1", "rand 0.9.4", "reqwest 0.12.28", "roaring", @@ -6640,7 +6640,7 @@ dependencies = [ name = "iggy_connector_iceberg_sink" version = "0.4.1-edge.1" dependencies = [ - "arrow-json 57.3.0", + "arrow-json 57.3.1", "async-trait", "dashmap", "iceberg", @@ -6648,7 +6648,7 @@ dependencies = [ "iceberg-storage-opendal", "iggy_connector_sdk", "once_cell", - "parquet 57.3.0", + "parquet 57.3.1", "serde", "simd-json", "strum 0.28.0", @@ -6661,20 +6661,18 @@ name = "iggy_connector_influxdb_sink" version = "0.4.1-edge.1" dependencies = [ "async-trait", + "axum", "base64", "bytes", - "dashmap", - "futures", "iggy_common", "iggy_connector_sdk", - "once_cell", "reqwest 0.13.3", "reqwest-middleware", "secrecy", "serde", "serde_json", - "simd-json", "tokio", + "toml 1.1.2+spec-1.1.0", "tracing", ] @@ -6683,10 +6681,11 @@ name = "iggy_connector_influxdb_source" version = "0.4.1-edge.1" dependencies = [ "async-trait", + "axum", "base64", + "chrono", "csv", "dashmap", - "futures", "iggy_common", "iggy_connector_sdk", "once_cell", @@ -6696,7 +6695,9 @@ dependencies = [ "secrecy", "serde", "serde_json", + "simd-json", "tokio", + "toml 1.1.2+spec-1.1.0", "tracing", "uuid", ] @@ -6969,7 +6970,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.17.0", + "hashbrown 0.17.1", "serde", "serde_core", ] @@ -7386,9 +7387,9 @@ dependencies = [ [[package]] name = "kqueue-sys" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7b65860415f949f23fa882e669f2dbd4a0f0eeb1acdd56790b30494afd7da2f" +checksum = "285efcf12ef41bec907b3000d5ffaeb54191d4d9d83c0d6157e6cbc2db255e64" dependencies = [ "bitflags 2.11.1", "libc", @@ -7527,9 +7528,9 @@ dependencies = [ [[package]] name = "libbz2-rs-sys" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" +checksum = "f8fc329e1457d97a9d58a4e2ca49e3be572431a7e096008efc2e3a3c19d428f4" [[package]] name = "libc" @@ -7787,9 +7788,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +checksum = "90071f8077f8e40adfc4b7fe9cd495ce316263f19e75c2211eeff3fdf475a3d9" dependencies = [ "twox-hash", ] @@ -8944,18 +8945,18 @@ dependencies = [ [[package]] name = "parquet" -version = "57.3.0" +version = "57.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" +checksum = "2e832c6aa20310fc6de7ea5a3f4e20d34fd83e3b43229d32b81ffe5c14d74692" dependencies = [ "ahash 0.8.12", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-data 57.3.0", - "arrow-ipc 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-ipc 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", "base64", "brotli", "bytes", @@ -8964,7 +8965,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.12.1", + "lz4_flex 0.12.2", "num-bigint", "num-integer", "num-traits", @@ -8980,17 +8981,17 @@ dependencies = [ [[package]] name = "parquet" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d7efd3052f7d6ef601085559a246bc991e9a8cc77e02753737df6322ce35f1" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-ipc 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -8998,7 +8999,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.17.0", + "hashbrown 0.17.1", "lz4_flex 0.13.1", "num-bigint", "num-integer", @@ -10383,11 +10384,11 @@ dependencies = [ [[package]] name = "retry-policies" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46a4bd6027df676bcb752d3724db0ea3c0c5fc1dd0376fec51ac7dcaf9cc69be" +checksum = "dc05fbf560421a0357a750cbe78c7ca19d4923918490daabba313d5dbc871e47" dependencies = [ - "rand 0.9.4", + "rand 0.10.1", ] [[package]] @@ -14707,9 +14708,9 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ "zerofrom-derive", ] diff --git a/Cargo.toml b/Cargo.toml index fcaae3c4de..1a302ecdce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -166,7 +166,7 @@ governor = "0.10.4" harness_derive = { path = "core/harness_derive" } hash32 = "1.0.0" hostname = "0.4.2" -http = "1" +http = "1.4.0" human-repr = "1.1.0" humantime = "2.3.0" hwlocality = "1.0.0-alpha.12" diff --git a/core/connectors/influxdb_v3_architecture.md b/core/connectors/influxdb_v3_architecture.md new file mode 100644 index 0000000000..368e9a5426 --- /dev/null +++ b/core/connectors/influxdb_v3_architecture.md @@ -0,0 +1,202 @@ +# InfluxDB V2/V3 Connector — Layered Architecture + +## InfluxDB V2 vs V3 — API Delta (Feasibility Checklist) + +| Concern | InfluxDB V2 | InfluxDB V3 | Shared? | +| --- | --- | --- | --- | +| **Write body** | Line Protocol | Line Protocol | **Yes — identical** | +| **Write endpoint** | `POST /api/v2/write` | `POST /api/v3/write_lp` | No (URL differs) | +| **Write params** | `?org=X&bucket=Y&precision=P` | `?db=X&precision=P` | Partial | +| **Auth header** | `Authorization: Token {t}` | `Authorization: Bearer {t}` | No | +| **Query endpoint** | `POST /api/v2/query` | `POST /api/v3/query_sql` | No | +| **Query language** | Flux | SQL or InfluxQL | No | +| **Query response** | Annotated CSV | JSONL / JSON / CSV / Parquet | No | +| **Health check** | `GET /health` | `GET /health` | **Yes** | +| **Retry/backoff** | 429 / 5xx transient | 429 / 5xx transient | **Yes** | +| **Circuit breaker** | Per batch | Per batch | **Yes** | +| **Line Protocol builder** | Escaping, precision | Escaping, precision | **Yes** | +| **Cursor state mgmt** | Timestamp-based | Timestamp-based | **Yes** | +| **Data org concept** | `org` + `bucket` | `db` (org optional) | No | +| **V3 compat write** | — | `/api/v2/write` still works | Migration bridge | + +**Verdict:** ~70% of code is version-independent and can live in a shared common layer. Only URL construction, auth headers, query language, and response parsing diverge. + +--- + +## Layered Architecture Diagram + +```text +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ LAYER 1 — PUBLIC INTERFACE ║ +║ ║ +║ ┌──────────────────────────────────┐ ┌──────────────────────────────────┐ ║ +║ │ InfluxDbSinkConfig │ │ InfluxDbSourceConfig │ ║ +║ │ url, token, api_version │ │ url, token, api_version │ ║ +║ │ org, bucket (V2) │ db (V3) │ │ org, bucket (V2) │ db (V3) │ ║ +║ │ measurement, precision │ │ query, query_language │ ║ +║ │ batch_size, payload_format │ │ poll_interval, cursor_field │ ║ +║ │ ── resilience fields ── │ │ ── resilience fields ── │ ║ +║ │ retry_delay, max_retries │ │ retry_delay, max_retries │ ║ +║ │ circuit_breaker_threshold │ │ circuit_breaker_threshold │ ║ +║ └──────────────────────────────────┘ └──────────────────────────────────┘ ║ +║ ║ +║ ┌──────────────────────────────────┐ ┌──────────────────────────────────┐ ║ +║ │ impl Sink for InfluxDbSink │ │ impl Source for InfluxDbSource │ ║ +║ │ open() → health + retry │ │ open() → health + retry │ ║ +║ │ consume() → batch + write │ │ poll() → query + parse + emit │ ║ +║ │ close() → metrics log │ │ close() → flush state │ ║ +║ └──────────────────────────────────┘ └──────────────────────────────────┘ ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + │ + ▼ +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ LAYER 2 — SHARED ORCHESTRATION (version-agnostic) ║ +║ ║ +║ ┌─────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ║ +║ │ Batch Accumulator │ │ Retry + Exp Backoff │ │ Circuit Breaker │ ║ +║ │ (sink) │ │ (write + query) │ │ open/half-open/ │ ║ +║ │ Vec buffer │ │ transient: 429/5xx │ │ closed state │ ║ +║ │ flush at batch_size │ │ max_delay cap │ │ consecutive fail │ ║ +║ └─────────────────────┘ └──────────────────────┘ └──────────────────────┘ ║ +║ ║ +║ ┌─────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ║ +║ │ Line Protocol │ │ Payload Format │ │ Cursor State │ ║ +║ │ Builder (shared) │ │ Handler (shared) │ │ Manager (source) │ ║ +║ │ escape_measurement │ │ JSON / Text / Base64 │ │ persist last_time │ ║ +║ │ escape_tag_value │ │ (sink encoding) │ │ cursor_row_count │ ║ +║ │ to_precision_ts │ │ JSON / Text / Raw │ │ serde forward-compat│ ║ +║ └─────────────────────┘ │ (source decode) │ └──────────────────────┘ ║ +║ └──────────────────────┘ ║ +║ ║ +║ ┌─────────────────────────────────────────────────────────────────────────┐ ║ +║ │ Metrics (AtomicU64) │ ║ +║ │ messages_attempted write_success write_errors messages_dropped │ ║ +║ └─────────────────────────────────────────────────────────────────────────┘ ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + │ + ▼ +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ LAYER 3 — VERSION ADAPTER TRAIT (InfluxDbAdapter) ║ +║ ║ +║ trait InfluxDbAdapter: Send + Sync { ║ +║ fn auth_header(token: &SecretString) -> (HeaderName, HeaderValue); ║ +║ fn write_request(lines: &str, cfg: &Config) -> RequestBuilder; ║ +║ fn query_request(cursor: &str, limit: u32, cfg: &Config) -> RequestBuilder║ +║ fn parse_rows(response_body: &str) -> Result, Error>; ║ +║ fn health_url(base: &Url) -> Url; ║ +║ } ║ +║ ║ +║ ┌──────────────────────────────────────────────────────────────────────────┐ ║ +║ │ ApiVersion enum { V2, V3, Auto } │ ║ +║ │ fn make_adapter(cfg) → Box │ ║ +║ │ Auto → GET /ping → parse X-Influxdb-Version header → pick V2 or V3 │ ║ +║ └──────────────────────────────────────────────────────────────────────────┘ ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + │ │ + ▼ ▼ +╔═════════════════════════════╗ ╔══════════════════════════════════╗ +║ V2Adapter ║ ║ V3Adapter ║ +║ ║ ║ ║ +║ auth_header → ║ ║ auth_header → ║ +║ "Token {token}" ║ ║ "Bearer {token}" ║ +║ ║ ║ ║ +║ write_request → ║ ║ write_request → ║ +║ POST /api/v2/write ║ ║ POST /api/v3/write_lp ║ +║ ?org=X&bucket=Y ║ ║ ?db=X ║ +║ &precision=P ║ ║ &precision=P ║ +║ ║ ║ (Content-Encoding: gzip opt) ║ +║ query_request → ║ ║ ║ +║ POST /api/v2/query ║ ║ query_request → ║ +║ ?org=X ║ ║ POST /api/v3/query_sql ║ +║ body: Flux template ║ ║ body: SQL template ─ OR ─ ║ +║ $cursor → timestamp ║ ║ POST /api/v3/query_influxql ║ +║ $limit → row count ║ ║ ?db=X format=jsonl ║ +║ ║ ║ $cursor / $limit substituted ║ +║ parse_rows → ║ ║ ║ +║ RFC 4180 annotated CSV ║ ║ parse_rows → ║ +║ skip #datatype header ║ ║ JSONL (one JSON obj / line) ║ +║ extract payload_column ║ ║ extract payload_column ║ +║ ║ ║ ║ +║ health_url → /health ║ ║ health_url → /health or /ping ║ +╚═════════════════════════════╝ ╚══════════════════════════════════╝ + │ │ + └───────────────────┬──────────────────────────┘ + ▼ +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ LAYER 4 — HTTP CLIENT (shared) ║ +║ ║ +║ reqwest::ClientWithMiddleware ║ +║ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────────────┐ ║ +║ │ RetryMiddleware │ │ Timeout Policy │ │ Connection Pool │ ║ +║ │ ExponentialBackoff│ │ per-request (cfg)│ │ keep-alive, max-idle │ ║ +║ │ max_retries (cfg) │ │ 30s default │ │ │ ║ +║ └───────────────────┘ └───────────────────┘ └───────────────────────────┘ ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + │ + ▼ +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ INFLUXDB SERVER ║ +║ ║ +║ InfluxDB OSS 2.x / Cloud 2.x InfluxDB 3.x Core / Enterprise ║ +║ (TSM engine, Flux, org+bucket) (IOx engine, SQL, db, ∞ cardinality)║ +╚══════════════════════════════════════════════════════════════════════════════════╝ +``` + +--- + +## Config Schema Design + +```toml +# Common to all versions +url = "http://localhost:8086" +token = "my_token" +api_version = "auto" # "v2" | "v3" | "auto" (detects via /ping) + +# V2 identity fields (required when api_version = "v2") +org = "my_org" +bucket = "events" # sink +# bucket = "events" # source (also used for query from-clause) + +# V3 identity fields (required when api_version = "v3") +# db = "events" # replaces org+bucket + +# Source query — user provides version-appropriate template +# V2 (Flux): +query = ''' + from(bucket: "$bucket") + |> range(start: time(v: "$cursor")) + |> filter(fn: (r) => r._measurement == "iggy") + |> limit(n: $limit) +''' + +# V3 (SQL): +query = ''' + SELECT _time, payload FROM iggy + WHERE _time > '$cursor' + ORDER BY _time + LIMIT $limit +''' + +query_language = "flux" # "flux" | "sql" | "influxql" (V3 only) +response_format = "jsonl" # "jsonl" | "json" | "csv" (V3 only) +``` + +--- + +## Code Reuse Summary + +| Component | Reuse | Notes | +| --- | --- | --- | +| `Sink` / `Source` trait impls | 100% | Same `open/consume/poll/close` logic | +| Line Protocol builder | 100% | Body format identical in V2 and V3 | +| Batch accumulator | 100% | Flush logic unchanged | +| Retry + circuit breaker | 100% | Same HTTP status codes trigger retries | +| Metrics counters | 100% | Atomic counters are version-agnostic | +| Cursor state management | 100% | RFC 3339 timestamp cursors work in both | +| Payload format handling | 100% | Encoding/decoding is connector-internal | +| Auth header construction | 0% | `Token` vs `Bearer` — adapter handles | +| Write URL + params | ~20% | Precision param shared; endpoint & org/bucket vs db differ | +| Query URL + body | 0% | Flux vs SQL — fully different languages | +| Response parsing | 0% | Annotated CSV vs JSONL — different parsers | + +**Conclusion:** The approach is fully feasible. The adapter trait boundary is clean — only 3 methods diverge (auth, write URL, query+parse). Everything above that layer compiles once and serves both versions. The `Auto` mode via `/ping` header detection means zero config burden on users running standard installations. diff --git a/core/connectors/sinks/influxdb_sink/Cargo.toml b/core/connectors/sinks/influxdb_sink/Cargo.toml index 9cc6c85b6d..086a5c6d22 100644 --- a/core/connectors/sinks/influxdb_sink/Cargo.toml +++ b/core/connectors/sinks/influxdb_sink/Cargo.toml @@ -29,9 +29,6 @@ repository = "https://github.com/apache/iggy" readme = "../../README.md" publish = false -[package.metadata.cargo-machete] -ignored = ["dashmap", "once_cell", "futures"] - [lib] crate-type = ["cdylib", "lib"] @@ -39,16 +36,16 @@ crate-type = ["cdylib", "lib"] async-trait = { workspace = true } base64 = { workspace = true } bytes = { workspace = true } -dashmap = { workspace = true } -futures = { workspace = true } iggy_common = { workspace = true } iggy_connector_sdk = { workspace = true } -once_cell = { workspace = true } reqwest = { workspace = true } reqwest-middleware = { workspace = true } secrecy = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -simd-json = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } + +[dev-dependencies] +axum = { workspace = true } +toml = { workspace = true } diff --git a/core/connectors/sinks/influxdb_sink/config.toml b/core/connectors/sinks/influxdb_sink/config.toml index 621c6e7f77..7d32488c5f 100644 --- a/core/connectors/sinks/influxdb_sink/config.toml +++ b/core/connectors/sinks/influxdb_sink/config.toml @@ -32,6 +32,7 @@ poll_interval = "5ms" consumer_group = "influxdb_sink" [plugin_config] +version = "v2" url = "http://localhost:8086" org = "iggy" bucket = "events" diff --git a/core/connectors/sinks/influxdb_sink/src/lib.rs b/core/connectors/sinks/influxdb_sink/src/lib.rs index 815e03f7ed..eb74403040 100644 --- a/core/connectors/sinks/influxdb_sink/src/lib.rs +++ b/core/connectors/sinks/influxdb_sink/src/lib.rs @@ -16,6 +16,9 @@ * under the License. */ +mod protocol; + +use crate::protocol::{write_field_string, write_measurement, write_tag_value}; use async_trait::async_trait; use base64::{Engine as _, engine::general_purpose}; use bytes::Bytes; @@ -29,90 +32,332 @@ use iggy_connector_sdk::{ }; use reqwest::Url; use reqwest_middleware::ClientWithMiddleware; -use secrecy::{ExposeSecret, SecretString}; +use secrecy::{ExposeSecret, SecretBox, SecretString}; use serde::{Deserialize, Serialize}; +use std::fmt::Write as _; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::Duration; -use std::time::SystemTime; -use std::time::UNIX_EPOCH; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::{debug, error, info, warn}; + sink_connector!(InfluxDbSink); const DEFAULT_MAX_RETRIES: u32 = 3; const DEFAULT_RETRY_DELAY: &str = "1s"; const DEFAULT_TIMEOUT: &str = "30s"; const DEFAULT_PRECISION: &str = "us"; -// Maximum attempts for open() connectivity retries const DEFAULT_MAX_OPEN_RETRIES: u32 = 10; -// Cap for exponential backoff in open() — never wait longer than this const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s"; -// Cap for exponential backoff on per-write retries — kept short so a -// transient InfluxDB blip does not stall message delivery for too long const DEFAULT_RETRY_MAX_DELAY: &str = "5s"; -// How many consecutive batch failures open the circuit breaker const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5; -// How long the circuit stays open before allowing a probe attempt const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s"; -// --------------------------------------------------------------------------- -// Main connector structs -// --------------------------------------------------------------------------- +// ── Configuration ───────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct V2SinkConfig { + pub(crate) url: String, + pub(crate) org: String, + pub(crate) bucket: String, + #[serde(serialize_with = "serialize_secret")] + pub(crate) token: SecretString, + pub(crate) measurement: Option, + pub(crate) precision: Option, + pub(crate) batch_size: Option, + pub(crate) include_metadata: Option, + pub(crate) include_checksum: Option, + pub(crate) include_origin_timestamp: Option, + pub(crate) include_stream_tag: Option, + pub(crate) include_topic_tag: Option, + pub(crate) include_partition_tag: Option, + pub(crate) payload_format: Option, + pub(crate) verbose_logging: Option, + pub(crate) max_retries: Option, + pub(crate) retry_delay: Option, + pub(crate) timeout: Option, + pub(crate) max_open_retries: Option, + pub(crate) open_retry_max_delay: Option, + pub(crate) retry_max_delay: Option, + pub(crate) circuit_breaker_threshold: Option, + pub(crate) circuit_breaker_cool_down: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct V3SinkConfig { + pub(crate) url: String, + pub(crate) db: String, + #[serde(serialize_with = "serialize_secret")] + pub(crate) token: SecretString, + pub(crate) measurement: Option, + pub(crate) precision: Option, + pub(crate) batch_size: Option, + pub(crate) include_metadata: Option, + pub(crate) include_checksum: Option, + pub(crate) include_origin_timestamp: Option, + pub(crate) include_stream_tag: Option, + pub(crate) include_topic_tag: Option, + pub(crate) include_partition_tag: Option, + pub(crate) payload_format: Option, + pub(crate) verbose_logging: Option, + pub(crate) max_retries: Option, + pub(crate) retry_delay: Option, + pub(crate) timeout: Option, + pub(crate) max_open_retries: Option, + pub(crate) open_retry_max_delay: Option, + pub(crate) retry_max_delay: Option, + pub(crate) circuit_breaker_threshold: Option, + pub(crate) circuit_breaker_cool_down: Option, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "version")] +pub enum InfluxDbSinkConfig { + #[serde(rename = "v2")] + V2(V2SinkConfig), + #[serde(rename = "v3")] + V3(V3SinkConfig), +} + +impl<'de> serde::Deserialize<'de> for InfluxDbSinkConfig { + fn deserialize>(d: D) -> Result { + let raw = serde_json::Value::deserialize(d)?; + let version = match raw.get("version") { + None => "v2", // absent key → backward compat default + Some(v) => v.as_str().ok_or_else(|| { + serde::de::Error::custom(format!( + "\"version\" must be a string (e.g. \"v2\" or \"v3\"), got: {v}" + )) + })?, + }; + match version { + "v2" => serde_json::from_value::(raw) + .map(Self::V2) + .map_err(serde::de::Error::custom), + "v3" => serde_json::from_value::(raw) + .map(Self::V3) + .map_err(serde::de::Error::custom), + other => Err(serde::de::Error::custom(format!( + "unknown InfluxDB version {other:?}; expected \"v2\" or \"v3\"" + ))), + } + } +} + +/// Map a short precision string to InfluxDB 3's long-form equivalent. +/// +/// InfluxDB 3 rejects the short forms (`"ns"`, `"us"`, `"ms"`, `"s"`) on the +/// `/api/v3/write_lp` endpoint and expects full English words. Returns an error +/// for unrecognised values rather than silently defaulting. +#[must_use = "precision mapping errors must be propagated — ignoring this silently corrupts timestamps"] +fn map_precision_v3(p: &str) -> Result<&'static str, Error> { + match p { + "ns" => Ok("nanosecond"), + "us" => Ok("microsecond"), + "ms" => Ok("millisecond"), + "s" => Ok("second"), + other => Err(Error::InvalidConfigValue(format!( + "unknown precision {other:?}; valid values are \"ns\", \"us\", \"ms\", \"s\"" + ))), + } +} + +// Eliminates the repetitive "match self { V2(c) => …, V3(c) => … }" pattern for +// fields that are identical across all config variants. Methods with version-specific +// logic (auth_header, build_write_url, build_health_url, version_label) remain explicit. +// +// Supported patterns: +// delegate!(ref self.url) → &String (borrow) +// delegate!(opt self.measurement) → Option<&str> +// delegate!(str_or self.precision, "us") → &str with string fallback +// delegate!(unwrap self.batch_size, 500) → T: Copy with value fallback +// +// Not supported (use explicit match arms instead): +// Fields with version-specific defaults (e.g. cursor_field: "_time" vs "time") +// Fields with chained transformations (e.g. .max(1)) +// Fields requiring complex construction (e.g. auth_header building) +macro_rules! delegate { + // &T field reference → fn foo(&self) -> &T + (ref $self:ident . $field:ident) => { + match $self { + Self::V2(c) => &c.$field, + Self::V3(c) => &c.$field, + } + }; + // Option → Option<&str> + (opt $self:ident . $field:ident) => { + match $self { + Self::V2(c) => c.$field.as_deref(), + Self::V3(c) => c.$field.as_deref(), + } + }; + // Option → &str with fallback + (str_or $self:ident . $field:ident, $default:expr) => { + match $self { + Self::V2(c) => c.$field.as_deref().unwrap_or($default), + Self::V3(c) => c.$field.as_deref().unwrap_or($default), + } + }; + // Option → T with fallback + (unwrap $self:ident . $field:ident, $default:expr) => { + match $self { + Self::V2(c) => c.$field.unwrap_or($default), + Self::V3(c) => c.$field.unwrap_or($default), + } + }; +} + +impl InfluxDbSinkConfig { + fn url(&self) -> &str { + delegate!(ref self.url) + } + fn base_url(&self) -> &str { + self.url().trim_end_matches('/') + } + fn measurement(&self) -> Option<&str> { + delegate!(opt self.measurement) + } + fn precision(&self) -> &str { + delegate!(str_or self.precision, DEFAULT_PRECISION) + } + fn batch_size(&self) -> u32 { + delegate!(unwrap self.batch_size, 500) + } + fn include_metadata(&self) -> bool { + delegate!(unwrap self.include_metadata, true) + } + fn include_checksum(&self) -> bool { + delegate!(unwrap self.include_checksum, true) + } + fn include_origin_timestamp(&self) -> bool { + delegate!(unwrap self.include_origin_timestamp, true) + } + fn include_stream_tag(&self) -> bool { + delegate!(unwrap self.include_stream_tag, true) + } + fn include_topic_tag(&self) -> bool { + delegate!(unwrap self.include_topic_tag, true) + } + fn include_partition_tag(&self) -> bool { + delegate!(unwrap self.include_partition_tag, true) + } + fn payload_format(&self) -> Option<&str> { + delegate!(opt self.payload_format) + } + fn verbose_logging(&self) -> bool { + delegate!(unwrap self.verbose_logging, false) + } + fn max_retries(&self) -> u32 { + delegate!(unwrap self.max_retries, DEFAULT_MAX_RETRIES) + } + fn retry_delay(&self) -> Option<&str> { + delegate!(opt self.retry_delay) + } + fn timeout(&self) -> Option<&str> { + delegate!(opt self.timeout) + } + fn max_open_retries(&self) -> u32 { + delegate!(unwrap self.max_open_retries, DEFAULT_MAX_OPEN_RETRIES) + } + fn open_retry_max_delay(&self) -> Option<&str> { + delegate!(opt self.open_retry_max_delay) + } + fn retry_max_delay(&self) -> Option<&str> { + delegate!(opt self.retry_max_delay) + } + fn circuit_breaker_threshold(&self) -> u32 { + delegate!(unwrap self.circuit_breaker_threshold, DEFAULT_CIRCUIT_BREAKER_THRESHOLD) + } + fn circuit_breaker_cool_down(&self) -> Option<&str> { + delegate!(opt self.circuit_breaker_cool_down) + } + + fn auth_header(&self) -> String { + match self { + Self::V2(c) => format!("Token {}", c.token.expose_secret()), + Self::V3(c) => format!("Bearer {}", c.token.expose_secret()), + } + } + + #[must_use = "URL construction can fail; the error must be propagated or open() will silently use a stale URL"] + fn build_write_url(&self) -> Result { + let precision = self.precision(); + match self { + Self::V2(c) => { + if c.org.trim().is_empty() { + return Err(Error::InvalidConfigValue( + "InfluxDB V2 'org' must not be empty".into(), + )); + } + let mut url = Url::parse(&format!("{}/api/v2/write", self.base_url())) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + url.query_pairs_mut() + .append_pair("org", &c.org) + .append_pair("bucket", &c.bucket) + .append_pair("precision", precision); + Ok(url) + } + Self::V3(c) => { + let mut url = Url::parse(&format!("{}/api/v3/write_lp", self.base_url())) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + url.query_pairs_mut() + .append_pair("db", &c.db) + .append_pair("precision", map_precision_v3(precision)?); + Ok(url) + } + } + } + + #[must_use = "URL construction can fail; the error must be propagated"] + fn build_health_url(&self) -> Result { + Url::parse(&format!("{}/health", self.base_url())) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}"))) + } + + fn version_label(&self) -> &'static str { + match self { + Self::V2(_) => "v2", + Self::V3(_) => "v3", + } + } +} + +// ── Sink struct ─────────────────────────────────────────────────────────────── +/// InfluxDB sink connector state. +/// +/// **Init-time fields** (populated in `new()` from config, never `None`): +/// `id`, `config`, `circuit_breaker`, `verbose`, `retry_delay`, `payload_format`, +/// `measurement`, `precision`, `include_*`, `batch_size_limit`. +/// +/// **Open-time fields** (populated in `open()`, guarded by `Option`): +/// `client`, `write_url`, `auth_header` — callers must invoke `open()` before +/// any `process_batch()` call; `get_client()` returns an error otherwise. #[derive(Debug)] pub struct InfluxDbSink { - pub id: u32, + id: u32, config: InfluxDbSinkConfig, - /// `None` until `open()` is called. Wraps `reqwest::Client` with - /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled - /// transparently by the middleware stack instead of a hand-rolled loop. client: Option, - /// Cached once in `open()` — config fields never change at runtime. write_url: Option, + auth_header: Option>, + circuit_breaker: Arc, messages_attempted: AtomicU64, write_success: AtomicU64, write_errors: AtomicU64, verbose: bool, retry_delay: Duration, - /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation - /// on every message in the hot path. payload_format: PayloadFormat, - circuit_breaker: Arc, + measurement: String, + precision: String, + include_metadata: bool, + include_checksum: bool, + include_origin_timestamp: bool, + include_stream_tag: bool, + include_topic_tag: bool, + include_partition_tag: bool, + batch_size_limit: usize, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct InfluxDbSinkConfig { - pub url: String, - pub org: String, - pub bucket: String, - #[serde(serialize_with = "serialize_secret")] - pub token: SecretString, - pub measurement: Option, - pub precision: Option, - pub batch_size: Option, - pub include_metadata: Option, - pub include_checksum: Option, - pub include_origin_timestamp: Option, - pub include_stream_tag: Option, - pub include_topic_tag: Option, - pub include_partition_tag: Option, - pub payload_format: Option, - pub verbose_logging: Option, - pub max_retries: Option, - pub retry_delay: Option, - pub timeout: Option, - // How many times open() will retry before giving up - pub max_open_retries: Option, - // Upper cap on open() backoff delay — can be set high (e.g. "60s") for - // patient startup without affecting per-write retry behaviour - pub open_retry_max_delay: Option, - // Upper cap on per-write retry backoff — kept short so a transient blip - // does not stall message delivery; independent of open_retry_max_delay - pub retry_max_delay: Option, - // Circuit breaker configuration - pub circuit_breaker_threshold: Option, - pub circuit_breaker_cool_down: Option, -} +// ── PayloadFormat ───────────────────────────────────────────────────────────── #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] enum PayloadFormat { @@ -125,198 +370,87 @@ enum PayloadFormat { impl PayloadFormat { fn from_config(value: Option<&str>) -> Self { match value.map(|v| v.to_ascii_lowercase()).as_deref() { - Some("text") | Some("utf8") => PayloadFormat::Text, - Some("base64") | Some("raw") => PayloadFormat::Base64, - Some("json") => PayloadFormat::Json, - other => { - warn!( - "Unrecognized payload_format value {:?}, falling back to JSON. \ - Valid values are: \"json\", \"text\", \"utf8\", \"base64\", \"raw\".", - other - ); - PayloadFormat::Json - } - } - } -} - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -/// Write an escaped measurement name into `buf`. -/// Escapes: `\` → `\\`, `,` → `\,`, ` ` → `\ `, `\n` → `\\n`, `\r` → `\\r` -/// -/// Newline (`\n`) and carriage-return (`\r`) are the InfluxDB line-protocol -/// record delimiters; a literal newline inside a measurement name would split -/// the line and corrupt the batch. -fn write_measurement(buf: &mut String, value: &str) { - for ch in value.chars() { - match ch { - '\\' => buf.push_str("\\\\"), - ',' => buf.push_str("\\,"), - ' ' => buf.push_str("\\ "), - '\n' => buf.push_str("\\n"), - '\r' => buf.push_str("\\r"), - _ => buf.push(ch), - } - } -} - -/// Write an escaped tag key/value into `buf`. -/// Escapes: `\` → `\\`, `,` → `\,`, `=` → `\=`, ` ` → `\ `, `\n` → `\\n`, `\r` → `\\r` -/// -/// Newline and carriage-return are escaped for the same reason as in -/// [`write_measurement`]: they are InfluxDB line-protocol record delimiters. -fn write_tag_value(buf: &mut String, value: &str) { - for ch in value.chars() { - match ch { - '\\' => buf.push_str("\\\\"), - ',' => buf.push_str("\\,"), - '=' => buf.push_str("\\="), - ' ' => buf.push_str("\\ "), - '\n' => buf.push_str("\\n"), - '\r' => buf.push_str("\\r"), - _ => buf.push(ch), - } - } -} - -/// Write an escaped string field value (without surrounding quotes) into `buf`. -/// Escapes: `\` → `\\`, `"` → `\"`, `\n` → `\\n`, `\r` → `\\r` -/// -/// Newline and carriage-return are the InfluxDB line-protocol record -/// delimiters; a literal newline inside a string field value (e.g. from a -/// multi-line text payload) would split the line and corrupt the batch. -fn write_field_string(buf: &mut String, value: &str) { - for ch in value.chars() { - match ch { - '\\' => buf.push_str("\\\\"), - '"' => buf.push_str("\\\""), - '\n' => buf.push_str("\\n"), - '\r' => buf.push_str("\\r"), - _ => buf.push(ch), + Some("text") | Some("utf8") => Self::Text, + Some("base64") | Some("raw") => Self::Base64, + _ => Self::Json, } } } -// --------------------------------------------------------------------------- -// InfluxDbSink implementation -// --------------------------------------------------------------------------- +// ── InfluxDbSink impl ───────────────────────────────────────────────────────── impl InfluxDbSink { pub fn new(id: u32, config: InfluxDbSinkConfig) -> Self { - let verbose = config.verbose_logging.unwrap_or(false); - let retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); - let payload_format = PayloadFormat::from_config(config.payload_format.as_deref()); - - // Build circuit breaker from config - let cb_threshold = config - .circuit_breaker_threshold - .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD); - let cb_cool_down = parse_duration( - config.circuit_breaker_cool_down.as_deref(), - DEFAULT_CIRCUIT_COOL_DOWN, - ); - - InfluxDbSink { + let verbose = config.verbose_logging(); + let retry_delay = parse_duration(config.retry_delay(), DEFAULT_RETRY_DELAY); + let payload_format = PayloadFormat::from_config(config.payload_format()); + let circuit_breaker = Arc::new(CircuitBreaker::new( + config.circuit_breaker_threshold(), + parse_duration( + config.circuit_breaker_cool_down(), + DEFAULT_CIRCUIT_COOL_DOWN, + ), + )); + let measurement = config.measurement().unwrap_or("iggy_messages").to_string(); + let precision = config.precision().to_string(); + let include_metadata = config.include_metadata(); + let include_checksum = config.include_checksum(); + let include_origin_timestamp = config.include_origin_timestamp(); + let include_stream_tag = config.include_stream_tag(); + let include_topic_tag = config.include_topic_tag(); + let include_partition_tag = config.include_partition_tag(); + let batch_size_limit = config.batch_size().max(1) as usize; + + Self { id, config, client: None, write_url: None, + auth_header: None, + circuit_breaker, messages_attempted: AtomicU64::new(0), write_success: AtomicU64::new(0), write_errors: AtomicU64::new(0), verbose, retry_delay, payload_format, - circuit_breaker: Arc::new(CircuitBreaker::new(cb_threshold, cb_cool_down)), + measurement, + precision, + include_metadata, + include_checksum, + include_origin_timestamp, + include_stream_tag, + include_topic_tag, + include_partition_tag, + batch_size_limit, } } fn build_raw_client(&self) -> Result { - let timeout = parse_duration(self.config.timeout.as_deref(), DEFAULT_TIMEOUT); + let timeout = parse_duration(self.config.timeout(), DEFAULT_TIMEOUT); reqwest::Client::builder() .timeout(timeout) .build() .map_err(|e| Error::InitError(format!("Failed to create HTTP client: {e}"))) } - fn build_write_url(&self) -> Result { - let base = self.config.url.trim_end_matches('/'); - let mut url = Url::parse(&format!("{base}/api/v2/write")) - .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; - - let precision = self - .config - .precision - .as_deref() - .unwrap_or(DEFAULT_PRECISION); - url.query_pairs_mut() - .append_pair("org", &self.config.org) - .append_pair("bucket", &self.config.bucket) - .append_pair("precision", precision); - - Ok(url) - } - - fn build_health_url(&self) -> Result { - let base = self.config.url.trim_end_matches('/'); - Url::parse(&format!("{base}/health")) - .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}"))) - } - fn get_client(&self) -> Result<&ClientWithMiddleware, Error> { - self.client - .as_ref() - .ok_or_else(|| Error::Connection("InfluxDB client is not initialized".to_string())) - } - - fn measurement(&self) -> &str { - self.config - .measurement - .as_deref() - .unwrap_or("iggy_messages") - } - - fn payload_format(&self) -> PayloadFormat { - self.payload_format - } - - fn timestamp_precision(&self) -> &str { - self.config - .precision - .as_deref() - .unwrap_or(DEFAULT_PRECISION) - } - - fn get_max_retries(&self) -> u32 { - self.config - .max_retries - .unwrap_or(DEFAULT_MAX_RETRIES) - .max(1) + self.client.as_ref().ok_or_else(|| { + Error::Connection("InfluxDB client not initialized — call open() first".to_string()) + }) } + #[inline] fn to_precision_timestamp(&self, micros: u64) -> u64 { - match self.timestamp_precision() { + match self.precision.as_str() { "ns" => micros.saturating_mul(1_000), "us" => micros, "ms" => micros / 1_000, "s" => micros / 1_000_000, - _ => micros, + _ => micros, // unreachable if open() validated precision as the precision is validated in the config validation } } - /// Serialise one message as a line-protocol line, appending directly into - /// `buf` with no intermediate `Vec` for tags or fields. - /// - /// # Allocation budget (per message, happy path) - /// - Zero `Vec` allocations for tags or fields. - /// - Zero per-tag/per-field `format!` allocations. - /// - One `Vec` for `payload_bytes` (unavoidable — payload must be - /// decoded/serialised before it can be escaped into the buffer). - /// - The caller's `buf` grows in place; if it was pre-allocated with - /// `with_capacity` it will not reallocate for typical message sizes. fn append_line( &self, buf: &mut String, @@ -324,117 +458,85 @@ impl InfluxDbSink { messages_metadata: &MessagesMetadata, message: &ConsumedMessage, ) -> Result<(), Error> { - let include_metadata = self.config.include_metadata.unwrap_or(true); - let include_checksum = self.config.include_checksum.unwrap_or(true); - let include_origin_timestamp = self.config.include_origin_timestamp.unwrap_or(true); - let include_stream_tag = self.config.include_stream_tag.unwrap_or(true); - let include_topic_tag = self.config.include_topic_tag.unwrap_or(true); - let include_partition_tag = self.config.include_partition_tag.unwrap_or(true); - - // ── Measurement ────────────────────────────────────────────────────── - write_measurement(buf, self.measurement()); - - // ── Tag set ────────────────────────────────────────────────────────── - // Tags are written as ",key=value" pairs directly into buf. - // The offset tag is always present — it makes every point unique in - // InfluxDB's deduplication key (measurement + tag set + timestamp), - // regardless of precision or how many messages share a timestamp. - if include_metadata && include_stream_tag { + write_measurement(buf, &self.measurement); + + // Tag *key* strings below ("stream", "topic", "partition", "offset", etc.) are + // static ASCII literals — they contain no InfluxDB line-protocol special chars + // (comma, equals, space, backslash, newline) and therefore do not need escaping. + // Only the tag *values* (user-supplied stream/topic names) are escaped via + // `write_tag_value`. The user-supplied `measurement` is escaped via + // `write_measurement`. + if self.include_metadata && self.include_stream_tag { buf.push_str(",stream="); write_tag_value(buf, &topic_metadata.stream); } - if include_metadata && include_topic_tag { + if self.include_metadata && self.include_topic_tag { buf.push_str(",topic="); write_tag_value(buf, &topic_metadata.topic); } - if include_metadata && include_partition_tag { - use std::fmt::Write as _; - write!(buf, ",partition={}", messages_metadata.partition_id) - .expect("write to String is infallible"); - } - // offset tag — always written, ensures point uniqueness - { - use std::fmt::Write as _; - write!(buf, ",offset={}", message.offset).expect("write to String is infallible"); + if self.include_metadata && self.include_partition_tag { + write!(buf, ",partition={}", messages_metadata.partition_id).expect("infallible"); } + // `offset` is always written as a tag regardless of `include_metadata`. + // It forms the deduplication key for idempotent writes: without it, two + // messages at the same timestamp in the same measurement+tag-set would + // silently overwrite each other in InfluxDB's last-write-wins model. + write!(buf, ",offset={}", message.offset).expect("infallible"); - // ── Field set ──────────────────────────────────────────────────────── - // First field: no leading comma. All subsequent fields: leading comma. buf.push(' '); - buf.push_str("message_id=\""); - write_field_string(buf, &message.id.to_string()); + let _ = write!(buf, "{}", message.id); buf.push('"'); - // offset as a numeric field (queryable in Flux) in addition to the tag - { - use std::fmt::Write as _; - write!(buf, ",offset={}u", message.offset).expect("write to String is infallible"); - } - - // Optional metadata fields written when the corresponding tag is - // disabled (so the value is still queryable as a field). - if include_metadata && !include_stream_tag { + if self.include_metadata && !self.include_stream_tag { buf.push_str(",iggy_stream=\""); write_field_string(buf, &topic_metadata.stream); buf.push('"'); } - if include_metadata && !include_topic_tag { + if self.include_metadata && !self.include_topic_tag { buf.push_str(",iggy_topic=\""); write_field_string(buf, &topic_metadata.topic); buf.push('"'); } - if include_metadata && !include_partition_tag { - use std::fmt::Write as _; + if self.include_metadata && !self.include_partition_tag { write!( buf, ",iggy_partition={}u", messages_metadata.partition_id as u64 ) - .expect("write to String is infallible"); + .expect("infallible"); } - if include_checksum { - use std::fmt::Write as _; - write!(buf, ",iggy_checksum={}u", message.checksum) - .expect("write to String is infallible"); + if self.include_checksum { + write!(buf, ",iggy_checksum={}u", message.checksum).expect("infallible"); } - if include_origin_timestamp { - use std::fmt::Write as _; + if self.include_origin_timestamp { write!(buf, ",iggy_origin_timestamp={}u", message.origin_timestamp) - .expect("write to String is infallible"); + .expect("infallible"); } - // ── Payload field ──────────────────────────────────────────────────── - match self.payload_format() { + match self.payload_format { PayloadFormat::Json => { - // Fast path: if the payload is already a parsed simd_json value, - // serialise directly to a compact string — one pass, no bytes - // round-trip. Avoids: simd_json→bytes, bytes→serde_json::Value, - // serde_json::Value→string (three allocating passes per message). - // - // Fallback: any other Payload variant (Raw bytes that happen to - // contain JSON, Text, etc.) goes through try_to_bytes() first. + // simd_json::to_string applies SIMD only on the *parse* path, not + // the serialize path — no throughput advantage over serde_json here. + // The Json variant is kept for API compatibility; the hot path goes + // through the fallback branch for non-Json payloads. let compact = match &message.payload { - iggy_connector_sdk::Payload::Json(value) => simd_json::to_string(value) + iggy_connector_sdk::Payload::Json(value) => serde_json::to_string(value) .map_err(|e| { - Error::CannotStoreData(format!("Failed to serialize JSON payload: {e}")) + Error::CannotStoreData(format!("JSON serialization failed: {e}")) })?, _ => { let bytes = message.payload.try_to_bytes().map_err(|e| { - Error::CannotStoreData(format!( - "Failed to convert payload to bytes: {e}" - )) + Error::CannotStoreData(format!("Payload conversion failed: {e}")) })?; - // Validate that the bytes are actually JSON before - // writing them into the line-protocol field. + // Parse to validate and normalize (compact) the JSON; + // preserves correct output for pretty-printed inputs. let value: serde_json::Value = serde_json::from_slice(&bytes).map_err(|e| { - Error::CannotStoreData(format!( - "Payload format is json but payload is invalid JSON: {e}" - )) + Error::CannotStoreData(format!("Payload is not valid JSON: {e}")) })?; serde_json::to_string(&value).map_err(|e| { - Error::CannotStoreData(format!("Failed to serialize JSON payload: {e}")) + Error::CannotStoreData(format!("JSON serialization failed: {e}")) })? } }; @@ -443,34 +545,36 @@ impl InfluxDbSink { buf.push('"'); } PayloadFormat::Text => { - let payload_bytes = message.payload.try_to_bytes().map_err(|e| { - Error::CannotStoreData(format!("Failed to convert payload to bytes: {e}")) + let bytes = message.payload.try_to_bytes().map_err(|e| { + Error::CannotStoreData(format!("Payload conversion failed: {e}")) })?; - let text = String::from_utf8(payload_bytes).map_err(|e| { - Error::CannotStoreData(format!( - "Payload format is text but payload is invalid UTF-8: {e}" - )) + let text = String::from_utf8(bytes).map_err(|e| { + Error::CannotStoreData(format!("Payload is not valid UTF-8: {e}")) })?; buf.push_str(",payload_text=\""); write_field_string(buf, &text); buf.push('"'); } PayloadFormat::Base64 => { - let payload_bytes = message.payload.try_to_bytes().map_err(|e| { - Error::CannotStoreData(format!("Failed to convert payload to bytes: {e}")) + let bytes = message.payload.try_to_bytes().map_err(|e| { + Error::CannotStoreData(format!("Payload conversion failed: {e}")) })?; - let encoded = general_purpose::STANDARD.encode(&payload_bytes); + let encoded = general_purpose::STANDARD.encode(&bytes); buf.push_str(",payload_base64=\""); write_field_string(buf, &encoded); buf.push('"'); } } - // ── Timestamp ──────────────────────────────────────────────────────── - // message.timestamp is microseconds since Unix epoch. - // Fall back to now() when unset (0) so points are not stored at the - // Unix epoch (year 1970), which falls outside every range(start:-1h). let base_micros = if message.timestamp == 0 { + // timestamp == 0 is treated as "not set" — the iggy server overwrites + // the header timestamp on ingest, so live traffic never reaches here. + // This path fires only for externally-imported data stamped at UNIX_EPOCH. + warn!( + "sink ID: {} — message offset={} has timestamp=0 (Unix epoch or unset); \ + substituting current wall-clock time", + self.id, message.offset + ); SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() @@ -479,20 +583,34 @@ impl InfluxDbSink { message.timestamp }; let ts = self.to_precision_timestamp(base_micros); + write!(buf, " {ts}").expect("infallible"); - { - use std::fmt::Write as _; - write!(buf, " {ts}").expect("write to String is infallible"); - } - - debug!( - "InfluxDB sink ID: {} point — offset={}, raw_ts={}, influx_ts={ts}", - self.id, message.offset, message.timestamp - ); - + debug!("sink ID: {} — offset={}, ts={ts}", self.id, message.offset); Ok(()) } + /// Build the newline-separated line-protocol body for a batch of messages. + /// Pure function — no I/O; extracted for testability. The empty-slice path is + /// unreachable in production (process_batch returns early when messages is empty) + /// but is exercised by unit tests for defensive completeness. + fn build_body( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: &[ConsumedMessage], + ) -> Result { + // 1 KiB per message is a conservative estimate that accommodates JSON + // payloads without excessive reallocation. + let mut body = String::with_capacity(messages.len() * 1024); + for (i, msg) in messages.iter().enumerate() { + self.append_line(&mut body, topic_metadata, messages_metadata, msg)?; + if i + 1 < messages.len() { + body.push('\n'); + } + } + Ok(body) + } + async fn process_batch( &self, topic_metadata: &TopicMetadata, @@ -503,35 +621,26 @@ impl InfluxDbSink { return Ok(()); } - // Single buffer for the entire batch — reused across all messages. - // Pre-allocate a generous estimate (256 bytes per message) to avoid - // reallocation in the common case. The buffer is passed into - // append_line() which writes each line directly, with '\n' separators - // between lines. No per-message String is allocated. - let mut body = String::with_capacity(messages.len() * 256); - - for (i, message) in messages.iter().enumerate() { - if i > 0 { - body.push('\n'); - } - self.append_line(&mut body, topic_metadata, messages_metadata, message)?; - } + let body = self.build_body(topic_metadata, messages_metadata, messages)?; let client = self.get_client()?; - let url = self.write_url.clone().ok_or_else(|| { - Error::Connection("write_url not initialised — was open() called?".to_string()) + let url = self.write_url.as_ref().ok_or_else(|| { + Error::Connection("write_url not initialized — call open() first".to_string()) })?; - let token = self.config.token.expose_secret().to_owned(); - - // Convert once before sending — Bytes is reference-counted so any - // retry inside the middleware clones the pointer, not the payload data. - let body: Bytes = Bytes::from(body); + let auth = self + .auth_header + .as_ref() + .map(|s| s.expose_secret().as_str()) + .ok_or_else(|| { + Error::Connection("auth_header not initialised — was open() called?".to_string()) + })?; let response = client - .post(url) - .header("Authorization", format!("Token {token}")) + .post(url.as_str()) + .header("Authorization", auth) .header("Content-Type", "text/plain; charset=utf-8") - .body(body) + // into_bytes() hands the Vec directly to Bytes without copying. + .body(Bytes::from(body.into_bytes())) .send() .await .map_err(|e| Error::CannotStoreData(format!("InfluxDB write failed: {e}")))?; @@ -546,52 +655,55 @@ impl InfluxDbSink { .await .unwrap_or_else(|_| "failed to read response body".to_string()); - // Use PermanentHttpError for non-transient 4xx (400 Bad Request, 422 - // schema conflict, etc.) so consume() can skip the circuit breaker for - // these — they indicate a data/schema issue, not an infrastructure one. if iggy_connector_sdk::retry::is_transient_status(status) { Err(Error::CannotStoreData(format!( - "InfluxDB write failed with status {status}: {body_text}" + "InfluxDB write failed {status}: {body_text}" ))) } else { Err(Error::PermanentHttpError(format!( - "InfluxDB write failed with status {status}: {body_text}" + "InfluxDB write failed {status}: {body_text}" ))) } } } -// --------------------------------------------------------------------------- -// Sink trait implementation -// --------------------------------------------------------------------------- +// ── Sink trait ──────────────────────────────────────────────────────────────── #[async_trait] impl Sink for InfluxDbSink { async fn open(&mut self) -> Result<(), Error> { + const VALID_PRECISIONS: &[&str] = &["ns", "us", "ms", "s"]; + if !VALID_PRECISIONS.contains(&self.precision.as_str()) { + return Err(Error::InvalidConfigValue(format!( + "unknown precision {:?}; valid values are \"ns\", \"us\", \"ms\", \"s\"", + self.precision + ))); + } + + if let InfluxDbSinkConfig::V2(c) = &self.config + && c.org.trim().is_empty() + { + return Err(Error::InvalidConfigValue( + "V2 sink config requires a non-empty 'org'".to_string(), + )); + } + info!( - "Opening InfluxDB sink connector with ID: {}. Bucket: {}, org: {}", - self.id, self.config.bucket, self.config.org + "Opening InfluxDB sink ID: {} (version={})", + self.id, + self.config.version_label() ); - // Build the raw client first and use it for the startup connectivity - // check. The connectivity retry loop uses separate delay bounds - // (open_retry_max_delay) from the per-write middleware retries, so - // we keep them independent rather than routing health checks through - // the write-tuned middleware. let raw_client = self.build_raw_client()?; - let health_url = self.build_health_url()?; check_connectivity_with_retry( &raw_client, - health_url, + self.config.build_health_url()?, "InfluxDB sink", self.id, &ConnectivityConfig { - max_open_retries: self - .config - .max_open_retries - .unwrap_or(DEFAULT_MAX_OPEN_RETRIES), + max_open_retries: self.config.max_open_retries(), open_retry_max_delay: parse_duration( - self.config.open_retry_max_delay.as_deref(), + self.config.open_retry_max_delay(), DEFAULT_OPEN_RETRY_MAX_DELAY, ), retry_delay: self.retry_delay, @@ -599,30 +711,18 @@ impl Sink for InfluxDbSink { ) .await?; - // Wrap in the retry middleware for all subsequent write operations. - // The middleware handles transient 429 / 5xx retries with - // exponential back-off, jitter, and Retry-After header support. - let max_retries = self.get_max_retries(); - let write_retry_max_delay = parse_duration( - self.config.retry_max_delay.as_deref(), - DEFAULT_RETRY_MAX_DELAY, - ); self.client = Some(build_retry_client( raw_client, - max_retries, + self.config.max_retries().max(1), self.retry_delay, - write_retry_max_delay, + parse_duration(self.config.retry_max_delay(), DEFAULT_RETRY_MAX_DELAY), "InfluxDB", )); - // Cache once — both are derived purely from config fields that - // never change at runtime. - self.write_url = Some(self.build_write_url()?); + self.write_url = Some(self.config.build_write_url()?); + self.auth_header = Some(SecretBox::new(Box::new(self.config.auth_header()))); - info!( - "InfluxDB sink connector with ID: {} opened successfully", - self.id - ); + info!("InfluxDB sink ID: {} opened successfully", self.id); Ok(()) } @@ -632,52 +732,41 @@ impl Sink for InfluxDbSink { messages_metadata: MessagesMetadata, messages: Vec, ) -> Result<(), Error> { - let batch_size = self.config.batch_size.unwrap_or(500) as usize; - let total_messages = messages.len(); + let total = messages.len(); - // Skip writes entirely if circuit breaker is open if self.circuit_breaker.is_open().await { warn!( - "InfluxDB sink ID: {} — circuit breaker is OPEN. \ - Skipping {} messages to avoid hammering a down InfluxDB.", - self.id, total_messages + "InfluxDB sink ID: {} — circuit breaker OPEN, skipping {} messages", + self.id, total ); - // Return an error so the runtime knows messages were not written return Err(Error::CannotStoreData( - "Circuit breaker is open — InfluxDB write skipped".to_string(), + "Circuit breaker is open".to_string(), )); } - // Collect the first batch error rather than silently dropping let mut first_error: Option = None; - for batch in messages.chunks(batch_size.max(1)) { + for batch in messages.chunks(self.batch_size_limit) { match self .process_batch(topic_metadata, &messages_metadata, batch) .await { Ok(()) => { + self.circuit_breaker.record_success(); self.write_success - .fetch_add(batch.len() as u64, Ordering::Relaxed); + .fetch_add(batch.len() as u64, Ordering::AcqRel); } Err(e) => { - // Only count transient/connectivity failures toward the - // circuit breaker. PermanentHttpError (400, 422, etc.) are - // data/schema issues that retrying will not fix; tripping - // the circuit on them would block valid subsequent messages. if !matches!(e, Error::PermanentHttpError(_)) { self.circuit_breaker.record_failure().await; } self.write_errors - .fetch_add(batch.len() as u64, Ordering::Relaxed); + .fetch_add(batch.len() as u64, Ordering::AcqRel); error!( - "InfluxDB sink ID: {} failed to write batch of {} messages: {e}", + "InfluxDB sink ID: {} failed batch of {}: {e}", self.id, batch.len() ); - - // Capture first error; continue attempting remaining - // batches to maximise data delivery, but record the failure. if first_error.is_none() { first_error = Some(e); } @@ -685,76 +774,54 @@ impl Sink for InfluxDbSink { } } - // Only reset the circuit breaker if every batch in this consume() call - // succeeded. Resetting inside the loop means a later successful batch - // would clear the failure counter accumulated by an earlier failed one, - // masking repeated partial failures and preventing the circuit from - // ever tripping. - if first_error.is_none() { - self.circuit_breaker.record_success(); - } - let total_processed = self .messages_attempted - .fetch_add(total_messages as u64, Ordering::Relaxed) - + total_messages as u64; + .fetch_add(total as u64, Ordering::AcqRel) + + total as u64; if self.verbose { info!( - "InfluxDB sink ID: {} processed {} messages. \ - Total processed: {}, Success: {}, write errors: {}", + "InfluxDB sink ID: {} — processed={total}, cumulative={total_processed}, \ + success={}, errors={}", self.id, - total_messages, - total_processed, - self.write_success.load(Ordering::Relaxed), - self.write_errors.load(Ordering::Relaxed), + self.write_success.load(Ordering::Acquire), + self.write_errors.load(Ordering::Acquire), ); } else { debug!( - "InfluxDB sink ID: {} processed {} messages. \ - Total processed: {}, Success: {}, write errors: {}", + "InfluxDB sink ID: {} — processed={total}, cumulative={total_processed}, \ + success={}, errors={}", self.id, - total_messages, - total_processed, - self.write_success.load(Ordering::Relaxed), - self.write_errors.load(Ordering::Relaxed), + self.write_success.load(Ordering::Acquire), + self.write_errors.load(Ordering::Acquire), ); } - // Propagate the first batch error to the runtime so it can - // decide whether to retry, halt, or dead-letter — instead of returning Ok(()) - // and silently losing messages. - if let Some(err) = first_error { - return Err(err); - } - - Ok(()) + first_error.map_or(Ok(()), Err) } async fn close(&mut self) -> Result<(), Error> { - self.client = None; // release connection pool + self.client = None; info!( - "InfluxDB sink connector with ID: {} closed. Processed: {}, Success: {}, errors: {}", + "InfluxDB sink ID: {} closed — processed={}, success={}, errors={}", self.id, - self.messages_attempted.load(Ordering::Relaxed), - self.write_success.load(Ordering::Relaxed), - self.write_errors.load(Ordering::Relaxed), + self.messages_attempted.load(Ordering::Acquire), + self.write_success.load(Ordering::Acquire), + self.write_errors.load(Ordering::Acquire), ); Ok(()) } } -// --------------------------------------------------------------------------- -// Unit tests -// --------------------------------------------------------------------------- +// ── Tests ───────────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { use super::*; use iggy_connector_sdk::{MessagesMetadata, Schema, TopicMetadata}; - fn make_config() -> InfluxDbSinkConfig { - InfluxDbSinkConfig { + fn make_v2_config() -> InfluxDbSinkConfig { + InfluxDbSinkConfig::V2(V2SinkConfig { url: "http://localhost:8086".to_string(), org: "test_org".to_string(), bucket: "test_bucket".to_string(), @@ -778,11 +845,38 @@ mod tests { retry_max_delay: Some("1s".to_string()), circuit_breaker_threshold: Some(5), circuit_breaker_cool_down: Some("30s".to_string()), - } + }) + } + + fn make_v3_config() -> InfluxDbSinkConfig { + InfluxDbSinkConfig::V3(V3SinkConfig { + url: "http://localhost:8181".to_string(), + db: "test_db".to_string(), + token: SecretString::from("test_token"), + measurement: Some("test_measurement".to_string()), + precision: Some("us".to_string()), + batch_size: None, + include_metadata: Some(true), + include_checksum: Some(true), + include_origin_timestamp: Some(true), + include_stream_tag: Some(true), + include_topic_tag: Some(true), + include_partition_tag: Some(true), + payload_format: Some("json".to_string()), + verbose_logging: None, + max_retries: Some(3), + retry_delay: Some("100ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(3), + open_retry_max_delay: Some("5s".to_string()), + retry_max_delay: Some("1s".to_string()), + circuit_breaker_threshold: Some(5), + circuit_breaker_cool_down: Some("30s".to_string()), + }) } fn make_sink() -> InfluxDbSink { - InfluxDbSink::new(1, make_config()) + InfluxDbSink::new(1, make_v2_config()) } fn make_topic_metadata() -> TopicMetadata { @@ -812,49 +906,196 @@ mod tests { } } - // ── to_precision_timestamp ─────────────────────────────────────────── + // ── config deserialization ──────────────────────────────────────────── + + #[test] + fn sink_config_without_version_defaults_to_v2() { + let json = r#"{"url":"http://localhost:8086","org":"o","bucket":"b","token":"t"}"#; + let raw: serde_json::Value = serde_json::from_str(json).unwrap(); + let config: InfluxDbSinkConfig = serde_json::from_value(raw).unwrap(); + assert!(matches!(config, InfluxDbSinkConfig::V2(_))); + } + + #[test] + fn sink_config_with_explicit_v2_version_deserializes_v2() { + let json = + r#"{"version":"v2","url":"http://localhost:8086","org":"o","bucket":"b","token":"t"}"#; + let config: InfluxDbSinkConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(config, InfluxDbSinkConfig::V2(_))); + } + + #[test] + fn sink_config_with_version_v3_deserializes_v3() { + let json = r#"{"version":"v3","url":"http://localhost:8181","db":"d","token":"t"}"#; + let config: InfluxDbSinkConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(config, InfluxDbSinkConfig::V3(_))); + } + + #[test] + fn sink_config_unknown_version_returns_error() { + let json = r#"{"version":"v9","url":"http://x","org":"o","bucket":"b","token":"t"}"#; + let result = serde_json::from_str::(json); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("unknown InfluxDB version") + ); + } + + #[test] + fn sink_config_toml_without_version_defaults_to_v2() { + // Connectors load config from TOML files in production. Verify the + // backward-compat path works with TOML, not just JSON. + let toml_str = r#" +url = "http://localhost:8086" +org = "myorg" +bucket = "mybucket" +token = "t" +"#; + let cfg: InfluxDbSinkConfig = toml::from_str(toml_str).unwrap(); + assert!( + matches!(cfg, InfluxDbSinkConfig::V2(_)), + "TOML config without version= must default to V2" + ); + } + + #[test] + fn sink_config_toml_with_version_v3_deserializes_v3() { + let toml_str = r#" +version = "v3" +url = "http://localhost:8181" +db = "mydb" +token = "t" +"#; + let cfg: InfluxDbSinkConfig = toml::from_str(toml_str).unwrap(); + assert!(matches!(cfg, InfluxDbSinkConfig::V3(_))); + } + + // ── config ──────────────────────────────────────────────────────────── + + #[test] + fn v2_auth_header_uses_token_scheme() { + let config = make_v2_config(); + assert_eq!(config.auth_header(), "Token test_token"); + } + + #[test] + fn v3_auth_header_uses_bearer_scheme() { + let config = make_v3_config(); + assert_eq!(config.auth_header(), "Bearer test_token"); + } + + #[test] + fn v2_write_url_contains_org_bucket_precision() { + let config = make_v2_config(); + let url = config.build_write_url().unwrap(); + let q = url.query().unwrap_or(""); + assert!(url.path().ends_with("/api/v2/write")); + assert!(q.contains("org=test_org")); + assert!(q.contains("bucket=test_bucket")); + assert!(q.contains("precision=us")); + } + + #[test] + fn v3_write_url_contains_db_and_mapped_precision() { + let config = make_v3_config(); + let url = config.build_write_url().unwrap(); + let q = url.query().unwrap_or(""); + assert!(url.path().ends_with("/api/v3/write_lp")); + assert!(q.contains("db=test_db")); + assert!(q.contains("precision=microsecond")); + assert!(!q.contains("org=")); + assert!(!q.contains("bucket=")); + } + + // ── to_precision_timestamp ──────────────────────────────────────────── #[test] fn precision_ns_multiplies_by_1000() { - let mut config = make_config(); - config.precision = Some("ns".to_string()); + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + precision: Some("ns".to_string()), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); assert_eq!(sink.to_precision_timestamp(1_000_000), 1_000_000_000); } #[test] fn precision_us_is_identity() { - let mut config = make_config(); - config.precision = Some("us".to_string()); - let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.to_precision_timestamp(1_234_567), 1_234_567); + assert_eq!(make_sink().to_precision_timestamp(1_234_567), 1_234_567); } #[test] fn precision_ms_divides_by_1000() { - let mut config = make_config(); - config.precision = Some("ms".to_string()); + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + precision: Some("ms".to_string()), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); assert_eq!(sink.to_precision_timestamp(5_000_000), 5_000); } #[test] fn precision_s_divides_by_1_000_000() { - let mut config = make_config(); - config.precision = Some("s".to_string()); + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + precision: Some("s".to_string()), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); assert_eq!(sink.to_precision_timestamp(7_000_000), 7); } - #[test] - fn precision_unknown_falls_back_to_us() { - let mut config = make_config(); - config.precision = Some("xx".to_string()); - let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.to_precision_timestamp(999), 999); + #[tokio::test] + async fn open_rejects_unknown_precision() { + // Unknown precision must fail at open() rather than silently defaulting + // to microseconds, which would timestamp data at the wrong precision. + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + url: "http://localhost:18086".to_string(), + precision: Some("xx".to_string()), + ..make_v2_config().into_v2().unwrap() + }); + let mut sink = InfluxDbSink::new(1, config); + let err = sink.open().await.unwrap_err(); + assert!( + matches!(err, Error::InvalidConfigValue(_)), + "expected InvalidConfigValue, got {err:?}" + ); + } + + #[tokio::test] + async fn open_rejects_empty_org_in_v2() { + // An empty org generates `?org=` in the write URL, which InfluxDB V2 + // rejects at runtime with a 400. Catch it eagerly at open(). + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + url: "http://localhost:18086".to_string(), + org: "".to_string(), + ..make_v2_config().into_v2().unwrap() + }); + let mut sink = InfluxDbSink::new(1, config); + let err = sink.open().await.unwrap_err(); + assert!( + matches!(err, Error::InvalidConfigValue(_)), + "expected InvalidConfigValue for empty org, got {err:?}" + ); + } + + #[tokio::test] + async fn open_rejects_whitespace_only_org_in_v2() { + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + url: "http://localhost:18086".to_string(), + org: " ".to_string(), + ..make_v2_config().into_v2().unwrap() + }); + let mut sink = InfluxDbSink::new(1, config); + assert!(matches!( + sink.open().await, + Err(Error::InvalidConfigValue(_)) + )); } - // ── line-protocol escaping ─────────────────────────────────────────── + // ── line-protocol escaping ──────────────────────────────────────────── #[test] fn measurement_escapes_comma_space_backslash() { @@ -869,6 +1110,7 @@ mod tests { write_measurement(&mut buf, "meas\nurea\rment"); assert_eq!(buf, "meas\\nurea\\rment"); } + #[test] fn tag_value_escapes_equals_sign() { let mut buf = String::new(); @@ -892,733 +1134,788 @@ mod tests { #[test] fn field_string_escapes_newlines() { - // A \n inside a string field value would split the line-protocol record. let mut buf = String::new(); write_field_string(&mut buf, "line1\nline2\r"); assert_eq!(buf, "line1\\nline2\\r"); } - // ── append_line error paths ────────────────────────────────────────── + + // ── append_line ─────────────────────────────────────────────────────── #[test] fn append_line_invalid_json_payload_returns_error() { - let mut config = make_config(); - config.payload_format = Some("json".to_string()); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - // Raw bytes that are not valid JSON - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"not json!".to_vec())); - + let sink = make_sink(); let mut buf = String::new(); - let result = sink.append_line(&mut buf, &topic, &meta, &msg); - assert!(result.is_err(), "invalid JSON payload should fail"); - let err = result.unwrap_err().to_string(); + let result = sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(b"not json!".to_vec())), + ); + assert!(result.is_err()); assert!( - err.contains("invalid JSON") || err.contains("JSON"), - "error should mention JSON: {err}" + result + .unwrap_err() + .to_string() + .to_lowercase() + .contains("json") ); } #[test] fn append_line_invalid_utf8_text_payload_returns_error() { - let mut config = make_config(); - config.payload_format = Some("text".to_string()); + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + payload_format: Some("text".to_string()), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - // Invalid UTF-8 sequence - let msg = make_message(iggy_connector_sdk::Payload::Raw(vec![0xff, 0xfe, 0xfd])); - let mut buf = String::new(); - let result = sink.append_line(&mut buf, &topic, &meta, &msg); - assert!(result.is_err(), "invalid UTF-8 payload should fail"); - let err = result.unwrap_err().to_string(); + let result = sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(vec![0xff, 0xfe, 0xfd])), + ); + assert!(result.is_err()); assert!( - err.contains("UTF-8") || err.contains("utf"), - "error should mention UTF-8: {err}" + result + .unwrap_err() + .to_string() + .to_uppercase() + .contains("UTF") ); } #[test] fn append_line_valid_json_payload_succeeds() { let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec())); - let mut buf = String::new(); - assert!(sink.append_line(&mut buf, &topic, &meta, &msg).is_ok()); - assert!(buf.contains("payload_json="), "should have json field"); + assert!( + sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec())), + ) + .is_ok() + ); + assert!(buf.contains("payload_json=")); } #[test] fn append_line_base64_payload_succeeds() { - let mut config = make_config(); - config.payload_format = Some("base64".to_string()); + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + payload_format: Some("base64".to_string()), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"binary data".to_vec())); - let mut buf = String::new(); - assert!(sink.append_line(&mut buf, &topic, &meta, &msg).is_ok()); - assert!(buf.contains("payload_base64="), "should have base64 field"); + assert!( + sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(b"binary data".to_vec())), + ) + .is_ok() + ); + assert!(buf.contains("payload_base64=")); } #[test] fn append_line_offset_tag_always_present() { let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - // offset=7 should appear as a tag - assert!( - buf.contains(",offset=7"), - "offset tag should always be present" - ); + sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())), + ) + .unwrap(); + assert!(buf.contains(",offset=7")); } #[test] fn append_line_includes_measurement_name() { - let sink = make_sink(); // measurement = "test_measurement" - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!( - buf.starts_with("test_measurement"), - "line should start with measurement name" - ); - } - - #[test] - fn append_line_partial_metadata_tags_suppressed() { - let mut config = make_config(); - config.include_stream_tag = Some(false); - config.include_topic_tag = Some(false); - config.include_partition_tag = Some(false); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - + let sink = make_sink(); let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!(!buf.contains(",stream="), "stream tag should be suppressed"); - assert!(!buf.contains(",topic="), "topic tag should be suppressed"); - assert!( - !buf.contains(",partition="), - "partition tag should be suppressed" - ); - // Values should appear as fields instead - assert!( - buf.contains("iggy_stream="), - "stream should appear as field" - ); - assert!(buf.contains("iggy_topic="), "topic should appear as field"); + sink.append_line( + &mut buf, + &make_topic_metadata(), + &make_messages_metadata(), + &make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())), + ) + .unwrap(); + assert!(buf.starts_with("test_measurement")); } - // ── circuit breaker integration ────────────────────────────────────── - - #[tokio::test] - async fn consume_returns_error_when_circuit_is_open() { - let mut config = make_config(); - // Threshold of 1 means circuit opens after first failure - config.circuit_breaker_threshold = Some(1); - config.circuit_breaker_cool_down = Some("60s".to_string()); - let sink = InfluxDbSink::new(1, config); - - // Force the circuit open - sink.circuit_breaker.record_failure().await; - assert!(sink.circuit_breaker.is_open().await); + // ── V3 append_line parity ───────────────────────────────────────────── + #[test] + fn v3_append_line_produces_same_line_protocol_as_v2() { + // Only URL, auth header, and write endpoint differ between V2 and V3. + // The line-protocol body itself must be identical so existing tests + // cover both configs implicitly. + let v2_sink = InfluxDbSink::new(1, make_v2_config()); + let v3_sink = InfluxDbSink::new(2, make_v3_config()); + let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec())); let topic = make_topic_metadata(); let meta = make_messages_metadata(); - let result = sink.consume(&topic, meta, vec![]).await; - - assert!(result.is_err(), "consume should fail when circuit is open"); - let err = result.unwrap_err().to_string(); - assert!( - err.to_lowercase().contains("circuit breaker"), - "error should mention circuit breaker: {err}" - ); - } - - #[tokio::test] - async fn consume_succeeds_with_empty_messages_when_circuit_closed() { - // Open the connector so write_url is set (needed if non-empty batch) - // With empty messages, process_batch returns Ok(()) immediately. - let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - // Empty message list — no HTTP call needed, should succeed even without open() - let result = sink.consume(&topic, meta, vec![]).await; - assert!(result.is_ok(), "empty consume should succeed: {:?}", result); - } - // ── close() ────────────────────────────────────────────────────────── + let mut v2_buf = String::new(); + let mut v3_buf = String::new(); + v2_sink + .append_line(&mut v2_buf, &topic, &meta, &msg) + .unwrap(); + v3_sink + .append_line(&mut v3_buf, &topic, &meta, &msg) + .unwrap(); - #[tokio::test] - async fn close_drops_client_and_logs() { - let mut sink = make_sink(); - // close() before open() should be safe - let result = sink.close().await; - assert!(result.is_ok()); - assert!(sink.client.is_none(), "client should be None after close"); + assert_eq!(v2_buf, v3_buf); } - // ── payload_format fallback ────────────────────────────────────────── + // ── build_body batching logic ───────────────────────────────────────── #[test] - fn unknown_payload_format_falls_back_to_json() { - assert_eq!( - PayloadFormat::from_config(Some("unknown_format")), - PayloadFormat::Json - ); + fn build_body_empty_messages_returns_empty_string() { + let sink = make_sink(); + let body = sink + .build_body(&make_topic_metadata(), &make_messages_metadata(), &[]) + .unwrap(); + assert!(body.is_empty()); } #[test] - fn payload_format_aliases() { - assert_eq!( - PayloadFormat::from_config(Some("utf8")), - PayloadFormat::Text - ); - assert_eq!( - PayloadFormat::from_config(Some("raw")), - PayloadFormat::Base64 - ); - assert_eq!(PayloadFormat::from_config(None), PayloadFormat::Json); + fn build_body_single_message_no_leading_or_trailing_newline() { + let sink = make_sink(); + let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec())); + let body = sink + .build_body(&make_topic_metadata(), &make_messages_metadata(), &[msg]) + .unwrap(); + assert!(!body.is_empty()); + assert!(!body.starts_with('\n')); + assert!(!body.ends_with('\n')); + assert_eq!(body.lines().count(), 1); } - // ── payload_format cached at construction ──────────────────────────────────── - #[test] - fn payload_format_resolved_at_construction_text() { - let mut config = make_config(); - config.payload_format = Some("text".to_string()); - let sink = InfluxDbSink::new(1, config); - // The cached field must reflect what was in the config at new() time. - assert_eq!(sink.payload_format(), PayloadFormat::Text); + fn build_body_multiple_messages_newline_separated() { + let sink = make_sink(); + let msgs: Vec<_> = (0..3) + .map(|_| make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec()))) + .collect(); + let body = sink + .build_body(&make_topic_metadata(), &make_messages_metadata(), &msgs) + .unwrap(); + // 3 records → 2 separating newlines + assert_eq!(body.lines().count(), 3); + assert_eq!(body.chars().filter(|&c| c == '\n').count(), 2); } #[test] - fn payload_format_resolved_at_construction_base64() { - let mut config = make_config(); - config.payload_format = Some("base64".to_string()); + fn build_body_batch_size_one_produces_single_line() { + // When batch_size=1, consume() calls process_batch with a single-element + // slice. Verify that build_body returns exactly one line (no newlines). + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + batch_size: Some(1), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.payload_format(), PayloadFormat::Base64); - } + assert_eq!(sink.batch_size_limit, 1); - #[test] - fn payload_format_resolved_at_construction_none_defaults_to_json() { - let mut config = make_config(); - config.payload_format = None; - let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.payload_format(), PayloadFormat::Json); + let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec())); + let body = sink + .build_body(&make_topic_metadata(), &make_messages_metadata(), &[msg]) + .unwrap(); + assert_eq!(body.lines().count(), 1); + assert!(!body.contains('\n')); } #[test] - fn payload_format_resolved_at_construction_unknown_defaults_to_json() { - let mut config = make_config(); - config.payload_format = Some("bogus".to_string()); + fn build_body_exactly_batch_size_limit_messages() { + // Edge case: exactly batch_size messages in one call. + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + batch_size: Some(3), + ..make_v2_config().into_v2().unwrap() + }); let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.payload_format(), PayloadFormat::Json); + let msgs: Vec<_> = (0..3) + .map(|_| make_message(iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec()))) + .collect(); + let body = sink + .build_body(&make_topic_metadata(), &make_messages_metadata(), &msgs) + .unwrap(); + assert_eq!(body.lines().count(), 3); } +} - // ── append_line — Payload::Json fast path ─────────────────────────────────── - - #[test] - fn append_line_native_json_payload_uses_fast_path() { - // Payload::Json is the new fast path (single simd_json serialisation pass). - let sink = make_sink(); // payload_format = "json" - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - - let value = simd_json::json!({"sensor": "temp", "value": 23.5_f64}); - let msg = make_message(iggy_connector_sdk::Payload::Json(value)); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); +// ── Helper for tests: destructure config variants ───────────────────────────── - assert!(buf.contains("payload_json="), "field name must be present"); - // The compact JSON must contain the key and value somewhere in the field. - assert!( - buf.contains("sensor") && buf.contains("temp"), - "JSON content must survive serialisation: {buf}" - ); +impl InfluxDbSinkConfig { + #[cfg(test)] + fn into_v2(self) -> Option { + match self { + Self::V2(c) => Some(c), + Self::V3(_) => None, + } } +} - #[test] - fn append_line_native_json_and_raw_json_produce_equivalent_output() { - // Both Payload::Json and Payload::Raw(valid_json_bytes) must produce the - // same logical JSON value in the line-protocol field, even though they - // travel through different code paths. - let mut config = make_config(); - config.payload_format = Some("json".to_string()); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - - let raw_bytes = b"{\"k\":1}".to_vec(); - - // Fast path: already-parsed OwnedValue - let native_val = simd_json::owned::to_value(&mut raw_bytes.clone()).unwrap(); - let msg_native = make_message(iggy_connector_sdk::Payload::Json(native_val)); - let mut buf_native = String::new(); - sink.append_line(&mut buf_native, &topic, &meta, &msg_native) - .unwrap(); +// ── HTTP integration tests ──────────────────────────────────────────────────── - // Fallback path: raw bytes - let msg_raw = make_message(iggy_connector_sdk::Payload::Raw(raw_bytes)); - let mut buf_raw = String::new(); - sink.append_line(&mut buf_raw, &topic, &meta, &msg_raw) - .unwrap(); - - // Extract just the payload_json field value from each line. - // Both should encode {"k":1} (possibly different key ordering, both valid). - let extract_json_field = |line: &str| -> serde_json::Value { - // Find the payload_json="..." section and parse it. - let start = line.find("payload_json=\"").unwrap() + "payload_json=\"".len(); - // Walk forward to the closing unescaped quote. - let remainder = &line[start..]; - let mut end = 0; - let chars: Vec = remainder.chars().collect(); - while end < chars.len() { - if chars[end] == '"' && (end == 0 || chars[end - 1] != '\\') { - break; - } - end += 1; - } - let json_str = &remainder[..end].replace("\\\"", "\"").replace("\\\\", "\\"); - serde_json::from_str(json_str).unwrap() - }; - - let val_native = extract_json_field(&buf_native); - let val_raw = extract_json_field(&buf_raw); - assert_eq!(val_native, val_raw, "fast-path and fallback must agree"); +#[cfg(test)] +mod http_tests { + use super::*; + use axum::Router; + use axum::extract::Request; + use axum::http::{HeaderMap, StatusCode}; + use axum::routing::{get, post}; + use iggy_connector_sdk::{MessagesMetadata, Schema, Sink, TopicMetadata}; + use secrecy::SecretString; + use std::sync::Arc; + use std::sync::atomic::{AtomicU32, Ordering}; + use tokio::sync::Mutex; + + // ── test helpers ───────────────────────────────────────────────────────── + + async fn start_server(router: Router) -> String { + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + tokio::spawn(async move { + axum::serve(listener, router).await.unwrap(); + }); + format!("http://127.0.0.1:{port}") + } + + /// Minimal V2 config that points at `url`, has batch_size=2, + /// and uses 1-retry / fast timeouts to keep tests quick. + fn v2_config(url: &str) -> InfluxDbSinkConfig { + InfluxDbSinkConfig::V2(V2SinkConfig { + url: url.to_string(), + org: "org".to_string(), + bucket: "bucket".to_string(), + token: SecretString::from("tok"), + measurement: Some("m".to_string()), + precision: Some("us".to_string()), + batch_size: Some(2), + include_metadata: Some(false), + include_checksum: Some(false), + include_origin_timestamp: Some(false), + include_stream_tag: Some(false), + include_topic_tag: Some(false), + include_partition_tag: Some(false), + payload_format: Some("json".to_string()), + verbose_logging: None, + max_retries: Some(1), + retry_delay: Some("1ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("10ms".to_string()), + retry_max_delay: Some("10ms".to_string()), + circuit_breaker_threshold: Some(5), + circuit_breaker_cool_down: Some("30s".to_string()), + }) } - #[test] - fn append_line_text_payload_uses_payload_text_field() { - let mut config = make_config(); - config.payload_format = Some("text".to_string()); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Text( - "hello influx".to_string(), - )); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!(buf.contains("payload_text="), "field name must be present"); - assert!(buf.contains("hello influx"), "content must be preserved"); + fn v3_config(url: &str) -> InfluxDbSinkConfig { + InfluxDbSinkConfig::V3(V3SinkConfig { + url: url.to_string(), + db: "db".to_string(), + token: SecretString::from("tok"), + measurement: Some("m".to_string()), + precision: Some("us".to_string()), + batch_size: Some(2), + include_metadata: Some(false), + include_checksum: Some(false), + include_origin_timestamp: Some(false), + include_stream_tag: Some(false), + include_topic_tag: Some(false), + include_partition_tag: Some(false), + payload_format: Some("json".to_string()), + verbose_logging: None, + max_retries: Some(1), + retry_delay: Some("1ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("10ms".to_string()), + retry_max_delay: Some("10ms".to_string()), + circuit_breaker_threshold: Some(5), + circuit_breaker_cool_down: Some("30s".to_string()), + }) } - #[test] - fn append_line_text_payload_from_raw_bytes() { - let mut config = make_config(); - config.payload_format = Some("text".to_string()); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"raw_as_text".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!(buf.contains("payload_text=")); - assert!(buf.contains("raw_as_text")); + /// Build a mock app that responds 200 to GET /health and `write_status` to + /// POST on the V2 write endpoint. + fn v2_app(write_status: StatusCode) -> Router { + Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route("/api/v2/write", post(move || async move { write_status })) } - // ── append_line — timestamp zero fallback ─────────────────────────────────── - - #[test] - fn append_line_zero_timestamp_falls_back_to_now() { - let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - // message.timestamp == 0 triggers the now() fallback. - let mut msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - msg.timestamp = 0; - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - - // Extract the trailing timestamp from the line-protocol line. - // Format: "measurement,...,tag=v field=v timestamp\n" - let ts_str = buf.trim().rsplit(' ').next().unwrap(); - let ts: u64 = ts_str.parse().expect("timestamp should be a u64"); - - // Must be after Unix epoch (year 1970) and before year 2100. - let year_2100_us = 4_102_444_800_000_000u64; // approx - assert!(ts > 0, "zero timestamp must produce a positive fallback"); - assert!( - ts < year_2100_us, - "fallback timestamp is unreasonably large: {ts}" - ); + fn v3_app(write_status: StatusCode) -> Router { + Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v3/write_lp", + post(move || async move { write_status }), + ) } - #[test] - fn append_line_nonzero_timestamp_preserved() { - let sink = make_sink(); // precision = "us" (identity transform) - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let mut msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - msg.timestamp = 1_700_000_000_000_000; // 2023-11-14 in µs - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - - let ts_str = buf.trim().rsplit(' ').next().unwrap(); - let ts: u64 = ts_str.parse().unwrap(); - assert_eq!( - ts, 1_700_000_000_000_000, - "timestamp must pass through unchanged" - ); + async fn open_sink(config: InfluxDbSinkConfig) -> InfluxDbSink { + let mut sink = InfluxDbSink::new(1, config); + sink.open() + .await + .expect("open() should succeed against mock"); + sink } - // ── append_line — checksum / origin_timestamp fields ──────────────────────── - - #[test] - fn append_line_checksum_field_present_by_default() { - let sink = make_sink(); // include_checksum = Some(true) - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - // make_message sets checksum = 12345 - assert!( - buf.contains("iggy_checksum=12345u"), - "checksum field missing: {buf}" - ); + fn topic() -> TopicMetadata { + TopicMetadata { + stream: "s".to_string(), + topic: "t".to_string(), + } } - #[test] - fn append_line_checksum_suppressed_when_disabled() { - let mut config = make_config(); - config.include_checksum = Some(false); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!( - !buf.contains("iggy_checksum"), - "checksum must be absent: {buf}" - ); + fn meta() -> MessagesMetadata { + MessagesMetadata { + partition_id: 0, + current_offset: 0, + schema: Schema::Json, + } } - #[test] - fn append_line_origin_timestamp_present_by_default() { - let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - // make_message sets origin_timestamp = 1_000_000 - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!( - buf.contains("iggy_origin_timestamp=1000000u"), - "origin_timestamp missing: {buf}" - ); + fn msg() -> ConsumedMessage { + ConsumedMessage { + id: 1, + offset: 0, + checksum: 0, + timestamp: 1_000_000, + origin_timestamp: 1_000_000, + headers: None, + payload: iggy_connector_sdk::Payload::Raw(b"{\"k\":1}".to_vec()), + } } - #[test] - fn append_line_origin_timestamp_suppressed_when_disabled() { - let mut config = make_config(); - config.include_origin_timestamp = Some(false); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); + // ── open() ─────────────────────────────────────────────────────────────── - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - assert!( - !buf.contains("iggy_origin_timestamp"), - "must be absent: {buf}" - ); + #[tokio::test] + async fn open_v2_succeeds_when_health_returns_200() { + let base = start_server(v2_app(StatusCode::NO_CONTENT)).await; + let mut sink = InfluxDbSink::new(1, v2_config(&base)); + assert!(sink.open().await.is_ok()); } - // ── append_line — metadata disabled entirely ───────────────────────────────── - - #[test] - fn append_line_no_metadata_at_all() { - let mut config = make_config(); - config.include_metadata = Some(false); - let sink = InfluxDbSink::new(1, config); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - let msg = make_message(iggy_connector_sdk::Payload::Raw(b"{\"x\":1}".to_vec())); - - let mut buf = String::new(); - sink.append_line(&mut buf, &topic, &meta, &msg).unwrap(); - - // With include_metadata=false, stream/topic/partition tags AND their - // field fallbacks must all be absent. - assert!(!buf.contains(",stream="), "stream tag must be absent"); - assert!(!buf.contains(",topic="), "topic tag must be absent"); - assert!(!buf.contains(",partition="), "partition tag must be absent"); - assert!(!buf.contains("iggy_stream="), "stream field must be absent"); - assert!(!buf.contains("iggy_topic="), "topic field must be absent"); - assert!( - !buf.contains("iggy_partition="), - "partition field must be absent" - ); + #[tokio::test] + async fn open_v3_succeeds_when_health_returns_200() { + let base = start_server(v3_app(StatusCode::NO_CONTENT)).await; + let mut sink = InfluxDbSink::new(1, v3_config(&base)); + assert!(sink.open().await.is_ok()); } - // ── build_write_url ─────────────────────────────────────────────────────────── - - #[test] - fn build_write_url_contains_org_bucket_precision() { - let sink = make_sink(); // org=test_org, bucket=test_bucket, precision=us - let url = sink.build_write_url().unwrap(); - let query = url.query().unwrap_or(""); - assert!(query.contains("org=test_org"), "org missing: {query}"); - assert!( - query.contains("bucket=test_bucket"), - "bucket missing: {query}" - ); - assert!(query.contains("precision=us"), "precision missing: {query}"); + #[tokio::test] + async fn open_fails_when_health_returns_503() { + let app = Router::new().route("/health", get(|| async { StatusCode::SERVICE_UNAVAILABLE })); + let base = start_server(app).await; + let mut sink = InfluxDbSink::new(1, v2_config(&base)); + assert!(sink.open().await.is_err()); } - #[test] - fn build_write_url_path_is_api_v2_write() { - let sink = make_sink(); - let url = sink.build_write_url().unwrap(); - assert_eq!(url.path(), "/api/v2/write"); - } + // ── process_batch() ────────────────────────────────────────────────────── - #[test] - fn build_write_url_trailing_slash_stripped() { - let mut config = make_config(); - config.url = "http://localhost:8086/".to_string(); - let sink = InfluxDbSink::new(1, config); - let url = sink.build_write_url().unwrap(); - // Must not produce a double-slash path like //api/v2/write + #[tokio::test] + async fn process_batch_204_returns_ok() { + let base = start_server(v2_app(StatusCode::NO_CONTENT)).await; + let sink = open_sink(v2_config(&base)).await; assert!( - !url.path().starts_with("//"), - "double slash: {}", - url.path() + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .is_ok() ); - assert_eq!(url.path(), "/api/v2/write"); } - #[test] - fn build_write_url_invalid_base_url_returns_error() { - let mut config = make_config(); - config.url = "not_a_url".to_string(); - let sink = InfluxDbSink::new(1, config); + #[tokio::test] + async fn process_batch_v3_204_returns_ok() { + let base = start_server(v3_app(StatusCode::NO_CONTENT)).await; + let sink = open_sink(v3_config(&base)).await; assert!( - sink.build_write_url().is_err(), - "invalid URL must return error" + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .is_ok() ); } - #[test] - fn build_health_url_path_is_health() { - let sink = make_sink(); - let url = sink.build_health_url().unwrap(); - assert_eq!(url.path(), "/health"); + #[tokio::test] + async fn process_batch_500_returns_can_not_store_data_error() { + let base = start_server(v2_app(StatusCode::INTERNAL_SERVER_ERROR)).await; + let sink = open_sink(v2_config(&base)).await; + let err = sink + .process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap_err(); + assert!(matches!(err, Error::CannotStoreData(_))); } - // ── PermanentHttpError does not trip circuit breaker ───────────────────────── - #[tokio::test] - async fn permanent_http_error_does_not_open_circuit_breaker() { - // threshold=1 means any transient error would open the circuit immediately. - let mut config = make_config(); - config.circuit_breaker_threshold = Some(1); - config.circuit_breaker_cool_down = Some("60s".to_string()); - let sink = InfluxDbSink::new(1, config); - - // Simulate what process_batch does when it gets a 400 response: - // it returns PermanentHttpError, which consume() must NOT count. - let e = Error::PermanentHttpError("400 Bad Request: malformed line protocol".to_string()); - if !matches!(e, Error::PermanentHttpError(_)) { - sink.circuit_breaker.record_failure().await; - } - - // Circuit must remain closed — no failure was recorded. - assert!( - !sink.circuit_breaker.is_open().await, - "PermanentHttpError must not open the circuit breaker" - ); + async fn process_batch_400_returns_permanent_http_error() { + let base = start_server(v2_app(StatusCode::BAD_REQUEST)).await; + let sink = open_sink(v2_config(&base)).await; + let err = sink + .process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap_err(); + assert!(matches!(err, Error::PermanentHttpError(_))); } #[tokio::test] - async fn transient_error_does_open_circuit_breaker() { - let mut config = make_config(); - config.circuit_breaker_threshold = Some(1); - config.circuit_breaker_cool_down = Some("60s".to_string()); - let sink = InfluxDbSink::new(1, config); - - // Simulate what process_batch does for a 503 response: CannotStoreData. - let e = Error::CannotStoreData("503 Service Unavailable".to_string()); - if !matches!(e, Error::PermanentHttpError(_)) { - sink.circuit_breaker.record_failure().await; - } + async fn process_batch_sends_token_authorization_header() { + let captured: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move |headers: HeaderMap| { + let cap = cap2.clone(); + async move { + *cap.lock().await = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); + assert_eq!(*captured.lock().await, "Token tok"); + } - assert!( - sink.circuit_breaker.is_open().await, - "transient error must open the circuit breaker" - ); + #[tokio::test] + async fn process_batch_v3_sends_bearer_authorization_header() { + let captured: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v3/write_lp", + post(move |headers: HeaderMap| { + let cap = cap2.clone(); + async move { + *cap.lock().await = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v3_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); + assert_eq!(*captured.lock().await, "Bearer tok"); } - // ── partial-batch failure does not reset circuit breaker (fix) ──────── + #[tokio::test] + async fn process_batch_sends_line_protocol_content_type() { + let captured: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move |headers: HeaderMap| { + let cap = cap2.clone(); + async move { + *cap.lock().await = headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); + assert!(captured.lock().await.starts_with("text/plain")); + } #[tokio::test] - async fn partial_batch_failure_does_not_reset_circuit_failure_counter() { - // With threshold=2, two consume() calls each containing one failed batch - // followed by one successful batch must still open the circuit. - // - // With the old code (record_success inside the loop), the successful - // second batch would reset the consecutive-failure counter after every - // call, so the circuit would never trip regardless of how many partial - // failures occurred. The fix moves record_success after the loop, - // guarded by first_error.is_none(), so the counter accumulates across - // calls that had any failure. - let mut config = make_config(); - config.circuit_breaker_threshold = Some(2); - config.circuit_breaker_cool_down = Some("60s".to_string()); - let sink = InfluxDbSink::new(1, config); + async fn process_batch_body_is_valid_line_protocol() { + let captured: Arc>> = Arc::new(Mutex::new(Vec::new())); + let cap2 = captured.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move |request: Request| { + let cap = cap2.clone(); + async move { + let b = axum::body::to_bytes(request.into_body(), usize::MAX) + .await + .unwrap(); + *cap.lock().await = b.to_vec(); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); - // Simulate consume() call 1: batch 1 fails (record_failure), batch 2 - // succeeds but first_error.is_some() → record_success NOT called. - sink.circuit_breaker.record_failure().await; + let body = String::from_utf8(captured.lock().await.clone()).unwrap(); + // measurement name is "m" from config + assert!(body.starts_with("m,"), "expected measurement tag: {body}"); + // offset tag is always written + assert!(body.contains(",offset=0"), "expected offset tag: {body}"); + // JSON payload field assert!( - !sink.circuit_breaker.is_open().await, - "circuit must still be closed after one failure at threshold 2" + body.contains("payload_json="), + "expected payload field: {body}" ); - - // Simulate consume() call 2: same pattern. - sink.circuit_breaker.record_failure().await; - + // ends with a timestamp + let last_token = body.split_whitespace().last().unwrap(); assert!( - sink.circuit_breaker.is_open().await, - "circuit must open after threshold failures – a later successful \ - batch in the same call must not reset the consecutive-failure counter" + last_token.parse::().is_ok(), + "expected numeric ts: {body}" ); } + // ── consume() chunking ─────────────────────────────────────────────────── + #[tokio::test] - async fn all_batches_succeeding_resets_circuit_failure_counter() { - // Contrast: when ALL batches in a consume() succeed, first_error is None - // and record_success IS called after the loop. The consecutive-failure - // counter must reset so a subsequent isolated failure does not - // immediately re-open the circuit. - let mut config = make_config(); - config.circuit_breaker_threshold = Some(2); - config.circuit_breaker_cool_down = Some("60s".to_string()); - let sink = InfluxDbSink::new(1, config); + async fn consume_chunks_into_batches_of_batch_size() { + // batch_size=2 with 5 messages → 3 HTTP calls: (2, 2, 1) + let call_count = Arc::new(AtomicU32::new(0)); + let cc2 = call_count.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move || { + let cc = cc2.clone(); + async move { + cc.fetch_add(1, Ordering::AcqRel); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + let msgs: Vec<_> = (0..5).map(|_| msg()).collect(); + sink.consume(&topic(), meta(), msgs).await.unwrap(); + assert_eq!(call_count.load(Ordering::Acquire), 3); + } - // Accumulate one failure. - sink.circuit_breaker.record_failure().await; - assert!(!sink.circuit_breaker.is_open().await); + #[tokio::test] + async fn consume_single_message_batch_size_one_makes_one_call() { + let call_count = Arc::new(AtomicU32::new(0)); + let cc2 = call_count.clone(); + // Override batch_size to 1 + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + batch_size: Some(1), + ..v2_config("placeholder").into_v2().unwrap() + }); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move || { + let cc = cc2.clone(); + async move { + cc.fetch_add(1, Ordering::AcqRel); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + // Patch the url in after server started + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + url: base.clone(), + ..config.into_v2().unwrap() + }); + let sink = open_sink(config).await; + sink.consume(&topic(), meta(), vec![msg()]).await.unwrap(); + assert_eq!(call_count.load(Ordering::Acquire), 1); + } - // A fully successful consume() → record_success resets the counter. - sink.circuit_breaker.record_success(); + #[tokio::test] + async fn consume_returns_first_error_after_all_batches_attempt() { + // First batch fails (500), second batch succeeds. + // consume() should return an error but still attempt the second batch. + let call_count = Arc::new(AtomicU32::new(0)); + let cc2 = call_count.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move || { + let cc = cc2.clone(); + async move { + let n = cc.fetch_add(1, Ordering::AcqRel); + if n == 0 { + StatusCode::INTERNAL_SERVER_ERROR + } else { + StatusCode::NO_CONTENT + } + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + // 4 messages, batch_size=2 → 2 batches; first fails, second succeeds + let msgs: Vec<_> = (0..4).map(|_| msg()).collect(); + let result = sink.consume(&topic(), meta(), msgs).await; + assert!(result.is_err()); // error from the first batch is returned + assert_eq!(call_count.load(Ordering::Acquire), 2); // both batches were attempted + } - // A single subsequent failure should restart from 1 (threshold not reached). - sink.circuit_breaker.record_failure().await; + #[tokio::test] + async fn consume_records_success_per_successful_batch() { + // With 2 batches where the first fails and the second succeeds, the circuit + // breaker must record 1 failure AND 1 success — not 1 failure and 0 successes. + // If only failures are recorded, the breaker will trip after enough intermittent + // errors even when most batches succeed. + let call_count = Arc::new(AtomicU32::new(0)); + let cc2 = call_count.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move || { + let cc = cc2.clone(); + async move { + let n = cc.fetch_add(1, Ordering::AcqRel); + if n == 0 { + StatusCode::INTERNAL_SERVER_ERROR + } else { + StatusCode::NO_CONTENT + } + } + }), + ); + let base = start_server(app).await; + // Use a threshold of 2 so the breaker trips after 2 failures. + // If the success of batch 2 is not recorded, successive calls that have + // 1 failure each would trip the breaker after 2 invocations. + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + circuit_breaker_threshold: Some(2), + circuit_breaker_cool_down: Some("60s".to_string()), + ..v2_config(&base).into_v2().unwrap() + }); + let sink = open_sink(config).await; + let msgs: Vec<_> = (0..4).map(|_| msg()).collect(); + let _ = sink.consume(&topic(), meta(), msgs).await; // first fails, second succeeds + // Circuit breaker should NOT be open: 1 failure + 1 success → not tripped. assert!( !sink.circuit_breaker.is_open().await, - "failure counter must restart from zero after a fully successful consume() call" + "circuit breaker must not trip when at least one batch succeeded" ); } - // ── measurement default ─────────────────────────────────────────────────────── + // ── write URL routing ───────────────────────────────────────────────────── - #[test] - fn measurement_defaults_to_iggy_messages_when_not_configured() { - let mut config = make_config(); - config.measurement = None; - let sink = InfluxDbSink::new(1, config); - assert_eq!(sink.measurement(), "iggy_messages"); + #[tokio::test] + async fn v2_writes_to_api_v2_write_endpoint() { + let hit = Arc::new(AtomicU32::new(0)); + let hit2 = hit.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v2/write", + post(move || { + let h = hit2.clone(); + async move { + h.fetch_add(1, Ordering::AcqRel); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v2_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); + assert_eq!(hit.load(Ordering::Acquire), 1); } - #[test] - fn measurement_uses_configured_value() { - let sink = make_sink(); // measurement = "test_measurement" - assert_eq!(sink.measurement(), "test_measurement"); + #[tokio::test] + async fn v3_writes_to_api_v3_write_lp_endpoint() { + let hit = Arc::new(AtomicU32::new(0)); + let hit2 = hit.clone(); + let app = Router::new() + .route("/health", get(|| async { StatusCode::OK })) + .route( + "/api/v3/write_lp", + post(move || { + let h = hit2.clone(); + async move { + h.fetch_add(1, Ordering::AcqRel); + StatusCode::NO_CONTENT + } + }), + ); + let base = start_server(app).await; + let sink = open_sink(v3_config(&base)).await; + sink.process_batch(&topic(), &meta(), &[msg()]) + .await + .unwrap(); + assert_eq!(hit.load(Ordering::Acquire), 1); } - // ── get_client before open() ────────────────────────────────────────────────── + // ── map_precision_v3 ────────────────────────────────────────────────── #[test] - fn get_client_before_open_returns_error() { - let sink = make_sink(); - assert!( - sink.get_client().is_err(), - "get_client before open() must return an error" - ); + fn map_precision_v3_maps_all_short_forms() { + assert_eq!(map_precision_v3("ns").unwrap(), "nanosecond"); + assert_eq!(map_precision_v3("us").unwrap(), "microsecond"); + assert_eq!(map_precision_v3("ms").unwrap(), "millisecond"); + assert_eq!(map_precision_v3("s").unwrap(), "second"); } - // ── batch chunking: multiple messages produce newline-separated lines ───────── - #[test] - fn process_batch_body_has_newline_between_lines() { - let sink = make_sink(); - let topic = make_topic_metadata(); - let meta = make_messages_metadata(); - - let msg1 = make_message(iggy_connector_sdk::Payload::Raw(b"{\"a\":1}".to_vec())); - let msg2 = make_message(iggy_connector_sdk::Payload::Raw(b"{\"b\":2}".to_vec())); - - // Build the body the same way process_batch does. - let mut body = String::new(); - sink.append_line(&mut body, &topic, &meta, &msg1).unwrap(); - body.push('\n'); - sink.append_line(&mut body, &topic, &meta, &msg2).unwrap(); - - let lines: Vec<&str> = body.lines().collect(); - assert_eq!( - lines.len(), - 2, - "two messages must produce exactly two lines" - ); - assert!( - lines[0].starts_with("test_measurement"), - "line 1: {}", - lines[0] - ); - assert!( - lines[1].starts_with("test_measurement"), - "line 2: {}", - lines[1] - ); + fn map_precision_v3_rejects_unknown_values() { + assert!(map_precision_v3("xx").is_err()); + assert!(map_precision_v3("").is_err()); + assert!(map_precision_v3("nanosecond").is_err()); } - // ── to_precision_timestamp edge cases ──────────────────────────────────────── - #[test] - fn precision_ns_does_not_overflow_large_micros() { - // 1e15 µs × 1000 = 1e18 ns, within u64::MAX (~1.8e19). - let mut config = make_config(); - config.precision = Some("ns".to_string()); - let sink = InfluxDbSink::new(1, config); - let micros = 1_000_000_000_000_000u64; // 1e15 µs ≈ year 2001 in ns terms - let ns = sink.to_precision_timestamp(micros); - assert_eq!(ns, micros * 1_000); + fn v3_write_url_invalid_base_returns_error() { + let config = InfluxDbSinkConfig::V3(V3SinkConfig { + url: "not-a-url".to_string(), + ..v3_config("http://placeholder").into_v3().unwrap() + }); + assert!(config.build_write_url().is_err()); } #[test] - fn precision_ns_saturates_on_overflow() { - // u64::MAX µs × 1000 overflows; saturating_mul must not panic. - let mut config = make_config(); - config.precision = Some("ns".to_string()); - let sink = InfluxDbSink::new(1, config); - let result = sink.to_precision_timestamp(u64::MAX); - assert_eq!(result, u64::MAX, "saturating_mul must clamp at u64::MAX"); + fn v2_write_url_invalid_base_returns_error() { + let config = InfluxDbSinkConfig::V2(V2SinkConfig { + url: "not-a-url".to_string(), + ..v2_config("http://placeholder").into_v2().unwrap() + }); + assert!(config.build_write_url().is_err()); + } + + impl InfluxDbSinkConfig { + fn into_v3(self) -> Option { + match self { + Self::V3(c) => Some(c), + Self::V2(_) => None, + } + } } } diff --git a/core/connectors/sinks/influxdb_sink/src/protocol.rs b/core/connectors/sinks/influxdb_sink/src/protocol.rs new file mode 100644 index 0000000000..8c8d231841 --- /dev/null +++ b/core/connectors/sinks/influxdb_sink/src/protocol.rs @@ -0,0 +1,217 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! InfluxDB line-protocol escaping helpers. +//! +//! Both InfluxDB V2 and V3 use the same line-protocol format for writes, so +//! these functions are shared by both connector versions. + +/// Write an escaped measurement name into `buf`. +/// +/// Escapes: `\` → `\\`, `,` → `\,`, ` ` → `\ `, `\t` → `\\t`, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline, carriage-return, and tab are the InfluxDB line-protocol record +/// delimiters or whitespace that can corrupt parsing; a literal newline inside +/// a measurement name would split the line and corrupt the batch. +#[inline] +pub(crate) fn write_measurement(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + ',' => buf.push_str("\\,"), + ' ' => buf.push_str("\\ "), + '\t' => buf.push_str("\\t"), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +/// Write an escaped tag key/value into `buf`. +/// +/// Escapes: `\` → `\\`, `,` → `\,`, `=` → `\=`, ` ` → `\ `, `\t` → `\\t`, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline, carriage-return, and tab are escaped for the same reason as in +/// [`write_measurement`]: they are InfluxDB line-protocol record delimiters or +/// whitespace that can corrupt tag-set parsing. +#[inline] +pub(crate) fn write_tag_value(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + ',' => buf.push_str("\\,"), + '=' => buf.push_str("\\="), + ' ' => buf.push_str("\\ "), + '\t' => buf.push_str("\\t"), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +/// Write an escaped string field value (without surrounding quotes) into `buf`. +/// +/// Escapes: `\` → `\\`, `"` → `\"`, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline and carriage-return are the InfluxDB line-protocol record +/// delimiters; a literal newline inside a string field value would split the +/// line and corrupt the batch. +/// +/// Tab (`\t`) is intentionally NOT escaped here. String field values are +/// double-quoted in line protocol, and the spec permits literal tabs inside +/// quoted strings. Measurement names and tag values (see [`write_measurement`] +/// and [`write_tag_value`]) are unquoted, so tabs must be escaped there to +/// avoid misparsing the tag set. +#[inline] +pub(crate) fn write_field_string(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + '"' => buf.push_str("\\\""), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn measurement_escapes_comma_space_backslash() { + let mut buf = String::new(); + write_measurement(&mut buf, "m\\eas,urea meant"); + assert_eq!(buf, "m\\\\eas\\,urea\\ meant"); + } + + #[test] + fn measurement_escapes_newlines() { + let mut buf = String::new(); + write_measurement(&mut buf, "meas\nurea\rment"); + assert_eq!(buf, "meas\\nurea\\rment"); + } + + #[test] + fn tag_value_escapes_equals_sign() { + let mut buf = String::new(); + write_tag_value(&mut buf, "a=b,c d\\e"); + assert_eq!(buf, "a\\=b\\,c\\ d\\\\e"); + } + + #[test] + fn tag_value_escapes_newlines() { + let mut buf = String::new(); + write_tag_value(&mut buf, "line1\nline2\r"); + assert_eq!(buf, "line1\\nline2\\r"); + } + + #[test] + fn field_string_escapes_quote_and_backslash() { + let mut buf = String::new(); + write_field_string(&mut buf, r#"say "hello" \world\"#); + assert_eq!(buf, r#"say \"hello\" \\world\\"#); + } + + #[test] + fn field_string_escapes_newlines() { + let mut buf = String::new(); + write_field_string(&mut buf, "line1\nline2\r"); + assert_eq!(buf, "line1\\nline2\\r"); + } + + #[test] + fn measurement_plain_ascii_unchanged() { + let mut buf = String::new(); + write_measurement(&mut buf, "cpu_usage"); + assert_eq!(buf, "cpu_usage"); + } + + #[test] + fn tag_value_plain_ascii_unchanged() { + let mut buf = String::new(); + write_tag_value(&mut buf, "server01"); + assert_eq!(buf, "server01"); + } + + #[test] + fn field_string_plain_ascii_unchanged() { + let mut buf = String::new(); + write_field_string(&mut buf, "hello world"); + assert_eq!(buf, "hello world"); + } + + #[test] + fn measurement_empty_string_produces_empty_output() { + let mut buf = String::new(); + write_measurement(&mut buf, ""); + assert!(buf.is_empty()); + } + + #[test] + fn tag_value_empty_string_produces_empty_output() { + let mut buf = String::new(); + write_tag_value(&mut buf, ""); + assert!(buf.is_empty()); + } + + #[test] + fn field_string_empty_string_produces_empty_output() { + let mut buf = String::new(); + write_field_string(&mut buf, ""); + assert!(buf.is_empty()); + } + + #[test] + fn measurement_escapes_tab() { + let mut buf = String::new(); + write_measurement(&mut buf, "m\teasure"); + assert_eq!(buf, "m\\teasure"); + } + + #[test] + fn tag_value_escapes_tab() { + let mut buf = String::new(); + write_tag_value(&mut buf, "val\tue"); + assert_eq!(buf, "val\\tue"); + } + + #[test] + fn measurement_unicode_passthrough() { + let mut buf = String::new(); + write_measurement(&mut buf, "温度"); + assert_eq!(buf, "温度"); + } + + #[test] + fn tag_value_unicode_passthrough() { + let mut buf = String::new(); + write_tag_value(&mut buf, "µ-sensor"); + assert_eq!(buf, "µ-sensor"); + } + + #[test] + fn field_string_unicode_passthrough() { + let mut buf = String::new(); + write_field_string(&mut buf, "café"); + assert_eq!(buf, "café"); + } +} diff --git a/core/connectors/sources/influxdb_source/Cargo.toml b/core/connectors/sources/influxdb_source/Cargo.toml index dcf7bb7caa..8e534265d3 100644 --- a/core/connectors/sources/influxdb_source/Cargo.toml +++ b/core/connectors/sources/influxdb_source/Cargo.toml @@ -29,8 +29,14 @@ repository = "https://github.com/apache/iggy" readme = "../../README.md" publish = false +# dashmap and once_cell are not imported directly in this crate's source, but +# the source_connector! macro (in iggy_connector_sdk::source) expands bare +# `use dashmap::DashMap` and `use once_cell::sync::Lazy` into this crate's +# namespace, so they must be listed here. Remove them only after the SDK macro +# is updated to use `$crate::connector_macro_support::{DashMap, Lazy}` (the +# same fix already applied to sink_connector!). [package.metadata.cargo-machete] -ignored = ["dashmap", "once_cell", "futures"] +ignored = ["dashmap", "once_cell"] [lib] crate-type = ["cdylib", "lib"] @@ -38,9 +44,9 @@ crate-type = ["cdylib", "lib"] [dependencies] async-trait = { workspace = true } base64 = { workspace = true } +chrono = "0.4.44" csv = { workspace = true } dashmap = { workspace = true } -futures = { workspace = true } iggy_common = { workspace = true } iggy_connector_sdk = { workspace = true } once_cell = { workspace = true } @@ -50,6 +56,11 @@ reqwest-middleware = { workspace = true } secrecy = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +simd-json = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } uuid = { workspace = true } + +[dev-dependencies] +axum = { workspace = true } +toml = { workspace = true } diff --git a/core/connectors/sources/influxdb_source/config.toml b/core/connectors/sources/influxdb_source/config.toml index 3f7cd14980..0a726035ae 100644 --- a/core/connectors/sources/influxdb_source/config.toml +++ b/core/connectors/sources/influxdb_source/config.toml @@ -30,6 +30,7 @@ schema = "json" batch_length = 100 [plugin_config] +version = "v2" url = "http://localhost:8086" org = "iggy" token = "replace-with-token" diff --git a/core/connectors/sources/influxdb_source/src/common.rs b/core/connectors/sources/influxdb_source/src/common.rs new file mode 100644 index 0000000000..f29f6d2be8 --- /dev/null +++ b/core/connectors/sources/influxdb_source/src/common.rs @@ -0,0 +1,1027 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use iggy_common::serde_secret::serialize_secret; +use iggy_common::{DateTime, Utc}; +use iggy_connector_sdk::{Error, Schema}; +use secrecy::SecretString; +use serde::{Deserialize, Serialize}; +use std::sync::OnceLock; +use tracing::warn; + +pub(crate) use crate::row::{Row, parse_csv_rows, parse_jsonl_rows}; + +// ── Constants ───────────────────────────────────────────────────────────────── + +/// Default cursor column for V2 (Flux annotated-CSV timestamp annotation). +pub(crate) const DEFAULT_V2_CURSOR_FIELD: &str = "_time"; +/// Default cursor column for V3 (SQL timestamp column name). +pub(crate) const DEFAULT_V3_CURSOR_FIELD: &str = "time"; + +// ── Config ──────────────────────────────────────────────────────────────────── +// +// Uses `#[serde(tag = "version")]` instead of `#[serde(flatten)]` because +// serde's flatten interacts poorly with tagged enums — the tag field can be +// consumed before the variant content is parsed, causing deserialization to fail. + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "version")] +pub enum InfluxDbSourceConfig { + #[serde(rename = "v2")] + V2(V2SourceConfig), + #[serde(rename = "v3")] + V3(V3SourceConfig), +} + +/// Deserializes `InfluxDbSourceConfig` with backward-compatible version defaulting. +/// +/// Existing V2 configs that omit the `version` field are treated as `"v2"` so +/// deployments can upgrade without touching their config files. Explicitly +/// unknown version strings are rejected with a clear error. +impl<'de> serde::Deserialize<'de> for InfluxDbSourceConfig { + fn deserialize>(d: D) -> Result { + let raw = serde_json::Value::deserialize(d)?; + let version = match raw.get("version") { + None => "v2", // absent key → backward compat default + Some(v) => v.as_str().ok_or_else(|| { + serde::de::Error::custom(format!( + "\"version\" must be a string (e.g. \"v2\" or \"v3\"), got: {v}" + )) + })?, + }; + match version { + "v2" => serde_json::from_value::(raw) + .map(Self::V2) + .map_err(serde::de::Error::custom), + "v3" => serde_json::from_value::(raw) + .map(Self::V3) + .map_err(serde::de::Error::custom), + other => Err(serde::de::Error::custom(format!( + "unknown InfluxDB version {other:?}; expected \"v2\" or \"v3\"" + ))), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct V2SourceConfig { + pub(crate) url: String, + pub(crate) org: String, + #[serde(serialize_with = "serialize_secret")] + pub(crate) token: SecretString, + pub(crate) query: String, + pub(crate) poll_interval: Option, + pub(crate) batch_size: Option, + pub(crate) cursor_field: Option, + pub(crate) initial_offset: Option, + pub(crate) payload_column: Option, + pub(crate) payload_format: Option, + pub(crate) include_metadata: Option, + pub(crate) verbose_logging: Option, + pub(crate) max_retries: Option, + pub(crate) retry_delay: Option, + pub(crate) timeout: Option, + pub(crate) max_open_retries: Option, + pub(crate) open_retry_max_delay: Option, + pub(crate) retry_max_delay: Option, + pub(crate) circuit_breaker_threshold: Option, + pub(crate) circuit_breaker_cool_down: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct V3SourceConfig { + pub(crate) url: String, + pub(crate) db: String, + #[serde(serialize_with = "serialize_secret")] + pub(crate) token: SecretString, + pub(crate) query: String, + pub(crate) poll_interval: Option, + pub(crate) batch_size: Option, + pub(crate) cursor_field: Option, + pub(crate) initial_offset: Option, + pub(crate) payload_column: Option, + pub(crate) payload_format: Option, + /// When `false`, the cursor column (`time` by default) is excluded from the + /// emitted JSON payload. Useful when consumers don't need the timestamp in + /// the message body since it's available as message metadata. + pub(crate) include_metadata: Option, + pub(crate) verbose_logging: Option, + pub(crate) max_retries: Option, + pub(crate) retry_delay: Option, + pub(crate) timeout: Option, + pub(crate) max_open_retries: Option, + pub(crate) open_retry_max_delay: Option, + pub(crate) retry_max_delay: Option, + pub(crate) circuit_breaker_threshold: Option, + pub(crate) circuit_breaker_cool_down: Option, + /// Maximum factor by which batch_size may be inflated before the stuck-timestamp + /// circuit breaker trips. Defaults to 10 (i.e. up to 10× the configured batch_size). + /// Maximum accepted value is 100; higher values risk OOM-inducing queries. + pub(crate) stuck_batch_cap_factor: Option, +} + +// Eliminates the repetitive "match self { V2(c) => …, V3(c) => … }" pattern for +// fields that are identical across all config variants. Methods with version-specific +// logic (cursor_field, max_retries, version_label) remain explicit. +// +// Supported patterns: +// delegate!(ref self.url) → &String (borrow) +// delegate!(opt self.poll_interval) → Option<&str> +// delegate!(unwrap self.batch_size, 500) → T: Copy with value fallback +// +// Not supported (use explicit match arms instead): +// Fields with version-specific defaults (e.g. cursor_field: "_time" vs "time") +// Fields with chained transformations (e.g. max_retries + .max(1)) +// Fields that only exist on one variant (e.g. V3's stuck_batch_cap_factor) +macro_rules! delegate { + // &T field reference → fn foo(&self) -> &T + (ref $self:ident . $field:ident) => { + match $self { + Self::V2(c) => &c.$field, + Self::V3(c) => &c.$field, + } + }; + // Option → Option<&str> + (opt $self:ident . $field:ident) => { + match $self { + Self::V2(c) => c.$field.as_deref(), + Self::V3(c) => c.$field.as_deref(), + } + }; + // Option → T with fallback + (unwrap $self:ident . $field:ident, $default:expr) => { + match $self { + Self::V2(c) => c.$field.unwrap_or($default), + Self::V3(c) => c.$field.unwrap_or($default), + } + }; +} + +impl InfluxDbSourceConfig { + pub fn url(&self) -> &str { + delegate!(ref self.url) + } + pub fn token_secret(&self) -> &SecretString { + delegate!(ref self.token) + } + pub fn poll_interval(&self) -> Option<&str> { + delegate!(opt self.poll_interval) + } + pub fn batch_size(&self) -> u32 { + // Floor at 1 — callers build LIMIT $limit queries; LIMIT 0 stalls silently. + // open() also rejects 0 explicitly, but defense-in-depth here costs nothing. + delegate!(unwrap self.batch_size, 500).max(1) + } + pub fn initial_offset(&self) -> Option<&str> { + delegate!(opt self.initial_offset) + } + pub fn payload_column(&self) -> Option<&str> { + delegate!(opt self.payload_column) + } + pub fn payload_format(&self) -> Option<&str> { + delegate!(opt self.payload_format) + } + pub fn verbose_logging(&self) -> bool { + delegate!(unwrap self.verbose_logging, false) + } + pub fn retry_delay(&self) -> Option<&str> { + delegate!(opt self.retry_delay) + } + pub fn timeout(&self) -> Option<&str> { + delegate!(opt self.timeout) + } + pub fn max_open_retries(&self) -> u32 { + delegate!(unwrap self.max_open_retries, 10) + } + pub fn open_retry_max_delay(&self) -> Option<&str> { + delegate!(opt self.open_retry_max_delay) + } + pub fn retry_max_delay(&self) -> Option<&str> { + delegate!(opt self.retry_max_delay) + } + pub fn circuit_breaker_threshold(&self) -> u32 { + delegate!(unwrap self.circuit_breaker_threshold, 5) + } + pub fn circuit_breaker_cool_down(&self) -> Option<&str> { + delegate!(opt self.circuit_breaker_cool_down) + } + + // V2 and V3 use different default cursor column names. + pub fn cursor_field(&self) -> &str { + match self { + Self::V2(c) => c.cursor_field.as_deref().unwrap_or(DEFAULT_V2_CURSOR_FIELD), + Self::V3(c) => c.cursor_field.as_deref().unwrap_or(DEFAULT_V3_CURSOR_FIELD), + } + } + + pub fn include_metadata(&self) -> bool { + delegate!(unwrap self.include_metadata, true) + } + + // Both arms are identical; `delegate!` is not used because the `.max(1)` chain + // cannot be expressed in the macro without adding a new variant. + pub fn max_retries(&self) -> u32 { + match self { + Self::V2(c) => c.max_retries.unwrap_or(3).max(1), + Self::V3(c) => c.max_retries.unwrap_or(3).max(1), + } + } + + pub fn version_label(&self) -> &'static str { + match self { + Self::V2(_) => "v2", + Self::V3(_) => "v3", + } + } + + /// URL with any trailing slash stripped — used as the base for all endpoint URLs. + pub(crate) fn base_url(&self) -> &str { + self.url().trim_end_matches('/') + } +} + +// ── Row processing context ──────────────────────────────────────────────────── + +/// Per-poll fields that are constant across all rows in a batch. +/// Passed by reference to `process_rows` so the function signature stays at ≤ 3 parameters. +#[derive(Debug, Clone, Copy)] +pub(crate) struct RowContext<'a> { + pub cursor_field: &'a str, + pub current_cursor: &'a str, + pub include_metadata: bool, + pub payload_col: Option<&'a str>, + pub payload_format: PayloadFormat, + pub now_micros: u64, +} + +// ── Persisted state ─────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +#[serde(tag = "version")] +pub enum PersistedState { + #[serde(rename = "v2")] + V2(V2State), + #[serde(rename = "v3")] + V3(V3State), +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct V2State { + pub last_timestamp: Option, + pub processed_rows: u64, + /// Rows at `last_timestamp` already delivered; used to skip them when the + /// Flux query uses `>= $cursor` and a batch boundary lands mid-timestamp. + pub cursor_row_count: u64, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct V3State { + pub last_timestamp: Option, + pub processed_rows: u64, + /// Current effective batch size after stuck-timestamp inflation. + /// Reset to the configured base value when the cursor advances. + pub effective_batch_size: u32, + /// Row offset within the last timestamp group — used as a tiebreaker + /// so that siblings at the same timestamp are not silently dropped. + pub last_timestamp_row_offset: u64, +} + +// ── Payload format ──────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum PayloadFormat { + #[default] + Json, + Text, + Raw, +} + +impl PayloadFormat { + pub fn from_config(value: Option<&str>) -> Self { + match value.map(|v| v.to_ascii_lowercase()).as_deref() { + Some("text") | Some("utf8") => PayloadFormat::Text, + Some("raw") | Some("base64") => PayloadFormat::Raw, + Some("json") => PayloadFormat::Json, + other => { + if other.is_some() { + warn!( + "Unrecognized payload_format {:?}, falling back to JSON", + other + ); + } + PayloadFormat::Json + } + } + } + + pub fn schema(self) -> Schema { + match self { + PayloadFormat::Json => Schema::Json, + PayloadFormat::Text => Schema::Text, + PayloadFormat::Raw => Schema::Raw, + } + } +} + +// ── Cursor validation ───────────────────────────────────────────────────────── + +static CURSOR_RE: OnceLock = OnceLock::new(); + +pub fn cursor_re() -> &'static regex::Regex { + CURSOR_RE.get_or_init(|| { + // Validates RFC 3339 timestamp structure with proper field ranges: + // month 01-12, day 01-31, hour 00-23, minute/second 00-59. + // Timezone suffix is required: a naive timestamp without Z or +HH:MM + // is rejected to prevent silent UTC-vs-local ambiguity between V2 (Flux + // always treats timestamps as UTC) and V3 (SQL engine timezone depends + // on server config). + // Note: day 29-31 validity for a given month is not checked by the regex; + // chrono parsing inside validate_cursor handles that for tz-aware timestamps. + regex::Regex::new( + r"(?-u)^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T([01]\d|2[0-3]):[0-5]\d:[0-5]\d(\.\d+)?(Z|[+-]\d{2}:\d{2})$" + ) + .expect("hardcoded regex is valid") + }) +} + +pub fn validate_cursor(cursor: &str) -> Result<(), Error> { + if !cursor_re().is_match(cursor) { + return Err(Error::InvalidConfigValue(format!( + "cursor value {cursor:?} is not a valid RFC 3339 timestamp" + ))); + } + // Chain chrono parse to catch calendar-invalid dates (e.g. Feb 30) + chrono::DateTime::parse_from_rfc3339(cursor).map_err(|e| { + Error::InvalidConfigValue(format!( + "cursor value {cursor:?} failed chrono validation: {e}" + )) + })?; + Ok(()) +} + +/// Validate `cursor_field` for the given connector version. +/// +/// `version` should be `"v2"` or `"v3"`. The function is version-strict: `"_time"` +/// is only valid for V2 (Flux annotation column) and `"time"` is only valid for V3 +/// (SQL timestamp column). Swapping them silently would produce empty result sets +/// or query errors at the InfluxDB level. +pub fn validate_cursor_field(field: &str, version: &str) -> Result<(), Error> { + if field.is_empty() { + return Err(Error::InvalidConfigValue(format!( + "cursor_field must not be empty for {version} — \ + use \"_time\" for v2 or \"time\" for v3" + ))); + } + match (field, version) { + ("time", "v2") => Err(Error::InvalidConfigValue( + "cursor_field \"time\" is not valid for v2 — use \"_time\" \ + (the Flux annotated-CSV timestamp column)" + .into(), + )), + ("_time", "v3") => Err(Error::InvalidConfigValue( + "cursor_field \"_time\" is not valid for v3 — use \"time\" \ + (the SQL timestamp column)" + .into(), + )), + // Allow everything else — custom column names are valid + _ => Ok(()), + } +} + +// ── Timestamp helpers ───────────────────────────────────────────────────────── + +/// Return `true` if timestamp string `a` is strictly after the pre-parsed `b`. +/// +/// `b` is accepted as an already-parsed `DateTime` so callers that compare +/// against the same cursor on every row in a batch parse it once, not O(n) times. +/// `a` is parsed on each call. Returns `false` conservatively when `a` fails to +/// parse — do NOT advance the cursor when comparison is ambiguous. Lexicographic +/// comparison is incorrect for timestamps with different timezone offsets +/// (e.g. `+05:30` vs `Z`) and would silently produce wrong cursor advancement. +pub fn is_timestamp_after(a: &str, b_parsed: DateTime) -> bool { + match a.parse::>() { + Ok(dt_a) => dt_a > b_parsed, + Err(_) => { + warn!( + "is_timestamp_after: could not parse {a:?} as RFC 3339; \ + refusing to advance cursor" + ); + false + } + } +} + +// ── Scalar parsing ──────────────────────────────────────────────────────────── + +/// Parse a string value from InfluxDB into the most specific JSON scalar type. +/// +/// Tries `bool`, then `i64`, then `f64`; falls back to `String`. An empty +/// string becomes `null`. `NaN` and `±Infinity` are emitted as strings because +/// JSON has no representation for non-finite floats +/// (`serde_json::Number::from_f64` returns `None` for them). +pub fn parse_scalar(value: &str) -> serde_json::Value { + if value.is_empty() { + return serde_json::Value::Null; + } + if let Ok(v) = value.parse::() { + return serde_json::Value::Bool(v); + } + if let Ok(v) = value.parse::() { + return serde_json::Value::Number(v.into()); + } + if let Ok(v) = value.parse::() + && let Some(number) = serde_json::Number::from_f64(v) + { + return serde_json::Value::Number(number); + } + serde_json::Value::String(value.to_string()) +} + +// ── Query template substitution ─────────────────────────────────────────────── + +/// Substitute `$cursor` and `$limit` placeholders in a query template in a +/// single pass, avoiding the two intermediate `String` allocations that +/// `clone() + replace() + replace()` would produce. +pub(crate) fn apply_query_params( + template: &str, + cursor: &str, + limit: &str, + offset: &str, +) -> String { + let capacity = template.len() + cursor.len() + limit.len(); + let mut result = String::with_capacity(capacity); + let mut remaining = template; + while let Some(pos) = remaining.find('$') { + result.push_str(&remaining[..pos]); + let after = &remaining[pos..]; + if after.starts_with("$cursor") { + result.push_str(cursor); + remaining = &remaining[pos + "$cursor".len()..]; + } else if after.starts_with("$limit") { + result.push_str(limit); + remaining = &remaining[pos + "$limit".len()..]; + } else if after.starts_with("$offset") { + result.push_str(offset); + remaining = &remaining[pos + "$offset".len()..]; + } else { + result.push('$'); + remaining = &remaining[pos + 1..]; + } + } + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_cursor_accepts_rfc3339() { + assert!(validate_cursor("2024-01-15T10:30:00Z").is_ok()); + assert!(validate_cursor("2024-01-15T10:30:00.123456789Z").is_ok()); + assert!(validate_cursor("2024-01-15T10:30:00+05:30").is_ok()); + assert!(validate_cursor("1970-01-01T00:00:00Z").is_ok()); + } + + #[test] + fn validate_cursor_rejects_timezone_free_timestamp() { + // A naive timestamp without a timezone suffix is rejected to prevent + // silent UTC-vs-local ambiguity between V2 (always UTC) and V3 + // (SQL engine may apply a different default timezone). + assert!( + validate_cursor("2026-04-12T11:28:25.180749").is_err(), + "no timezone suffix must be rejected" + ); + assert!( + validate_cursor("2024-01-15T10:30:00").is_err(), + "bare datetime without tz must be rejected" + ); + } + + #[test] + fn validate_cursor_rejects_invalid() { + assert!(validate_cursor(r#"") |> drop()"#).is_err()); + assert!(validate_cursor("2024-01-15 10:30:00Z").is_err()); + assert!(validate_cursor("not-a-timestamp").is_err()); + assert!(validate_cursor("").is_err()); + assert!(validate_cursor("2024-01-15").is_err()); + } + + #[test] + fn validate_cursor_rejects_out_of_range_date_parts() { + assert!(validate_cursor("2024-13-01T00:00:00Z").is_err(), "month 13"); + assert!(validate_cursor("2024-00-01T00:00:00Z").is_err(), "month 0"); + assert!(validate_cursor("2024-01-00T00:00:00Z").is_err(), "day 0"); + assert!(validate_cursor("2024-01-32T00:00:00Z").is_err(), "day 32"); + assert!(validate_cursor("2024-01-01T24:00:00Z").is_err(), "hour 24"); + assert!( + validate_cursor("2024-01-01T00:60:00Z").is_err(), + "minute 60" + ); + assert!( + validate_cursor("2024-01-01T00:00:60Z").is_err(), + "second 60" + ); + } + + #[test] + fn validate_cursor_field_accepts_time_columns() { + assert!(validate_cursor_field("_time", "v2").is_ok()); + assert!(validate_cursor_field("time", "v3").is_ok()); + } + + #[test] + fn validate_cursor_field_rejects_empty() { + // Empty cursor field must always be rejected — it produces no results. + assert!(validate_cursor_field("", "v2").is_err()); + assert!(validate_cursor_field("", "v3").is_err()); + } + + #[test] + fn validate_cursor_field_is_version_strict() { + // Swapping column names across versions is an error, not a silent passthrough: + // using "time" with v2 or "_time" with v3 produces empty results at the DB level. + assert!( + validate_cursor_field("time", "v2").is_err(), + "\"time\" must be rejected for v2 — correct column is \"_time\"" + ); + assert!( + validate_cursor_field("_time", "v3").is_err(), + "\"_time\" must be rejected for v3 — correct column is \"time\"" + ); + } + + #[test] + fn validate_cursor_field_error_is_version_specific() { + // The cross-version error messages reference the wrong column and hint at + // the correct one, so users can fix config without reading the docs. + let v2_err = validate_cursor_field("time", "v2").unwrap_err().to_string(); + assert!( + v2_err.contains("v2"), + "v2 error should mention v2, got: {v2_err}" + ); + assert!( + v2_err.contains("\"_time\""), + "v2 error should suggest _time, got: {v2_err}" + ); + + let v3_err = validate_cursor_field("_time", "v3") + .unwrap_err() + .to_string(); + assert!( + v3_err.contains("v3"), + "v3 error should mention v3, got: {v3_err}" + ); + assert!( + v3_err.contains("\"time\""), + "v3 error should suggest time, got: {v3_err}" + ); + } + #[test] + fn parse_scalar_types() { + assert_eq!(parse_scalar(""), serde_json::Value::Null); + assert_eq!(parse_scalar("true"), serde_json::Value::Bool(true)); + assert_eq!(parse_scalar("42"), serde_json::Value::Number(42.into())); + assert_eq!( + parse_scalar("hello"), + serde_json::Value::String("hello".to_string()) + ); + } + + #[test] + fn is_timestamp_after_chronological() { + let earlier = "2026-03-18T12:00:00.60952Z" + .parse::>() + .unwrap(); + let later = "2026-03-18T12:00:00.609521Z" + .parse::>() + .unwrap(); + assert!(is_timestamp_after("2026-03-18T12:00:00.609521Z", earlier)); + assert!(!is_timestamp_after("2026-03-18T12:00:00.60952Z", later)); + assert!(!is_timestamp_after("2026-03-18T12:00:00.609521Z", later)); + } + + #[test] + fn is_timestamp_after_fallback_is_conservative() { + // Unparsable `a` must NOT advance the cursor. + let sentinel = "2024-01-01T00:00:00Z".parse::>().unwrap(); + assert!(!is_timestamp_after("not-a-timestamp", sentinel)); + // Valid `a` that is older than `b` must also return false. + assert!(!is_timestamp_after("2023-01-01T00:00:00Z", sentinel)); + } + + #[test] + fn apply_query_params_substitutes_both_placeholders() { + let tmpl = "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit"; + let out = apply_query_params(tmpl, "2024-01-01T00:00:00Z", "100", ""); + assert_eq!( + out, + "SELECT * FROM t WHERE time > '2024-01-01T00:00:00Z' LIMIT 100" + ); + } + + #[test] + fn apply_query_params_no_placeholders() { + let tmpl = "SELECT 1"; + assert_eq!( + apply_query_params(tmpl, "ignored", "ignored", ""), + "SELECT 1" + ); + } + + #[test] + fn apply_query_params_repeated_placeholders() { + let tmpl = "$cursor $cursor $limit"; + let out = apply_query_params(tmpl, "T", "5", ""); + assert_eq!(out, "T T 5"); + } + + // ── V2State / V3State ───────────────────────────────────────────────── + + #[test] + fn v2_state_default_is_zeroed() { + let s = V2State::default(); + assert!(s.last_timestamp.is_none()); + assert_eq!(s.processed_rows, 0); + assert_eq!(s.cursor_row_count, 0); + } + + #[test] + fn v3_state_default_is_zeroed() { + let s = V3State::default(); + assert!(s.last_timestamp.is_none()); + assert_eq!(s.processed_rows, 0); + assert_eq!(s.effective_batch_size, 0); + } + + #[test] + fn v2_state_clone_preserves_all_fields() { + let original = V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + processed_rows: 42, + cursor_row_count: 3, + }; + let cloned = original.clone(); + assert_eq!(cloned.last_timestamp, original.last_timestamp); + assert_eq!(cloned.processed_rows, original.processed_rows); + assert_eq!(cloned.cursor_row_count, original.cursor_row_count); + } + + #[test] + fn v3_state_clone_preserves_all_fields() { + let original = V3State { + last_timestamp: Some("2024-06-15T12:30:00Z".to_string()), + processed_rows: 100, + effective_batch_size: 1000, + last_timestamp_row_offset: 0, + }; + let cloned = original.clone(); + assert_eq!(cloned.last_timestamp, original.last_timestamp); + assert_eq!(cloned.processed_rows, original.processed_rows); + assert_eq!(cloned.effective_batch_size, original.effective_batch_size); + } + + #[test] + fn v2_state_serde_round_trip() { + let original = V2State { + last_timestamp: Some("2024-06-15T12:30:00Z".to_string()), + processed_rows: 999, + cursor_row_count: 7, + }; + let json = serde_json::to_string(&original).unwrap(); + let restored: V2State = serde_json::from_str(&json).unwrap(); + assert_eq!(restored.last_timestamp, original.last_timestamp); + assert_eq!(restored.processed_rows, original.processed_rows); + assert_eq!(restored.cursor_row_count, original.cursor_row_count); + } + + #[test] + fn v3_state_serde_round_trip() { + let original = V3State { + last_timestamp: Some("2024-06-15T12:30:00Z".to_string()), + processed_rows: 500, + effective_batch_size: 2000, + last_timestamp_row_offset: 0, + }; + let json = serde_json::to_string(&original).unwrap(); + let restored: V3State = serde_json::from_str(&json).unwrap(); + assert_eq!(restored.last_timestamp, original.last_timestamp); + assert_eq!(restored.processed_rows, original.processed_rows); + assert_eq!(restored.effective_batch_size, original.effective_batch_size); + } + + #[test] + fn persisted_state_v2_serde_includes_version_tag() { + let state = PersistedState::V2(V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + processed_rows: 1, + cursor_row_count: 0, + }); + let json = serde_json::to_string(&state).unwrap(); + assert!(json.contains(r#""version":"v2""#)); + let restored: PersistedState = serde_json::from_str(&json).unwrap(); + assert!(matches!(restored, PersistedState::V2(_))); + } + + #[test] + fn persisted_state_v3_serde_includes_version_tag() { + let state = PersistedState::V3(V3State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + processed_rows: 1, + effective_batch_size: 500, + last_timestamp_row_offset: 0, + }); + let json = serde_json::to_string(&state).unwrap(); + assert!(json.contains(r#""version":"v3""#)); + let restored: PersistedState = serde_json::from_str(&json).unwrap(); + assert!(matches!(restored, PersistedState::V3(_))); + } + + #[test] + fn persisted_state_wrong_version_tag_fails_to_deserialize() { + let json = r#"{"version":"v9","last_timestamp":null,"processed_rows":0}"#; + let result: Result = serde_json::from_str(json); + assert!(result.is_err()); + } + + // ── InfluxDbSourceConfig backward-compat deserialization ───────────────── + + #[test] + fn source_config_without_version_defaults_to_v2() { + // Existing V2 configs that pre-date the version field must deserialize + // as V2 without any modification to the config file. + let json = + r#"{"url":"http://localhost:8086","org":"myorg","token":"t","query":"SELECT 1"}"#; + let cfg: InfluxDbSourceConfig = serde_json::from_str(json).unwrap(); + assert!( + matches!(cfg, InfluxDbSourceConfig::V2(_)), + "missing version must default to v2" + ); + } + + #[test] + fn source_config_with_explicit_v2_version_deserializes_v2() { + let json = r#"{"version":"v2","url":"http://localhost:8086","org":"o","token":"t","query":"SELECT 1"}"#; + let cfg: InfluxDbSourceConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(cfg, InfluxDbSourceConfig::V2(_))); + } + + #[test] + fn source_config_with_version_v3_deserializes_v3() { + let json = r#"{"version":"v3","url":"http://localhost:8181","db":"d","token":"t","query":"SELECT 1"}"#; + let cfg: InfluxDbSourceConfig = serde_json::from_str(json).unwrap(); + assert!(matches!(cfg, InfluxDbSourceConfig::V3(_))); + } + + #[test] + fn source_config_unknown_version_returns_error() { + let json = + r#"{"version":"v9","url":"http://localhost","org":"o","token":"t","query":"SELECT 1"}"#; + let result: Result = serde_json::from_str(json); + assert!(result.is_err(), "unknown version must be rejected"); + } + + #[test] + fn source_config_serializes_with_version_tag() { + // Round-trip: serialize produces the version tag so the output can be + // loaded back by a version-aware deserializer. + let cfg = InfluxDbSourceConfig::V2(V2SourceConfig { + url: "http://localhost".to_string(), + org: "o".to_string(), + token: SecretString::from("t"), + query: "q".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: None, + retry_delay: None, + timeout: None, + max_open_retries: None, + open_retry_max_delay: None, + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + }); + let json = serde_json::to_string(&cfg).unwrap(); + assert!( + json.contains(r#""version":"v2""#), + "serialized form must include version tag" + ); + let restored: InfluxDbSourceConfig = serde_json::from_str(&json).unwrap(); + assert!(matches!(restored, InfluxDbSourceConfig::V2(_))); + } + + #[test] + fn source_config_toml_without_version_defaults_to_v2() { + // Connectors load config from TOML files in production. Verify the + // backward-compat path works with TOML, not just JSON. + let toml_str = r#" +url = "http://localhost:8086" +org = "myorg" +token = "t" +query = "SELECT 1" +"#; + let cfg: InfluxDbSourceConfig = toml::from_str(toml_str).unwrap(); + assert!( + matches!(cfg, InfluxDbSourceConfig::V2(_)), + "TOML config without version= must default to V2" + ); + } + + #[test] + fn source_config_toml_with_version_v3_deserializes_v3() { + let toml_str = r#" +version = "v3" +url = "http://localhost:8181" +db = "mydb" +token = "t" +query = "SELECT 1" +"#; + let cfg: InfluxDbSourceConfig = toml::from_str(toml_str).unwrap(); + assert!(matches!(cfg, InfluxDbSourceConfig::V3(_))); + } + + // ── InfluxDbSourceConfig accessors ─────────────────────────────────────── + + fn make_v2_cfg() -> InfluxDbSourceConfig { + let json = r#"{"version":"v2","url":"http://host:8086/","org":"o","token":"t","query":"q", + "poll_interval":"5s","batch_size":200,"cursor_field":"_time","initial_offset":"1970-01-01T00:00:00Z", + "payload_column":"data","payload_format":"json","include_metadata":false, + "verbose_logging":true,"retry_delay":"1s","timeout":"10s","max_open_retries":5, + "open_retry_max_delay":"30s","retry_max_delay":"2s","circuit_breaker_threshold":3, + "circuit_breaker_cool_down":"60s","max_retries":4}"#; + serde_json::from_str(json).unwrap() + } + + fn make_v3_cfg() -> InfluxDbSourceConfig { + let json = r#"{"version":"v3","url":"http://host:8181/","db":"mydb","token":"t","query":"q", + "batch_size":300,"payload_format":"text","include_metadata":true,"max_retries":2}"#; + serde_json::from_str(json).unwrap() + } + + #[test] + fn config_accessors_v2_all_fields() { + let cfg = make_v2_cfg(); + assert_eq!(cfg.url(), "http://host:8086/"); + assert_eq!(cfg.poll_interval(), Some("5s")); + assert_eq!(cfg.batch_size(), 200); + assert_eq!(cfg.initial_offset(), Some("1970-01-01T00:00:00Z")); + assert_eq!(cfg.payload_column(), Some("data")); + assert_eq!(cfg.payload_format(), Some("json")); + assert!(!cfg.include_metadata()); + assert!(cfg.verbose_logging()); + assert_eq!(cfg.retry_delay(), Some("1s")); + assert_eq!(cfg.timeout(), Some("10s")); + assert_eq!(cfg.max_open_retries(), 5); + assert_eq!(cfg.open_retry_max_delay(), Some("30s")); + assert_eq!(cfg.retry_max_delay(), Some("2s")); + assert_eq!(cfg.circuit_breaker_threshold(), 3); + assert_eq!(cfg.circuit_breaker_cool_down(), Some("60s")); + assert_eq!(cfg.max_retries(), 4); + assert_eq!(cfg.base_url(), "http://host:8086"); + assert_eq!(cfg.version_label(), "v2"); + assert_eq!(cfg.cursor_field(), "_time"); + } + + #[test] + fn config_accessors_v3_all_fields() { + let cfg = make_v3_cfg(); + assert_eq!(cfg.url(), "http://host:8181/"); + assert_eq!(cfg.batch_size(), 300); + assert_eq!(cfg.payload_format(), Some("text")); + assert!(cfg.include_metadata()); + assert_eq!(cfg.max_retries(), 2); + assert_eq!(cfg.base_url(), "http://host:8181"); + assert_eq!(cfg.version_label(), "v3"); + assert_eq!(cfg.cursor_field(), "time"); // V3 default + } + + #[test] + fn config_accessor_batch_size_zero_is_floored_to_one() { + // batch_size: 0 would produce LIMIT 0 queries; the accessor floors it to 1. + let json = + r#"{"version":"v2","url":"http://h","org":"o","token":"t","query":"q","batch_size":0}"#; + let cfg: InfluxDbSourceConfig = serde_json::from_str(json).unwrap(); + assert_eq!(cfg.batch_size(), 1); + } + + #[test] + fn config_accessor_defaults_when_fields_absent() { + let json = r#"{"version":"v2","url":"http://h","org":"o","token":"t","query":"q"}"#; + let cfg: InfluxDbSourceConfig = serde_json::from_str(json).unwrap(); + assert_eq!(cfg.batch_size(), 500); + assert!(cfg.poll_interval().is_none()); + assert!(cfg.initial_offset().is_none()); + assert!(cfg.payload_column().is_none()); + assert!(cfg.payload_format().is_none()); + assert!(cfg.include_metadata()); // default true + assert!(!cfg.verbose_logging()); // default false + assert!(cfg.retry_delay().is_none()); + assert!(cfg.timeout().is_none()); + assert_eq!(cfg.max_open_retries(), 10); + assert!(cfg.open_retry_max_delay().is_none()); + assert!(cfg.retry_max_delay().is_none()); + assert_eq!(cfg.circuit_breaker_threshold(), 5); + assert!(cfg.circuit_breaker_cool_down().is_none()); + assert_eq!(cfg.max_retries(), 3); + } + + #[test] + fn source_config_version_not_a_string_returns_error() { + // version must be a string — numeric or null version must be rejected. + let json = r#"{"version":42,"url":"http://h","org":"o","token":"t","query":"q"}"#; + let result: Result = serde_json::from_str(json); + assert!(result.is_err(), "numeric version must be rejected"); + } + + // ── PayloadFormat ──────────────────────────────────────────────────────── + + #[test] + fn payload_format_from_config_all_variants() { + assert_eq!( + PayloadFormat::from_config(Some("text")), + PayloadFormat::Text + ); + assert_eq!( + PayloadFormat::from_config(Some("utf8")), + PayloadFormat::Text + ); + assert_eq!(PayloadFormat::from_config(Some("raw")), PayloadFormat::Raw); + assert_eq!( + PayloadFormat::from_config(Some("base64")), + PayloadFormat::Raw + ); + assert_eq!( + PayloadFormat::from_config(Some("json")), + PayloadFormat::Json + ); + assert_eq!(PayloadFormat::from_config(None), PayloadFormat::Json); + } + + #[test] + fn payload_format_from_config_unrecognized_falls_back_to_json() { + assert_eq!(PayloadFormat::from_config(Some("xml")), PayloadFormat::Json); + } + + #[test] + fn payload_format_schema_all_variants() { + use crate::common::Schema; + assert_eq!(PayloadFormat::Json.schema(), Schema::Json); + assert_eq!(PayloadFormat::Text.schema(), Schema::Text); + assert_eq!(PayloadFormat::Raw.schema(), Schema::Raw); + } + + // ── parse_scalar float ─────────────────────────────────────────────────── + + #[test] + fn parse_scalar_float_values() { + // Finite f64 — can be represented as JSON number. + assert_eq!( + parse_scalar("1.23456"), + serde_json::Value::Number(serde_json::Number::from_f64(1.23456).unwrap()) + ); + // NaN is not representable in JSON — falls back to String. + assert_eq!( + parse_scalar("NaN"), + serde_json::Value::String("NaN".to_string()) + ); + } + + // ── apply_query_params $offset and unknown $ ───────────────────────────── + + #[test] + fn apply_query_params_substitutes_offset() { + let tmpl = "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit OFFSET $offset"; + let out = apply_query_params(tmpl, "T", "10", "5"); + assert_eq!(out, "SELECT * FROM t WHERE time > 'T' LIMIT 10 OFFSET 5"); + } + + #[test] + fn apply_query_params_unknown_dollar_passthrough() { + // An unrecognized $-placeholder is passed through literally. + let tmpl = "SELECT $unknown FROM t"; + let out = apply_query_params(tmpl, "T", "10", "0"); + assert_eq!(out, "SELECT $unknown FROM t"); + } +} diff --git a/core/connectors/sources/influxdb_source/src/lib.rs b/core/connectors/sources/influxdb_source/src/lib.rs index 8ef4c75b15..a444be4f69 100644 --- a/core/connectors/sources/influxdb_source/src/lib.rs +++ b/core/connectors/sources/influxdb_source/src/lib.rs @@ -16,696 +16,287 @@ * under the License. */ +mod common; +mod row; +mod v2; +mod v3; + +use crate::common::is_timestamp_after; use async_trait::async_trait; -use base64::{Engine as _, engine::general_purpose}; -use csv::StringRecord; -use iggy_common::serde_secret::serialize_secret; -use iggy_common::{DateTime, Utc}; +use chrono::{DateTime, Utc}; +use common::{ + InfluxDbSourceConfig, PayloadFormat, PersistedState, V2State, V3State, validate_cursor, + validate_cursor_field, +}; use iggy_connector_sdk::retry::{ CircuitBreaker, ConnectivityConfig, build_retry_client, check_connectivity_with_retry, parse_duration, }; use iggy_connector_sdk::{ - ConnectorState, Error, ProducedMessage, ProducedMessages, Schema, Source, source_connector, + ConnectorState, Error, ProducedMessages, Schema, Source, source_connector, }; -use regex::Regex; use reqwest::Url; use reqwest_middleware::ClientWithMiddleware; -use secrecy::{ExposeSecret, SecretString}; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use std::collections::HashMap; +use secrecy::{ExposeSecret, SecretBox}; use std::sync::Arc; -use std::sync::OnceLock; use std::time::Duration; use tokio::sync::Mutex; use tracing::{debug, error, info, warn}; -use uuid::Uuid; source_connector!(InfluxDbSource); const CONNECTOR_NAME: &str = "InfluxDB source"; -const DEFAULT_MAX_RETRIES: u32 = 3; const DEFAULT_RETRY_DELAY: &str = "1s"; const DEFAULT_POLL_INTERVAL: &str = "5s"; const DEFAULT_TIMEOUT: &str = "10s"; -const DEFAULT_CURSOR: &str = "1970-01-01T00:00:00Z"; -// Maximum attempts for open() connectivity retries -const DEFAULT_MAX_OPEN_RETRIES: u32 = 10; -// Cap for exponential backoff in open() — never wait longer than this const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s"; -// Cap for exponential backoff on per-query retries — kept short so a -// transient InfluxDB blip does not stall polling for too long const DEFAULT_RETRY_MAX_DELAY: &str = "5s"; -// How many consecutive poll failures open the circuit breaker -const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5; -// How long the circuit stays open before allowing a probe attempt const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s"; -/// RFC 3339 / ISO 8601 datetime pattern. -/// Matches the forms InfluxDB stores in `_time`: -/// "2024-01-15T10:30:00Z" -/// "2024-01-15T10:30:00.123456789Z" -/// "2024-01-15T10:30:00+05:30" -/// Intentionally strict: only digits, T, Z, colon, dot, plus, hyphen. -/// Any Flux syntax character (pipe, quote, paren, space, slash) is rejected. -static CURSOR_RE: OnceLock = OnceLock::new(); +// ── Connector state ─────────────────────────────────────────────────────────── -// --------------------------------------------------------------------------- -// Main connector structs -// --------------------------------------------------------------------------- +#[derive(Debug)] +enum VersionState { + V2(Mutex), + V3(Mutex), +} + +// ── Connector struct ────────────────────────────────────────────────────────── #[derive(Debug)] pub struct InfluxDbSource { pub id: u32, config: InfluxDbSourceConfig, - /// `None` until `open()` is called. Wraps `reqwest::Client` with - /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled - /// transparently by the middleware stack instead of a hand-rolled loop. client: Option, - state: Mutex, - verbose: bool, - retry_delay: Duration, - poll_interval: Duration, - /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation - /// on every message in the hot path. + version_state: VersionState, payload_format: PayloadFormat, + poll_interval: Duration, + retry_delay: Duration, circuit_breaker: Arc, - /// Set when a persisted `ConnectorState` was provided to `new()` but could - /// not be deserialized into `State` (e.g. schema changed after an upgrade). - /// `open()` refuses to start when this is `true` so operators are not - /// surprised by a silent cursor reset and full re-delivery. + auth_header: Option>, state_restore_failed: bool, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct InfluxDbSourceConfig { - pub url: String, - pub org: String, - #[serde(serialize_with = "serialize_secret")] - pub token: SecretString, - pub query: String, - pub poll_interval: Option, - pub batch_size: Option, - pub cursor_field: Option, - pub initial_offset: Option, - pub payload_column: Option, - pub payload_format: Option, - pub include_metadata: Option, - pub verbose_logging: Option, - pub max_retries: Option, - pub retry_delay: Option, - pub timeout: Option, - // How many times open() will retry before giving up - pub max_open_retries: Option, - // Upper cap on open() backoff delay — can be set high (e.g. "60s") for - // patient startup without affecting per-query retry behaviour - pub open_retry_max_delay: Option, - // Upper cap on per-query retry backoff — kept short so a transient blip - // does not stall polling; independent of open_retry_max_delay - pub retry_max_delay: Option, - // Circuit breaker configuration - pub circuit_breaker_threshold: Option, - pub circuit_breaker_cool_down: Option, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] -enum PayloadFormat { - #[default] - Json, - Text, - Raw, -} - -impl PayloadFormat { - fn from_config(value: Option<&str>) -> Self { - match value.map(|v| v.to_ascii_lowercase()).as_deref() { - Some("text") | Some("utf8") => PayloadFormat::Text, - Some("raw") | Some("base64") => PayloadFormat::Raw, - Some("json") => PayloadFormat::Json, - other => { - warn!( - "Unrecognized payload_format value {:?}, falling back to JSON. \ - Valid values are: \"json\", \"text\", \"utf8\", \"base64\", \"raw\".", - other - ); - PayloadFormat::Json - } - } - } - - fn schema(self) -> Schema { - match self { - PayloadFormat::Json => Schema::Json, - PayloadFormat::Text => Schema::Text, - PayloadFormat::Raw => Schema::Raw, - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -struct State { - last_poll_time: DateTime, - last_timestamp: Option, - processed_rows: u64, - /// How many rows at `last_timestamp` have already been delivered downstream. - /// - /// When the user's Flux query uses `>= $cursor`, consecutive polls may - /// return the same rows for the current cursor timestamp. This counter - /// lets `poll_messages` skip those already-delivered rows and inflate - /// `$limit` accordingly, preventing both duplicates and data loss at - /// batch boundaries where multiple rows share the same timestamp. - /// - /// `#[serde(default)]` keeps existing persisted state files forward-compatible: - /// the field defaults to 0 when the state was saved by an older version. - #[serde(default)] - cursor_row_count: u64, -} - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -fn parse_scalar(value: &str) -> serde_json::Value { - if value.is_empty() { - return serde_json::Value::Null; - } - if let Ok(v) = value.parse::() { - return serde_json::Value::Bool(v); - } - if let Ok(v) = value.parse::() { - return serde_json::Value::Number(v.into()); - } - if let Ok(v) = value.parse::() - && let Some(number) = serde_json::Number::from_f64(v) - { - return serde_json::Value::Number(number); - } - serde_json::Value::String(value.to_string()) -} - -/// Recognise an InfluxDB CSV header row. -/// -/// A header row must contain a `_time` column. The `_value` column is -/// intentionally **not** required: Flux aggregation queries (`count()`, -/// `mean()`, `group()`) produce result tables with columns like `_count` or -/// `_mean` instead of `_value`. Requiring `_value` would cause those header -/// rows to be missed, silently skipping all subsequent data rows until the -/// next recognised header. -/// -/// InfluxDB annotation rows (`#group`, `#datatype`, `#default`) are already -/// filtered out earlier in `parse_csv_rows` by the leading-`#` check, so -/// they will never reach this function. -fn is_header_record(record: &StringRecord) -> bool { - record.iter().any(|v| v == "_time") -} - -/// Compare two RFC 3339 timestamp strings chronologically. -/// -/// InfluxDB strips trailing fractional-second zeros, producing timestamps like -/// `"2026-03-18T12:00:00.60952Z"` (= 609520µs). A naïve `>` string comparison -/// treats this as *greater* than `"2026-03-18T12:00:00.609521Z"` because `'Z'` -/// (ASCII 90) > `'1'` (ASCII 49), even though the former is chronologically -/// *earlier*. Always parse to `DateTime` so the comparison is correct. -fn is_timestamp_after(a: &str, b: &str) -> bool { - match (a.parse::>(), b.parse::>()) { - (Ok(dt_a), Ok(dt_b)) => dt_a > dt_b, - _ => a > b, - } -} - -// --------------------------------------------------------------------------- -// InfluxDbSource implementation -// --------------------------------------------------------------------------- - impl InfluxDbSource { pub fn new(id: u32, config: InfluxDbSourceConfig, state: Option) -> Self { - let verbose = config.verbose_logging.unwrap_or(false); - let retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); - let poll_interval = parse_duration(config.poll_interval.as_deref(), DEFAULT_POLL_INTERVAL); - let payload_format = PayloadFormat::from_config(config.payload_format.as_deref()); - - // Build circuit breaker from config - let cb_threshold = config - .circuit_breaker_threshold - .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD); + let retry_delay = parse_duration(config.retry_delay(), DEFAULT_RETRY_DELAY); + let poll_interval = parse_duration(config.poll_interval(), DEFAULT_POLL_INTERVAL); + let payload_format = PayloadFormat::from_config(config.payload_format()); + + let cb_threshold = config.circuit_breaker_threshold(); let cb_cool_down = parse_duration( - config.circuit_breaker_cool_down.as_deref(), + config.circuit_breaker_cool_down(), DEFAULT_CIRCUIT_COOL_DOWN, ); + let circuit_breaker = Arc::new(CircuitBreaker::new(cb_threshold, cb_cool_down)); - // Distinguish "no prior state" (fresh start, expected) from "state - // existed but could not be deserialized" (schema mismatch after an - // upgrade, unexpected). Collapsing both into None via and_then() would - // silently reset the cursor to the epoch and cause a full re-delivery. - let (restored_state, state_restore_failed) = match state { - None => (None, false), - Some(s) => match s.deserialize::(CONNECTOR_NAME, id) { - Some(state) => { - info!( - "Restored state for {CONNECTOR_NAME} connector with ID: {id}. \ - Last timestamp: {:?}, processed rows: {}", - state.last_timestamp, state.processed_rows - ); - (Some(state), false) - } - None => { - // ConnectorState::deserialize already logs at warn level; - // escalate to error here so the operator sees the connector - // ID and understands the cursor will NOT be silently reset. - error!( - "InfluxDB source ID: {id} — persisted state exists but could not \ - be deserialized (possible schema change after upgrade). \ - Refusing to start to prevent silent cursor reset and full \ - re-delivery. Clear or migrate the connector state to proceed." - ); - (None, true) - } - }, + let (version_state, state_restore_failed) = match &config { + InfluxDbSourceConfig::V2(_) => { + let (s, failed) = restore_v2_state(id, state); + (VersionState::V2(Mutex::new(s)), failed) + } + InfluxDbSourceConfig::V3(_) => { + let (s, failed) = restore_v3_state(id, state); + (VersionState::V3(Mutex::new(s)), failed) + } }; InfluxDbSource { id, config, client: None, - state: Mutex::new(restored_state.unwrap_or(State { - last_poll_time: Utc::now(), - last_timestamp: None, - processed_rows: 0, - cursor_row_count: 0, - })), - verbose, - retry_delay, - poll_interval, + version_state, payload_format, - circuit_breaker: Arc::new(CircuitBreaker::new(cb_threshold, cb_cool_down)), + poll_interval, + retry_delay, + circuit_breaker, + auth_header: None, state_restore_failed, } } - fn serialize_state(&self, state: &State) -> Option { - ConnectorState::serialize(state, CONNECTOR_NAME, self.id) - } - - fn payload_format(&self) -> PayloadFormat { - self.payload_format - } - - fn cursor_field(&self) -> &str { - self.config.cursor_field.as_deref().unwrap_or("_time") - } - - fn get_max_retries(&self) -> u32 { - self.config - .max_retries - .unwrap_or(DEFAULT_MAX_RETRIES) - .max(1) - } - - fn build_raw_client(&self) -> Result { - let timeout = parse_duration(self.config.timeout.as_deref(), DEFAULT_TIMEOUT); - reqwest::Client::builder() - .timeout(timeout) - .build() - .map_err(|e| Error::InitError(format!("Failed to create HTTP client: {e}"))) - } - fn get_client(&self) -> Result<&ClientWithMiddleware, Error> { self.client .as_ref() - .ok_or_else(|| Error::Connection("InfluxDB client is not initialized".to_string())) - } - - fn build_health_url(&self) -> Result { - let base = self.config.url.trim_end_matches('/'); - Url::parse(&format!("{base}/health")) - .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}"))) - } - - fn build_query_url(&self) -> Result { - let base = self.config.url.trim_end_matches('/'); - let mut url = Url::parse(&format!("{base}/api/v2/query")) - .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; - url.query_pairs_mut().append_pair("org", &self.config.org); - Ok(url) - } - - fn cursor_re() -> &'static Regex { - CURSOR_RE.get_or_init(|| { - Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$") - .expect("hardcoded regex is valid") - }) - } - - fn validate_cursor(cursor: &str) -> Result<(), Error> { - if Self::cursor_re().is_match(cursor) { - Ok(()) - } else { - Err(Error::InvalidConfigValue(format!( - "cursor value {:?} is not a valid RFC 3339 timestamp; \ - refusing substitution to prevent Flux query injection", - cursor - ))) - } + .ok_or_else(|| Error::Connection("InfluxDB client not initialized".to_string())) } +} - /// Reject cursor fields that would produce incorrect results. - /// - /// Cursor advancement compares values as `String`s (lexicographic order). - /// This is correct for ISO 8601 / RFC 3339 timestamps — the default - /// `cursor_field` of `"_time"` — because their fixed-width format makes - /// lexicographic and chronological order identical. - fn validate_cursor_field(field: &str) -> Result<(), Error> { - match field { - "_time" | "time" => Ok(()), - other => Err(Error::InvalidConfigValue(format!( - "cursor_field {:?} is not supported — cursor values are compared as strings \ - (lexicographic order), which is only correct for ISO 8601 timestamp columns. \ - Use the default \"_time\" column, or omit cursor_field entirely.", - other - ))), +// ── State restore helpers ───────────────────────────────────────────────────── + +fn restore_v2_state(id: u32, state: Option) -> (V2State, bool) { + let Some(cs) = state else { + return (V2State::default(), false); + }; + match cs.deserialize::(CONNECTOR_NAME, id) { + Some(PersistedState::V2(s)) => { + info!( + "{CONNECTOR_NAME} ID {id}: restored V2 state — \ + last_timestamp={:?}, processed_rows={}", + s.last_timestamp, s.processed_rows + ); + (s, false) } - } - - fn query_with_params(&self, cursor: &str, already_seen: u64) -> Result { - // Reject anything that is not a well-formed RFC 3339 timestamp. - // This prevents a crafted or corrupted _time value (e.g. containing - // Flux syntax like `") |> drop() //`) from being injected into the - // query string before it is sent to /api/v2/query. - // Note: InfluxDB OSS v2 does not support the `params` JSON field for - // parameterized queries (Cloud-only feature), so substitution is - // unavoidable for OSS — validation is the correct mitigation here. - Self::validate_cursor(cursor)?; - // Inflate the limit so that after skipping `already_seen` rows at the - // cursor timestamp we still return a full batch of new rows. This is - // a no-op when `already_seen == 0` (first poll or `>` queries). - let batch_size = self.config.batch_size.unwrap_or(500) as u64; - let limit = batch_size.saturating_add(already_seen).to_string(); - let mut query = self.config.query.clone(); - if query.contains("$cursor") { - query = query.replace("$cursor", cursor); + Some(PersistedState::V3(_)) => { + error!( + "{CONNECTOR_NAME} ID {id}: persisted state is V3 but connector is configured \ + as V2. Refusing to start to prevent cursor reset. \ + Clear or migrate the connector state to proceed." + ); + (V2State::default(), true) } - if query.contains("$limit") { - query = query.replace("$limit", &limit); + None => { + error!( + "{CONNECTOR_NAME} ID {id}: persisted state exists but could not be deserialized. \ + Refusing to start to prevent silent cursor reset." + ); + (V2State::default(), true) } - Ok(query) } +} - /// Execute a Flux query against `/api/v2/query` and return the raw CSV - /// response body. Retry/back-off is handled transparently by the - /// `ClientWithMiddleware` stack (see `build_retry_client`). - async fn run_query(&self, query: &str) -> Result { - let client = self.get_client()?; - let url = self.build_query_url()?; - let token = self.config.token.expose_secret().to_owned(); - - let body = json!({ - "query": query, - "dialect": { - "annotations": [], - "delimiter": ",", - "header": true, - "commentPrefix": "#" - } - }); - - let response = client - .post(url) - .header("Authorization", format!("Token {token}")) - .header("Content-Type", "application/json") - .header("Accept", "text/csv") - .json(&body) - .send() - .await - .map_err(|e| Error::Storage(format!("InfluxDB query failed: {e}")))?; - - let status = response.status(); - if status.is_success() { - return response - .text() - .await - .map_err(|e| Error::Storage(format!("Failed to read query response: {e}"))); +fn restore_v3_state(id: u32, state: Option) -> (V3State, bool) { + let Some(cs) = state else { + return (V3State::default(), false); + }; + match cs.deserialize::(CONNECTOR_NAME, id) { + Some(PersistedState::V3(s)) => { + info!( + "{CONNECTOR_NAME} ID {id}: restored V3 state — \ + last_timestamp={:?}, processed_rows={}", + s.last_timestamp, s.processed_rows + ); + (s, false) } - - let body_text = response - .text() - .await - .unwrap_or_else(|_| "failed to read response body".to_string()); - - // Use PermanentHttpError for non-transient 4xx (400 Bad Request, 401 - // Unauthorized, etc.) so poll() can skip the circuit breaker for these - // — they indicate a config/data issue, not an infrastructure failure. - if iggy_connector_sdk::retry::is_transient_status(status) { - Err(Error::Storage(format!( - "InfluxDB query failed with status {status}: {body_text}" - ))) - } else { - Err(Error::PermanentHttpError(format!( - "InfluxDB query failed with status {status}: {body_text}" - ))) + Some(PersistedState::V2(_)) => { + error!( + "{CONNECTOR_NAME} ID {id}: persisted state is V2 but connector is configured \ + as V3. Refusing to start to prevent cursor reset. \ + Clear or migrate the connector state to proceed." + ); + (V3State::default(), true) + } + None => { + error!( + "{CONNECTOR_NAME} ID {id}: persisted state exists but could not be deserialized. \ + Refusing to start to prevent silent cursor reset." + ); + (V3State::default(), true) } } +} - fn parse_csv_rows(&self, csv_text: &str) -> Result>, Error> { - let mut reader = csv::ReaderBuilder::new() - .has_headers(false) - .from_reader(csv_text.as_bytes()); - - let mut headers: Option = None; - let mut rows = Vec::new(); - - for result in reader.records() { - let record = result - .map_err(|e| Error::InvalidRecordValue(format!("Invalid CSV record: {e}")))?; - - if record.is_empty() { - continue; - } - - if let Some(first) = record.get(0) - && first.starts_with('#') - { - continue; - } - - if is_header_record(&record) { - headers = Some(record.clone()); - continue; - } - - let Some(active_headers) = headers.as_ref() else { - continue; - }; +// ── Source trait ────────────────────────────────────────────────────────────── - if record == *active_headers { - continue; - } +#[async_trait] +impl Source for InfluxDbSource { + async fn open(&mut self) -> Result<(), Error> { + if self.state_restore_failed { + return Err(Error::InvalidState); + } - let mut mapped = HashMap::new(); - for (idx, key) in active_headers.iter().enumerate() { - if key.is_empty() { - continue; - } - let value = record.get(idx).unwrap_or("").to_string(); - mapped.insert(key.to_string(), value); - } + let ver = self.config.version_label(); + info!( + "Opening {CONNECTOR_NAME} with ID: {} (version={ver})", + self.id + ); - if !mapped.is_empty() { - rows.push(mapped); - } + validate_cursor_field(self.config.cursor_field(), self.config.version_label())?; + if let Some(offset) = self.config.initial_offset() { + validate_cursor(offset)?; } - Ok(rows) - } - - fn build_payload( - &self, - row: &HashMap, - include_metadata: bool, - ) -> Result, Error> { - if let Some(payload_column) = self.config.payload_column.as_deref() { - let raw_value = row.get(payload_column).cloned().ok_or_else(|| { - Error::InvalidRecordValue(format!("Missing payload column '{payload_column}'")) - })?; + if let InfluxDbSourceConfig::V3(cfg) = &self.config + && let Some(cap) = cfg.stuck_batch_cap_factor + && cap > v3::MAX_STUCK_CAP_FACTOR + { + return Err(Error::InvalidConfigValue(format!( + "stuck_batch_cap_factor {cap} exceeds maximum of {}; \ + reduce it to avoid querying up to {}×batch_size rows per poll.", + v3::MAX_STUCK_CAP_FACTOR, + v3::MAX_STUCK_CAP_FACTOR + ))); + } - return match self.payload_format() { - PayloadFormat::Json => { - let value: serde_json::Value = - serde_json::from_str(&raw_value).map_err(|e| { - Error::InvalidRecordValue(format!( - "Payload column '{payload_column}' is not valid JSON: {e}" - )) - })?; - serde_json::to_vec(&value).map_err(|e| { - Error::Serialization(format!("JSON serialization failed: {e}")) - }) - } - PayloadFormat::Text => Ok(raw_value.into_bytes()), - PayloadFormat::Raw => general_purpose::STANDARD - .decode(raw_value.as_bytes()) - .map_err(|e| { - Error::InvalidRecordValue(format!( - "Failed to decode payload as base64: {e}" - )) - }), - }; + if let InfluxDbSourceConfig::V3(_) = &self.config + && self.config.batch_size() == 0 + { + return Err(Error::InvalidConfigValue( + "batch_size must be >= 1; got 0. \ + A LIMIT 0 query would return no rows and stall the connector." + .into(), + )); } - let mut json_row = serde_json::Map::new(); - for (key, value) in row { - if include_metadata || key == "_value" || key == "_time" || key == "_measurement" { - json_row.insert(key.clone(), parse_scalar(value)); + // V3 stuck-batch inflation writes last_timestamp_row_offset to state and + // passes it as `$offset` on the next poll so already-seen rows at the same + // timestamp are skipped. If the query template lacks `$offset`, apply_query_params + // silently no-ops and the same head rows are re-fetched and re-emitted on + // every poll — duplicate delivery with no error. + if let InfluxDbSourceConfig::V3(cfg) = &self.config { + let cap = cfg + .stuck_batch_cap_factor + .unwrap_or(v3::DEFAULT_STUCK_CAP_FACTOR); + if cap > 0 && !cfg.query.contains("$offset") { + return Err(Error::InvalidConfigValue( + "V3 source query must contain the '$offset' placeholder when \ + stuck_batch_cap_factor > 0 (the default). Add \ + 'OFFSET $offset' to your query to prevent duplicate delivery \ + during stuck-batch inflation. Example: \ + \"WHERE time > '$cursor' LIMIT $limit OFFSET $offset\"" + .into(), + )); } } - let wrapped = json!({ - "measurement": row.get("_measurement").cloned().unwrap_or_default(), - "field": row.get("_field").cloned().unwrap_or_default(), - "timestamp": row.get("_time").cloned().unwrap_or_default(), - "value": row.get("_value").map(|v| parse_scalar(v)).unwrap_or(serde_json::Value::Null), - "row": json_row, - }); - - serde_json::to_vec(&wrapped) - .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))) - } - - /// Returns `(messages, max_cursor, rows_at_max_cursor, skipped)`. - /// - /// `rows_at_max_cursor` is the count of delivered messages whose cursor - /// field value equals `max_cursor`. The caller stores this in - /// [`State::cursor_row_count`] so the next poll can skip those rows when - /// the query uses `>= $cursor`. - /// - /// `skipped` is the number of rows that were elided because they fell - /// within the already-seen window. When the caller observes zero - /// delivered messages but `skipped > 0`, it means every row the query - /// returned was at the current cursor timestamp and had already been - /// delivered. In that case `skipped` equals the true row count at that - /// timestamp, so the caller can correct any over-inflated - /// `cursor_row_count` rather than getting permanently stuck. - async fn poll_messages( - &self, - ) -> Result<(Vec, Option, u64, u64), Error> { - // Read cursor and already_seen atomically from the same lock acquisition - // so the two values are always consistent with each other. - let (cursor, already_seen) = { - let state = self.state.lock().await; - let c = state - .last_timestamp - .clone() - .or_else(|| self.config.initial_offset.clone()) - .unwrap_or_else(|| DEFAULT_CURSOR.to_string()); - (c, state.cursor_row_count) - }; - - let query = self.query_with_params(&cursor, already_seen).map_err(|e| { - error!( - "InfluxDB source ID: {} — invalid cursor, skipping poll: {e}", + // Skip-N dedup for V2 requires rows to arrive sorted by time. If the Flux + // query uses `>=` semantics (inclusive cursor) without an explicit sort, + // InfluxDB may return rows in storage order, causing skip-N to silently + // skip the wrong rows and produce incorrect output. Hard-error so operators + // don't discover this only after data loss. Queries using strict `>` do not + // need skip-N and are not affected. + if let InfluxDbSourceConfig::V2(cfg) = &self.config + && cfg.query.contains(">=") + && !query_has_sort_call(&cfg.query) + { + return Err(Error::InvalidConfigValue(format!( + "{CONNECTOR_NAME} ID: {}: V2 query uses '>=' (inclusive cursor) but does \ + not contain `|> sort(columns: [\"_time\"])`. Skip-N dedup is \ + order-dependent; without sorting, InfluxDB may return rows in storage \ + order and the wrong rows will be silently skipped. \ + Add `|> sort(columns: [\"_time\"])` before `|> limit(...)` in your query.", self.id - ); - e - })?; - let csv_data = self.run_query(&query).await?; - - let rows = self.parse_csv_rows(&csv_data)?; - let include_metadata = self.config.include_metadata.unwrap_or(true); - let cursor_field = self.cursor_field().to_string(); - - let mut messages = Vec::with_capacity(rows.len()); - let mut max_cursor: Option = None; - let mut rows_at_max_cursor = 0u64; - let mut skipped = 0u64; - - for row in rows { - // Skip rows at the current cursor that were already delivered in a - // previous batch. This deduplicate rows when the query uses - // `>= $cursor` and a batch boundary landed inside a group of rows - // sharing the same timestamp. - if let Some(cv) = row.get(&cursor_field) - && cv == &cursor - && skipped < already_seen - { - skipped += 1; - continue; - } - - // Track the new max cursor and how many delivered rows share it. - if let Some(cv) = row.get(&cursor_field) { - match &max_cursor { - None => { - max_cursor = Some(cv.clone()); - rows_at_max_cursor = 1; - } - Some(current) => { - if is_timestamp_after(cv, current) { - max_cursor = Some(cv.clone()); - rows_at_max_cursor = 1; - } else if cv == current { - rows_at_max_cursor += 1; - } - } - } - } - - let payload = self.build_payload(&row, include_metadata)?; - // Capture once so timestamp and origin_timestamp are guaranteed identical - // and we make exactly one syscall regardless of how many fields use it. - let now_micros = Utc::now().timestamp_micros() as u64; - - messages.push(ProducedMessage { - id: Some(Uuid::new_v4().as_u128()), - checksum: None, - timestamp: Some(now_micros), - origin_timestamp: Some(now_micros), - headers: None, - payload, - }); + ))); } - Ok((messages, max_cursor, rows_at_max_cursor, skipped)) - } -} - -// --------------------------------------------------------------------------- -// Source trait implementation -// --------------------------------------------------------------------------- - -#[async_trait] -impl Source for InfluxDbSource { - async fn open(&mut self) -> Result<(), Error> { - if self.state_restore_failed { - return Err(Error::InvalidState); + if let InfluxDbSourceConfig::V2(_) = &self.config + && self.config.batch_size() == 0 + { + return Err(Error::InvalidConfigValue( + "batch_size must be >= 1; got 0. \ + A LIMIT 0 query would return no rows and stall the connector." + .into(), + )); } - info!( - "Opening InfluxDB source connector with ID: {}. Org: {}", - self.id, self.config.org - ); - - // Build the raw client first and use it for the startup connectivity - // check. The connectivity retry loop uses separate delay bounds - // (open_retry_max_delay) from the per-query middleware retries, so - // we keep them independent. - let raw_client = self.build_raw_client()?; + let timeout = parse_duration(self.config.timeout(), DEFAULT_TIMEOUT); + let raw_client = reqwest::Client::builder() + .timeout(timeout) + .build() + .map_err(|e| Error::InitError(format!("Failed to create HTTP client: {e}")))?; - // Validate cursor_field before touching the network: string comparison - // is only safe for timestamp columns. See validate_cursor_field for details. - Self::validate_cursor_field(self.cursor_field())?; + let health_url = Url::parse(&format!("{}/health", self.config.base_url())) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; - let health_url = self.build_health_url()?; check_connectivity_with_retry( &raw_client, health_url, - "InfluxDB source", + CONNECTOR_NAME, self.id, &ConnectivityConfig { - max_open_retries: self - .config - .max_open_retries - .unwrap_or(DEFAULT_MAX_OPEN_RETRIES), + max_open_retries: self.config.max_open_retries(), open_retry_max_delay: parse_duration( - self.config.open_retry_max_delay.as_deref(), + self.config.open_retry_max_delay(), DEFAULT_OPEN_RETRY_MAX_DELAY, ), retry_delay: self.retry_delay, @@ -713,35 +304,33 @@ impl Source for InfluxDbSource { ) .await?; - // Wrap in the retry middleware for all subsequent query operations. - // The middleware handles transient 429 / 5xx retries with - // exponential back-off, jitter, and Retry-After header support. - let max_retries = self.get_max_retries(); - let query_retry_max_delay = parse_duration( - self.config.retry_max_delay.as_deref(), - DEFAULT_RETRY_MAX_DELAY, - ); + let query_retry_max_delay = + parse_duration(self.config.retry_max_delay(), DEFAULT_RETRY_MAX_DELAY); self.client = Some(build_retry_client( raw_client, - max_retries, + self.config.max_retries(), self.retry_delay, query_retry_max_delay, "InfluxDB", )); + let token = self.config.token_secret().expose_secret(); + self.auth_header = Some(SecretBox::new(Box::new(match &self.config { + InfluxDbSourceConfig::V2(_) => format!("Token {token}"), + InfluxDbSourceConfig::V3(_) => format!("Bearer {token}"), + }))); + info!( - "InfluxDB source connector with ID: {} opened successfully", + "{CONNECTOR_NAME} ID: {} opened successfully (version={ver})", self.id ); Ok(()) } async fn poll(&self) -> Result { - // Skip query if circuit breaker is open; sleep so the runtime does not - // spin-call poll() in a hot loop while the circuit is held open. if self.circuit_breaker.is_open().await { warn!( - "InfluxDB source ID: {} — circuit breaker is OPEN. Skipping poll.", + "{CONNECTOR_NAME} ID: {} — circuit breaker is OPEN. Skipping poll.", self.id ); tokio::time::sleep(self.poll_interval).await; @@ -752,118 +341,258 @@ impl Source for InfluxDbSource { }); } - match self.poll_messages().await { - Ok((messages, max_cursor, rows_at_max_cursor, skipped)) => { - // Successful poll — reset circuit breaker - self.circuit_breaker.record_success(); - - let mut state = self.state.lock().await; - state.last_poll_time = Utc::now(); - state.processed_rows += messages.len() as u64; - match max_cursor { - Some(ref new_cursor) - if state.last_timestamp.as_deref() != Some(new_cursor.as_str()) => - { - // Cursor advanced to a new timestamp — reset the row counter. - state.last_timestamp = max_cursor.clone(); - state.cursor_row_count = rows_at_max_cursor; - } - Some(_) => { - // Cursor stayed at the same timestamp — accumulate so the - // next poll skips all already-delivered rows at this timestamp. - state.cursor_row_count = - state.cursor_row_count.saturating_add(rows_at_max_cursor); - } - None => { - // No rows delivered. If we skipped some rows it means - // every row in the result was at the current cursor - // timestamp and had already been seen. `skipped` is - // therefore the true row count at that timestamp for - // this query result, so we correct cursor_row_count to - // that value. This prevents a permanently-inflated - // counter (e.g. after rows are deleted or compacted in - // InfluxDB) from causing the skip logic to over-skip on - // every subsequent poll and stall the connector. - if skipped > 0 { - state.cursor_row_count = skipped; + let client = self.get_client()?; + let auth = self + .auth_header + .as_ref() + .map(|s| s.expose_secret().as_str()) + .ok_or_else(|| { + Error::Connection("auth_header not initialised — was open() called?".to_string()) + })?; + match &self.version_state { + VersionState::V2(state_mu) => { + let InfluxDbSourceConfig::V2(cfg) = &self.config else { + return Err(Error::InvalidState); + }; + + let state_snap = state_mu.lock().await.clone(); + match v2::poll( + client, + cfg, + auth, + &state_snap, + self.payload_format, + self.config.include_metadata(), + ) + .await + { + Ok(result) => { + self.circuit_breaker.record_success(); + let mut state = state_mu.lock().await; + state.processed_rows += result.messages.len() as u64; + apply_v2_cursor_advance( + &mut state, + result.max_cursor, + result.rows_at_max_cursor, + result.skipped, + ); + + if self.config.verbose_logging() { + info!( + "{CONNECTOR_NAME} ID: {} produced {} messages (V2). \ + Total: {}. Cursor: {:?}", + self.id, + result.messages.len(), + state.processed_rows, + state.last_timestamp + ); + } else { + debug!( + "{CONNECTOR_NAME} ID: {} produced {} messages (V2). \ + Total: {}. Cursor: {:?}", + self.id, + result.messages.len(), + state.processed_rows, + state.last_timestamp + ); } - } - } - if self.verbose { - info!( - "InfluxDB source ID: {} produced {} messages. \ - Total processed: {}. Cursor: {:?}", - self.id, - messages.len(), - state.processed_rows, - state.last_timestamp - ); - } else { - debug!( - "InfluxDB source ID: {} produced {} messages. \ - Total processed: {}. Cursor: {:?}", - self.id, - messages.len(), - state.processed_rows, - state.last_timestamp - ); + let persisted = ConnectorState::serialize( + &PersistedState::V2(V2State { + last_timestamp: state.last_timestamp.clone(), + processed_rows: state.processed_rows, + cursor_row_count: state.cursor_row_count, + }), + CONNECTOR_NAME, + self.id, + ); + + Ok(ProducedMessages { + schema: result.schema, + messages: result.messages, + state: persisted, + }) + } + Err(e) => self.handle_poll_error(e).await, } + } - let schema = if self.config.payload_column.is_some() { - self.payload_format().schema() - } else { - Schema::Json + VersionState::V3(state_mu) => { + let InfluxDbSourceConfig::V3(cfg) = &self.config else { + return Err(Error::InvalidState); }; - let persisted_state = self.serialize_state(&state); + let state_snap = state_mu.lock().await.clone(); + match v3::poll( + client, + cfg, + auth, + &state_snap, + self.payload_format, + self.config.include_metadata(), + ) + .await + { + Ok(result) => { + if result.trip_circuit_breaker { + self.circuit_breaker.record_failure().await; + } else { + self.circuit_breaker.record_success(); + } - Ok(ProducedMessages { - schema, - messages, - state: persisted_state, - }) - } - Err(e) => { - // Only count transient/connectivity failures toward the - // circuit breaker. PermanentHttpError (400, 401, etc.) are - // config/data issues that retrying will not fix. - if !matches!(e, Error::PermanentHttpError(_)) { - self.circuit_breaker.record_failure().await; + let new = result.new_state; + let msg_count = result.messages.len(); + let mut state = state_mu.lock().await; + *state = new; + + if self.config.verbose_logging() { + info!( + "{CONNECTOR_NAME} ID: {} produced {} messages (V3). \ + Total: {}. Cursor: {:?}", + self.id, msg_count, state.processed_rows, state.last_timestamp + ); + } else { + debug!( + "{CONNECTOR_NAME} ID: {} produced {} messages (V3). \ + Total: {}. Cursor: {:?}", + self.id, msg_count, state.processed_rows, state.last_timestamp + ); + } + + let persisted = ConnectorState::serialize( + &PersistedState::V3(V3State { + last_timestamp: state.last_timestamp.clone(), + processed_rows: state.processed_rows, + effective_batch_size: state.effective_batch_size, + last_timestamp_row_offset: state.last_timestamp_row_offset, + }), + CONNECTOR_NAME, + self.id, + ); + + Ok(ProducedMessages { + schema: result.schema, + messages: result.messages, + state: persisted, + }) + } + Err(e) => self.handle_poll_error(e).await, } - error!( - "InfluxDB source ID: {} poll failed: {e}. \ - Consecutive failures tracked by circuit breaker.", - self.id - ); - tokio::time::sleep(self.poll_interval).await; - Err(e) } } } async fn close(&mut self) -> Result<(), Error> { self.client = None; - let state = self.state.lock().await; + let processed = match &self.version_state { + VersionState::V2(mu) => mu.lock().await.processed_rows, + VersionState::V3(mu) => mu.lock().await.processed_rows, + }; info!( - "InfluxDB source connector ID: {} closed. Total rows processed: {}", - self.id, state.processed_rows + "{CONNECTOR_NAME} ID: {} closed. Total rows processed: {processed}", + self.id ); Ok(()) } } -// --------------------------------------------------------------------------- -// Unit tests -// --------------------------------------------------------------------------- +impl InfluxDbSource { + async fn handle_poll_error(&self, e: Error) -> Result { + if !matches!(e, Error::PermanentHttpError(_)) { + self.circuit_breaker.record_failure().await; + } + error!("{CONNECTOR_NAME} ID: {} poll failed: {e}", self.id); + tokio::time::sleep(self.poll_interval).await; + Err(e) + } +} + +// ── Sort heuristic ──────────────────────────────────────────────────────────── + +/// Return `true` if `query` contains a `sort(` call that is not part of a longer +/// identifier (e.g. `mysort(` is excluded; `|> sort(` and bare `sort(` are included). +/// Best-effort check: warns if `sort(` does not appear outside +/// line comments. Not a full parser; documents its limitations. +fn query_has_sort_call(query: &str) -> bool { + query.lines().any(|line| { + // Strip line comments before checking for sort( + let code = match line.find("//") { + Some(pos) => &line[..pos], + None => line, + }; + // Find all occurrences of "sort(" and verify none are preceded by a + // word character (letter, digit, underscore) — that would mean the + // "sort" is part of a longer identifier like "mysort" or "do_sort". + let mut search = code; + while let Some(pos) = search.find("sort(") { + let preceded_by_word_char = search[..pos] + .chars() + .last() + .is_some_and(|c| c.is_alphanumeric() || c == '_'); + if !preceded_by_word_char { + return true; + } + search = &search[pos + 5..]; // skip past "sort(" + } + false + }) +} + +// ── V2 cursor advance logic ─────────────────────────────────────────────────── + +/// Update V2 polling state after a successful poll. +/// +/// V2 uses `>= $cursor` semantics, so the first batch after a cursor advance +/// will include rows already delivered at the previous max timestamp. The +/// `cursor_row_count` tracks how many such rows to skip on the next poll. +/// +/// - New cursor → store it with the count of rows that landed at that timestamp. +/// - Same cursor → accumulate: more rows at this timestamp were delivered. +/// - No new cursor (all skipped) → correct `cursor_row_count` to `skipped` +/// so the skip counter reflects reality rather than a stale inflated value. +fn apply_v2_cursor_advance( + state: &mut V2State, + max_cursor: Option, + rows_at_max_cursor: u64, + skipped: u64, +) { + if let Some(ref new_cursor) = max_cursor { + let should_advance = match state.last_timestamp.as_deref() { + None => true, + Some(old) => match old.parse::>() { + Ok(dt) => is_timestamp_after(new_cursor, dt), + Err(e) => { + error!( + "V2 source: persisted cursor {old:?} failed RFC 3339 parse ({e}); \ + cannot advance cursor — connector state may be corrupt. \ + Clear or migrate the connector state to recover." + ); + false + } + }, + }; + if should_advance { + state.last_timestamp = Some(new_cursor.clone()); + state.cursor_row_count = rows_at_max_cursor; + } else { + // Cursor stayed at same timestamp — accumulate new rows for the offset tiebreaker. + state.cursor_row_count += rows_at_max_cursor; + } + } else if skipped > 0 { + // max_cursor is None (all rows were at or before the current cursor and were + // skipped). Reset the counter to `skipped` to correct an over-inflated offset. + state.cursor_row_count = skipped; + } +} #[cfg(test)] mod tests { use super::*; - use std::collections::HashMap; + use common::{V2SourceConfig, V3SourceConfig}; + use secrecy::SecretString; - fn make_config() -> InfluxDbSourceConfig { - InfluxDbSourceConfig { + fn make_v2_config() -> InfluxDbSourceConfig { + InfluxDbSourceConfig::V2(V2SourceConfig { url: "http://localhost:8086".to_string(), org: "test_org".to_string(), token: SecretString::from("test_token"), @@ -884,516 +613,517 @@ mod tests { retry_max_delay: Some("1s".to_string()), circuit_breaker_threshold: Some(5), circuit_breaker_cool_down: Some("30s".to_string()), - } - } - - fn make_source() -> InfluxDbSource { - InfluxDbSource::new(1, make_config(), None) - } - - // ── validate_cursor ────────────────────────────────────────────────── - - #[test] - fn validate_cursor_accepts_valid_rfc3339() { - assert!(InfluxDbSource::validate_cursor("2024-01-15T10:30:00Z").is_ok()); - assert!(InfluxDbSource::validate_cursor("2024-01-15T10:30:00.123456789Z").is_ok()); - assert!(InfluxDbSource::validate_cursor("2024-01-15T10:30:00+05:30").is_ok()); - assert!(InfluxDbSource::validate_cursor("1970-01-01T00:00:00Z").is_ok()); - } - - #[test] - fn validate_cursor_rejects_flux_injection_characters() { - // pipe, quote, parenthesis, space, slash are Flux syntax characters - assert!(InfluxDbSource::validate_cursor(r#"") |> drop() //"#).is_err()); - assert!(InfluxDbSource::validate_cursor("2024-01-15 10:30:00Z").is_err()); - assert!(InfluxDbSource::validate_cursor("2024/01/15T10:30:00Z").is_err()); - assert!(InfluxDbSource::validate_cursor("not-a-timestamp").is_err()); - } - - #[test] - fn validate_cursor_rejects_empty_string() { - assert!(InfluxDbSource::validate_cursor("").is_err()); - } - - #[test] - fn validate_cursor_rejects_date_only() { - // Missing time component - assert!(InfluxDbSource::validate_cursor("2024-01-15").is_err()); - } - - // ── validate_cursor_field ──────────────────────────────────────────── - - #[test] - fn validate_cursor_field_accepts_time_columns() { - assert!(InfluxDbSource::validate_cursor_field("_time").is_ok()); - assert!(InfluxDbSource::validate_cursor_field("time").is_ok()); + }) } - #[test] - fn validate_cursor_field_rejects_non_timestamp_columns() { - assert!(InfluxDbSource::validate_cursor_field("_value").is_err()); - assert!(InfluxDbSource::validate_cursor_field("sensor_id").is_err()); - assert!(InfluxDbSource::validate_cursor_field("temperature").is_err()); - assert!(InfluxDbSource::validate_cursor_field("").is_err()); + fn make_v3_config() -> InfluxDbSourceConfig { + InfluxDbSourceConfig::V3(V3SourceConfig { + url: "http://localhost:8181".to_string(), + db: "test_db".to_string(), + token: SecretString::from("test_token"), + query: "SELECT time, val FROM tbl WHERE time > '$cursor' ORDER BY time LIMIT $limit OFFSET $offset" + .to_string(), + poll_interval: Some("1s".to_string()), + batch_size: Some(100), + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(3), + retry_delay: Some("100ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(3), + open_retry_max_delay: Some("5s".to_string()), + retry_max_delay: Some("1s".to_string()), + circuit_breaker_threshold: Some(5), + circuit_breaker_cool_down: Some("30s".to_string()), + stuck_batch_cap_factor: Some(10), + }) } - // ── parse_scalar ───────────────────────────────────────────────────── - #[test] - fn parse_scalar_empty_is_null() { - assert_eq!(parse_scalar(""), serde_json::Value::Null); + fn v2_source_new_creates_v2_state() { + let source = InfluxDbSource::new(1, make_v2_config(), None); + assert!(matches!(source.version_state, VersionState::V2(_))); + assert!(!source.state_restore_failed); } #[test] - fn parse_scalar_booleans() { - assert_eq!(parse_scalar("true"), serde_json::Value::Bool(true)); - assert_eq!(parse_scalar("false"), serde_json::Value::Bool(false)); + fn v3_source_new_creates_v3_state() { + let source = InfluxDbSource::new(1, make_v3_config(), None); + assert!(matches!(source.version_state, VersionState::V3(_))); + assert!(!source.state_restore_failed); } - #[test] - fn parse_scalar_integers() { - assert_eq!(parse_scalar("42"), serde_json::Value::Number(42.into())); - assert_eq!( - parse_scalar("-7"), - serde_json::Value::Number((-7i64).into()) + #[tokio::test] + async fn state_restore_fails_on_version_mismatch() { + // Persist a V2 state, then try to open a V3 connector + let v2_state = PersistedState::V2(V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + processed_rows: 42, + cursor_row_count: 0, + }); + let persisted = ConnectorState::serialize(&v2_state, CONNECTOR_NAME, 1).unwrap(); + let source = InfluxDbSource::new(1, make_v3_config(), Some(persisted)); + assert!( + source.state_restore_failed, + "V3 connector must refuse V2 persisted state" ); } - #[test] - fn parse_scalar_floats() { - match parse_scalar("1.5") { - serde_json::Value::Number(n) => { - let v = n.as_f64().unwrap(); - assert!((v - 1.5).abs() < 1e-10); - } - other => panic!("expected Number, got {other:?}"), - } - } - - #[test] - fn parse_scalar_strings() { - assert_eq!( - parse_scalar("hello"), - serde_json::Value::String("hello".to_string()) - ); - // "True" is not a bool (case-sensitive) - assert_eq!( - parse_scalar("True"), - serde_json::Value::String("True".to_string()) + #[tokio::test] + async fn open_returns_invalid_state_when_restore_failed() { + let garbage = ConnectorState(vec![0xFF, 0xFE, 0xFD]); + let mut source = InfluxDbSource::new(1, make_v2_config(), Some(garbage)); + assert!(source.state_restore_failed); + let result = source.open().await; + assert!( + matches!(result, Err(Error::InvalidState)), + "open() must fail fast on restore failure" ); } - // ── is_timestamp_after ─────────────────────────────────────────────── - - #[test] - fn is_timestamp_after_compares_chronologically_not_lexicographically() { - // "2026-03-18T12:00:00.60952Z" = 609520µs (chronologically earlier) - // "2026-03-18T12:00:00.609521Z" = 609521µs (chronologically later) - // A naive string compare would say the first is > second (Z > 1). - let earlier = "2026-03-18T12:00:00.60952Z"; - let later = "2026-03-18T12:00:00.609521Z"; + #[tokio::test] + async fn open_rejects_invalid_initial_offset() { + // Validates initial_offset before attempting any network connection. + let config = InfluxDbSourceConfig::V2(V2SourceConfig { + url: "http://localhost:18086".to_string(), + initial_offset: Some("not-a-timestamp".to_string()), + org: "o".to_string(), + token: SecretString::from("t"), + query: "SELECT 1".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); assert!( - is_timestamp_after(later, earlier), - "later timestamp should be after earlier" + matches!(err, Error::InvalidConfigValue(_)), + "expected InvalidConfigValue for bad initial_offset, got {err:?}" ); + } + + #[tokio::test] + async fn open_rejects_timezone_free_initial_offset() { + let config = InfluxDbSourceConfig::V2(V2SourceConfig { + url: "http://localhost:18086".to_string(), + initial_offset: Some("2024-01-15T10:30:00".to_string()), + org: "o".to_string(), + token: SecretString::from("t"), + query: "SELECT 1".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); assert!( - !is_timestamp_after(earlier, later), - "earlier should not be after later" + matches!(err, Error::InvalidConfigValue(_)), + "initial_offset without timezone must be rejected" ); } - #[test] - fn is_timestamp_after_equal_timestamps() { - let ts = "2024-01-15T10:30:00Z"; - assert!(!is_timestamp_after(ts, ts)); - } + #[tokio::test] + async fn poll_returns_empty_when_circuit_is_open() { + let config = match make_v2_config() { + InfluxDbSourceConfig::V2(mut c) => { + c.circuit_breaker_threshold = Some(1); + c.circuit_breaker_cool_down = Some("60s".to_string()); + c.poll_interval = Some("1ms".to_string()); + InfluxDbSourceConfig::V2(c) + } + other => other, + }; + let source = InfluxDbSource::new(1, config, None); + source.circuit_breaker.record_failure().await; + assert!(source.circuit_breaker.is_open().await); - // ── parse_csv_rows ─────────────────────────────────────────────────── + let result = source.poll().await; + assert!(result.is_ok()); + assert!(result.unwrap().messages.is_empty()); + } - #[test] - fn parse_csv_rows_empty_string_returns_empty() { - let source = make_source(); - let rows = source.parse_csv_rows("").unwrap(); - assert!(rows.is_empty()); + #[tokio::test] + async fn close_clears_client() { + let mut source = InfluxDbSource::new(1, make_v2_config(), None); + let result = source.close().await; + assert!(result.is_ok()); + assert!(source.client.is_none()); } #[test] - fn parse_csv_rows_skips_annotation_rows() { - let source = make_source(); - // Annotation rows must have the same field count as data rows for the CSV - // reader to accept them. InfluxDB always emits uniformly-wide rows. - let csv = "#group,false\n#datatype,string\n_time,_value\n2024-01-01T00:00:00Z,42\n"; - let rows = source.parse_csv_rows(csv).unwrap(); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0].get("_value").map(String::as_str), Some("42")); + fn apply_v2_cursor_advance_moves_cursor() { + let mut state = V2State::default(); + apply_v2_cursor_advance(&mut state, Some("2024-01-01T00:00:01Z".to_string()), 3, 0); + assert_eq!( + state.last_timestamp.as_deref(), + Some("2024-01-01T00:00:01Z") + ); + assert_eq!(state.cursor_row_count, 3); } #[test] - fn parse_csv_rows_skips_blank_lines() { - let source = make_source(); - // Two data records separated by a blank line (multi-table CSV format) - let csv = "_time,_value\n2024-01-01T00:00:00Z,1\n\n_time,_value\n2024-01-01T00:00:01Z,2\n"; - let rows = source.parse_csv_rows(csv).unwrap(); - // Both data rows should be parsed (second header line is skipped) - assert_eq!(rows.len(), 2, "expected 2 data rows, got {}", rows.len()); + fn apply_v2_cursor_advance_accumulates_same_cursor() { + let mut state = V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + cursor_row_count: 3, + processed_rows: 0, + }; + apply_v2_cursor_advance(&mut state, Some("2024-01-01T00:00:00Z".to_string()), 2, 0); + assert_eq!(state.cursor_row_count, 5); } #[test] - fn parse_csv_rows_skips_repeated_header_rows() { - let source = make_source(); - // Same header appears twice (InfluxDB multi-table result format) - let csv = "_time,_value\n2024-01-01T00:00:00Z,10\n_time,_value\n2024-01-01T00:00:01Z,20\n"; - let rows = source.parse_csv_rows(csv).unwrap(); - assert_eq!(rows.len(), 2); + fn apply_v2_cursor_advance_corrects_inflated_counter() { + let mut state = V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + cursor_row_count: 10, + processed_rows: 0, + }; + // None + skipped=3 → correction + apply_v2_cursor_advance(&mut state, None, 0, 3); + assert_eq!(state.cursor_row_count, 3); } - #[test] - fn parse_csv_rows_handles_empty_value_columns() { - let source = make_source(); - // Data row with an empty field value (column present but blank). - // The CSV reader requires uniform field counts, so we keep all 3 columns. - let csv = "_time,_value,_measurement\n2024-01-01T00:00:00Z,42,\n"; - let rows = source.parse_csv_rows(csv).unwrap(); - assert_eq!(rows.len(), 1); - // _measurement is present but empty - assert_eq!( - rows[0].get("_measurement").map(String::as_str), - Some(""), - "empty column value should be stored as empty string" + #[tokio::test] + async fn open_v3_rejects_query_without_offset_when_stuck_cap_active() { + // Default stuck_batch_cap_factor is 10 (> 0), so any V3 query without + // '$offset' must be rejected at open() to prevent duplicate delivery. + let config = InfluxDbSourceConfig::V3(V3SourceConfig { + url: "http://localhost:18181".to_string(), + db: "db".to_string(), + token: SecretString::from("t"), + // deliberately missing $offset + query: "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + stuck_batch_cap_factor: None, // uses default (10 > 0) + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); + assert!( + matches!(err, Error::InvalidConfigValue(_)), + "expected InvalidConfigValue when $offset missing in V3 query, got {err:?}" ); } - // ── build_payload ──────────────────────────────────────────────────── - - #[test] - fn build_payload_missing_column_returns_error() { - let mut config = make_config(); - config.payload_column = Some("data".to_string()); - let source = InfluxDbSource::new(1, config, None); - - let row: HashMap = - [("_time".to_string(), "2024-01-01T00:00:00Z".to_string())] - .into_iter() - .collect(); - - let result = source.build_payload(&row, true); - assert!(result.is_err()); - let err = result.unwrap_err().to_string(); + #[tokio::test] + async fn open_v3_accepts_query_with_offset_placeholder() { + // A query with $offset (and a URL that fails health check) must NOT be + // rejected for the offset reason — it must proceed to the connectivity check. + let config = InfluxDbSourceConfig::V3(V3SourceConfig { + url: "http://localhost:18181".to_string(), + db: "db".to_string(), + token: SecretString::from("t"), + query: "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit OFFSET $offset".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + stuck_batch_cap_factor: None, + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); + // Must NOT be InvalidConfigValue for the offset reason; connectivity fails instead. assert!( - err.contains("data") || err.contains("Missing"), - "error should mention missing column: {err}" + !matches!(err, Error::InvalidConfigValue(ref msg) if msg.contains("$offset")), + "open() must not reject a query that contains $offset; got {err:?}" ); } - #[test] - fn build_payload_invalid_base64_returns_error() { - let mut config = make_config(); - config.payload_column = Some("data".to_string()); - config.payload_format = Some("raw".to_string()); // raw = base64 decode - let source = InfluxDbSource::new(1, config, None); - - let row: HashMap = - [("data".to_string(), "not-valid-base64!!!".to_string())] - .into_iter() - .collect(); - - let result = source.build_payload(&row, true); - assert!(result.is_err()); - let err = result.unwrap_err().to_string(); + #[tokio::test] + async fn open_v3_with_zero_stuck_cap_skips_offset_check() { + // stuck_batch_cap_factor = 0 disables the stuck-cap feature; $offset is + // not required because no inflation will ever happen. + let config = InfluxDbSourceConfig::V3(V3SourceConfig { + url: "http://localhost:18181".to_string(), + db: "db".to_string(), + token: SecretString::from("t"), + query: "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + stuck_batch_cap_factor: Some(0), // explicitly disabled + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); assert!( - err.contains("base64") || err.contains("decode"), - "error should mention base64: {err}" + !matches!(err, Error::InvalidConfigValue(ref msg) if msg.contains("$offset")), + "open() must not check $offset when stuck_batch_cap_factor=0; got {err:?}" ); } - #[test] - fn build_payload_invalid_json_returns_error() { - let mut config = make_config(); - config.payload_column = Some("data".to_string()); - config.payload_format = Some("json".to_string()); - let source = InfluxDbSource::new(1, config, None); - - let row: HashMap = [("data".to_string(), "{{not valid json}}".to_string())] - .into_iter() - .collect(); - - let result = source.build_payload(&row, true); - assert!(result.is_err()); - let err = result.unwrap_err().to_string(); + #[tokio::test] + async fn open_rejects_stuck_batch_cap_factor_above_max() { + let config = InfluxDbSourceConfig::V3(V3SourceConfig { + url: "http://localhost:18181".to_string(), + db: "db".to_string(), + token: SecretString::from("t"), + query: "SELECT 1".to_string(), + poll_interval: None, + batch_size: None, + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: None, + timeout: Some("1s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("1ms".to_string()), + retry_max_delay: None, + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + stuck_batch_cap_factor: Some(v3::MAX_STUCK_CAP_FACTOR + 1), + }); + let mut source = InfluxDbSource::new(1, config, None); + let err = source.open().await.unwrap_err(); assert!( - err.contains("JSON") || err.contains("json"), - "error should mention JSON: {err}" + matches!(err, Error::InvalidConfigValue(_)), + "expected InvalidConfigValue for oversized stuck_batch_cap_factor, got {err:?}" ); } #[test] - fn build_payload_valid_base64_decodes_correctly() { - let mut config = make_config(); - config.payload_column = Some("data".to_string()); - config.payload_format = Some("raw".to_string()); - let source = InfluxDbSource::new(1, config, None); - - // base64("hello") = "aGVsbG8=" - let row: HashMap = [("data".to_string(), "aGVsbG8=".to_string())] - .into_iter() - .collect(); - - let result = source.build_payload(&row, true).unwrap(); - assert_eq!(result, b"hello"); + fn config_accessors_v2() { + let cfg = make_v2_config(); + assert_eq!(cfg.version_label(), "v2"); + assert_eq!(cfg.cursor_field(), "_time"); + assert_eq!(cfg.batch_size(), 100); } #[test] - fn build_payload_text_column_returns_bytes() { - let mut config = make_config(); - config.payload_column = Some("data".to_string()); - config.payload_format = Some("text".to_string()); - let source = InfluxDbSource::new(1, config, None); + fn config_accessors_v3() { + let cfg = make_v3_config(); + assert_eq!(cfg.version_label(), "v3"); + assert_eq!(cfg.cursor_field(), "time"); + assert_eq!(cfg.batch_size(), 100); + } - let row: HashMap = [("data".to_string(), "hello world".to_string())] - .into_iter() - .collect(); + // ── query_has_sort_call heuristic ───────────────────────────────────────── - let result = source.build_payload(&row, true).unwrap(); - assert_eq!(result, b"hello world"); + #[test] + fn sort_call_detected_in_flux_pipeline() { + assert!(query_has_sort_call( + r#"from(bucket:"b") |> range(start: -1h) |> sort(columns: ["_time"])"# + )); } #[test] - fn build_payload_whole_row_wraps_measurement_and_value() { - let source = make_source(); // no payload_column - let row: HashMap = [ - ("_measurement".to_string(), "temperature".to_string()), - ("_field".to_string(), "v".to_string()), - ("_time".to_string(), "2024-01-01T00:00:00Z".to_string()), - ("_value".to_string(), "21.5".to_string()), - ] - .into_iter() - .collect(); - - let bytes = source.build_payload(&row, true).unwrap(); - let parsed: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); - assert_eq!(parsed["measurement"], "temperature"); - assert_eq!(parsed["timestamp"], "2024-01-01T00:00:00Z"); - // _value "21.5" → parsed as f64 - assert!(parsed["value"].is_number()); + fn sort_call_detected_without_pipe() { + assert!(query_has_sort_call("sort(columns: [\"_time\"])")); } #[test] - fn build_payload_include_metadata_false_filters_fields() { - let source = make_source(); - let row: HashMap = [ - ("_measurement".to_string(), "temp".to_string()), - ("_field".to_string(), "v".to_string()), - ("_time".to_string(), "2024-01-01T00:00:00Z".to_string()), - ("_value".to_string(), "42".to_string()), - ("host".to_string(), "server1".to_string()), // extra annotation column - ] - .into_iter() - .collect(); - - let bytes = source.build_payload(&row, false).unwrap(); - let parsed: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); - // With include_metadata=false, only _value/_time/_measurement go into row - let row_obj = parsed["row"].as_object().unwrap(); - // "host" is an annotation column — should be excluded - assert!( - !row_obj.contains_key("host"), - "annotation columns should be excluded when include_metadata=false" - ); - // Core columns should still be present - assert!(row_obj.contains_key("_value") || row_obj.contains_key("_time")); + fn sort_call_not_detected_when_absent() { + assert!(!query_has_sort_call( + r#"from(bucket:"b") |> range(start: $cursor) |> limit(n: $limit)"# + )); } - // ── circuit breaker integration ────────────────────────────────────── - - #[tokio::test] - async fn poll_returns_empty_when_circuit_is_open() { - let mut config = make_config(); - config.circuit_breaker_threshold = Some(1); - config.circuit_breaker_cool_down = Some("60s".to_string()); - // Use a short poll_interval so the circuit-open sleep does not stall - // the test suite for a full second. - config.poll_interval = Some("1ms".to_string()); - let source = InfluxDbSource::new(1, config, None); - - // Force the circuit open - source.circuit_breaker.record_failure().await; - assert!(source.circuit_breaker.is_open().await); - - let result = source.poll().await; - assert!(result.is_ok(), "poll should return Ok when circuit is open"); - let produced = result.unwrap(); - assert!( - produced.messages.is_empty(), - "no messages should be produced when circuit is open" - ); + #[test] + fn sort_call_not_false_positive_on_identifier_prefix() { + // `mysort(` must NOT be treated as a sort call — it is a different function name. + assert!(!query_has_sort_call("mysort(columns: [\"_time\"])")); + assert!(!query_has_sort_call("do_sort(x)")); } - // ── query_with_params — limit inflation ────────────────────────────── - #[test] - fn query_with_params_inflates_limit_by_already_seen() { - let mut config = make_config(); - config.batch_size = Some(10); - config.query = - "from(bucket:\"b\") |> range(start: $cursor) |> limit(n: $limit)".to_string(); - let source = InfluxDbSource::new(1, config, None); - - // With already_seen=5, limit should be 10+5=15 - let q = source.query_with_params("2024-01-01T00:00:00Z", 5).unwrap(); - assert!(q.contains("limit(n: 15)"), "inflated limit not found: {q}"); + fn sort_call_detected_at_start_of_string() { + assert!(query_has_sort_call( + "sort(columns: [\"_time\"]) |> limit(n: 10)" + )); } #[test] - fn query_with_params_no_inflation_when_already_seen_is_zero() { - let mut config = make_config(); - config.batch_size = Some(100); - config.query = - "from(bucket:\"b\") |> range(start: $cursor) |> limit(n: $limit)".to_string(); - let source = InfluxDbSource::new(1, config, None); - - let q = source.query_with_params("2024-01-01T00:00:00Z", 0).unwrap(); - assert!( - q.contains("limit(n: 100)"), - "limit should be batch_size: {q}" - ); + fn sort_call_not_detected_with_space_before_paren() { + // `sort (` with a space is not valid Flux syntax; the heuristic searches + // for the literal token `sort(` and does not match this form. The warning + // is therefore not emitted, which is acceptable: a query written this way + // would fail at the InfluxDB level for a different reason. + assert!(!query_has_sort_call("sort (columns: [\"_time\"])")); } - // ── close() ────────────────────────────────────────────────────────── - - #[tokio::test] - async fn close_drops_client() { - let mut source = make_source(); - let result = source.close().await; - assert!(result.is_ok()); - assert!(source.client.is_none(), "client should be None after close"); + #[test] + fn sort_call_in_line_comment_is_ignored() { + // sort( appearing only in a // comment must NOT trigger the heuristic. + assert!(!query_has_sort_call( + "from(bucket:\"b\") // sort(columns:[\"_time\"]) not real" + )); + // But sort( before the comment on the same line is still detected. + assert!(query_has_sort_call( + "sort(columns:[\"_time\"]) // also present" + )); } - // ── cursor_row_count correction (fix: inflated counter reset) ──────── - - #[tokio::test] - async fn cursor_row_count_corrected_when_inflated_above_actual_row_count_at_cursor() { - use std::io::{Read, Write}; - use std::net::TcpListener as StdTcpListener; - - // The server returns 3 rows, all at the cursor timestamp. - // The source starts with cursor_row_count = 5 (inflated – e.g. rows - // deleted from InfluxDB after delivery). All 3 returned rows will be - // skipped (skipped=3 < already_seen=5). The None branch must correct - // cursor_row_count to 3 instead of leaving it at the inflated 5. - let t = "2024-01-01T00:00:00Z"; - let csv = format!("_time,_value\n{t},1\n{t},2\n{t},3\n"); - let http_response = format!( - "HTTP/1.1 200 OK\r\nContent-Type: text/csv\r\n\ - Content-Length: {}\r\nConnection: close\r\n\r\n{}", - csv.len(), - csv - ); - - let listener = StdTcpListener::bind("127.0.0.1:0").unwrap(); - let port = listener.local_addr().unwrap().port(); - std::thread::spawn(move || { - if let Ok((mut stream, _)) = listener.accept() { - let mut buf = [0u8; 8192]; - let _ = stream.read(&mut buf); - let _ = stream.write_all(http_response.as_bytes()); - } - }); - - let initial_state = State { - last_poll_time: Utc::now(), - last_timestamp: Some(t.to_string()), + #[test] + fn apply_v2_cursor_advance_no_new_cursor_no_skipped_is_noop() { + let mut state = V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + cursor_row_count: 7, processed_rows: 0, - cursor_row_count: 5, // inflated: actual rows at T are only 3 }; - let persisted = ConnectorState::serialize(&initial_state, CONNECTOR_NAME, 1).unwrap(); - - let mut config = make_config(); - config.url = format!("http://127.0.0.1:{port}"); - config.batch_size = Some(10); - let mut source = InfluxDbSource::new(1, config, Some(persisted)); - - // Inject a real HTTP client directly, bypassing open()'s health check. - let raw = source.build_raw_client().unwrap(); - source.client = Some(build_retry_client( - raw, - 0, - Duration::from_millis(0), - Duration::from_millis(0), - "InfluxDB", - )); - + apply_v2_cursor_advance(&mut state, None, 0, 0); + // Neither max_cursor nor skipped — state must not change. + assert_eq!(state.cursor_row_count, 7); assert_eq!( - source.state.lock().await.cursor_row_count, - 5, - "pre-condition: cursor_row_count starts at inflated value" + state.last_timestamp.as_deref(), + Some("2024-01-01T00:00:00Z") ); + } - // poll() → server returns 3 rows at T, all skipped (already_seen=5 > 3) - // → (messages=[], max_cursor=None, rows_at_max_cursor=0, skipped=3) - // → None branch corrects cursor_row_count to skipped (3). - let result = source.poll().await; - assert!(result.is_ok(), "poll should succeed: {:?}", result); - assert!( - result.unwrap().messages.is_empty(), - "all rows were already seen – no messages expected" - ); + // ── restore_v2_state / restore_v3_state paths ───────────────────────────── - assert_eq!( - source.state.lock().await.cursor_row_count, - 3, - "cursor_row_count must be corrected to actual row count (3), not left at inflated (5)" + #[test] + fn restore_v2_state_with_v2_persisted_restores_successfully() { + // V2 config + V2 persisted state → state is restored with state_restore_failed=false. + let v2_state = PersistedState::V2(V2State { + last_timestamp: Some("2024-06-01T00:00:00Z".to_string()), + processed_rows: 99, + cursor_row_count: 3, + }); + let persisted = ConnectorState::serialize(&v2_state, CONNECTOR_NAME, 1).unwrap(); + let source = InfluxDbSource::new(1, make_v2_config(), Some(persisted)); + assert!( + !source.state_restore_failed, + "V2 state on V2 connector must succeed" ); + if let VersionState::V2(mu) = &source.version_state { + let state = mu.blocking_lock(); + assert_eq!( + state.last_timestamp.as_deref(), + Some("2024-06-01T00:00:00Z") + ); + assert_eq!(state.processed_rows, 99); + } else { + panic!("expected V2 version state"); + } } - // ── state restore failure (fix: deserialization failure fails open) ── - - #[tokio::test] - async fn open_returns_invalid_state_when_persisted_state_cannot_be_deserialized() { - // Garbage bytes will cause ConnectorState::deserialize to fail. - // new() must set state_restore_failed=true, and open() must return - // Err(InvalidState) before attempting any network calls, so the - // operator sees a hard failure instead of a silent cursor reset. - let garbage = ConnectorState(vec![0xFF, 0xFE, 0xFD, 0xAA, 0xBB]); - let mut source = InfluxDbSource::new(1, make_config(), Some(garbage)); - - let result = source.open().await; + #[test] + fn restore_v2_state_with_v3_persisted_marks_restore_failed() { + // V2 config + V3 persisted state → mismatch must be rejected. + let v3_state = PersistedState::V3(V3State { + last_timestamp: Some("2024-06-01T00:00:00Z".to_string()), + processed_rows: 10, + effective_batch_size: 500, + last_timestamp_row_offset: 0, + }); + let persisted = ConnectorState::serialize(&v3_state, CONNECTOR_NAME, 1).unwrap(); + let source = InfluxDbSource::new(1, make_v2_config(), Some(persisted)); assert!( - matches!(result, Err(Error::InvalidState)), - "open() must return Err(InvalidState) on state deserialization failure, got: {:?}", - result + source.state_restore_failed, + "V3 state on V2 connector must set state_restore_failed" ); } #[test] - fn fresh_start_with_no_prior_state_does_not_set_restore_failed() { - // When no prior ConnectorState is supplied (first boot), state_restore_failed - // must be false so that open() is not blocked on a normal first run. - let source = InfluxDbSource::new(1, make_config(), None); + fn restore_v3_state_with_v3_persisted_restores_successfully() { + // V3 config + V3 persisted state → state is restored with state_restore_failed=false. + let v3_state = PersistedState::V3(V3State { + last_timestamp: Some("2024-07-15T12:00:00Z".to_string()), + processed_rows: 500, + effective_batch_size: 1000, + last_timestamp_row_offset: 5, + }); + let persisted = ConnectorState::serialize(&v3_state, CONNECTOR_NAME, 1).unwrap(); + let source = InfluxDbSource::new(1, make_v3_config(), Some(persisted)); assert!( !source.state_restore_failed, - "state_restore_failed must be false when no prior state exists" + "V3 state on V3 connector must succeed" ); + if let VersionState::V3(mu) = &source.version_state { + let state = mu.blocking_lock(); + assert_eq!( + state.last_timestamp.as_deref(), + Some("2024-07-15T12:00:00Z") + ); + assert_eq!(state.processed_rows, 500); + assert_eq!(state.effective_batch_size, 1000); + } else { + panic!("expected V3 version state"); + } } - // ── payload_format ─────────────────────────────────────────────────── + // ── get_client() uncalled open() ───────────────────────────────────────── - #[test] - fn payload_format_aliases() { - assert_eq!( - PayloadFormat::from_config(Some("utf8")), - PayloadFormat::Text - ); - assert_eq!( - PayloadFormat::from_config(Some("base64")), - PayloadFormat::Raw + #[tokio::test] + async fn poll_returns_connection_error_when_client_not_initialized() { + // Calling poll() without open() means client is None; get_client() must + // return a Connection error rather than panicking. + let source = InfluxDbSource::new(1, make_v2_config(), None); + let result = source.poll().await; + assert!( + matches!(result, Err(Error::Connection(_))), + "expected Connection error when client not initialized, got {result:?}" ); - assert_eq!(PayloadFormat::from_config(None), PayloadFormat::Json); - } - - #[test] - fn payload_format_schema_mapping() { - assert_eq!(PayloadFormat::Json.schema(), Schema::Json); - assert_eq!(PayloadFormat::Text.schema(), Schema::Text); - assert_eq!(PayloadFormat::Raw.schema(), Schema::Raw); } } diff --git a/core/connectors/sources/influxdb_source/src/row.rs b/core/connectors/sources/influxdb_source/src/row.rs new file mode 100644 index 0000000000..b1e7e541db --- /dev/null +++ b/core/connectors/sources/influxdb_source/src/row.rs @@ -0,0 +1,420 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! Query-response parsers for InfluxDB V2 (annotated CSV) and V3 (JSONL). +//! +//! Both parsers produce `Vec` — a list of field-name → string-value maps. +//! The cursor-tracking and payload-building logic in the source connector +//! operates on this common representation so it runs unchanged regardless of +//! which InfluxDB version is in use. + +use csv::StringRecord; +use iggy_connector_sdk::Error; +use simd_json::BorrowedValue; +use std::collections::HashMap; + +/// A single row returned by a query, field name → typed JSON value. +/// +/// V2 (annotated CSV) stores all values as `Value::String` since CSV has no +/// type information; `parse_scalar` in `build_payload` converts them to typed +/// values when building the message payload. V3 (JSONL) stores typed values +/// directly — numbers, booleans, and nulls arrive pre-typed from SQL, so no +/// string-round-trip parse is needed. +pub(crate) type Row = HashMap; + +// ── InfluxDB V2 — annotated CSV ─────────────────────────────────────────────── + +/// Return `true` if `record` is a CSV header row. +/// +/// Checks for any of the standard InfluxDB temporal column names: +/// `_time`, `_start`, or `_stop`. Regular time-series queries include `_time`; +/// Flux window-aggregate queries (`count()`, `mean()`, `distinct()`) produce +/// result tables with `_start` and `_stop` but no `_time`. Requiring only +/// `_time` would cause those header rows to be missed, silently dropping all +/// subsequent data rows until the next recognised header. +/// +/// InfluxDB annotation rows (`#group`, `#datatype`, `#default`) are already +/// filtered out earlier in [`parse_csv_rows`] by the leading-`#` check, so +/// they will never reach this function. +fn is_header_record(record: &StringRecord) -> bool { + record + .iter() + .any(|v| v == "_time" || v == "_start" || v == "_stop") +} + +/// Parse an InfluxDB V2 annotated-CSV response body into a list of rows. +/// +/// - Annotation rows (first field starts with `#`) are skipped. +/// - Blank lines are skipped. +/// - The first non-annotation row containing `_time`, `_start`, or `_stop` becomes the header. +/// - Repeated identical header rows (multi-table result format) are skipped. +/// - Each subsequent data row is mapped `header[i] → row[i]`. +pub(crate) fn parse_csv_rows(csv_text: &str) -> Result, Error> { + let mut reader = csv::ReaderBuilder::new() + .has_headers(false) + .flexible(true) // multi-table results have variable column counts per table + .from_reader(csv_text.as_bytes()); + + let mut headers: Option = None; + let mut rows = Vec::new(); + + for result in reader.records() { + let record = + result.map_err(|e| Error::InvalidRecordValue(format!("Invalid CSV record: {e}")))?; + + if record.is_empty() { + continue; + } + + if let Some(first) = record.get(0) + && first.starts_with('#') + { + continue; + } + + if is_header_record(&record) { + headers = Some(record.clone()); + continue; + } + + let Some(active_headers) = headers.as_ref() else { + continue; + }; + + // Skip repeated header rows (multi-table result format) + if record == *active_headers { + continue; + } + + let mut mapped = Row::with_capacity(active_headers.len()); + for (idx, key) in active_headers.iter().enumerate() { + if key.is_empty() { + continue; + } + let value = record.get(idx).unwrap_or("").to_string(); + mapped.insert(key.to_string(), serde_json::Value::String(value)); + } + + if !mapped.is_empty() { + rows.push(mapped); + } + } + + Ok(rows) +} + +// ── InfluxDB V3 — JSONL (newline-delimited JSON) ────────────────────────────── + +/// Parse an InfluxDB V3 JSONL response body into a list of rows. +/// +/// Each non-empty line must be a JSON object. Field values of any JSON type +/// are stringified to `String`: +/// - `null` → `"null"` +/// - `bool` → `"true"` / `"false"` +/// - `number` → decimal representation +/// - `string` → value as-is (no extra quotes) +/// - `array` / `object` → compact JSON representation +/// +/// Blank lines are silently skipped. Lines that fail to parse as JSON objects +/// return an error. +/// +/// Uses `simd_json` for accelerated JSON tokenization in the hot path. +/// `simd_json::from_slice` requires `&mut [u8]` and modifies the bytes in +/// place for zero-copy SIMD parsing; we clone each line into a `Vec` to +/// satisfy the mutability requirement without borrowing the original string. +fn parse_object(value: BorrowedValue<'_>) -> Result { + let BorrowedValue::Object(map) = value else { + return Err(Error::InvalidRecordValue( + "expected a JSON object in JSONL response".to_string(), + )); + }; + let mut row = Row::with_capacity(map.len()); + for (k, v) in map.iter() { + // Convert simd_json BorrowedValue → serde_json::Value directly, preserving + // the original type. Numbers and booleans are kept typed so build_payload + // can emit them without a string-round-trip through parse_scalar. + let json_val = match v { + BorrowedValue::Static(simd_json::StaticNode::Null) => serde_json::Value::Null, + BorrowedValue::Static(simd_json::StaticNode::Bool(b)) => serde_json::Value::Bool(*b), + BorrowedValue::Static(simd_json::StaticNode::I64(n)) => { + serde_json::Value::Number((*n).into()) + } + BorrowedValue::Static(simd_json::StaticNode::U64(n)) => { + serde_json::Value::Number((*n).into()) + } + BorrowedValue::Static(simd_json::StaticNode::F64(n)) => { + serde_json::Number::from_f64(*n) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null) + } + BorrowedValue::String(s) => serde_json::Value::String(s.to_string()), + // Arrays/objects: serialize via simd_json then re-parse as serde_json. + // InfluxDB V3 SQL results are flat, so this path is rarely hit. + other => serde_json::to_value(other) + .map_err(|e| Error::InvalidRecordValue(format!("JSON conversion error: {e}")))?, + }; + row.insert(k.to_string(), json_val); + } + Ok(row) +} + +pub fn parse_jsonl_rows(data: &str) -> Result, Error> { + let mut rows = Vec::new(); + let mut scratch = Vec::new(); // reused across lines + for line in data.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + scratch.clear(); + scratch.extend_from_slice(trimmed.as_bytes()); + let obj: simd_json::BorrowedValue = simd_json::to_borrowed_value(&mut scratch) + .map_err(|e| Error::InvalidRecordValue(format!("JSON parse error: {e}")))?; + rows.push(parse_object(obj)?); + } + Ok(rows) +} +#[cfg(test)] +mod tests { + use super::*; + + // ── parse_csv_rows ─────────────────────────────────────────────────────── + + #[test] + fn csv_empty_string_returns_empty() { + assert!(parse_csv_rows("").unwrap().is_empty()); + } + + #[test] + fn csv_skips_annotation_rows() { + let csv = "#group,false\n#datatype,string\n_time,_value\n2024-01-01T00:00:00Z,42\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].get("_value").and_then(|v| v.as_str()), Some("42")); + } + + #[test] + fn csv_skips_blank_lines() { + let csv = "_time,_value\n2024-01-01T00:00:00Z,1\n\n_time,_value\n2024-01-01T00:00:01Z,2\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 2, "expected 2 data rows, got {}", rows.len()); + } + + #[test] + fn csv_skips_repeated_header_rows() { + let csv = "_time,_value\n2024-01-01T00:00:00Z,10\n_time,_value\n2024-01-01T00:00:01Z,20\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 2); + } + + #[test] + fn csv_new_table_different_columns_updates_headers() { + // Multi-table result: second table has an extra _measurement column. + // The parser should recognise the new header row and update accordingly. + let csv = "_time,_value\n\ + 2024-01-01T00:00:00Z,10\n\ + _time,_measurement,_value\n\ + 2024-01-01T00:00:01Z,cpu,20\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 2); + //assert!(rows[0].contains_key("_measurement")); + assert!(!rows[0].contains_key("_measurement")); + assert_eq!( + rows[1].get("_measurement").and_then(|v| v.as_str()), + Some("cpu") + ); + } + + #[test] + fn csv_maps_all_columns() { + let csv = "_time,_measurement,_field,_value\n2024-01-01T00:00:00Z,cpu,usage,75.0\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 1); + let row = &rows[0]; + assert_eq!( + row.get("_measurement").and_then(|v| v.as_str()), + Some("cpu") + ); + assert_eq!(row.get("_field").and_then(|v| v.as_str()), Some("usage")); + assert_eq!(row.get("_value").and_then(|v| v.as_str()), Some("75.0")); + } + + #[test] + fn csv_no_data_rows_returns_empty() { + let csv = "_time,_value\n"; // header only + let rows = parse_csv_rows(csv).unwrap(); + assert!(rows.is_empty()); + } + + #[test] + fn csv_aggregation_query_without_time_column_parses_rows() { + // Flux window-aggregate queries (count(), mean(), etc.) produce result + // tables with _start and _stop but no _time. Before the _start/_stop fix, + // is_header_record returned false, headers stayed None, and all data rows + // were silently dropped. + let csv = "_start,_stop,_field,_value\n\ + 2024-01-01T00:00:00Z,2024-01-01T01:00:00Z,usage,42\n\ + 2024-01-01T01:00:00Z,2024-01-01T02:00:00Z,usage,55\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 2, "rows must not be silently dropped"); + assert_eq!(rows[0].get("_value").and_then(|v| v.as_str()), Some("42")); + assert_eq!(rows[1].get("_value").and_then(|v| v.as_str()), Some("55")); + } + + #[test] + fn csv_stop_only_header_is_recognised() { + let csv = "_stop,_count\n2024-01-01T01:00:00Z,7\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].get("_count").and_then(|v| v.as_str()), Some("7")); + } + + // ── parse_jsonl_rows ───────────────────────────────────────────────────── + + #[test] + fn jsonl_empty_string_returns_empty() { + assert!(parse_jsonl_rows("").unwrap().is_empty()); + } + + #[test] + fn jsonl_single_row() { + let jsonl = r#"{"_time":"2024-01-01T00:00:00Z","_measurement":"cpu","_value":75.5}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!( + rows[0].get("_measurement").and_then(|v| v.as_str()), + Some("cpu") + ); + // Numbers remain typed — no string round-trip. + assert_eq!(rows[0].get("_value").and_then(|v| v.as_f64()), Some(75.5)); + } + + #[test] + fn jsonl_multiple_rows() { + let jsonl = "{\"_time\":\"2024-01-01T00:00:00Z\",\"v\":1}\n{\"_time\":\"2024-01-01T00:00:01Z\",\"v\":2}\n"; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0].get("v").and_then(|v| v.as_i64()), Some(1)); + assert_eq!(rows[1].get("v").and_then(|v| v.as_i64()), Some(2)); + } + + #[test] + fn jsonl_skips_blank_lines() { + let jsonl = "{\"v\":1}\n\n{\"v\":2}\n"; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows.len(), 2); + } + + #[test] + fn jsonl_bool_values_remain_typed() { + let jsonl = r#"{"active":true,"disabled":false}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows[0].get("active"), Some(&serde_json::Value::Bool(true))); + assert_eq!( + rows[0].get("disabled"), + Some(&serde_json::Value::Bool(false)) + ); + } + + #[test] + fn jsonl_null_value_remains_typed() { + let jsonl = r#"{"field":null}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows[0].get("field"), Some(&serde_json::Value::Null)); + } + + #[test] + fn jsonl_string_values_unquoted() { + let jsonl = r#"{"host":"server1"}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!( + rows[0].get("host").and_then(|v| v.as_str()), + Some("server1") + ); + } + + #[test] + fn jsonl_invalid_json_returns_error() { + let jsonl = "not json\n"; + assert!(parse_jsonl_rows(jsonl).is_err()); + } + + #[test] + fn jsonl_trailing_newline_ok() { + let jsonl = "{\"v\":42}\n"; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows.len(), 1); + } + + // ── parse_jsonl_rows — additional type coverage ────────────────────────── + + #[test] + fn jsonl_negative_integer_uses_i64_branch() { + // Negative integers are stored as I64 by simd_json; positive ones are U64. + let jsonl = r#"{"delta":-42,"count":7}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + assert_eq!(rows[0].get("delta").and_then(|v| v.as_i64()), Some(-42)); + assert_eq!(rows[0].get("count").and_then(|v| v.as_u64()), Some(7)); + } + + #[test] + fn jsonl_nested_array_value_is_serialized_as_json() { + // The "other" arm in parse_object handles arrays and nested objects. + let jsonl = r#"{"tags":["a","b"]}"#; + let rows = parse_jsonl_rows(jsonl).unwrap(); + let tags = rows[0].get("tags").unwrap(); + assert!(tags.is_array(), "expected array value, got {tags:?}"); + } + + #[test] + fn jsonl_non_object_line_returns_error() { + // A JSONL line that is not a JSON object (e.g. a bare array) must be rejected. + let jsonl = "[1,2,3]\n"; + assert!(parse_jsonl_rows(jsonl).is_err()); + } + + // ── parse_csv_rows — path coverage ────────────────────────────────────── + + #[test] + fn csv_data_row_before_any_header_is_skipped() { + // A data row that appears before any _time/_start/_stop header row must be + // silently ignored (headers is None → continue). + let csv = "random,data\n_time,_value\n2024-01-01T00:00:00Z,1\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!( + rows.len(), + 1, + "only the row after the header should be returned" + ); + assert_eq!(rows[0].get("_value").and_then(|v| v.as_str()), Some("1")); + } + + #[test] + fn csv_empty_header_column_name_is_skipped() { + // InfluxDB annotated CSV sometimes has a leading empty column (annotation prefix). + // Empty column names must be skipped so the row map stays clean. + let csv = "_time,,_value\n2024-01-01T00:00:00Z,extra,42\n"; + let rows = parse_csv_rows(csv).unwrap(); + assert_eq!(rows.len(), 1); + assert!( + !rows[0].contains_key(""), + "empty key must not appear in row" + ); + assert_eq!(rows[0].get("_value").and_then(|v| v.as_str()), Some("42")); + } +} diff --git a/core/connectors/sources/influxdb_source/src/v2.rs b/core/connectors/sources/influxdb_source/src/v2.rs new file mode 100644 index 0000000000..7690072254 --- /dev/null +++ b/core/connectors/sources/influxdb_source/src/v2.rs @@ -0,0 +1,968 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! InfluxDB V2 source — Flux queries, annotated-CSV responses, Token auth. + +use crate::common::{ + PayloadFormat, Row, RowContext, V2SourceConfig, V2State, apply_query_params, + is_timestamp_after, parse_csv_rows, parse_scalar, validate_cursor, +}; +use base64::{Engine as _, engine::general_purpose}; +use chrono::{DateTime, Utc}; +use iggy_connector_sdk::{Error, ProducedMessage, Schema}; +use reqwest::Url; +use reqwest_middleware::ClientWithMiddleware; +use serde_json::json; +use uuid::Uuid; + +fn build_query( + base: &str, + query: &str, + org: Option<&str>, +) -> Result<(Url, serde_json::Value), Error> { + let mut url = Url::parse(&format!("{base}/api/v2/query")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + if let Some(o) = org { + url.query_pairs_mut().append_pair("org", o); + } + let body = json!({ + "query": query, + "dialect": { + "annotations": ["datatype", "group", "default"], + "delimiter": ",", + "header": true, + "commentPrefix": "#" + } + }); + Ok((url, body)) +} + +/// Maximum multiple of `batch_size` by which `already_seen` may inflate the +/// query limit. Prevents an unbounded request to InfluxDB when the cursor +/// is stuck at the same timestamp for many consecutive polls (analogous to +/// V3's `stuck_batch_cap_factor`). +const MAX_SKIP_INFLATION_FACTOR: u64 = 10; +const MAX_RESPONSE_BODY_BYTES: usize = 256 * 1024 * 1024; // 256 MiB + +/// Render the final Flux query by substituting `$cursor` and `$limit`. +/// +/// The limit is inflated by `already_seen` (rows at the current cursor +/// timestamp that were delivered in a previous batch) so that re-fetching +/// with `>= cursor` returns enough rows to skip them and still fill a full +/// batch. Inflation is capped at `MAX_SKIP_INFLATION_FACTOR × batch_size` +/// to prevent excessively large queries when the cursor is stuck. +fn render_query(config: &V2SourceConfig, cursor: &str, already_seen: u64) -> Result { + validate_cursor(cursor)?; + let batch = config.batch_size.unwrap_or(500) as u64; + // Cap inflation so a stuck cursor cannot issue arbitrarily large queries. + let capped_seen = already_seen.min(batch.saturating_mul(MAX_SKIP_INFLATION_FACTOR)); + let limit = batch.saturating_add(capped_seen).to_string(); + Ok(apply_query_params(&config.query, cursor, &limit, "")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::{Row, RowContext}; + + fn row(pairs: &[(&str, &str)]) -> Row { + pairs + .iter() + .map(|(k, v)| (k.to_string(), serde_json::Value::String(v.to_string()))) + .collect() + } + + const BASE_CURSOR: &str = "1970-01-01T00:00:00Z"; + const T1: &str = "2024-01-01T00:00:00Z"; + const T2: &str = "2024-01-01T00:00:01Z"; + const T3: &str = "2024-01-01T00:00:02Z"; + + fn ctx(current_cursor: &str, now_micros: u64) -> RowContext<'_> { + RowContext { + cursor_field: "_time", + current_cursor, + include_metadata: true, + payload_col: None, + payload_format: PayloadFormat::Json, + now_micros, + } + } + + #[test] + fn process_rows_empty_returns_empty() { + let result = process_rows(&[], &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert!(result.messages.is_empty()); + assert!(result.max_cursor.is_none()); + assert_eq!(result.skipped, 0); + assert_eq!(result.rows_at_max_cursor, 0); + } + + #[test] + fn process_rows_single_row_produces_one_message() { + let rows = vec![row(&[("_time", T1), ("_value", "42")])]; + let result = process_rows(&rows, &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert_eq!(result.messages.len(), 1); + assert_eq!(result.max_cursor.as_deref(), Some(T1)); + assert_eq!(result.rows_at_max_cursor, 1); + assert_eq!(result.skipped, 0); + } + + #[test] + fn process_rows_skips_already_seen_at_cursor() { + // Three rows all at T1, cursor=T1, already_seen=1 → skip first, produce two. + let rows = vec![ + row(&[("_time", T1), ("_value", "1")]), + row(&[("_time", T1), ("_value", "2")]), + row(&[("_time", T1), ("_value", "3")]), + ]; + let result = process_rows(&rows, &ctx(T1, 1000), 1).unwrap(); + assert_eq!(result.skipped, 1); + assert_eq!(result.messages.len(), 2); + } + + #[test] + fn process_rows_does_not_skip_beyond_already_seen() { + // already_seen=1 but there are 3 rows at cursor; only the first should be skipped. + let rows = vec![ + row(&[("_time", T1)]), + row(&[("_time", T1)]), + row(&[("_time", T1)]), + ]; + let result = process_rows(&rows, &ctx(T1, 1000), 1).unwrap(); + assert_eq!(result.skipped, 1); + assert_eq!(result.messages.len(), 2); + } + + #[test] + fn process_rows_tracks_latest_max_cursor() { + let rows = vec![ + row(&[("_time", T1)]), + row(&[("_time", T3)]), + row(&[("_time", T2)]), + ]; + let result = process_rows(&rows, &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert_eq!(result.max_cursor.as_deref(), Some(T3)); + assert_eq!(result.rows_at_max_cursor, 1); + } + + #[test] + fn process_rows_counts_rows_at_max_cursor() { + let rows = vec![ + row(&[("_time", T1)]), + row(&[("_time", T2)]), + row(&[("_time", T2)]), + ]; + let result = process_rows(&rows, &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert_eq!(result.max_cursor.as_deref(), Some(T2)); + assert_eq!(result.rows_at_max_cursor, 2); + } + + #[test] + fn process_rows_message_ids_are_some_and_unique() { + let rows = vec![row(&[("_time", T1)]), row(&[("_time", T2)])]; + let result = process_rows(&rows, &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert!(result.messages[0].id.is_some()); + assert!(result.messages[1].id.is_some()); + assert_ne!(result.messages[0].id, result.messages[1].id); + } + + #[test] + fn process_rows_message_timestamps_use_now_micros() { + let rows = vec![row(&[("_time", T1)])]; + let result = process_rows(&rows, &ctx(BASE_CURSOR, 999_999), 0).unwrap(); + assert_eq!(result.messages[0].timestamp, Some(999_999)); + assert_eq!(result.messages[0].origin_timestamp, Some(999_999)); + } + + #[test] + fn process_rows_row_without_cursor_field_still_produces_message() { + let rows = vec![row(&[("_value", "42")])]; // no _time field + let result = process_rows(&rows, &ctx(BASE_CURSOR, 1000), 0).unwrap(); + assert_eq!(result.messages.len(), 1); + assert!(result.max_cursor.is_none()); + } +} + +// ── Query execution ─────────────────────────────────────────────────────────── + +pub(crate) async fn run_query( + client: &ClientWithMiddleware, + config: &V2SourceConfig, + auth: &str, + cursor: &str, + already_seen: u64, +) -> Result { + let query = render_query(config, cursor, already_seen)?; + let base = config.url.trim_end_matches('/'); + let (url, body) = build_query(base, &query, Some(&config.org))?; + + let mut response = client + .post(url) + .header("Authorization", auth) + .header("Content-Type", "application/json") + .header("Accept", "text/csv") + .json(&body) + .send() + .await + .map_err(|e| Error::Storage(format!("InfluxDB V2 query failed: {e}")))?; + + let status = response.status(); + if status.is_success() { + // Stream chunk-by-chunk with a hard byte cap to mirror the V3 path and + // prevent OOM when MAX_SKIP_INFLATION_FACTOR inflates the effective batch. + if response + .content_length() + .is_some_and(|n| n as usize > MAX_RESPONSE_BODY_BYTES) + { + return Err(Error::Storage(format!( + "InfluxDB V2 response body exceeds {MAX_RESPONSE_BODY_BYTES} byte cap; \ + reduce batch_size to avoid OOM" + ))); + } + let mut buf: Vec = Vec::new(); + while let Some(chunk) = response + .chunk() + .await + .map_err(|e| Error::Storage(format!("Failed to read V2 response: {e}")))? + { + buf.extend_from_slice(&chunk); + if buf.len() > MAX_RESPONSE_BODY_BYTES { + return Err(Error::Storage(format!( + "InfluxDB V2 response body exceeded {MAX_RESPONSE_BODY_BYTES} byte cap \ + while streaming; reduce batch_size to avoid OOM" + ))); + } + } + return String::from_utf8(buf) + .map_err(|e| Error::Storage(format!("V2 response body is not valid UTF-8: {e}"))); + } + + let body_text = response + .text() + .await + .unwrap_or_else(|_| "failed to read response body".to_string()); + + if iggy_connector_sdk::retry::is_transient_status(status) { + Err(Error::Storage(format!( + "InfluxDB V2 query failed with status {status}: {body_text}" + ))) + } else { + Err(Error::PermanentHttpError(format!( + "InfluxDB V2 query failed with status {status}: {body_text}" + ))) + } +} + +// ── Message building ────────────────────────────────────────────────────────── + +fn build_payload( + row: &Row, + payload_column: Option<&str>, + payload_format: PayloadFormat, + include_metadata: bool, +) -> Result, Error> { + if let Some(col) = payload_column { + // V2 CSV values are always Value::String; extract once and reuse. + let raw = row + .get(col) + .and_then(|v| v.as_str()) + .ok_or_else(|| Error::InvalidRecordValue(format!("Missing payload column '{col}'")))?; + return match payload_format { + PayloadFormat::Json => { + let v: serde_json::Value = serde_json::from_str(raw).map_err(|e| { + Error::InvalidRecordValue(format!( + "Payload column '{col}' is not valid JSON: {e}" + )) + })?; + serde_json::to_vec(&v) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))) + } + PayloadFormat::Text => Ok(raw.as_bytes().to_vec()), + PayloadFormat::Raw => general_purpose::STANDARD + .decode(raw.as_bytes()) + .map_err(|e| { + Error::InvalidRecordValue(format!("Failed to decode payload as base64: {e}")) + }), + }; + } + + // Single pass over the row: extract envelope fields and build json_row + // simultaneously, avoiding the second HashMap lookups that the two-pass + // approach required. + // parse_scalar is called only when the result will actually be used — + // skipping it for metadata fields when include_metadata=false avoids + // three failed parse attempts (bool, i64, f64) per discarded field. + let mut json_row = serde_json::Map::new(); + let mut measurement: &str = ""; + let mut field_name: &str = ""; + let mut timestamp_str: &str = ""; + let mut field_value = serde_json::Value::Null; + + // V2 CSV values arrive as Value::String; extract the &str once and call + // parse_scalar to infer bool / i64 / f64 / string type from the raw text. + for (key, val) in row { + let val_str = val.as_str().unwrap_or(""); + match key.as_str() { + "_measurement" => { + measurement = val_str; + if include_metadata { + json_row.insert(key.clone(), parse_scalar(val_str)); + } + } + "_field" => { + field_name = val_str; + if include_metadata { + json_row.insert(key.clone(), parse_scalar(val_str)); + } + } + "_time" => { + timestamp_str = val_str; + // _time always included (needed for cursor tracking by consumers) + json_row.insert(key.clone(), parse_scalar(val_str)); + } + "_value" => { + let parsed = parse_scalar(val_str); + field_value = parsed.clone(); + json_row.insert(key.clone(), parsed); + } + _ => { + if include_metadata { + json_row.insert(key.clone(), parse_scalar(val_str)); + } + } + } + } + + let wrapped = json!({ + "measurement": measurement, + "field": field_name, + "timestamp": timestamp_str, + "value": field_value, + "row": json_row, + }); + + serde_json::to_vec(&wrapped) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))) +} + +pub(crate) struct PollResult { + pub messages: Vec, + pub max_cursor: Option, + pub rows_at_max_cursor: u64, + pub skipped: u64, + pub schema: Schema, +} + +// ── Row processing (pure, testable without HTTP) ────────────────────────────── + +/// Result of processing a batch of V2 rows into Iggy messages. +pub(crate) struct RowProcessingResult { + pub messages: Vec, + pub max_cursor: Option, + pub rows_at_max_cursor: u64, + pub skipped: u64, +} + +/// Converts a slice of V2 query rows into Iggy messages. +/// +/// ## Cursor semantics and deduplication +/// +/// InfluxDB V2 Flux queries use `>= $cursor` (inclusive), so the first batch after +/// a cursor advance will re-include any rows whose timestamp equals the new cursor. +/// `already_seen` is the count of such rows delivered in the previous batch; this +/// function skips exactly that many leading rows that match `ctx.current_cursor`, +/// preventing duplicate delivery across batch boundaries. +/// +/// `already_seen` is a separate parameter rather than part of [`RowContext`] because +/// it is V2-specific: V3 uses strict `> cursor` and never needs to skip rows. +/// +/// ## Cursor tracking +/// +/// Each row's cursor field is compared as a timestamp. The highest timestamp seen +/// among emitted rows becomes `max_cursor` in the result. `rows_at_max_cursor` +/// counts how many emitted rows share that timestamp — the caller uses this to +/// detect when a batch is stuck (all rows share the same timestamp and fill the +/// entire batch), at which point the effective batch size is inflated. +/// +/// Rows that are missing the cursor field still produce messages; they do not +/// contribute to cursor tracking and are excluded from skip logic. +/// +/// ## Message identity +/// +/// A single random UUID is generated per call; per-message IDs are derived by +/// adding the message's position to that base, keeping PRNG work O(1) per batch. +/// +/// ## Parameters +/// +/// - `rows`: Rows returned by the Flux query for this poll. +/// - `ctx`: Shared context (cursor field name, current cursor value, payload config, +/// wall-clock time in microseconds). +/// - `already_seen`: Number of rows at `ctx.current_cursor` to skip — rows already +/// delivered in the previous batch that the `>=` query re-included. +/// +/// ## Returns +/// +/// A [`RowProcessingResult`] containing: +/// - `messages`: One [`ProducedMessage`] per non-skipped row. +/// - `max_cursor`: Highest cursor timestamp seen among emitted rows, if any. +/// - `rows_at_max_cursor`: Count of emitted rows sharing `max_cursor`. +/// - `skipped`: Number of rows skipped due to `already_seen` deduplication. +pub(crate) fn process_rows( + rows: &[Row], + ctx: &RowContext<'_>, + already_seen: u64, +) -> Result { + let mut messages = Vec::with_capacity(rows.len()); + let mut max_cursor: Option = None; + let mut max_cursor_parsed: Option> = None; + let mut rows_at_max_cursor = 0u64; + let mut skipped = 0u64; + // Generate the base UUID once per poll; derive per-message IDs by addition. + // This is O(1) PRNG calls per batch instead of O(n), measurable at batch ≥ 100. + let id_base = Uuid::new_v4().as_u128(); + + for row in rows.iter() { + // Single lookup for cursor_field — used for both skip logic and max-cursor tracking. + // V2 CSV rows store all values as Value::String; .as_str() is always Some. + let cv = row.get(ctx.cursor_field).and_then(|v| v.as_str()); + if cv == Some(ctx.current_cursor) && skipped < already_seen { + skipped += 1; + continue; + } + + if let Some(cv) = cv { + match max_cursor_parsed { + None => { + max_cursor = Some(cv.to_string()); + max_cursor_parsed = cv.parse::>().ok(); + rows_at_max_cursor = 1; + } + Some(current_dt) => { + if is_timestamp_after(cv, current_dt) { + max_cursor = Some(cv.to_string()); + max_cursor_parsed = cv.parse::>().ok(); + rows_at_max_cursor = 1; + } else if max_cursor.as_deref() == Some(cv) { + rows_at_max_cursor += 1; + } + } + } + } + + let payload = build_payload( + row, + ctx.payload_col, + ctx.payload_format, + ctx.include_metadata, + )?; + messages.push(ProducedMessage { + id: Some(id_base.wrapping_add(messages.len() as u128)), + checksum: None, + timestamp: Some(ctx.now_micros), + origin_timestamp: Some(ctx.now_micros), + headers: None, + payload, + }); + } + + Ok(RowProcessingResult { + messages, + max_cursor, + rows_at_max_cursor, + skipped, + }) +} + +pub(crate) async fn poll( + client: &ClientWithMiddleware, + config: &V2SourceConfig, + auth: &str, + state: &V2State, + payload_format: PayloadFormat, + include_metadata: bool, +) -> Result { + let cursor = state + .last_timestamp + .clone() + .or_else(|| config.initial_offset.clone()) + .unwrap_or_else(|| "1970-01-01T00:00:00Z".to_string()); + + let already_seen = state.cursor_row_count; + let response_data = run_query(client, config, auth, &cursor, already_seen).await?; + let rows = parse_csv_rows(&response_data)?; + + let ctx = RowContext { + cursor_field: config.cursor_field.as_deref().unwrap_or("_time"), + current_cursor: &cursor, + include_metadata, + payload_col: config.payload_column.as_deref(), + payload_format, + now_micros: iggy_common::Utc::now().timestamp_micros() as u64, + }; + + let result = process_rows(&rows, &ctx, already_seen)?; + + let schema = if ctx.payload_col.is_some() { + ctx.payload_format.schema() + } else { + Schema::Json + }; + + Ok(PollResult { + messages: result.messages, + max_cursor: result.max_cursor, + rows_at_max_cursor: result.rows_at_max_cursor, + skipped: result.skipped, + schema, + }) +} + +#[cfg(test)] +mod http_tests { + use super::*; + use axum::Router; + use axum::extract::Request; + use axum::http::{HeaderMap, StatusCode}; + use axum::routing::post; + use secrecy::SecretString; + use std::sync::Arc; + use std::time::Duration; + use tokio::sync::Mutex; + + // ── helpers ─────────────────────────────────────────────────────────────── + + async fn start_server(router: Router) -> String { + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + tokio::spawn(async move { + axum::serve(listener, router).await.unwrap(); + }); + format!("http://127.0.0.1:{port}") + } + + fn make_client() -> ClientWithMiddleware { + let raw = reqwest::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .unwrap(); + iggy_connector_sdk::retry::build_retry_client( + raw, + 1, + Duration::from_millis(1), + Duration::from_millis(10), + "test", + ) + } + + fn make_config(url: &str) -> V2SourceConfig { + V2SourceConfig { + url: url.to_string(), + org: "test_org".to_string(), + token: SecretString::from("test_token"), + query: "SELECT * FROM t WHERE time >= '$cursor' LIMIT $limit".to_string(), + poll_interval: None, + batch_size: Some(10), + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: Some("1ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("10ms".to_string()), + retry_max_delay: Some("10ms".to_string()), + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + } + } + + const CURSOR: &str = "1970-01-01T00:00:00Z"; + + // ── run_query ───────────────────────────────────────────────────────────── + + #[tokio::test] + async fn run_query_returns_body_on_200() { + let csv = "_time,_value\n2024-01-01T00:00:00Z,42\n"; + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let result = run_query(&make_client(), &make_config(&base), "Token tok", CURSOR, 0) + .await + .unwrap(); + assert!(result.contains("_value")); + assert!(result.contains("42")); + } + + #[tokio::test] + async fn run_query_empty_body_on_200() { + let app = Router::new().route("/api/v2/query", post(|| async { (StatusCode::OK, "") })); + let base = start_server(app).await; + let result = run_query(&make_client(), &make_config(&base), "Token tok", CURSOR, 0) + .await + .unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn run_query_500_returns_transient_error() { + let app = Router::new().route( + "/api/v2/query", + post(|| async { StatusCode::INTERNAL_SERVER_ERROR }), + ); + let base = start_server(app).await; + let result = run_query(&make_client(), &make_config(&base), "Token tok", CURSOR, 0).await; + assert!(matches!(result, Err(Error::Storage(_)))); + } + + #[tokio::test] + async fn run_query_400_returns_permanent_error() { + let app = Router::new().route("/api/v2/query", post(|| async { StatusCode::BAD_REQUEST })); + let base = start_server(app).await; + let result = run_query(&make_client(), &make_config(&base), "Token tok", CURSOR, 0).await; + assert!(matches!(result, Err(Error::PermanentHttpError(_)))); + } + + #[tokio::test] + async fn run_query_sends_token_authorization_header() { + let captured: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured.clone(); + let app = Router::new().route( + "/api/v2/query", + post(move |headers: HeaderMap| { + let cap = cap2.clone(); + async move { + *cap.lock().await = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + StatusCode::OK + } + }), + ); + let base = start_server(app).await; + let _ = run_query( + &make_client(), + &make_config(&base), + "Token my_token", + CURSOR, + 0, + ) + .await; + assert_eq!(*captured.lock().await, "Token my_token"); + } + + #[tokio::test] + async fn run_query_sends_org_in_query_params() { + let captured_uri: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured_uri.clone(); + let app = Router::new().route( + "/api/v2/query", + post(move |request: Request| { + let cap = cap2.clone(); + async move { + *cap.lock().await = request.uri().to_string(); + StatusCode::OK + } + }), + ); + let base = start_server(app).await; + let _ = run_query(&make_client(), &make_config(&base), "Token tok", CURSOR, 0).await; + assert!(captured_uri.lock().await.contains("org=test_org")); + } + + #[tokio::test] + async fn run_query_request_body_contains_substituted_query() { + let captured_body: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured_body.clone(); + let app = Router::new().route( + "/api/v2/query", + post(move |request: Request| { + let cap = cap2.clone(); + async move { + let bytes = axum::body::to_bytes(request.into_body(), usize::MAX) + .await + .unwrap(); + *cap.lock().await = String::from_utf8_lossy(&bytes).to_string(); + StatusCode::OK + } + }), + ); + let base = start_server(app).await; + let cursor = "2024-01-01T00:00:00Z"; + let _ = run_query(&make_client(), &make_config(&base), "Token tok", cursor, 0).await; + let body = captured_body.lock().await; + // The $cursor placeholder should be replaced with the cursor value + assert!(body.contains(cursor)); + // $limit should be replaced with the batch size (10) + assert!(body.contains("10")); + // The raw placeholders must NOT appear in the sent query + assert!(!body.contains("$cursor")); + assert!(!body.contains("$limit")); + } + + // ── poll() end-to-end ───────────────────────────────────────────────────── + + #[tokio::test] + async fn poll_returns_messages_for_csv_response() { + let csv = "_time,_value\n\ + 2024-01-01T00:00:01Z,42\n\ + 2024-01-01T00:00:02Z,43\n"; + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let state = V2State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.messages.len(), 2); + assert_eq!(result.max_cursor.as_deref(), Some("2024-01-01T00:00:02Z")); + assert_eq!(result.rows_at_max_cursor, 1); + assert_eq!(result.skipped, 0); + assert_eq!(result.schema, Schema::Json); + } + + #[tokio::test] + async fn poll_advances_cursor_to_latest_out_of_order_timestamp() { + // Rows arrive in non-chronological order; max_cursor must still be the latest. + let csv = "_time,_value\n\ + 2024-01-01T00:00:01Z,10\n\ + 2024-01-01T00:00:03Z,30\n\ + 2024-01-01T00:00:02Z,20\n"; + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let state = V2State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.messages.len(), 3); + assert_eq!(result.max_cursor.as_deref(), Some("2024-01-01T00:00:03Z")); + } + + #[tokio::test] + async fn poll_skips_already_seen_rows_at_cursor() { + // State says we already delivered 1 row at T1. + // Server returns 3 rows all at T1 → first must be skipped. + let t1 = "2024-01-01T00:00:01Z"; + let csv = format!("_time,_value\n{t1},1\n{t1},2\n{t1},3\n"); + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let state = V2State { + last_timestamp: Some(t1.to_string()), + cursor_row_count: 1, + processed_rows: 5, + }; + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.skipped, 1); + assert_eq!(result.messages.len(), 2); + assert_eq!(result.rows_at_max_cursor, 2); + } + + #[tokio::test] + async fn poll_empty_csv_returns_no_messages() { + let app = Router::new().route("/api/v2/query", post(|| async { (StatusCode::OK, "") })); + let base = start_server(app).await; + let state = V2State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + ..V2State::default() + }; + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert!(result.messages.is_empty()); + assert!(result.max_cursor.is_none()); + assert_eq!(result.skipped, 0); + } + + #[tokio::test] + async fn poll_propagates_http_error() { + let app = Router::new().route( + "/api/v2/query", + post(|| async { StatusCode::INTERNAL_SERVER_ERROR }), + ); + let base = start_server(app).await; + let state = V2State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn poll_counts_rows_at_same_max_cursor() { + // Two rows share the latest timestamp; rows_at_max_cursor must be 2. + let t1 = "2024-01-01T00:00:01Z"; + let t2 = "2024-01-01T00:00:02Z"; + let csv = format!("_time,_value\n{t1},1\n{t2},2\n{t2},3\n"); + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let state = V2State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.max_cursor.as_deref(), Some(t2)); + assert_eq!(result.rows_at_max_cursor, 2); + assert_eq!(result.messages.len(), 3); + } + + #[tokio::test] + async fn poll_schema_matches_payload_format() { + // When a payload_column is configured the schema should reflect + // the format (Text here), not always Json. + let csv = "_time,data\n2024-01-01T00:00:01Z,hello\n"; + let app = Router::new().route( + "/api/v2/query", + post(move || async move { (StatusCode::OK, csv) }), + ); + let base = start_server(app).await; + let config = V2SourceConfig { + payload_column: Some("data".to_string()), + ..make_config(&base) + }; + let state = V2State::default(); + let result = poll( + &make_client(), + &config, + "Token tok", + &state, + PayloadFormat::Text, + true, + ) + .await + .unwrap(); + assert_eq!(result.messages.len(), 1); + assert_eq!(result.schema, Schema::Text); + // The raw text should be the payload bytes + assert_eq!(result.messages[0].payload, b"hello"); + } + + #[tokio::test] + async fn poll_permanent_http_error_propagates() { + let app = Router::new().route("/api/v2/query", post(|| async { StatusCode::BAD_REQUEST })); + let base = start_server(app).await; + let state = V2State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Token tok", + &state, + PayloadFormat::Json, + true, + ) + .await; + assert!(matches!(result, Err(Error::PermanentHttpError(_)))); + } + + // ── build_query ────────────────────────────────────────────────────────── + + const BASE: &str = "http://localhost:8086"; + + #[test] + fn build_query_url_path_and_org_param() { + let (url, body) = build_query( + BASE, + "from(bucket:\"b\") |> range(start:-1h)", + Some("myorg"), + ) + .unwrap(); + assert!( + url.path().ends_with("/api/v2/query"), + "wrong path: {}", + url.path() + ); + assert!( + url.query().unwrap_or("").contains("org=myorg"), + "missing org param" + ); + assert!(body["query"].is_string()); + let annotations = body["dialect"]["annotations"].as_array().unwrap(); + assert!(annotations.iter().any(|v| v.as_str() == Some("datatype"))); + assert!(annotations.iter().any(|v| v.as_str() == Some("group"))); + assert!(annotations.iter().any(|v| v.as_str() == Some("default"))); + } + + #[test] + fn build_query_without_org_omits_param() { + let (url, _) = build_query(BASE, "SELECT 1", None).unwrap(); + assert!(url.query().is_none_or(|q| !q.contains("org="))); + } + + #[test] + fn build_query_invalid_base_returns_error() { + assert!(build_query("not-a-url", "SELECT 1", None).is_err()); + } +} diff --git a/core/connectors/sources/influxdb_source/src/v3.rs b/core/connectors/sources/influxdb_source/src/v3.rs new file mode 100644 index 0000000000..b81800c7a2 --- /dev/null +++ b/core/connectors/sources/influxdb_source/src/v3.rs @@ -0,0 +1,1356 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! InfluxDB V3 source — SQL queries, JSONL responses, Bearer auth. +//! +//! V3 uses strict `> cursor` semantics. DataFusion/Parquet does not guarantee +//! stable ordering for rows that share the same timestamp, so the V2 skip-N +//! approach is not safe here. If all rows in a batch share the same timestamp, +//! the cursor cannot advance — the effective batch size is doubled each poll +//! up to `stuck_batch_cap_factor × batch_size`. If the cap is reached, the +//! circuit breaker is tripped. + +use crate::common::{ + DEFAULT_V3_CURSOR_FIELD, PayloadFormat, Row, RowContext, V3SourceConfig, V3State, + apply_query_params, is_timestamp_after, parse_jsonl_rows, validate_cursor, +}; +use base64::{Engine as _, engine::general_purpose}; +use chrono::{DateTime, Utc}; +use iggy_connector_sdk::{Error, ProducedMessage, Schema}; +use reqwest::Url; +use reqwest_middleware::ClientWithMiddleware; +use serde_json::json; +use tracing::warn; +use uuid::Uuid; + +pub(crate) const DEFAULT_STUCK_CAP_FACTOR: u32 = 10; +/// Upper bound for `stuck_batch_cap_factor`. A value of 1000 with batch_size=1000 +/// would issue 1,000,000-row queries before tripping the circuit breaker. +pub(crate) const MAX_STUCK_CAP_FACTOR: u32 = 100; + +/// Hard cap on buffered JSONL response body size. +/// +/// `MAX_STUCK_CAP_FACTOR` can inflate the effective batch to 100 × `batch_size`, +/// making unbounded `response.text()` a real OOM vector under misconfiguration. +/// Streaming stops and returns an error once this many bytes have been read. +const MAX_RESPONSE_BODY_BYTES: usize = 256 * 1024 * 1024; // 256 MiB + +/// InfluxDB V3 query endpoint expects this exact string for JSONL response format. +const QUERY_FORMAT_JSONL: &str = "jsonl"; + +fn build_query(base: &str, query: &str, db: &str) -> Result<(Url, serde_json::Value), Error> { + let url = Url::parse(&format!("{base}/api/v3/query_sql")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + let body = json!({ + "db": db, + "q": query, + "format": QUERY_FORMAT_JSONL + }); + Ok((url, body)) +} + +// ── Query execution ─────────────────────────────────────────────────────────── + +pub(crate) async fn run_query( + client: &ClientWithMiddleware, + config: &V3SourceConfig, + auth: &str, + cursor: &str, + effective_batch: u32, + offset: u64, +) -> Result { + validate_cursor(cursor)?; + let q = apply_query_params( + &config.query, + cursor, + &effective_batch.to_string(), + &offset.to_string(), /* &str */ + ); + let base = config.url.trim_end_matches('/'); + let (url, body) = build_query(base, &q, &config.db)?; + + let mut response = client + .post(url) + .header("Authorization", auth) + .header("Content-Type", "application/json") + .header("Accept", "application/json") + .json(&body) + .send() + .await + .map_err(|e| Error::Storage(format!("InfluxDB V3 query failed: {e}")))?; + + let status = response.status(); + if status.is_success() { + // Stream chunk-by-chunk with a hard byte cap to prevent OOM when + // MAX_STUCK_CAP_FACTOR inflates the effective batch to 100 × batch_size. + if response + .content_length() + .is_some_and(|n| n as usize > MAX_RESPONSE_BODY_BYTES) + { + return Err(Error::Storage(format!( + "InfluxDB V3 response body exceeds {MAX_RESPONSE_BODY_BYTES} byte cap; \ + reduce batch_size to avoid OOM" + ))); + } + let mut buf: Vec = Vec::new(); + while let Some(chunk) = response + .chunk() + .await + .map_err(|e| Error::Storage(format!("Failed to read V3 response: {e}")))? + { + buf.extend_from_slice(&chunk); + if buf.len() > MAX_RESPONSE_BODY_BYTES { + return Err(Error::Storage(format!( + "InfluxDB V3 response body exceeded {MAX_RESPONSE_BODY_BYTES} byte cap \ + while streaming; reduce batch_size to avoid OOM" + ))); + } + } + return String::from_utf8(buf) + .map_err(|e| Error::Storage(format!("V3 response body is not valid UTF-8: {e}"))); + } + + let body_text = response + .text() + .await + .unwrap_or_else(|_| "failed to read response body".to_string()); + + // 404 "database not found" means the namespace has not been written to yet; + // treat it as empty rather than a failure so the circuit breaker stays healthy. + // Any other 404 (e.g. "table not found") is a permanent error — don't swallow it. + if status.as_u16() == 404 { + if body_text.to_lowercase().contains("database not found") { + return Ok(String::new()); + } + return Err(Error::PermanentHttpError(format!( + "InfluxDB V3 query failed with status {status}: {body_text}" + ))); + } + + if iggy_connector_sdk::retry::is_transient_status(status) { + Err(Error::Storage(format!( + "InfluxDB V3 query failed with status {status}: {body_text}" + ))) + } else { + Err(Error::PermanentHttpError(format!( + "InfluxDB V3 query failed with status {status}: {body_text}" + ))) + } +} + +// ── Message building ────────────────────────────────────────────────────────── + +fn build_payload( + row: &Row, + payload_column: Option<&str>, + payload_format: PayloadFormat, + include_metadata: bool, + cursor_field: &str, +) -> Result, Error> { + if let Some(col) = payload_column { + let raw = row + .get(col) + .cloned() + .ok_or_else(|| Error::InvalidRecordValue(format!("Missing payload column '{col}'")))?; + return match payload_format { + // raw is already a serde_json::Value — serialize directly, no re-parse. + PayloadFormat::Json => serde_json::to_vec(&raw) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))), + PayloadFormat::Text => match raw { + serde_json::Value::String(s) => Ok(s.into_bytes()), + other => serde_json::to_vec(&other) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))), + }, + PayloadFormat::Raw => { + let s = raw.as_str().ok_or_else(|| { + Error::InvalidRecordValue(format!( + "Payload column '{col}' must be a string value for Raw format" + )) + })?; + general_purpose::STANDARD.decode(s.as_bytes()).map_err(|e| { + Error::InvalidRecordValue(format!("Failed to decode payload as base64: {e}")) + }) + } + }; + } + + // Serialize directly from borrowed references — avoids O(fields) String+Value + // clones that the collect-into-Map approach required at high batch sizes. + struct RowView<'a> { + row: &'a Row, + cursor_field: &'a str, + include_metadata: bool, + } + impl serde::Serialize for RowView<'_> { + fn serialize(&self, s: S) -> Result { + use serde::ser::SerializeMap; + let entries: Vec<_> = self + .row + .iter() + .filter(|(k, _)| self.include_metadata || k.as_str() != self.cursor_field) + .collect(); + let mut map = s.serialize_map(Some(entries.len()))?; + for (k, v) in &entries { + map.serialize_entry(k, v)?; + } + map.end() + } + } + serde_json::to_vec(&RowView { + row, + cursor_field, + include_metadata, + }) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))) +} + +/// Compute the next effective batch size when the batch is stuck. +/// Doubles until it reaches `cap`. Returns `None` if already at cap. +pub(crate) fn next_stuck_batch_size(current: u32, base: u32, cap_factor: u32) -> Option { + let cap = base.saturating_mul(cap_factor); + if current >= cap { + None + } else { + Some(current.saturating_mul(2).min(cap)) + } +} + +// ── Poll ────────────────────────────────────────────────────────────────────── + +pub(crate) struct PollResult { + pub messages: Vec, + pub new_state: V3State, + pub schema: Schema, + /// Set to true when the stuck-timestamp cap was reached and the circuit + /// breaker should be tripped by the caller. + pub trip_circuit_breaker: bool, +} + +// ── Row processing (pure, testable without HTTP) ────────────────────────────── + +/// Normalize a raw timestamp from InfluxDB V3 JSONL into a cursor-safe RFC 3339 string. +/// +/// InfluxDB 3 Core returns timestamps without a timezone suffix and with nanosecond +/// precision (e.g. `"2026-04-26T02:32:20.526360865"`). The only required fix is +/// appending `"Z"` when no timezone suffix is present (InfluxDB always stores UTC). +/// +/// Full nanosecond precision is intentionally preserved — truncating to milliseconds +/// would place the cursor BEFORE the actual row timestamps within the same millisecond, +/// causing `WHERE time > '$cursor'` to re-deliver already-seen rows on subsequent polls. +/// InfluxDB 3's DataFusion SQL engine handles RFC 3339 strings with any number of +/// fractional digits in WHERE clause timestamp comparisons. +fn normalize_v3_timestamp(ts: &str) -> std::borrow::Cow<'_, str> { + if chrono::DateTime::parse_from_rfc3339(ts).is_ok() { + std::borrow::Cow::Borrowed(ts) // Zero allocation + } else { + std::borrow::Cow::Owned(format!("{ts}Z")) + } +} + +/// Result of processing a batch of V3 rows into Iggy messages. +#[derive(Debug)] +pub(crate) struct RowProcessingResult { + pub messages: Vec, + pub max_cursor: Option, + /// Count of rows whose cursor == max_cursor. + /// Used for stuck-batch detection: if this equals effective_batch, the cursor + /// cannot advance and the batch size must be inflated. + pub rows_at_max_cursor: u64, +} + +/// Converts a slice of V3 query rows into Iggy messages. +/// +/// ## Cursor semantics +/// +/// InfluxDB V3 queries use strict `WHERE time > '$cursor'` semantics, so every row +/// in the slice is strictly after the current cursor. No row-skipping or deduplication +/// is needed — unlike V2's inclusive `>= $cursor` query. +/// +/// ## Timestamp normalization +/// +/// InfluxDB 3 Core returns timestamps without a timezone suffix and with nanosecond +/// precision (e.g. `"2026-04-26T02:32:20.526360865"`). Each cursor value is passed +/// through [`normalize_v3_timestamp`] before parsing, which appends `"Z"` when no +/// timezone designator is present. Full nanosecond precision is preserved — truncating +/// to milliseconds would shift the cursor before rows that share a millisecond +/// boundary, causing re-delivery on the next poll. +/// +/// ## Cursor tracking and stuck-batch detection +/// +/// The highest timestamp seen across the batch becomes `max_cursor` in the result. +/// `rows_at_max_cursor` counts how many rows share that timestamp, computed in a +/// single pass with no extra allocations. The caller uses +/// `rows_at_max_cursor >= effective_batch_size` as the stuck-batch signal: when a +/// group of rows all carry the same timestamp and fill the entire batch, the cursor +/// cannot advance further, so the effective batch size is doubled (up to +/// `stuck_batch_cap_factor × base_batch_size`) to read past the tied rows. +/// +/// ## Error conditions +/// +/// - Any row whose cursor field value fails [`validate_cursor`] is an immediate error. +/// - If the batch is non-empty but *no* row contains the cursor field, an error is +/// returned — the cursor cannot advance and the connector would re-deliver the same +/// rows indefinitely. Individual rows that merely lack the cursor field (while at +/// least one other row has it) still produce messages without error. +/// +/// ## Message identity +/// +/// A single random UUID is generated per call; per-message IDs are derived by +/// adding the message's position to that base, keeping PRNG work O(1) per batch. +/// +/// ## Parameters +/// +/// - `rows`: Rows returned by the SQL query for this poll (already filtered by `> cursor`). +/// - `ctx`: Shared context (cursor field name, current cursor value, payload config, +/// wall-clock time in microseconds). +/// +/// ## Returns +/// +/// A [`RowProcessingResult`] containing: +/// - `messages`: One [`ProducedMessage`] per row. +/// - `max_cursor`: Highest cursor timestamp seen, if any row had a parsable cursor field. +/// - `rows_at_max_cursor`: Count of rows sharing `max_cursor`; used for stuck-batch detection. +pub(crate) fn process_rows( + rows: &[Row], + ctx: &RowContext<'_>, +) -> Result { + let mut messages = Vec::with_capacity(rows.len()); + let mut max_cursor: Option = None; + let mut max_cursor_parsed: Option> = None; // cache parsed form + // Counted inline to avoid a second pass over rows (which would also + // re-call normalize_v3_timestamp for each row — extra allocations). + let mut rows_at_max_cursor = 0u64; + // Generate the base UUID once per poll; derive per-message IDs by addition. + // This is O(1) PRNG calls per batch instead of O(n), measurable at batch ≥ 100. + let id_base = Uuid::new_v4().as_u128(); + for row in rows.iter() { + if let Some(raw_cv) = row.get(ctx.cursor_field).and_then(|v| v.as_str()) { + let cv_owned = normalize_v3_timestamp(raw_cv); + let cv: &str = &cv_owned; + validate_cursor(cv)?; + let cv_parsed = cv.parse::>().ok(); + match (cv_parsed, max_cursor_parsed) { + (Some(new_dt), Some(cur_dt)) if new_dt > cur_dt => { + max_cursor = Some(cv.to_string()); + max_cursor_parsed = Some(new_dt); + rows_at_max_cursor = 1; + } + (Some(new_dt), Some(cur_dt)) if new_dt == cur_dt => { + rows_at_max_cursor += 1; + } + (Some(new_dt), None) => { + max_cursor = Some(cv.to_string()); + max_cursor_parsed = Some(new_dt); + rows_at_max_cursor = 1; + } + (None, _) if max_cursor_parsed.is_none() => { + // Unparsable cursor — still track it (string fallback) if no + // parsable cursor has been seen yet. + max_cursor = Some(cv.to_string()); + } + _ => {} + } + } + + let payload = build_payload( + row, + ctx.payload_col, + ctx.payload_format, + ctx.include_metadata, + ctx.cursor_field, + )?; + messages.push(ProducedMessage { + id: Some(id_base.wrapping_add(messages.len() as u128)), + checksum: None, + timestamp: Some(ctx.now_micros), + origin_timestamp: Some(ctx.now_micros), + headers: None, + payload, + }); + } + + if !rows.is_empty() && max_cursor.is_none() { + return Err(Error::InvalidRecordValue(format!( + "No '{}' field found in any returned row — cursor cannot advance; \ + the connector would re-deliver the same rows on every poll. \ + Ensure your query selects the cursor column.", + ctx.cursor_field + ))); + } + + Ok(RowProcessingResult { + messages, + max_cursor, + rows_at_max_cursor, + }) +} + +pub(crate) async fn poll( + client: &ClientWithMiddleware, + config: &V3SourceConfig, + auth: &str, + state: &V3State, + payload_format: PayloadFormat, + include_metadata: bool, +) -> Result { + // Access config.initial_offset directly (not via the enum accessor) because + // poll() receives &V3SourceConfig — the inner struct — already matched by the + // caller in lib.rs. The enum accessor InfluxDbSourceConfig::initial_offset() + // is not available here. + let cursor = state + .last_timestamp + .clone() + .or_else(|| config.initial_offset.clone()) + .unwrap_or_else(|| "1970-01-01T00:00:00Z".to_string()); + + let base_batch = config.batch_size.unwrap_or(500); + let effective_batch = if state.effective_batch_size == 0 { + base_batch + } else { + state.effective_batch_size + }; + + let response_data = run_query( + client, + config, + auth, + &cursor, + effective_batch, + state.last_timestamp_row_offset, + ) + .await?; + let rows = parse_jsonl_rows(&response_data)?; + + let cap_factor = config + .stuck_batch_cap_factor + .unwrap_or(DEFAULT_STUCK_CAP_FACTOR); + let ctx = RowContext { + cursor_field: config + .cursor_field + .as_deref() + .unwrap_or(DEFAULT_V3_CURSOR_FIELD), + current_cursor: &cursor, + include_metadata, + payload_col: config.payload_column.as_deref(), + payload_format, + now_micros: iggy_common::Utc::now().timestamp_micros() as u64, + }; + + let result = process_rows(&rows, &ctx)?; + + // Stuck-timestamp detection: if the number of rows sharing the max cursor + // equals the effective batch size, the cursor cannot advance — inflate. + // Using rows_at_max_cursor (not all_at_cursor) works correctly with the + // default strict `> '$cursor'` semantics: those rows are NEVER at the + // input cursor, so all_at_cursor is permanently false under strict >. + let stuck = result.rows_at_max_cursor >= effective_batch as u64; + + if stuck { + return match next_stuck_batch_size(effective_batch, base_batch, cap_factor) { + Some(next_batch) => { + warn!( + "InfluxDB V3 source — all {} rows share timestamp {cursor:?}; \ + inflating batch size {} → {} (cap={}×{}={})", + rows.len(), + effective_batch, + next_batch, + cap_factor, + base_batch, + base_batch.saturating_mul(cap_factor) + ); + Ok(PollResult { + messages: vec![], + new_state: V3State { + last_timestamp: state.last_timestamp.clone(), + processed_rows: state.processed_rows, + effective_batch_size: next_batch, + last_timestamp_row_offset: result.rows_at_max_cursor, + }, + schema: Schema::Json, + trip_circuit_breaker: false, + }) + } + None => { + warn!( + "InfluxDB V3 source — stuck-timestamp cap reached at batch size {effective_batch}; \ + tripping circuit breaker to prevent an infinite loop" + ); + // Reset effective_batch_size to base so the next poll after the + // circuit-breaker cool-down restarts from the configured batch size + // rather than re-entering at cap and immediately re-tripping. + // Do NOT update last_timestamp_row_offset on the trip path — no + // messages were emitted, so the offset tiebreaker is unchanged. + Ok(PollResult { + messages: vec![], + new_state: V3State { + last_timestamp: state.last_timestamp.clone(), + processed_rows: state.processed_rows, + effective_batch_size: base_batch, + last_timestamp_row_offset: state.last_timestamp_row_offset, + }, + schema: Schema::Json, + trip_circuit_breaker: true, + }) + } + }; + } + + let processed_rows = state.processed_rows + result.messages.len() as u64; + let old_dt = state + .last_timestamp + .as_deref() + .and_then(|s| s.parse::>().ok()); + let advanced_cursor = match ( + result.max_cursor.as_deref(), + state.last_timestamp.as_deref(), + ) { + (Some(new), Some(_)) if old_dt.is_some_and(|dt| is_timestamp_after(new, dt)) => { + result.max_cursor + } + (Some(_), Some(_)) => { + warn!("V3 source: max_cursor did not advance past saved cursor; keeping old value"); + state.last_timestamp.clone() + } + (Some(_), None) => result.max_cursor, // first poll + _ => state.last_timestamp.clone(), // empty batch + }; + + let new_state = V3State { + last_timestamp: advanced_cursor, + processed_rows, + effective_batch_size: base_batch, // reset on successful advance + last_timestamp_row_offset: result.rows_at_max_cursor, + }; + + let schema = if ctx.payload_col.is_some() { + ctx.payload_format.schema() + } else { + Schema::Json + }; + + Ok(PollResult { + messages: result.messages, + new_state, + schema, + trip_circuit_breaker: false, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::{Row, RowContext}; + + fn row(pairs: &[(&str, &str)]) -> Row { + pairs + .iter() + .map(|(k, v)| (k.to_string(), serde_json::Value::String(v.to_string()))) + .collect() + } + + const T1: &str = "2024-01-01T00:00:00Z"; + const T2: &str = "2024-01-01T00:00:01Z"; + const T3: &str = "2024-01-01T00:00:02Z"; + + fn ctx(current_cursor: &str, now_micros: u64) -> RowContext<'_> { + RowContext { + cursor_field: "time", + current_cursor, + include_metadata: true, + payload_col: None, + payload_format: PayloadFormat::Json, + now_micros, + } + } + + // ── process_rows ───────────────────────────────────────────────────────── + + #[test] + fn process_rows_empty_returns_empty() { + let result = process_rows(&[], &ctx(T1, 1000)).unwrap(); + assert!(result.messages.is_empty()); + assert!(result.max_cursor.is_none()); + assert_eq!( + result.rows_at_max_cursor, 0, + "empty slice has no rows at max cursor" + ); + } + + #[test] + fn process_rows_single_row_advances_cursor() { + let rows = vec![row(&[("time", T1), ("val", "1")])]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert_eq!(result.messages.len(), 1); + assert_eq!(result.max_cursor.as_deref(), Some(T1)); + } + + #[test] + fn process_rows_advances_to_latest_timestamp() { + let rows = vec![ + row(&[("time", T1)]), + row(&[("time", T3)]), + row(&[("time", T2)]), + ]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert_eq!(result.max_cursor.as_deref(), Some(T3)); + assert_eq!(result.messages.len(), 3); + } + + #[test] + fn process_rows_tied_timestamps_do_not_regress_cursor() { + let rows = vec![ + row(&[("time", T2)]), + row(&[("time", T1)]), // earlier — must not overwrite max + row(&[("time", T2)]), + ]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert_eq!(result.max_cursor.as_deref(), Some(T2)); + } + + #[test] + fn process_rows_row_without_cursor_field_returns_error() { + // A batch where no row has the cursor column must return Err rather than + // silently re-delivering the same rows on every poll. + let rows = vec![row(&[("val", "1")])]; // no "time" field + let err = process_rows(&rows, &ctx(T1, 1000)).unwrap_err(); + assert!( + matches!(err, Error::InvalidRecordValue(_)), + "expected InvalidRecordValue when cursor column is absent, got {err:?}" + ); + } + + #[test] + fn process_rows_all_rows_missing_cursor_field_returns_error() { + // When no row in the batch contains the cursor column, process_rows + // returns Err. This trips the circuit breaker via poll()'s `?`, giving + // the operator a visible failure rather than a silent infinite re-delivery. + let rows = vec![ + row(&[("val", "1")]), + row(&[("val", "2")]), + row(&[("val", "3")]), + ]; + let err = process_rows(&rows, &ctx(T1, 1000)).unwrap_err(); + assert!( + matches!(err, Error::InvalidRecordValue(_)), + "expected InvalidRecordValue when cursor column is absent, got {err:?}" + ); + } + + #[test] + fn process_rows_message_ids_are_some_and_unique() { + let rows = vec![row(&[("time", T1)]), row(&[("time", T2)])]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert!(result.messages[0].id.is_some()); + assert!(result.messages[1].id.is_some()); + assert_ne!(result.messages[0].id, result.messages[1].id); + } + + #[test] + fn process_rows_message_timestamps_use_now_micros() { + let rows = vec![row(&[("time", T1)])]; + let result = process_rows(&rows, &ctx(T1, 888_888)).unwrap(); + assert_eq!(result.messages[0].timestamp, Some(888_888)); + assert_eq!(result.messages[0].origin_timestamp, Some(888_888)); + } + + #[test] + fn process_rows_text_payload_format() { + use base64::{Engine as _, engine::general_purpose}; + let encoded = general_purpose::STANDARD.encode(b"hello"); + let rows = vec![row(&[("time", T1), ("payload", &encoded)])]; + let result = process_rows( + &rows, + &RowContext { + cursor_field: "time", + current_cursor: T1, + include_metadata: true, + payload_col: Some("payload"), + payload_format: PayloadFormat::Text, + now_micros: 1000, + }, + ) + .unwrap(); + assert_eq!(result.messages.len(), 1); + } + + // ── rows_at_max_cursor / stuck-batch ───────────────────────────────────── + + #[test] + fn process_rows_rows_at_max_cursor_counts_rows_sharing_max_timestamp() { + // Two rows at T1: both equal max_cursor → rows_at_max_cursor = 2. + let rows = vec![row(&[("time", T1)]), row(&[("time", T1)])]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert_eq!(result.rows_at_max_cursor, 2); + } + + #[test] + fn process_rows_rows_at_max_cursor_resets_when_cursor_advances() { + // T1 then T2: max_cursor = T2, only 1 row at T2 → rows_at_max_cursor = 1. + let rows = vec![row(&[("time", T1)]), row(&[("time", T2)])]; + let result = process_rows(&rows, &ctx(T1, 1000)).unwrap(); + assert_eq!(result.rows_at_max_cursor, 1); + assert_eq!(result.max_cursor.as_deref(), Some(T2)); + } + + #[test] + fn process_rows_rows_at_max_cursor_zero_for_empty_slice() { + let result = process_rows(&[], &ctx(T1, 1000)).unwrap(); + assert_eq!(result.rows_at_max_cursor, 0); + } + + // ── next_stuck_batch_size ──────────────────────────────────────────────── + + #[test] + fn next_stuck_batch_size_doubles_until_cap() { + assert_eq!(next_stuck_batch_size(500, 500, 10), Some(1000)); + assert_eq!(next_stuck_batch_size(1000, 500, 10), Some(2000)); + assert_eq!(next_stuck_batch_size(4000, 500, 10), Some(5000)); + assert_eq!(next_stuck_batch_size(5000, 500, 10), None); + } + + // ── normalize_v3_timestamp ──────────────────────────────────────────────── + + #[test] + fn normalize_already_valid_rfc3339_unchanged() { + // Already valid RFC 3339 with Z and ms precision — must be returned as-is. + assert_eq!( + normalize_v3_timestamp("2024-01-01T00:00:00.123Z"), + "2024-01-01T00:00:00.123Z" + ); + // Second-precision with Z is also ≤ms, returned unchanged. + assert_eq!( + normalize_v3_timestamp("2024-01-01T00:00:00Z"), + "2024-01-01T00:00:00Z" + ); + } + + #[test] + fn normalize_no_tz_nanoseconds_appends_z_only() { + // InfluxDB 3 Core returns timestamps like this — 9 fractional digits, no Z. + // Full nanosecond precision must be preserved (not truncated to ms). + let result = normalize_v3_timestamp("2026-04-26T02:32:20.526360865"); + assert_eq!(result, "2026-04-26T02:32:20.526360865Z"); + } + + #[test] + fn normalize_no_tz_milliseconds_appends_z() { + // No timezone suffix, ms precision — just append Z. + let result = normalize_v3_timestamp("2026-04-26T02:32:20.526"); + assert_eq!(result, "2026-04-26T02:32:20.526Z"); + } + + #[test] + fn normalize_rfc3339_sub_ms_precision_returned_unchanged() { + // Already valid RFC 3339 with Z and nanoseconds — returned as-is. + let result = normalize_v3_timestamp("2026-04-26T02:32:20.526360865Z"); + assert_eq!(result, "2026-04-26T02:32:20.526360865Z"); + } + + #[test] + fn normalize_invalid_returns_with_z_appended() { + // Unparsable string — append Z and return (validate_cursor will reject it later). + let result = normalize_v3_timestamp("not-a-timestamp"); + assert_eq!(result, "not-a-timestampZ"); + } + + #[test] + fn process_rows_accepts_influxdb3_no_tz_timestamps() { + // Regression test: process_rows must not return Err when timestamps lack Z suffix. + // Full nanosecond precision must be preserved so the cursor is exact. + let rows = vec![ + row(&[("time", "2026-04-26T02:32:20.526360865"), ("val", "1")]), + row(&[("time", "2026-04-26T02:32:21.000000000"), ("val", "2")]), + ]; + let c = ctx("2026-04-26T02:32:19.000Z", 0); + let result = process_rows(&rows, &c).expect("should not fail on bare timestamps"); + assert_eq!(result.messages.len(), 2); + assert_eq!( + result.max_cursor.as_deref(), + Some("2026-04-26T02:32:21.000000000Z") + ); + } + + #[test] + fn process_rows_sub_ms_timestamps_have_distinct_cursors() { + // Regression: rows within the same millisecond must NOT get the same ms cursor, + // which would cause re-delivery. Each row's nanosecond cursor must be preserved. + let rows = vec![ + row(&[("time", "2026-04-26T02:32:20.526360000"), ("val", "a")]), + row(&[("time", "2026-04-26T02:32:20.526361000"), ("val", "b")]), + row(&[("time", "2026-04-26T02:32:20.526362000"), ("val", "c")]), + ]; + let c = ctx("2026-04-26T02:32:19.000Z", 0); + let result = process_rows(&rows, &c).expect("should succeed"); + // max_cursor must be the latest nanosecond timestamp (row 3), not a truncated ms. + assert_eq!( + result.max_cursor.as_deref(), + Some("2026-04-26T02:32:20.526362000Z") + ); + // Only row 3 is at max_cursor. + assert_eq!(result.rows_at_max_cursor, 1); + } +} + +#[cfg(test)] +mod http_tests { + use super::*; + use axum::Router; + use axum::extract::Request; + use axum::http::{HeaderMap, StatusCode}; + use axum::routing::post; + use secrecy::SecretString; + use std::sync::Arc; + use std::time::Duration; + use tokio::sync::Mutex; + + // ── helpers ─────────────────────────────────────────────────────────────── + + async fn start_server(router: Router) -> String { + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + tokio::spawn(async move { + axum::serve(listener, router).await.unwrap(); + }); + format!("http://127.0.0.1:{port}") + } + + fn make_client() -> ClientWithMiddleware { + let raw = reqwest::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .unwrap(); + iggy_connector_sdk::retry::build_retry_client( + raw, + 1, + Duration::from_millis(1), + Duration::from_millis(10), + "test", + ) + } + + fn make_config(url: &str) -> V3SourceConfig { + V3SourceConfig { + url: url.to_string(), + db: "test_db".to_string(), + token: SecretString::from("test_token"), + query: "SELECT * FROM t WHERE time > '$cursor' LIMIT $limit OFFSET $offset".to_string(), + poll_interval: None, + batch_size: Some(10), + cursor_field: None, + initial_offset: None, + payload_column: None, + payload_format: None, + include_metadata: None, + verbose_logging: None, + max_retries: Some(1), + retry_delay: Some("1ms".to_string()), + timeout: Some("5s".to_string()), + max_open_retries: Some(1), + open_retry_max_delay: Some("10ms".to_string()), + retry_max_delay: Some("10ms".to_string()), + circuit_breaker_threshold: None, + circuit_breaker_cool_down: None, + stuck_batch_cap_factor: None, + } + } + + const CURSOR: &str = "1970-01-01T00:00:00Z"; + + // ── run_query ───────────────────────────────────────────────────────────── + + #[tokio::test] + async fn run_query_returns_jsonl_body_on_200() { + let jsonl = r#"{"time":"2024-01-01T00:00:00Z","val":1}"#; + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await + .unwrap(); + assert!(result.contains("val")); + assert!(result.contains("2024-01-01")); + } + + #[tokio::test] + async fn run_query_empty_body_on_200() { + let app = Router::new().route("/api/v3/query_sql", post(|| async { (StatusCode::OK, "") })); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await + .unwrap(); + assert!(result.is_empty()); + } + + /// V3-specific: 404 with body containing "database not found" must return + /// an empty string rather than an error (namespace not yet written to). + #[tokio::test] + async fn run_query_404_database_not_found_returns_empty_string() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { (StatusCode::NOT_FOUND, "database not found") }), + ); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await + .unwrap(); + assert!(result.is_empty()); + } + + /// Any other 404 body must NOT be swallowed — it is a permanent error. + #[tokio::test] + async fn run_query_404_other_body_returns_permanent_error() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { (StatusCode::NOT_FOUND, "table not found") }), + ); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await; + assert!(matches!(result, Err(Error::PermanentHttpError(_)))); + } + + #[tokio::test] + async fn run_query_500_returns_transient_error() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { StatusCode::INTERNAL_SERVER_ERROR }), + ); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await; + assert!(matches!(result, Err(Error::Storage(_)))); + } + + #[tokio::test] + async fn run_query_400_returns_permanent_error() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { StatusCode::BAD_REQUEST }), + ); + let base = start_server(app).await; + let result = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + CURSOR, + 10, + 0, + ) + .await; + assert!(matches!(result, Err(Error::PermanentHttpError(_)))); + } + + #[tokio::test] + async fn run_query_sends_bearer_authorization_header() { + let captured: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured.clone(); + let app = Router::new().route( + "/api/v3/query_sql", + post(move |headers: HeaderMap| { + let cap = cap2.clone(); + async move { + *cap.lock().await = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + StatusCode::OK + } + }), + ); + let base = start_server(app).await; + let _ = run_query( + &make_client(), + &make_config(&base), + "Bearer my_token", + CURSOR, + 10, + 0, + ) + .await; + assert_eq!(*captured.lock().await, "Bearer my_token"); + } + + #[tokio::test] + async fn run_query_request_body_contains_db_and_substituted_cursor() { + let captured_body: Arc> = Arc::new(Mutex::new(String::new())); + let cap2 = captured_body.clone(); + let app = Router::new().route( + "/api/v3/query_sql", + post(move |request: Request| { + let cap = cap2.clone(); + async move { + let bytes = axum::body::to_bytes(request.into_body(), usize::MAX) + .await + .unwrap(); + *cap.lock().await = String::from_utf8_lossy(&bytes).to_string(); + StatusCode::OK + } + }), + ); + let base = start_server(app).await; + let cursor = "2024-06-01T00:00:00Z"; + let _ = run_query( + &make_client(), + &make_config(&base), + "Bearer tok", + cursor, + 10, + 0, + ) + .await; + let body = captured_body.lock().await; + assert!(body.contains("test_db"), "body should include db: {body}"); + assert!(body.contains(cursor), "body should include cursor: {body}"); + assert!( + !body.contains("$cursor"), + "raw placeholder must not appear: {body}" + ); + } + + // ── poll() end-to-end ───────────────────────────────────────────────────── + + #[tokio::test] + async fn poll_returns_messages_for_jsonl_response() { + let jsonl = "{\"time\":\"2024-01-01T00:00:01Z\",\"val\":1}\n\ + {\"time\":\"2024-01-01T00:00:02Z\",\"val\":2}\n"; + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let state = V3State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.messages.len(), 2); + assert_eq!( + result.new_state.last_timestamp.as_deref(), + Some("2024-01-01T00:00:02Z") + ); + assert!(!result.trip_circuit_breaker); + assert_eq!(result.schema, Schema::Json); + } + + #[tokio::test] + async fn poll_advances_cursor_to_latest_out_of_order_timestamp() { + let jsonl = "{\"time\":\"2024-01-01T00:00:01Z\",\"v\":1}\n\ + {\"time\":\"2024-01-01T00:00:03Z\",\"v\":3}\n\ + {\"time\":\"2024-01-01T00:00:02Z\",\"v\":2}\n"; + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let state = V3State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.messages.len(), 3); + assert_eq!( + result.new_state.last_timestamp.as_deref(), + Some("2024-01-01T00:00:03Z") + ); + } + + #[tokio::test] + async fn poll_empty_jsonl_returns_no_messages() { + let app = Router::new().route("/api/v3/query_sql", post(|| async { (StatusCode::OK, "") })); + let base = start_server(app).await; + let state = V3State { + last_timestamp: Some("2024-01-01T00:00:00Z".to_string()), + ..V3State::default() + }; + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert!(result.messages.is_empty()); + assert!(!result.trip_circuit_breaker); + // Cursor must not regress + assert_eq!( + result.new_state.last_timestamp.as_deref(), + Some("2024-01-01T00:00:00Z") + ); + } + + #[tokio::test] + async fn poll_detects_stuck_batch_and_doubles_batch_size() { + // All batch_size rows share the same timestamp as the cursor → stuck. + // Expected: no messages produced, effective_batch_size doubled. + let t = "2024-01-01T00:00:00Z"; + let jsonl: String = (0..10) + .map(|i| format!("{{\"time\":\"{t}\",\"val\":{i}}}\n")) + .collect(); + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + // cursor = t so every row matches it + let state = V3State { + last_timestamp: Some(t.to_string()), + effective_batch_size: 10, + processed_rows: 0, + last_timestamp_row_offset: 0, + }; + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert!( + result.messages.is_empty(), + "stuck batch must produce no messages" + ); + assert_eq!(result.new_state.effective_batch_size, 20, "should double"); + assert!(!result.trip_circuit_breaker); + // Cursor must not change + assert_eq!(result.new_state.last_timestamp.as_deref(), Some(t)); + } + + #[tokio::test] + async fn poll_trips_circuit_breaker_when_stuck_cap_reached() { + // cap_factor=1 → cap = batch_size × 1 = 10. + // effective_batch_size is already 10 (= cap) → next_stuck_batch_size returns None. + let t = "2024-01-01T00:00:00Z"; + let jsonl: String = (0..10) + .map(|i| format!("{{\"time\":\"{t}\",\"val\":{i}}}\n")) + .collect(); + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let config = V3SourceConfig { + stuck_batch_cap_factor: Some(1), + ..make_config(&base) + }; + let state = V3State { + last_timestamp: Some(t.to_string()), + effective_batch_size: 10, + processed_rows: 0, + last_timestamp_row_offset: 0, + }; + let result = poll( + &make_client(), + &config, + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert!(result.trip_circuit_breaker, "must trip when at cap"); + assert!(result.messages.is_empty()); + } + + #[tokio::test] + async fn poll_resets_effective_batch_size_on_cursor_advance() { + // State has an inflated batch size from a previous stuck run. + // When the cursor advances the batch size must reset to the base value. + let jsonl = "{\"time\":\"2024-01-01T00:00:01Z\",\"v\":1}\n\ + {\"time\":\"2024-01-01T00:00:02Z\",\"v\":2}\n"; + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let state = V3State { + effective_batch_size: 5000, + ..V3State::default() + }; + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + // make_config has batch_size=10 → base_batch=10 + assert_eq!( + result.new_state.effective_batch_size, 10, + "should reset to base" + ); + assert_eq!(result.messages.len(), 2); + } + + #[tokio::test] + async fn poll_accumulates_processed_rows_in_state() { + let jsonl = "{\"time\":\"2024-01-01T00:00:01Z\",\"v\":1}\n\ + {\"time\":\"2024-01-01T00:00:02Z\",\"v\":2}\n"; + let app = Router::new().route( + "/api/v3/query_sql", + post(move || async move { (StatusCode::OK, jsonl) }), + ); + let base = start_server(app).await; + let state = V3State { + processed_rows: 7, + ..V3State::default() + }; + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await + .unwrap(); + assert_eq!(result.new_state.processed_rows, 9); // 7 prior + 2 new + } + + #[tokio::test] + async fn poll_propagates_transient_http_error() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { StatusCode::INTERNAL_SERVER_ERROR }), + ); + let base = start_server(app).await; + let state = V3State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await; + assert!(matches!(result, Err(Error::Storage(_)))); + } + + #[tokio::test] + async fn poll_permanent_http_error_propagates() { + let app = Router::new().route( + "/api/v3/query_sql", + post(|| async { StatusCode::BAD_REQUEST }), + ); + let base = start_server(app).await; + let state = V3State::default(); + let result = poll( + &make_client(), + &make_config(&base), + "Bearer tok", + &state, + PayloadFormat::Json, + true, + ) + .await; + assert!(matches!(result, Err(Error::PermanentHttpError(_)))); + } + + // ── build_query ────────────────────────────────────────────────────────── + + const BASE: &str = "http://localhost:8181"; + + #[test] + fn build_query_url_path_and_body_fields() { + let (url, body) = build_query(BASE, "SELECT * FROM cpu LIMIT 10", "sensors").unwrap(); + assert!( + url.path().ends_with("/api/v3/query_sql"), + "wrong path: {}", + url.path() + ); + assert!( + url.query().is_none_or(|q| !q.contains("org=")), + "org must not appear in URL" + ); + assert_eq!(body["db"].as_str(), Some("sensors")); + assert_eq!(body["format"].as_str(), Some("jsonl")); + assert!(body["q"].as_str().unwrap().contains("SELECT")); + } + + #[test] + fn build_query_format_is_always_jsonl() { + let (_, body) = build_query(BASE, "SELECT 1", "db").unwrap(); + assert_eq!(body["format"].as_str(), Some("jsonl")); + } + + #[test] + fn build_query_invalid_base_returns_error() { + assert!(build_query("not-a-url", "SELECT 1", "db").is_err()); + } +} diff --git a/core/integration/tests/connectors/fixtures/influxdb/container.rs b/core/integration/tests/connectors/fixtures/influxdb/container.rs index e77ae7f115..dcf707c52c 100644 --- a/core/integration/tests/connectors/fixtures/influxdb/container.rs +++ b/core/integration/tests/connectors/fixtures/influxdb/container.rs @@ -47,6 +47,9 @@ pub const DEFAULT_TEST_TOPIC: &str = "test_topic"; // ── env-var keys injected into the connectors runtime ──────────────────────── +pub const ENV_SOURCE_VERSION: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_VERSION"; +pub const ENV_SINK_VERSION: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_VERSION"; + pub const ENV_SOURCE_URL: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_URL"; pub const ENV_SOURCE_ORG: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_ORG"; pub const ENV_SOURCE_TOKEN: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_TOKEN"; @@ -121,14 +124,16 @@ impl InfluxDbContainer { message: format!("Failed to start container: {e}"), })?; - let mapped_port = container + let ports = container .ports() .await .map_err(|e| TestBinaryError::FixtureSetup { fixture_type: "InfluxDbContainer".to_string(), message: format!("Failed to get ports: {e}"), - })? + })?; + let mapped_port = ports .map_to_host_port_ipv4(INFLUXDB_PORT) + .or_else(|| ports.map_to_host_port_ipv6(INFLUXDB_PORT)) .ok_or_else(|| TestBinaryError::FixtureSetup { fixture_type: "InfluxDbContainer".to_string(), message: "No mapping for InfluxDB port".to_string(), diff --git a/core/integration/tests/connectors/fixtures/influxdb/container_v3.rs b/core/integration/tests/connectors/fixtures/influxdb/container_v3.rs new file mode 100644 index 0000000000..0752dbd734 --- /dev/null +++ b/core/integration/tests/connectors/fixtures/influxdb/container_v3.rs @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//! InfluxDB 3.x test container and shared HTTP operations. +//! +//! InfluxDB 3.x uses a simplified data model: +//! - No `org` required in write/query URLs. +//! - `db` replaces `bucket`. +//! - Write: `POST /api/v3/write_lp?db=X&precision=P` +//! - Query: `POST /api/v3/query_sql` with `{"db":…,"q":…,"format":"jsonl"}` +//! - Auth: `Authorization: Bearer {token}` + +use integration::harness::TestBinaryError; +use reqwest_middleware::ClientWithMiddleware as HttpClient; +use testcontainers_modules::testcontainers::core::{IntoContainerPort, WaitFor}; +use testcontainers_modules::testcontainers::runners::AsyncRunner; +use testcontainers_modules::testcontainers::{ContainerAsync, GenericImage, ImageExt}; +use tracing::info; + +// InfluxDB 3.x Core Docker image. +// `influxdb:3-core` is the official OSS image for InfluxDB 3 Core on Docker Hub. +const INFLUXDB3_IMAGE: &str = "influxdb"; +const INFLUXDB3_TAG: &str = "3-core"; +const INFLUXDB3_PORT: u16 = 8181; + +pub const INFLUXDB3_DB: &str = "iggy-test-db"; +pub const INFLUXDB3_TOKEN: &str = "iggy-v3-test-token"; + +/// Number of `/ping` attempts before giving up. +pub const HEALTH_CHECK_ATTEMPTS_V3: usize = 60; +/// Milliseconds between each `/ping` attempt. +pub const HEALTH_CHECK_INTERVAL_MS_V3: u64 = 1_000; + +pub const DEFAULT_TEST_STREAM_V3: &str = "test_stream"; +pub const DEFAULT_TEST_TOPIC_V3: &str = "test_topic"; + +// ── env-var keys injected into the connectors runtime ──────────────────────── + +pub const ENV_V3_SINK_URL: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_URL"; +pub const ENV_V3_SINK_TOKEN: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_TOKEN"; +pub const ENV_V3_SINK_DB: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_DB"; +pub const ENV_V3_SINK_VERSION: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_VERSION"; +pub const ENV_V3_SINK_ORG: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_ORG"; +pub const ENV_V3_SINK_BUCKET: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_BUCKET"; +pub const ENV_V3_SINK_STREAMS_0_STREAM: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_STREAMS_0_STREAM"; +pub const ENV_V3_SINK_STREAMS_0_TOPICS: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_STREAMS_0_TOPICS"; +pub const ENV_V3_SINK_STREAMS_0_SCHEMA: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_STREAMS_0_SCHEMA"; +pub const ENV_V3_SINK_STREAMS_0_CONSUMER_GROUP: &str = + "IGGY_CONNECTORS_SINK_INFLUXDB_STREAMS_0_CONSUMER_GROUP"; +pub const ENV_V3_SINK_PATH: &str = "IGGY_CONNECTORS_SINK_INFLUXDB_PATH"; +pub const ENV_V3_SINK_PAYLOAD_FORMAT: &str = + "IGGY_CONNECTORS_SINK_INFLUXDB_PLUGIN_CONFIG_PAYLOAD_FORMAT"; + +pub const ENV_V3_SOURCE_URL: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_URL"; +pub const ENV_V3_SOURCE_TOKEN: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_TOKEN"; +pub const ENV_V3_SOURCE_DB: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_DB"; +pub const ENV_V3_SOURCE_VERSION: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_VERSION"; +pub const ENV_V3_SOURCE_ORG: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_ORG"; +pub const ENV_V3_SOURCE_QUERY: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_QUERY"; +pub const ENV_V3_SOURCE_POLL_INTERVAL: &str = + "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_POLL_INTERVAL"; +pub const ENV_V3_SOURCE_BATCH_SIZE: &str = + "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_BATCH_SIZE"; +pub const ENV_V3_SOURCE_STREAMS_0_STREAM: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_STREAMS_0_STREAM"; +pub const ENV_V3_SOURCE_STREAMS_0_TOPIC: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_STREAMS_0_TOPIC"; +pub const ENV_V3_SOURCE_STREAMS_0_SCHEMA: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_STREAMS_0_SCHEMA"; +pub const ENV_V3_SOURCE_CURSOR_FIELD: &str = + "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_CURSOR_FIELD"; +pub const ENV_V3_SOURCE_PATH: &str = "IGGY_CONNECTORS_SOURCE_INFLUXDB_PATH"; +pub const ENV_V3_SOURCE_PAYLOAD_FORMAT: &str = + "IGGY_CONNECTORS_SOURCE_INFLUXDB_PLUGIN_CONFIG_PAYLOAD_FORMAT"; + +// ── Container ──────────────────────────────────────────────────────────────── + +pub struct InfluxDb3Container { + #[allow(dead_code)] + container: ContainerAsync, + pub base_url: String, +} + +impl InfluxDb3Container { + pub async fn start() -> Result { + let container: ContainerAsync = + GenericImage::new(INFLUXDB3_IMAGE, INFLUXDB3_TAG) + .with_exposed_port(INFLUXDB3_PORT.tcp()) + // InfluxDB 3 Core logs "startup time:" on stdout when the HTTP + // listener is accepting connections. + .with_wait_for(WaitFor::message_on_stdout("startup time:")) + .with_startup_timeout(std::time::Duration::from_secs(60)) + .with_mapped_port(0, INFLUXDB3_PORT.tcp()) + // The influxdb:3-core image has no ENTRYPOINT, so the full + // binary invocation must be the CMD. `--object-store memory` + // uses an ephemeral in-memory store (no disk I/O, perfect for + // tests). `--without-auth` disables token auth so test + // fixtures can write/query without managing real tokens. + .with_cmd(vec![ + "influxdb3", + "serve", + "--node-id", + "node0", + "--object-store", + "memory", + "--without-auth", + ]) + .start() + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Container".to_string(), + message: format!("Failed to start container: {e}"), + })?; + + let ports = container + .ports() + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Container".to_string(), + message: format!("Failed to get ports: {e}"), + })?; + let mapped_port = ports + .map_to_host_port_ipv4(INFLUXDB3_PORT) + .or_else(|| ports.map_to_host_port_ipv6(INFLUXDB3_PORT)) + .ok_or_else(|| TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Container".to_string(), + message: "No mapping for InfluxDB 3 port".to_string(), + })?; + + let base_url = format!("http://localhost:{mapped_port}"); + info!("InfluxDB 3 container available at {base_url}"); + + Ok(Self { + container, + base_url, + }) + } +} + +// ── HTTP client ─────────────────────────────────────────────────────────────── + +pub fn create_http_client_v3() -> HttpClient { + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .expect("Failed to build HTTP client"); + reqwest_middleware::ClientBuilder::new(client).build() +} + +// ── Shared InfluxDB 3 operations ────────────────────────────────────────────── + +pub trait InfluxDb3Ops: Sync { + fn container(&self) -> &InfluxDb3Container; + fn http_client(&self) -> &HttpClient; + + /// Write line-protocol lines into the test database. + fn write_lines( + &self, + lines: &[&str], + ) -> impl std::future::Future> + Send { + async move { + let url = format!( + "{}/api/v3/write_lp?db={}&precision=ns", + self.container().base_url, + INFLUXDB3_DB, + ); + let body = lines.join("\n"); + + let response = self + .http_client() + .post(&url) + .header("Authorization", format!("Bearer {INFLUXDB3_TOKEN}")) + .header("Content-Type", "text/plain; charset=utf-8") + .body(body) + .send() + .await + .map_err(|e| TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Ops".to_string(), + message: format!("Failed to write lines: {e}"), + })?; + + let status = response.status(); + if !status.is_success() { + let body = response.text().await.unwrap_or_default(); + return Err(TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Ops".to_string(), + message: format!("Write error: status={status}, body={body}"), + }); + } + Ok(()) + } + } + + /// Count rows matching a SQL query in the test database. + fn query_count( + &self, + sql: &str, + ) -> impl std::future::Future> + Send { + async move { + let url = format!("{}/api/v3/query_sql", self.container().base_url); + let body = serde_json::json!({ + "db": INFLUXDB3_DB, + "q": sql, + "format": "jsonl" + }); + + let response = match self + .http_client() + .post(&url) + .header("Authorization", format!("Bearer {INFLUXDB3_TOKEN}")) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await + { + Ok(r) => r, + // Network / timeout errors — table likely not yet created. + Err(_) => return Ok(0), + }; + + // InfluxDB 3 returns 5xx when the table/namespace does not yet + // exist. Treat any non-2xx as "0 rows" so the polling loop keeps + // going without triggering the retry middleware. + if !response.status().is_success() { + return Ok(0); + } + + let text = response.text().await.unwrap_or_default(); + // JSONL: count non-empty lines — each is one result row. + let count = text.lines().filter(|l| !l.trim().is_empty()).count(); + Ok(count) + } + } +} diff --git a/core/integration/tests/connectors/fixtures/influxdb/mod.rs b/core/integration/tests/connectors/fixtures/influxdb/mod.rs index 0928b5d270..7b6f1597c1 100644 --- a/core/integration/tests/connectors/fixtures/influxdb/mod.rs +++ b/core/integration/tests/connectors/fixtures/influxdb/mod.rs @@ -18,11 +18,16 @@ */ pub mod container; +pub mod container_v3; pub mod sink; +pub mod sink_v3; pub mod source; +pub mod source_v3; pub use sink::{ InfluxDbSinkBase64Fixture, InfluxDbSinkFixture, InfluxDbSinkNoMetadataFixture, InfluxDbSinkNsPrecisionFixture, InfluxDbSinkTextFixture, }; +pub use sink_v3::InfluxDb3SinkFixture; pub use source::{InfluxDbSourceFixture, InfluxDbSourceRawFixture, InfluxDbSourceTextFixture}; +pub use source_v3::InfluxDb3SourceFixture; diff --git a/core/integration/tests/connectors/fixtures/influxdb/sink.rs b/core/integration/tests/connectors/fixtures/influxdb/sink.rs index 00a009cfa4..aa3fa59a7d 100644 --- a/core/integration/tests/connectors/fixtures/influxdb/sink.rs +++ b/core/integration/tests/connectors/fixtures/influxdb/sink.rs @@ -23,8 +23,9 @@ use super::container::{ ENV_SINK_INCLUDE_STREAM_TAG, ENV_SINK_INCLUDE_TOPIC_TAG, ENV_SINK_ORG, ENV_SINK_PATH, ENV_SINK_PAYLOAD_FORMAT, ENV_SINK_PRECISION, ENV_SINK_STREAMS_0_CONSUMER_GROUP, ENV_SINK_STREAMS_0_SCHEMA, ENV_SINK_STREAMS_0_STREAM, ENV_SINK_STREAMS_0_TOPICS, - ENV_SINK_TOKEN, ENV_SINK_URL, HEALTH_CHECK_ATTEMPTS, HEALTH_CHECK_INTERVAL_MS, INFLUXDB_BUCKET, - INFLUXDB_ORG, INFLUXDB_TOKEN, InfluxDbContainer, InfluxDbOps, create_http_client, + ENV_SINK_TOKEN, ENV_SINK_URL, ENV_SINK_VERSION, HEALTH_CHECK_ATTEMPTS, + HEALTH_CHECK_INTERVAL_MS, INFLUXDB_BUCKET, INFLUXDB_ORG, INFLUXDB_TOKEN, InfluxDbContainer, + InfluxDbOps, create_http_client, }; use async_trait::async_trait; use integration::harness::{TestBinaryError, TestFixture}; @@ -143,6 +144,7 @@ impl TestFixture for InfluxDbSinkFixture { fn connectors_runtime_envs(&self) -> HashMap { let mut envs = HashMap::new(); + envs.insert(ENV_SINK_VERSION.to_string(), "v2".to_string()); envs.insert(ENV_SINK_URL.to_string(), self.container.base_url.clone()); envs.insert(ENV_SINK_ORG.to_string(), INFLUXDB_ORG.to_string()); envs.insert(ENV_SINK_TOKEN.to_string(), INFLUXDB_TOKEN.to_string()); diff --git a/core/integration/tests/connectors/fixtures/influxdb/sink_v3.rs b/core/integration/tests/connectors/fixtures/influxdb/sink_v3.rs new file mode 100644 index 0000000000..94e3eda7f9 --- /dev/null +++ b/core/integration/tests/connectors/fixtures/influxdb/sink_v3.rs @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use super::container_v3::{ + DEFAULT_TEST_STREAM_V3, DEFAULT_TEST_TOPIC_V3, ENV_V3_SINK_BUCKET, ENV_V3_SINK_DB, + ENV_V3_SINK_ORG, ENV_V3_SINK_PATH, ENV_V3_SINK_PAYLOAD_FORMAT, + ENV_V3_SINK_STREAMS_0_CONSUMER_GROUP, ENV_V3_SINK_STREAMS_0_SCHEMA, + ENV_V3_SINK_STREAMS_0_STREAM, ENV_V3_SINK_STREAMS_0_TOPICS, ENV_V3_SINK_TOKEN, ENV_V3_SINK_URL, + ENV_V3_SINK_VERSION, HEALTH_CHECK_ATTEMPTS_V3, HEALTH_CHECK_INTERVAL_MS_V3, INFLUXDB3_DB, + INFLUXDB3_TOKEN, InfluxDb3Container, InfluxDb3Ops, create_http_client_v3, +}; +use async_trait::async_trait; +use integration::harness::{TestBinaryError, TestFixture}; +use reqwest_middleware::ClientWithMiddleware as HttpClient; +use std::collections::HashMap; +use std::time::Duration; +use tokio::time::sleep; +use tracing::info; + +const POLL_ATTEMPTS_V3: usize = 100; +const POLL_INTERVAL_MS_V3: u64 = 50; + +pub struct InfluxDb3SinkFixture { + container: InfluxDb3Container, + http_client: HttpClient, +} + +impl InfluxDb3Ops for InfluxDb3SinkFixture { + fn container(&self) -> &InfluxDb3Container { + &self.container + } + fn http_client(&self) -> &HttpClient { + &self.http_client + } +} + +impl InfluxDb3SinkFixture { + /// Poll until at least `expected` rows exist in the test database under + /// `measurement` (SQL table name). + pub async fn wait_for_points( + &self, + measurement: &str, + expected: usize, + ) -> Result { + let sql = format!("SELECT * FROM \"{measurement}\""); + info!("V3 wait_for_points SQL: {sql}"); + for _ in 0..POLL_ATTEMPTS_V3 { + match self.query_count(&sql).await { + Ok(n) if n >= expected => { + info!("Found {n} rows in InfluxDB 3 (expected {expected})"); + return Ok(n); + } + Ok(_) | Err(_) => {} + } + sleep(Duration::from_millis(POLL_INTERVAL_MS_V3)).await; + } + Err(TestBinaryError::InvalidState { + message: format!("Expected at least {expected} rows after {POLL_ATTEMPTS_V3} attempts"), + }) + } + + pub async fn setup() -> Result { + let container = InfluxDb3Container::start().await?; + let http_client = create_http_client_v3(); + + let fixture = Self { + container, + http_client, + }; + + for attempt in 0..HEALTH_CHECK_ATTEMPTS_V3 { + let url = format!("{}/ping", fixture.container.base_url); + match fixture.http_client.get(&url).send().await { + Ok(resp) if resp.status().as_u16() == 200 || resp.status().as_u16() == 204 => { + info!("InfluxDB 3 /ping OK after {} attempts", attempt + 1); + return Ok(fixture); + } + Ok(resp) => { + info!( + "InfluxDB 3 /ping status {} (attempt {})", + resp.status(), + attempt + 1 + ); + } + Err(e) => { + info!("InfluxDB 3 /ping error on attempt {}: {e}", attempt + 1); + } + } + sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS_V3)).await; + } + + Err(TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Sink".to_string(), + message: format!( + "InfluxDB 3 /ping did not respond after {HEALTH_CHECK_ATTEMPTS_V3} attempts" + ), + }) + } +} + +#[async_trait] +impl TestFixture for InfluxDb3SinkFixture { + async fn setup() -> Result { + Self::setup().await + } + + fn connectors_runtime_envs(&self) -> HashMap { + let mut envs = HashMap::new(); + envs.insert(ENV_V3_SINK_URL.to_string(), self.container.base_url.clone()); + envs.insert(ENV_V3_SINK_TOKEN.to_string(), INFLUXDB3_TOKEN.to_string()); + envs.insert(ENV_V3_SINK_DB.to_string(), INFLUXDB3_DB.to_string()); + envs.insert(ENV_V3_SINK_VERSION.to_string(), "v3".to_string()); + // org and bucket set to empty placeholders for V3 (adapter ignores them) + envs.insert(ENV_V3_SINK_ORG.to_string(), "".to_string()); + envs.insert(ENV_V3_SINK_BUCKET.to_string(), INFLUXDB3_DB.to_string()); + envs.insert( + ENV_V3_SINK_STREAMS_0_STREAM.to_string(), + DEFAULT_TEST_STREAM_V3.to_string(), + ); + envs.insert( + ENV_V3_SINK_STREAMS_0_TOPICS.to_string(), + format!("[{}]", DEFAULT_TEST_TOPIC_V3), + ); + envs.insert(ENV_V3_SINK_STREAMS_0_SCHEMA.to_string(), "json".to_string()); + envs.insert(ENV_V3_SINK_PAYLOAD_FORMAT.to_string(), "json".to_string()); + envs.insert( + ENV_V3_SINK_STREAMS_0_CONSUMER_GROUP.to_string(), + "influxdb3_sink_cg".to_string(), + ); + envs.insert( + ENV_V3_SINK_PATH.to_string(), + "../../target/debug/libiggy_connector_influxdb_sink".to_string(), + ); + envs + } +} diff --git a/core/integration/tests/connectors/fixtures/influxdb/source.rs b/core/integration/tests/connectors/fixtures/influxdb/source.rs index 0eeda68f7d..1d964b6ade 100644 --- a/core/integration/tests/connectors/fixtures/influxdb/source.rs +++ b/core/integration/tests/connectors/fixtures/influxdb/source.rs @@ -22,8 +22,8 @@ use super::container::{ ENV_SOURCE_PATH, ENV_SOURCE_PAYLOAD_COLUMN, ENV_SOURCE_PAYLOAD_FORMAT, ENV_SOURCE_POLL_INTERVAL, ENV_SOURCE_QUERY, ENV_SOURCE_STREAMS_0_SCHEMA, ENV_SOURCE_STREAMS_0_STREAM, ENV_SOURCE_STREAMS_0_TOPIC, ENV_SOURCE_TOKEN, ENV_SOURCE_URL, - HEALTH_CHECK_ATTEMPTS, HEALTH_CHECK_INTERVAL_MS, INFLUXDB_BUCKET, INFLUXDB_ORG, INFLUXDB_TOKEN, - InfluxDbContainer, InfluxDbOps, create_http_client, + ENV_SOURCE_VERSION, HEALTH_CHECK_ATTEMPTS, HEALTH_CHECK_INTERVAL_MS, INFLUXDB_BUCKET, + INFLUXDB_ORG, INFLUXDB_TOKEN, InfluxDbContainer, InfluxDbOps, create_http_client, }; use async_trait::async_trait; use integration::harness::{TestBinaryError, TestFixture}; @@ -129,6 +129,7 @@ impl TestFixture for InfluxDbSourceFixture { }; let mut envs = HashMap::new(); + envs.insert(ENV_SOURCE_VERSION.to_string(), "v2".to_string()); envs.insert(ENV_SOURCE_URL.to_string(), self.container.base_url.clone()); envs.insert(ENV_SOURCE_ORG.to_string(), INFLUXDB_ORG.to_string()); envs.insert(ENV_SOURCE_TOKEN.to_string(), INFLUXDB_TOKEN.to_string()); diff --git a/core/integration/tests/connectors/fixtures/influxdb/source_v3.rs b/core/integration/tests/connectors/fixtures/influxdb/source_v3.rs new file mode 100644 index 0000000000..08c1ff031b --- /dev/null +++ b/core/integration/tests/connectors/fixtures/influxdb/source_v3.rs @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use super::container_v3::{ + DEFAULT_TEST_STREAM_V3, DEFAULT_TEST_TOPIC_V3, ENV_V3_SOURCE_BATCH_SIZE, + ENV_V3_SOURCE_CURSOR_FIELD, ENV_V3_SOURCE_DB, ENV_V3_SOURCE_ORG, ENV_V3_SOURCE_PATH, + ENV_V3_SOURCE_PAYLOAD_FORMAT, ENV_V3_SOURCE_POLL_INTERVAL, ENV_V3_SOURCE_QUERY, + ENV_V3_SOURCE_STREAMS_0_SCHEMA, ENV_V3_SOURCE_STREAMS_0_STREAM, ENV_V3_SOURCE_STREAMS_0_TOPIC, + ENV_V3_SOURCE_TOKEN, ENV_V3_SOURCE_URL, ENV_V3_SOURCE_VERSION, HEALTH_CHECK_ATTEMPTS_V3, + HEALTH_CHECK_INTERVAL_MS_V3, INFLUXDB3_DB, INFLUXDB3_TOKEN, InfluxDb3Container, InfluxDb3Ops, + create_http_client_v3, +}; +use async_trait::async_trait; +use integration::harness::{TestBinaryError, TestFixture}; +use reqwest_middleware::ClientWithMiddleware as HttpClient; +use std::collections::HashMap; +use std::time::Duration; +use tokio::time::sleep; +use tracing::info; + +pub struct InfluxDb3SourceFixture { + pub(super) container: InfluxDb3Container, + pub(super) http_client: HttpClient, +} + +impl InfluxDb3Ops for InfluxDb3SourceFixture { + fn container(&self) -> &InfluxDb3Container { + &self.container + } + fn http_client(&self) -> &HttpClient { + &self.http_client + } +} + +impl InfluxDb3SourceFixture { + /// Write line-protocol lines into the test database. + pub async fn write_lines(&self, lines: &[&str]) -> Result<(), TestBinaryError> { + InfluxDb3Ops::write_lines(self, lines).await + } + + pub async fn setup() -> Result { + let container = InfluxDb3Container::start().await?; + let http_client = create_http_client_v3(); + + let fixture = Self { + container, + http_client, + }; + + for attempt in 0..HEALTH_CHECK_ATTEMPTS_V3 { + let url = format!("{}/ping", fixture.container.base_url); + match fixture.http_client.get(&url).send().await { + Ok(resp) if resp.status().as_u16() == 200 || resp.status().as_u16() == 204 => { + info!("InfluxDB 3 /ping OK after {} attempts", attempt + 1); + return Ok(fixture); + } + Ok(resp) => { + info!( + "InfluxDB 3 /ping status {} (attempt {})", + resp.status(), + attempt + 1 + ); + } + Err(e) => { + info!("InfluxDB 3 /ping error on attempt {}: {e}", attempt + 1); + } + } + sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS_V3)).await; + } + + Err(TestBinaryError::FixtureSetup { + fixture_type: "InfluxDb3Source".to_string(), + message: format!( + "InfluxDB 3 /ping did not respond after {HEALTH_CHECK_ATTEMPTS_V3} attempts" + ), + }) + } +} + +#[async_trait] +impl TestFixture for InfluxDb3SourceFixture { + async fn setup() -> Result { + Self::setup().await + } + + fn connectors_runtime_envs(&self) -> HashMap { + // SQL query template with $cursor and $limit placeholders. + // InfluxDB 3 stores time as `time` column (not `_time`). + // The connector runtime substitutes $cursor and $limit before sending. + let sql_query = "SELECT * FROM sensor_readings \ + WHERE time > '$cursor' \ + ORDER BY time \ + LIMIT $limit OFFSET $offset" + .to_string(); + + let mut envs = HashMap::new(); + envs.insert( + ENV_V3_SOURCE_URL.to_string(), + self.container.base_url.clone(), + ); + envs.insert(ENV_V3_SOURCE_TOKEN.to_string(), INFLUXDB3_TOKEN.to_string()); + envs.insert(ENV_V3_SOURCE_DB.to_string(), INFLUXDB3_DB.to_string()); + envs.insert(ENV_V3_SOURCE_VERSION.to_string(), "v3".to_string()); + envs.insert(ENV_V3_SOURCE_ORG.to_string(), "".to_string()); + envs.insert(ENV_V3_SOURCE_QUERY.to_string(), sql_query); + envs.insert(ENV_V3_SOURCE_POLL_INTERVAL.to_string(), "100ms".to_string()); + envs.insert(ENV_V3_SOURCE_BATCH_SIZE.to_string(), "100".to_string()); + // InfluxDB 3 names the time column "time", not "_time" (V2 default). + envs.insert(ENV_V3_SOURCE_CURSOR_FIELD.to_string(), "time".to_string()); + envs.insert( + ENV_V3_SOURCE_STREAMS_0_STREAM.to_string(), + DEFAULT_TEST_STREAM_V3.to_string(), + ); + envs.insert( + ENV_V3_SOURCE_STREAMS_0_TOPIC.to_string(), + DEFAULT_TEST_TOPIC_V3.to_string(), + ); + envs.insert( + ENV_V3_SOURCE_STREAMS_0_SCHEMA.to_string(), + "json".to_string(), + ); + envs.insert(ENV_V3_SOURCE_PAYLOAD_FORMAT.to_string(), "json".to_string()); + envs.insert( + ENV_V3_SOURCE_PATH.to_string(), + "../../target/debug/libiggy_connector_influxdb_source".to_string(), + ); + envs + } +} diff --git a/core/integration/tests/connectors/fixtures/mod.rs b/core/integration/tests/connectors/fixtures/mod.rs index a020fa42c7..43a939b099 100644 --- a/core/integration/tests/connectors/fixtures/mod.rs +++ b/core/integration/tests/connectors/fixtures/mod.rs @@ -37,9 +37,9 @@ pub use iceberg::{ DEFAULT_NAMESPACE, DEFAULT_TABLE, IcebergEnvAuthFixture, IcebergOps, IcebergPreCreatedFixture, }; pub use influxdb::{ - InfluxDbSinkBase64Fixture, InfluxDbSinkFixture, InfluxDbSinkNoMetadataFixture, - InfluxDbSinkNsPrecisionFixture, InfluxDbSinkTextFixture, InfluxDbSourceFixture, - InfluxDbSourceRawFixture, InfluxDbSourceTextFixture, + InfluxDb3SinkFixture, InfluxDb3SourceFixture, InfluxDbSinkBase64Fixture, InfluxDbSinkFixture, + InfluxDbSinkNoMetadataFixture, InfluxDbSinkNsPrecisionFixture, InfluxDbSinkTextFixture, + InfluxDbSourceFixture, InfluxDbSourceRawFixture, InfluxDbSourceTextFixture, }; pub use mongodb::{ MongoDbOps, MongoDbSinkAutoCreateFixture, MongoDbSinkBatchFixture, MongoDbSinkFailpointFixture, diff --git a/core/integration/tests/connectors/influxdb/docker-compose.yml b/core/integration/tests/connectors/influxdb/docker-compose.yml index 73ae7e0df1..6b530dd403 100644 --- a/core/integration/tests/connectors/influxdb/docker-compose.yml +++ b/core/integration/tests/connectors/influxdb/docker-compose.yml @@ -72,6 +72,34 @@ services: volumes: - iggy-influxdb-source-data:/var/lib/influxdb2 + # InfluxDB 3.x instance for sink V3 tests (native /api/v3/* endpoints) + influxdb3-sink: + image: influxdb:3-core + container_name: iggy-test-influxdb3-sink + ports: + - "8188:8181" + command: ["influxdb3", "serve", "--node-id", "node-sink", "--object-store", "memory", "--without-auth"] + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8181/ping"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + + # InfluxDB 3.x instance for source V3 tests + influxdb3-source: + image: influxdb:3-core + container_name: iggy-test-influxdb3-source + ports: + - "8189:8181" + command: ["influxdb3", "serve", "--node-id", "node-source", "--object-store", "memory", "--without-auth"] + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8181/ping"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + volumes: iggy-influxdb-data: iggy-influxdb-source-data: diff --git a/core/integration/tests/connectors/influxdb/influxdb_sink_v3.rs b/core/integration/tests/connectors/influxdb/influxdb_sink_v3.rs new file mode 100644 index 0000000000..af3b68bb13 --- /dev/null +++ b/core/integration/tests/connectors/influxdb/influxdb_sink_v3.rs @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use super::TEST_MESSAGE_COUNT; +use crate::connectors::fixtures::InfluxDb3SinkFixture; +use bytes::Bytes; +use iggy::prelude::IggyMessage; +use iggy::prelude::Partitioning; +use iggy_common::Identifier; +use iggy_common::MessageClient; +use integration::harness::seeds; +use integration::iggy_harness; +use serde_json::json; + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/sink_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_sink_writes_messages_to_db( + harness: &TestHarness, + fixture: InfluxDb3SinkFixture, +) { + let client = harness.root_client().await.unwrap(); + + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + let mut messages: Vec = (1u32..=TEST_MESSAGE_COUNT as u32) + .map(|i| { + let payload = serde_json::to_vec(&json!({"sensor_id": i, "temp": 20.0 + i as f64})) + .expect("Failed to serialize"); + IggyMessage::builder() + .id(i as u128) + .payload(Bytes::from(payload)) + .build() + .expect("Failed to build message") + }) + .collect(); + + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::balanced(), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + fixture + .wait_for_points("iggy_messages", TEST_MESSAGE_COUNT) + .await + .expect("Failed to wait for InfluxDB 3 points"); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/sink_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_sink_handles_bulk_messages( + harness: &TestHarness, + fixture: InfluxDb3SinkFixture, +) { + let client = harness.root_client().await.unwrap(); + let bulk_count = 50; + + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + let mut messages: Vec = (0..bulk_count) + .map(|i| { + let payload = serde_json::to_vec(&json!({"seq": i})).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(payload)) + .build() + .expect("Failed to build message") + }) + .collect(); + + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::balanced(), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + fixture + .wait_for_points("iggy_messages", bulk_count) + .await + .expect("Failed to wait for InfluxDB 3 points"); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/sink_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_sink_payload_fields_stored_correctly( + harness: &TestHarness, + fixture: InfluxDb3SinkFixture, +) { + let client = harness.root_client().await.unwrap(); + + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + let payload = serde_json::to_vec(&json!({"device": "sensor-42", "reading": 99.5})).unwrap(); + let mut messages = vec![ + IggyMessage::builder() + .id(1u128) + .payload(Bytes::from(payload)) + .build() + .unwrap(), + ]; + + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::balanced(), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + fixture + .wait_for_points("iggy_messages", 1) + .await + .expect("Failed to wait for InfluxDB 3 points"); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/sink_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_sink_large_batch(harness: &TestHarness, fixture: InfluxDb3SinkFixture) { + let client = harness.root_client().await.unwrap(); + + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + for chunk_start in (0..500usize).step_by(100) { + let mut chunk: Vec = (chunk_start..chunk_start + 100) + .map(|i| { + let payload = serde_json::to_vec(&json!({"seq": i})).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(payload)) + .build() + .expect("Failed to build message") + }) + .collect(); + + client + .send_messages(&stream_id, &topic_id, &Partitioning::balanced(), &mut chunk) + .await + .expect("Failed to send messages"); + } + + fixture + .wait_for_points("iggy_messages", 500) + .await + .expect("Failed to wait for 500 InfluxDB 3 points"); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/sink_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_sink_recovers_backlogged_messages( + harness: &TestHarness, + fixture: InfluxDb3SinkFixture, +) { + let client = harness.root_client().await.unwrap(); + + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + + let mut messages: Vec = (0..10) + .map(|i| { + let payload = serde_json::to_vec(&json!({"i": i})).expect("Failed to serialize"); + IggyMessage::builder() + .id((i + 1) as u128) + .payload(Bytes::from(payload)) + .build() + .expect("Failed to build message") + }) + .collect(); + + client + .send_messages( + &stream_id, + &topic_id, + &Partitioning::balanced(), + &mut messages, + ) + .await + .expect("Failed to send messages"); + + fixture + .wait_for_points("iggy_messages", 10) + .await + .expect("Failed to wait for 10 backlogged InfluxDB 3 points"); +} diff --git a/core/integration/tests/connectors/influxdb/influxdb_source_v3.rs b/core/integration/tests/connectors/influxdb/influxdb_source_v3.rs new file mode 100644 index 0000000000..ae57006e74 --- /dev/null +++ b/core/integration/tests/connectors/influxdb/influxdb_source_v3.rs @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use super::TEST_MESSAGE_COUNT; +use crate::connectors::fixtures::InfluxDb3SourceFixture; +use iggy_common::MessageClient; +use iggy_common::Utc; +use iggy_common::{Consumer, Identifier, PollingStrategy}; +use integration::harness::seeds; +use integration::iggy_harness; +use serde_json::Value; +use tracing::info; + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/source_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_source_polls_and_produces_messages( + harness: &TestHarness, + fixture: InfluxDb3SourceFixture, +) { + let base_ts: u64 = Utc::now().timestamp_nanos_opt().unwrap_or(0) as u64; + let lines: Vec = (0..TEST_MESSAGE_COUNT) + .map(|i| { + format!( + "sensor_readings,loc=lab v={v} {ts}", + v = 20.0 + i as f64, + ts = base_ts + i as u64 * 1000, + ) + }) + .collect(); + let line_refs: Vec<&str> = lines.iter().map(String::as_str).collect(); + + fixture + .write_lines(&line_refs) + .await + .expect("Failed to write lines to InfluxDB 3"); + + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + let consumer = Consumer::default(); + + let mut total = 0usize; + for _ in 0..100 { + let polled = client + .poll_messages( + &stream_id, + &topic_id, + None, + &consumer, + &PollingStrategy::next(), + 100, + true, + ) + .await + .expect("poll_messages failed"); + + total += polled.messages.len(); + if total >= TEST_MESSAGE_COUNT { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + + assert!( + total >= TEST_MESSAGE_COUNT, + "Expected {TEST_MESSAGE_COUNT} messages, got {total}" + ); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/source_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_source_message_payload_structure( + harness: &TestHarness, + fixture: InfluxDb3SourceFixture, +) { + let base_ts: u64 = Utc::now().timestamp_nanos_opt().unwrap_or(0) as u64; + fixture + .write_lines(&[&format!("sensor_readings,loc=roof humidity=78.5 {base_ts}")]) + .await + .expect("Failed to write line to InfluxDB 3"); + + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + let consumer = Consumer::default(); + + let mut msgs: Vec = Vec::new(); + for _ in 0..100 { + let polled = client + .poll_messages( + &stream_id, + &topic_id, + None, + &consumer, + &PollingStrategy::next(), + 10, + true, + ) + .await + .expect("poll_messages failed"); + + for m in polled.messages { + if let Ok(v) = serde_json::from_slice::(&m.payload) { + msgs.push(v); + } + } + if !msgs.is_empty() { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + + info!( + "influxdb3_source_message_payload_structure received: {:?}", + msgs + ); + assert_eq!(msgs.len(), 1, "Expected 1 message, got {}", msgs.len()); + let m = &msgs[0]; + // InfluxDB 3 JSONL rows include all column names as keys; the `time` column is always present. + assert!(m.get("time").is_some(), "missing 'time' field: {m:?}"); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/source_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_source_empty_db_produces_no_messages( + harness: &TestHarness, + fixture: InfluxDb3SourceFixture, +) { + // Write nothing — DB intentionally empty for this test. + let _ = &fixture; + + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + let consumer = Consumer::default(); + + let polled = client + .poll_messages( + &stream_id, + &topic_id, + None, + &consumer, + &PollingStrategy::next(), + 100, + false, + ) + .await + .expect("poll_messages failed"); + + assert_eq!( + polled.messages.len(), + 0, + "Expected 0 messages for empty DB, got {}", + polled.messages.len() + ); +} + +#[iggy_harness( + server(connectors_runtime(config_path = "tests/connectors/influxdb/source_v3.toml")), + seed = seeds::connector_stream +)] +async fn influxdb3_source_multiple_rows(harness: &TestHarness, fixture: InfluxDb3SourceFixture) { + let base_ts: u64 = Utc::now().timestamp_nanos_opt().unwrap_or(0) as u64; + fixture + .write_lines(&[ + &format!("sensor_readings,room=living temp=21.5 {base_ts}"), + &format!("sensor_readings,room=bedroom temp=19.0 {}", base_ts + 1000), + &format!("sensor_readings,room=kitchen temp=23.1 {}", base_ts + 2000), + ]) + .await + .expect("Failed to write lines to InfluxDB 3"); + + let client = harness.root_client().await.unwrap(); + let stream_id: Identifier = seeds::names::STREAM.try_into().unwrap(); + let topic_id: Identifier = seeds::names::TOPIC.try_into().unwrap(); + let consumer = Consumer::default(); + + let mut msgs: Vec = Vec::new(); + for _ in 0..100 { + let polled = client + .poll_messages( + &stream_id, + &topic_id, + None, + &consumer, + &PollingStrategy::next(), + 100, + true, + ) + .await + .expect("poll_messages failed"); + + for m in polled.messages { + if let Ok(v) = serde_json::from_slice::(&m.payload) { + msgs.push(v); + } + } + if msgs.len() >= 3 { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + + info!( + "influxdb3_source_multiple_rows received {} messages", + msgs.len() + ); + assert_eq!(msgs.len(), 3, "Expected 3 messages, got {}", msgs.len()); + + // All rows come from the same measurement — verify the `time` column is present on each. + for (i, m) in msgs.iter().enumerate() { + assert!( + m.get("time").is_some(), + "message {i} missing 'time' field: {m:?}" + ); + } +} diff --git a/core/integration/tests/connectors/influxdb/mod.rs b/core/integration/tests/connectors/influxdb/mod.rs index c41497953e..304ec500e4 100644 --- a/core/integration/tests/connectors/influxdb/mod.rs +++ b/core/integration/tests/connectors/influxdb/mod.rs @@ -19,7 +19,9 @@ mod influxdb_sink; mod influxdb_sink_formats; +mod influxdb_sink_v3; mod influxdb_source; mod influxdb_source_formats; +mod influxdb_source_v3; const TEST_MESSAGE_COUNT: usize = 3; diff --git a/core/integration/tests/connectors/influxdb/sink_v3.toml b/core/integration/tests/connectors/influxdb/sink_v3.toml new file mode 100644 index 0000000000..5df023c54f --- /dev/null +++ b/core/integration/tests/connectors/influxdb/sink_v3.toml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Harness config for InfluxDB V3 sink connector integration tests. +# Uses the native /api/v3/write_lp endpoint with Bearer auth. + +[connectors] +config_type = "local" +config_dir = "../connectors/sinks/influxdb_sink" +[plugin_config] +version = "v3" +precision = "ns" +measurement = "iggy_messages" +batch_size = 100 +payload_format = "json" +max_retries = 3 +retry_delay = "200ms" +timeout = "10s" +max_open_retries = 5 +open_retry_max_delay = "10s" +circuit_breaker_threshold = 10 +circuit_breaker_cool_down = "5s" diff --git a/core/integration/tests/connectors/influxdb/source_v3.toml b/core/integration/tests/connectors/influxdb/source_v3.toml new file mode 100644 index 0000000000..68da94ac69 --- /dev/null +++ b/core/integration/tests/connectors/influxdb/source_v3.toml @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Harness config for InfluxDB V3 source connector integration tests. +# Uses the native /api/v3/query_sql endpoint with Bearer auth and JSONL response. + +[connectors] +config_type = "local" +config_dir = "../connectors/sources/influxdb_source" +[plugin_config] +version = "v3" +precision = "ns"