diff --git a/.github/workflows/elixir.yml b/.github/workflows/elixir.yml index 9a5c1edb4598..73a8c6848baa 100644 --- a/.github/workflows/elixir.yml +++ b/.github/workflows/elixir.yml @@ -47,7 +47,7 @@ jobs: --health-timeout 5s --health-retries 5 clickhouse: - image: clickhouse/clickhouse-server:24.3.3.102-alpine + image: clickhouse/clickhouse-server:25.4-alpine ports: - 8123:8123 env: @@ -56,6 +56,8 @@ jobs: --health-interval 10s --health-timeout 5s --health-retries 5 + volumes: + - ./clickhouse/clickhouse-user-config/users.xml:/etc/clickhouse-server/users.d/custom.xml steps: - uses: actions/checkout@v4 with: diff --git a/ch/.formatter.exs b/ch/.formatter.exs new file mode 100644 index 000000000000..8eecf5002838 --- /dev/null +++ b/ch/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test,bench}/**/*.{ex,exs}"] +] diff --git a/ch/.hex b/ch/.hex new file mode 100644 index 000000000000..96d07abf5fa3 Binary files /dev/null and b/ch/.hex differ diff --git a/ch/CHANGELOG.md b/ch/CHANGELOG.md new file mode 100644 index 000000000000..28fcbf755aaa --- /dev/null +++ b/ch/CHANGELOG.md @@ -0,0 +1,91 @@ +# Changelog + +## 0.2.6 (2024-05-30) + +- fix query encoding for datetimes where the microseconds value starts with zeroes `~U[****-**-** **:**:**.0*****]` https://github.com/plausible/ch/pull/175 + +## 0.2.5 (2024-03-05) + +- add `:data` in `%Ch.Result{}` https://github.com/plausible/ch/pull/159 +- duplicate `Ch.Result.data` in `Ch.Result.rows` for backwards compatibility https://github.com/plausible/ch/pull/160 +- make `Ch.stream` emit `Ch.Result.t` instead of `Mint.Types.response` https://github.com/plausible/ch/pull/161 +- make `Ch.stream` collectable https://github.com/plausible/ch/pull/162 + +## 0.2.4 (2024-01-29) + +- use `ch-#{version}` as user-agent https://github.com/plausible/ch/pull/154 +- fix query string escaping for `\t`, `\\`, and `\n` https://github.com/plausible/ch/pull/155 + +## 0.2.3 (2024-01-29) + +- fix socket leak on failed handshake https://github.com/plausible/ch/pull/153 + +## 0.2.2 (2023-12-23) + +- fix query encoding for datetimes with zeroed microseconds `~U[****-**-** **:**:**.000000]` https://github.com/plausible/ch/pull/138 + +## 0.2.1 (2023-08-22) + +- fix array casts with `Ch` subtype https://github.com/plausible/ch/pull/118 + +## 0.2.0 (2023-07-28) + +- move loading and dumping from `Ch` type to the adapter https://github.com/plausible/ch/pull/112 + +## 0.1.14 (2023-05-24) + +- simplify types, again... + +## 0.1.13 (2023-05-24) + +- refactor types in `Ch.RowBinary` https://github.com/plausible/ch/pull/88 + +## 0.1.12 (2023-05-24) + +- replace `{:raw, data}` with `encode: false` option, add `:decode` option https://github.com/plausible/ch/pull/42 + +## 0.1.11 (2023-05-19) + +- improve Enum error message invalid values during encoding: https://github.com/plausible/ch/pull/85 +- fix `\t` and `\n` in query params https://github.com/plausible/ch/pull/86 + +## 0.1.10 (2023-05-05) + +- support `:raw` option in `Ch` type https://github.com/plausible/ch/pull/84 + +## 0.1.9 (2023-05-02) + +- relax deps versions + +## 0.1.8 (2023-05-01) + +- fix varint encoding + +## 0.1.7 (2023-04-24) + +- support RowBinaryWithNamesAndTypes + +## 0.1.6 (2023-04-24) + +- add Map(K,V) support in Ch Ecto type + +## 0.1.5 (2023-04-23) + +- fix query param encoding like Array(Date) +- add more types support in Ch Ecto type: tuples, ipv4, ipv6, geo + +## 0.1.4 (2023-04-23) + +- actually support negative `Enum` values + +## 0.1.3 (2023-04-23) + +- support negative `Enum` values, fix `Enum16` encoding + +## 0.1.2 (2023-04-23) + +- support `Enum8` and `Enum16` encoding + +## 0.1.1 (2023-04-23) + +- cleanup published docs diff --git a/ch/LICENSE b/ch/LICENSE new file mode 100644 index 000000000000..f89dc1d19738 --- /dev/null +++ b/ch/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Plausible Insights OÜ + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ch/README.md b/ch/README.md new file mode 100644 index 000000000000..4f5520381b86 --- /dev/null +++ b/ch/README.md @@ -0,0 +1,391 @@ +# Ch + +[![Hex Package](https://img.shields.io/hexpm/v/ch.svg)](https://hex.pm/packages/ch) +[![Hex Docs](https://img.shields.io/badge/hex-docs-blue.svg)](https://hexdocs.pm/ch) + +Minimal HTTP ClickHouse client for Elixir. + +Used in [Ecto ClickHouse adapter.](https://github.com/plausible/chto) + +### Key features + +- RowBinary +- Native query parameters +- Per query settings +- Minimal API + +Your ideas are welcome [here.](https://github.com/plausible/ch/issues/82) + +## Installation + +```elixir +defp deps do + [ + {:ch, "~> 0.2.0"} + ] +end +``` + +## Usage + +#### Start [DBConnection](https://github.com/elixir-ecto/db_connection) pool + +```elixir +defaults = [ + scheme: "http", + hostname: "localhost", + port: 8123, + database: "default", + settings: [], + pool_size: 1, + timeout: :timer.seconds(15) +] + +{:ok, pid} = Ch.start_link(defaults) +``` + +#### Select rows + +```elixir +{:ok, pid} = Ch.start_link() + +{:ok, %Ch.Result{rows: [[0], [1], [2]]}} = + Ch.query(pid, "SELECT * FROM system.numbers LIMIT 3") + +{:ok, %Ch.Result{rows: [[0], [1], [2]]}} = + Ch.query(pid, "SELECT * FROM system.numbers LIMIT {$0:UInt8}", [3]) + +{:ok, %Ch.Result{rows: [[0], [1], [2]]}} = + Ch.query(pid, "SELECT * FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 3}) +``` + +Note on datetime encoding in query parameters: + +- `%NaiveDateTime{}` is encoded as text to make it assume the column's or ClickHouse server's timezone +- `%DateTime{time_zone: "Etc/UTC"}` is encoded as unix timestamp and is treated as UTC timestamp by ClickHouse +- encoding non UTC `%DateTime{}` raises `ArgumentError` + +#### Insert rows + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES (0), (1)") + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({$0:UInt8}), ({$1:UInt32})", [0, 1]) + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) VALUES ({a:UInt16}), ({b:UInt64})", %{"a" => 0, "b" => 1}) + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) SELECT number FROM system.numbers LIMIT {limit:UInt8}", %{"limit" => 2}) +``` + +#### Insert rows as [RowBinary](https://clickhouse.com/docs/en/interfaces/formats#rowbinary) (efficient) + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") + +types = ["UInt64"] +# or +types = [Ch.Types.u64()] +# or +types = [:u64] + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", [[0], [1]], types: types) +``` + +Note that RowBinary format encoding requires `:types` option to be provided. + +Similarly, you can use [`RowBinaryWithNamesAndTypes`](https://clickhouse.com/docs/en/interfaces/formats#rowbinarywithnamesandtypes) which would additionally do something like a type check. + +```elixir +sql = "INSERT INTO ch_demo FORMAT RowBinaryWithNamesAndTypes" +opts = [names: ["id"], types: ["UInt64"]] +rows = [[0], [1]] + +%Ch.Result{num_rows: 2} = Ch.query!(pid, sql, rows, opts) +``` + +#### Insert rows in custom [format](https://clickhouse.com/docs/en/interfaces/formats) + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") + +csv = [0, 1] |> Enum.map(&to_string/1) |> Enum.intersperse(?\n) + +%Ch.Result{num_rows: 2} = + Ch.query!(pid, "INSERT INTO ch_demo(id) FORMAT CSV", csv, encode: false) +``` + +#### Insert rows as chunked RowBinary stream + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, "CREATE TABLE IF NOT EXISTS ch_demo(id UInt64) ENGINE Null") + +stream = Stream.repeatedly(fn -> [:rand.uniform(100)] end) +chunked = Stream.chunk_every(stream, 100) +encoded = Stream.map(chunked, fn chunk -> Ch.RowBinary.encode_rows(chunk, _types = ["UInt64"]) end) +ten_encoded_chunks = Stream.take(encoded, 10) + +%Ch.Result{num_rows: 1000} = + Ch.query(pid, "INSERT INTO ch_demo(id) FORMAT RowBinary", ten_encoded_chunks, encode: false) +``` + +This query makes a [`transfer-encoding: chunked`](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) HTTP request while unfolding the stream resulting in lower memory usage. + +#### Query with custom [settings](https://clickhouse.com/docs/en/operations/settings/settings) + +```elixir +{:ok, pid} = Ch.start_link() + +settings = [async_insert: 1] + +%Ch.Result{rows: [["async_insert", "Bool", "0"]]} = + Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'") + +%Ch.Result{rows: [["async_insert", "Bool", "1"]]} = + Ch.query!(pid, "SHOW SETTINGS LIKE 'async_insert'", [], settings: settings) +``` + +## Caveats + +#### NULL in RowBinary + +It's the same as in [`ch-go`](https://clickhouse.com/docs/en/integrations/go#nullable) + +> At insert time, Nil can be passed for both the normal and Nullable version of a column. For the former, the default value for the type will be persisted, e.g., an empty string for string. For the nullable version, a NULL value will be stored in ClickHouse. + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, """ +CREATE TABLE ch_nulls ( + a UInt8 NULL, + b UInt8 DEFAULT 10, + c UInt8 NOT NULL +) ENGINE Memory +""") + +types = ["Nullable(UInt8)", "UInt8", "UInt8"] +inserted_rows = [[nil, nil, nil]] +selected_rows = [[nil, 0, 0]] + +%Ch.Result{num_rows: 1} = + Ch.query!(pid, "INSERT INTO ch_nulls(a, b, c) FORMAT RowBinary", inserted_rows, types: types) + +%Ch.Result{rows: ^selected_rows} = + Ch.query!(pid, "SELECT * FROM ch_nulls") +``` + +Note that in this example `DEFAULT 10` is ignored and `0` (the default value for `UInt8`) is persisted instead. + +However, [`input()`](https://clickhouse.com/docs/en/sql-reference/table-functions/input) can be used as a workaround: + +```elixir +sql = """ +INSERT INTO ch_nulls + SELECT * FROM input('a Nullable(UInt8), b Nullable(UInt8), c UInt8') + FORMAT RowBinary\ +""" + +Ch.query!(pid, sql, inserted_rows, types: ["Nullable(UInt8)", "Nullable(UInt8)", "UInt8"]) + +%Ch.Result{rows: [[0], [10]]} = + Ch.query!(pid, "SELECT b FROM ch_nulls ORDER BY b") +``` + +#### UTF-8 in RowBinary + +When decoding [`String`](https://clickhouse.com/docs/en/sql-reference/data-types/string) columns non UTF-8 characters are replaced with `�` (U+FFFD). This behaviour is similar to [`toValidUTF8`](https://clickhouse.com/docs/en/sql-reference/functions/string-functions#tovalidutf8) and [JSON format.](https://clickhouse.com/docs/en/interfaces/formats#json) + +```elixir +{:ok, pid} = Ch.start_link() + +Ch.query!(pid, "CREATE TABLE ch_utf8(str String) ENGINE Memory") + +bin = "\x61\xF0\x80\x80\x80b" +utf8 = "a�b" + +%Ch.Result{num_rows: 1} = + Ch.query!(pid, "INSERT INTO ch_utf8(str) FORMAT RowBinary", [[bin]], types: ["String"]) + +%Ch.Result{rows: [[^utf8]]} = + Ch.query!(pid, "SELECT * FROM ch_utf8") + +%Ch.Result{rows: %{"data" => [[^utf8]]}} = + pid |> Ch.query!("SELECT * FROM ch_utf8 FORMAT JSONCompact") |> Map.update!(:rows, &Jason.decode!/1) +``` + +To get raw binary from `String` columns use `:binary` type that skips UTF-8 checks. + +```elixir +%Ch.Result{rows: [[^bin]]} = + Ch.query!(pid, "SELECT * FROM ch_utf8", [], types: [:binary]) +``` + +#### Timezones in RowBinary + +Decoding non-UTC datetimes like `DateTime('Asia/Taipei')` requires a [timezone database.](https://hexdocs.pm/elixir/DateTime.html#module-time-zone-database) + +```elixir +Mix.install([:ch, :tz]) + +:ok = Calendar.put_time_zone_database(Tz.TimeZoneDatabase) + +{:ok, pid} = Ch.start_link() + +%Ch.Result{rows: [[~N[2023-04-25 17:45:09]]]} = + Ch.query!(pid, "SELECT CAST(now() as DateTime)") + +%Ch.Result{rows: [[~U[2023-04-25 17:45:11Z]]]} = + Ch.query!(pid, "SELECT CAST(now() as DateTime('UTC'))") + +%Ch.Result{rows: [[%DateTime{time_zone: "Asia/Taipei"} = taipei]]} = + Ch.query!(pid, "SELECT CAST(now() as DateTime('Asia/Taipei'))") + +"2023-04-26 01:45:12+08:00 CST Asia/Taipei" = to_string(taipei) +``` + +Encoding non-UTC datetimes raises an `ArgumentError` + +```elixir +Ch.query!(pid, "CREATE TABLE ch_datetimes(datetime DateTime) ENGINE Null") + +naive = NaiveDateTime.utc_now() +utc = DateTime.utc_now() +taipei = DateTime.shift_zone!(utc, "Asia/Taipei") + +# ** (ArgumentError) non-UTC timezones are not supported for encoding: 2023-04-26 01:49:43.044569+08:00 CST Asia/Taipei +Ch.query!(pid, "INSERT INTO ch_datetimes(datetime) FORMAT RowBinary", [[naive], [utc], [taipei]], types: ["DateTime"]) +``` + +## Benchmarks + +
+INSERT 1 million rows (original) + +

+$ MIX_ENV=bench mix run bench/insert.exs
+
+This benchmark is based on https://github.com/ClickHouse/clickhouse-go#benchmark
+
+Operating System: macOS
+CPU Information: Apple M1
+Number of Available Cores: 8
+Available memory: 8 GB
+Elixir 1.14.4
+Erlang 25.3
+
+Benchmark suite executing with the following configuration:
+warmup: 2 s
+time: 5 s
+memory time: 0 ns
+reduction time: 0 ns
+parallel: 1
+inputs: 1_000_000 rows
+Estimated total run time: 28 s
+
+Benchmarking encode with input 1_000_000 rows ...
+Benchmarking encode stream with input 1_000_000 rows ...
+Benchmarking insert with input 1_000_000 rows ...
+Benchmarking insert stream with input 1_000_000 rows ...
+
+##### With input 1_000_000 rows #####
+Name                    ips        average  deviation         median         99th %
+encode stream          1.63      612.96 ms    ±11.30%      583.03 ms      773.01 ms
+insert stream          1.22      819.82 ms     ±9.41%      798.94 ms      973.45 ms
+encode                 1.09      915.75 ms    ±44.13%      750.98 ms     1637.02 ms
+insert                 0.73     1373.84 ms    ±31.01%     1331.86 ms     1915.76 ms
+
+Comparison: 
+encode stream          1.63
+insert stream          1.22 - 1.34x slower +206.87 ms
+encode                 1.09 - 1.49x slower +302.79 ms
+insert                 0.73 - 2.24x slower +760.88 ms
+
+ +
+ +
+SELECT 500, 500 thousand, and 500 million rows (original) + +

+$ MIX_ENV=bench mix run bench/stream.exs
+
+This benchmark is based on https://github.com/ClickHouse/ch-bench
+
+Operating System: macOS
+CPU Information: Apple M1
+Number of Available Cores: 8
+Available memory: 8 GB
+Elixir 1.14.4
+Erlang 25.3
+
+Benchmark suite executing with the following configuration:
+warmup: 2 s
+time: 5 s
+memory time: 0 ns
+reduction time: 0 ns
+parallel: 1
+inputs: 500 rows, 500_000 rows, 500_000_000 rows
+Estimated total run time: 1.05 min
+
+Benchmarking stream with decode with input 500 rows ...
+Benchmarking stream with decode with input 500_000 rows ...
+Benchmarking stream with decode with input 500_000_000 rows ...
+Benchmarking stream with manual decode with input 500 rows ...
+Benchmarking stream with manual decode with input 500_000 rows ...
+Benchmarking stream with manual decode with input 500_000_000 rows ...
+Benchmarking stream without decode with input 500 rows ...
+Benchmarking stream without decode with input 500_000 rows ...
+Benchmarking stream without decode with input 500_000_000 rows ...
+
+##### With input 500 rows #####
+Name                                ips        average  deviation         median         99th %
+stream with decode               4.69 K      213.34 μs    ±12.49%      211.38 μs      290.94 μs
+stream with manual decode        4.69 K      213.43 μs    ±17.40%      210.96 μs      298.75 μs
+stream without decode            4.65 K      215.08 μs    ±10.79%      213.79 μs      284.66 μs
+
+Comparison:
+stream with decode               4.69 K
+stream with manual decode        4.69 K - 1.00x slower +0.0838 μs
+stream without decode            4.65 K - 1.01x slower +1.74 μs
+
+##### With input 500_000 rows #####
+Name                                ips        average  deviation         median         99th %
+stream without decode            234.58        4.26 ms    ±13.99%        4.04 ms        5.95 ms
+stream with manual decode         64.26       15.56 ms     ±8.36%       15.86 ms       17.97 ms
+stream with decode                41.03       24.37 ms     ±6.27%       24.39 ms       26.60 ms
+
+Comparison:
+stream without decode            234.58
+stream with manual decode         64.26 - 3.65x slower +11.30 ms
+stream with decode                41.03 - 5.72x slower +20.11 ms
+
+##### With input 500_000_000 rows #####
+Name                                ips        average  deviation         median         99th %
+stream without decode              0.32         3.17 s     ±0.20%         3.17 s         3.17 s
+stream with manual decode        0.0891        11.23 s     ±0.00%        11.23 s        11.23 s
+stream with decode               0.0462        21.66 s     ±0.00%        21.66 s        21.66 s
+
+Comparison:
+stream without decode              0.32
+stream with manual decode        0.0891 - 3.55x slower +8.06 s
+stream with decode               0.0462 - 6.84x slower +18.50 s
+
+ +
+ +[CI Results](https://github.com/plausible/ch/actions/workflows/bench.yml) (click the latest workflow run and scroll down to "Artifacts") diff --git a/ch/hex_metadata.config b/ch/hex_metadata.config new file mode 100644 index 000000000000..b545328fc1d7 --- /dev/null +++ b/ch/hex_metadata.config @@ -0,0 +1,40 @@ +{<<"links">>,[{<<"GitHub">>,<<"https://github.com/plausible/ch">>}]}. +{<<"name">>,<<"ch">>}. +{<<"version">>,<<"0.2.6">>}. +{<<"description">>,<<"HTTP ClickHouse driver for Elixir">>}. +{<<"elixir">>,<<"~> 1.14">>}. +{<<"app">>,<<"ch">>}. +{<<"licenses">>,[<<"MIT">>]}. +{<<"requirements">>, + [[{<<"name">>,<<"mint">>}, + {<<"app">>,<<"mint">>}, + {<<"optional">>,false}, + {<<"requirement">>,<<"~> 1.0">>}, + {<<"repository">>,<<"hexpm">>}], + [{<<"name">>,<<"db_connection">>}, + {<<"app">>,<<"db_connection">>}, + {<<"optional">>,false}, + {<<"requirement">>,<<"~> 2.0">>}, + {<<"repository">>,<<"hexpm">>}], + [{<<"name">>,<<"jason">>}, + {<<"app">>,<<"jason">>}, + {<<"optional">>,false}, + {<<"requirement">>,<<"~> 1.0">>}, + {<<"repository">>,<<"hexpm">>}], + [{<<"name">>,<<"decimal">>}, + {<<"app">>,<<"decimal">>}, + {<<"optional">>,false}, + {<<"requirement">>,<<"~> 2.0">>}, + {<<"repository">>,<<"hexpm">>}], + [{<<"name">>,<<"ecto">>}, + {<<"app">>,<<"ecto">>}, + {<<"optional">>,true}, + {<<"requirement">>,<<"~> 3.5">>}, + {<<"repository">>,<<"hexpm">>}]]}. +{<<"files">>, + [<<"lib">>,<<"lib/ch.ex">>,<<"lib/ch">>,<<"lib/ch/stream.ex">>, + <<"lib/ch/types.ex">>,<<"lib/ch/error.ex">>,<<"lib/ch/row_binary.ex">>, + <<"lib/ch/query.ex">>,<<"lib/ch/result.ex">>,<<"lib/ch/connection.ex">>, + <<".formatter.exs">>,<<"mix.exs">>,<<"README.md">>,<<"LICENSE">>, + <<"CHANGELOG.md">>]}. +{<<"build_tools">>,[<<"mix">>]}. diff --git a/ch/lib/ch.ex b/ch/lib/ch.ex new file mode 100644 index 000000000000..2e71c59ba0b2 --- /dev/null +++ b/ch/lib/ch.ex @@ -0,0 +1,287 @@ +defmodule Ch do + @moduledoc "Minimal HTTP ClickHouse client." + alias Ch.{Connection, Query, Result} + + @type common_option :: + {:database, String.t()} + | {:username, String.t()} + | {:password, String.t()} + | {:settings, Keyword.t()} + | {:timeout, timeout} + + @type start_option :: + common_option + | {:scheme, String.t()} + | {:hostname, String.t()} + | {:port, :inet.port_number()} + | {:transport_opts, :gen_tcp.connect_option()} + | DBConnection.start_option() + + @doc """ + Start the connection process and connect to ClickHouse. + + ## Options + + * `:scheme` - HTTP scheme, defaults to `"http"` + * `:hostname` - server hostname, defaults to `"localhost"` + * `:port` - HTTP port, defualts to `8123` + * `:transport_opts` - options to be given to the transport being used. See `Mint.HTTP1.connect/4` for more info + * `:database` - Database, defaults to `"default"` + * `:username` - Username + * `:password` - User password + * `:settings` - Keyword list of ClickHouse settings + * `:timeout` - HTTP receive timeout in milliseconds + * `:transport_opts` - options to be given to the transport being used. See `Mint.HTTP1.connect/4` for more info + * [`DBConnection.start_option()`](https://hexdocs.pm/db_connection/DBConnection.html#t:start_option/0) + + """ + @spec start_link([start_option]) :: GenServer.on_start() + def start_link(opts \\ []) do + DBConnection.start_link(Connection, opts) + end + + @doc """ + Returns a supervisor child specification for a DBConnection pool. + + See `start_link/1` for supported options. + """ + @spec child_spec([start_option]) :: :supervisor.child_spec() + def child_spec(opts) do + DBConnection.child_spec(Connection, opts) + end + + @type query_option :: + common_option + | {:command, Ch.Query.command()} + | {:headers, [{String.t(), String.t()}]} + | {:format, String.t()} + # TODO remove + | {:encode, boolean} + | {:decode, boolean} + | DBConnection.connection_option() + + @doc """ + Runs a query and returns the result as `{:ok, %Ch.Result{}}` or + `{:error, Exception.t()}` if there was a database error. + + ## Options + + * `:database` - Database + * `:username` - Username + * `:password` - User password + * `:settings` - Keyword list of settings + * `:timeout` - Query request timeout + * `:command` - Command tag for the query + * `:headers` - Custom HTTP headers for the request + * `:format` - Custom response format for the request + * `:decode` - Whether to automatically decode the response + * [`DBConnection.connection_option()`](https://hexdocs.pm/db_connection/DBConnection.html#t:connection_option/0) + + """ + @spec query(DBConnection.conn(), iodata, params, [query_option]) :: + {:ok, Result.t()} | {:error, Exception.t()} + when params: map | [term] | [row :: [term]] | iodata | Enumerable.t() + def query(conn, statement, params \\ [], opts \\ []) do + query = Query.build(statement, opts) + + with {:ok, _query, result} <- DBConnection.execute(conn, query, params, opts) do + {:ok, result} + end + end + + @doc """ + Runs a query and returns the result or raises `Ch.Error` if + there was an error. See `query/4`. + """ + @spec query!(DBConnection.conn(), iodata, params, [query_option]) :: Result.t() + when params: map | [term] | [row :: [term]] | iodata | Enumerable.t() + def query!(conn, statement, params \\ [], opts \\ []) do + query = Query.build(statement, opts) + DBConnection.execute!(conn, query, params, opts) + end + + @doc false + @spec stream(DBConnection.t(), iodata, map | [term], [query_option]) :: Ch.Stream.t() + def stream(conn, statement, params \\ [], opts \\ []) do + query = Query.build(statement, opts) + %Ch.Stream{conn: conn, query: query, params: params, opts: opts} + end + + # TODO drop + @doc false + @spec run(DBConnection.conn(), (DBConnection.t() -> any), Keyword.t()) :: any + def run(conn, f, opts \\ []) when is_function(f, 1) do + DBConnection.run(conn, f, opts) + end + + if Code.ensure_loaded?(Ecto.ParameterizedType) do + @behaviour Ecto.ParameterizedType + + @impl true + def type(params), do: {:parameterized, Ch, params} + + @impl true + def init(opts) do + clickhouse_type = + opts[:raw] || opts[:type] || + raise ArgumentError, "keys :raw or :type not found in: #{inspect(opts)}" + + Ch.Types.decode(clickhouse_type) + end + + @impl true + def load(value, _loader, _params), do: {:ok, value} + + @impl true + def dump(value, _dumper, _params), do: {:ok, value} + + @impl true + def cast(value, :string = type), do: Ecto.Type.cast(type, value) + def cast(value, :boolean = type), do: Ecto.Type.cast(type, value) + def cast(value, :uuid), do: Ecto.Type.cast(Ecto.UUID, value) + def cast(value, :date = type), do: Ecto.Type.cast(type, value) + def cast(value, :date32), do: Ecto.Type.cast(:date, value) + def cast(value, :datetime), do: Ecto.Type.cast(:naive_datetime, value) + def cast(value, {:datetime, "UTC"}), do: Ecto.Type.cast(:utc_datetime, value) + def cast(value, {:datetime64, _p}), do: Ecto.Type.cast(:naive_datetime_usec, value) + def cast(value, {:datetime64, _p, "UTC"}), do: Ecto.Type.cast(:utc_datetime_usec, value) + def cast(value, {:fixed_string, _s}), do: Ecto.Type.cast(:string, value) + + for size <- [8, 16, 32, 64, 128, 256] do + def cast(value, unquote(:"i#{size}")), do: Ecto.Type.cast(:integer, value) + def cast(value, unquote(:"u#{size}")), do: Ecto.Type.cast(:integer, value) + end + + for size <- [32, 64] do + def cast(value, unquote(:"f#{size}")), do: Ecto.Type.cast(:float, value) + end + + def cast(value, {:decimal = type, _p, _s}), do: Ecto.Type.cast(type, value) + + for size <- [32, 64, 128, 256] do + def cast(value, {unquote(:"decimal#{size}"), _s}) do + Ecto.Type.cast(:decimal, value) + end + end + + def cast(value, {:array, type}), do: Ecto.Type.cast({:array, type(type)}, value) + def cast(value, {:nullable, type}), do: cast(value, type) + def cast(value, {:low_cardinality, type}), do: cast(value, type) + def cast(value, {:simple_aggregate_function, _name, type}), do: cast(value, type) + + def cast(value, :ring), do: Ecto.Type.cast({:array, type(:point)}, value) + def cast(value, :polygon), do: Ecto.Type.cast({:array, type(:ring)}, value) + def cast(value, :multipolygon), do: Ecto.Type.cast({:array, type(:polygon)}, value) + + def cast(nil, _params), do: {:ok, nil} + + def cast(value, {enum, mappings}) when enum in [:enum8, :enum16] do + result = + case value do + _ when is_integer(value) -> List.keyfind(mappings, value, 1, :error) + _ when is_binary(value) -> List.keyfind(mappings, value, 0, :error) + _ -> :error + end + + case result do + {_, _} -> {:ok, value} + :error = e -> e + end + end + + def cast(value, :ipv4) do + case value do + {a, b, c, d} when is_number(a) and is_number(b) and is_number(c) and is_number(d) -> + {:ok, value} + + _ when is_binary(value) -> + with {:error = e, _reason} <- :inet.parse_ipv4_address(to_charlist(value)), do: e + + _ when is_list(value) -> + with {:error = e, _reason} <- :inet.parse_ipv4_address(value), do: e + + _ -> + :error + end + end + + def cast(value, :ipv6) do + case value do + {a, s, d, f, g, h, j, k} + when is_number(a) and is_number(s) and is_number(d) and is_number(f) and + is_number(g) and is_number(h) and is_number(j) and is_number(k) -> + {:ok, value} + + _ when is_binary(value) -> + with {:error = e, _reason} <- :inet.parse_ipv6_address(to_charlist(value)), do: e + + _ when is_list(value) -> + with {:error = e, _reason} <- :inet.parse_ipv6_address(value), do: e + + _ -> + :error + end + end + + def cast(value, :point) do + case value do + {x, y} when is_number(x) and is_number(y) -> {:ok, value} + _ -> :error + end + end + + def cast(value, {:tuple, types}), do: cast_tuple(types, value) + def cast(value, {:map, key_type, value_type}), do: cast_map(value, key_type, value_type) + + defp cast_tuple(types, values) when is_tuple(values) do + cast_tuple(types, Tuple.to_list(values), []) + end + + defp cast_tuple(types, values) when is_list(values) do + cast_tuple(types, values, []) + end + + defp cast_tuple(_types, _values), do: :error + + defp cast_tuple([type | types], [value | values], acc) do + case cast(value, type) do + {:ok, value} -> cast_tuple(types, values, [value | acc]) + :error = e -> e + end + end + + defp cast_tuple([], [], acc), do: {:ok, List.to_tuple(:lists.reverse(acc))} + defp cast_tuple(_types, _values, _acc), do: :error + + defp cast_map(value, key_type, value_type) when is_map(value) do + cast_map(Map.to_list(value), key_type, value_type) + end + + defp cast_map(value, key_type, value_type) when is_list(value) do + cast_map(value, key_type, value_type, []) + end + + defp cast_map(_value, _key_type, _value_type), do: :error + + defp cast_map([{key, value} | kvs], key_type, value_type, acc) do + with {:ok, key} <- cast(key, key_type), + {:ok, value} <- cast(value, value_type) do + cast_map(kvs, key_type, value_type, [{key, value} | acc]) + end + end + + defp cast_map([], _key_type, _value_type, acc), do: {:ok, Map.new(acc)} + defp cast_map(_kvs, _key_type, _value_type, _acc), do: :error + + @impl true + def embed_as(_, _), do: :self + + @impl true + def equal?(a, b, _), do: a == b + + @impl true + def format(params) do + "#Ch<#{Ch.Types.encode(params)}>" + end + end +end diff --git a/ch/lib/ch/connection.ex b/ch/lib/ch/connection.ex new file mode 100644 index 000000000000..3742f4747862 --- /dev/null +++ b/ch/lib/ch/connection.ex @@ -0,0 +1,427 @@ +defmodule Ch.Connection do + @moduledoc false + use DBConnection + require Logger + alias Ch.{Error, Query, Result} + alias Mint.HTTP1, as: HTTP + + @user_agent "ch/" <> Mix.Project.config()[:version] + + @typep conn :: HTTP.t() + + @impl true + @spec connect([Ch.start_option()]) :: {:ok, conn} | {:error, Error.t() | Mint.Types.error()} + def connect(opts) do + scheme = String.to_existing_atom(opts[:scheme] || "http") + address = opts[:hostname] || "localhost" + port = opts[:port] || 8123 + mint_opts = [mode: :passive] ++ Keyword.take(opts, [:hostname, :transport_opts]) + + with {:ok, conn} <- HTTP.connect(scheme, address, port, mint_opts) do + conn = + conn + |> HTTP.put_private(:timeout, opts[:timeout] || :timer.seconds(15)) + |> maybe_put_private(:database, opts[:database]) + |> maybe_put_private(:username, opts[:username]) + |> maybe_put_private(:password, opts[:password]) + |> maybe_put_private(:settings, opts[:settings]) + + handshake = Query.build("select 1") + params = DBConnection.Query.encode(handshake, _params = [], _opts = []) + + case handle_execute(handshake, params, _opts = [], conn) do + {:ok, handshake, responses, conn} -> + case DBConnection.Query.decode(handshake, responses, _opts = []) do + %Result{rows: [[1]]} -> + {:ok, conn} + + result -> + {:ok, _conn} = HTTP.close(conn) + reason = Error.exception("unexpected result for '#{handshake}': #{inspect(result)}") + {:error, reason} + end + + {:error, reason, conn} -> + {:ok, _conn} = HTTP.close(conn) + {:error, reason} + + {:disconnect, reason, conn} -> + {:ok, _conn} = HTTP.close(conn) + {:error, reason} + end + end + end + + @impl true + @spec ping(conn) :: {:ok, conn} | {:disconnect, Mint.Types.error() | Error.t(), conn} + def ping(conn) do + headers = [{"user-agent", @user_agent}] + + case request(conn, "GET", "/ping", headers, _body = "", _opts = []) do + {:ok, conn, _response} -> {:ok, conn} + {:error, error, conn} -> {:disconnect, error, conn} + {:disconnect, _error, _conn} = disconnect -> disconnect + end + end + + @impl true + @spec checkout(conn) :: {:ok, conn} + def checkout(conn), do: {:ok, conn} + + # we "support" these four tx callbacks for Repo.checkout + # even though ClickHouse doesn't support txs + + @impl true + def handle_begin(_opts, conn), do: {:ok, %{}, conn} + @impl true + def handle_commit(_opts, conn), do: {:ok, %{}, conn} + @impl true + def handle_rollback(_opts, conn), do: {:ok, %{}, conn} + @impl true + def handle_status(_opts, conn), do: {:idle, conn} + + @impl true + def handle_prepare(_query, _opts, conn) do + {:error, Error.exception("prepared statements are not supported"), conn} + end + + @impl true + def handle_close(_query, _opts, conn) do + {:error, Error.exception("prepared statements are not supported"), conn} + end + + @impl true + def handle_declare(query, params, opts, conn) do + %Query{command: command} = query + {query_params, extra_headers, body} = params + + path = path(conn, query_params, opts) + headers = headers(conn, extra_headers, opts) + + with {:ok, conn, _ref} <- send_request(conn, "POST", path, headers, body), + {:ok, conn} <- eat_ok_status_and_headers(conn, timeout(conn, opts)) do + {:ok, query, %Result{command: command}, conn} + end + end + + @spec eat_ok_status_and_headers(conn, timeout) :: + {:ok, %{conn: conn, buffer: [Mint.Types.response()]}} + | {:error, Ch.Error.t(), conn} + | {:disconnect, Mint.Types.error(), conn} + defp eat_ok_status_and_headers(conn, timeout) do + case HTTP.recv(conn, 0, timeout) do + {:ok, conn, responses} -> + case eat_ok_status_and_headers(responses) do + {:ok, data} -> + {:ok, %{conn: conn, buffer: data}} + + :more -> + eat_ok_status_and_headers(conn, timeout) + + :error -> + all_responses_result = + case handle_all_responses(responses, []) do + {:ok, responses} -> {:ok, conn, responses} + {:more, acc} -> recv_all(conn, acc, timeout) + end + + with {:ok, conn, responses} <- all_responses_result do + [_status, headers | data] = responses + message = IO.iodata_to_binary(data) + + code = + if code = get_header(headers, "x-clickhouse-exception-code") do + String.to_integer(code) + end + + {:error, Error.exception(code: code, message: message), conn} + end + end + + {:error, conn, error, _responses} -> + {:disconnect, error, conn} + end + end + + defp eat_ok_status_and_headers([{:status, _ref, 200} | rest]) do + eat_ok_status_and_headers(rest) + end + + defp eat_ok_status_and_headers([{:status, _ref, _status} | _rest]), do: :error + defp eat_ok_status_and_headers([{:headers, _ref, _headers} | data]), do: {:ok, data} + defp eat_ok_status_and_headers([]), do: :more + + @impl true + def handle_fetch(query, result, opts, %{conn: conn, buffer: buffer}) do + case buffer do + [] -> handle_fetch(query, result, opts, conn) + _not_empty -> {halt_or_cont(buffer), %Result{result | data: extract_data(buffer)}, conn} + end + end + + def handle_fetch(_query, result, opts, conn) do + case HTTP.recv(conn, 0, timeout(conn, opts)) do + {:ok, conn, responses} -> + {halt_or_cont(responses), %Result{result | data: extract_data(responses)}, conn} + + {:error, conn, reason, _responses} -> + {:disconnect, reason, conn} + end + end + + defp halt_or_cont([{:done, _ref}]), do: :halt + defp halt_or_cont([_ | rest]), do: halt_or_cont(rest) + defp halt_or_cont([]), do: :cont + + defp extract_data([{:data, _ref, data} | rest]), do: [data | extract_data(rest)] + defp extract_data([] = empty), do: empty + defp extract_data([{:done, _ref}]), do: [] + + @impl true + def handle_deallocate(_query, result, _opts, conn) do + case HTTP.open_request_count(conn) do + 0 -> + # TODO data: [], anything else? + {:ok, %Result{result | data: []}, conn} + + 1 -> + {:disconnect, Error.exception("cannot stop stream before receiving full response"), conn} + end + end + + @impl true + def handle_execute(%Query{} = query, {:stream, params}, opts, conn) do + {query_params, extra_headers, body} = params + + path = path(conn, query_params, opts) + headers = headers(conn, extra_headers, opts) + + with {:ok, conn, ref} <- send_request(conn, "POST", path, headers, :stream) do + case HTTP.stream_request_body(conn, ref, body) do + {:ok, conn} -> {:ok, query, ref, conn} + {:error, conn, reason} -> {:disconnect, reason, conn} + end + end + end + + def handle_execute(%Query{} = query, {:stream, ref, body}, opts, conn) do + case HTTP.stream_request_body(conn, ref, body) do + {:ok, conn} -> + case body do + :eof -> + with {:ok, conn, responses} <- receive_full_response(conn, timeout(conn, opts)) do + {:ok, query, responses, conn} + end + + _other -> + {:ok, query, ref, conn} + end + + {:error, conn, reason} -> + {:disconnect, reason, conn} + end + end + + def handle_execute(%Query{command: :insert} = query, params, opts, conn) do + {query_params, extra_headers, body} = params + + path = path(conn, query_params, opts) + headers = headers(conn, extra_headers, opts) + + result = + if is_function(body, 2) do + request_chunked(conn, "POST", path, headers, body, opts) + else + request(conn, "POST", path, headers, body, opts) + end + + with {:ok, conn, responses} <- result do + {:ok, query, responses, conn} + end + end + + def handle_execute(query, params, opts, conn) do + {query_params, extra_headers, body} = params + + path = path(conn, query_params, opts) + headers = headers(conn, extra_headers, opts) + + with {:ok, conn, responses} <- request(conn, "POST", path, headers, body, opts) do + {:ok, query, responses, conn} + end + end + + @impl true + def disconnect(_error, conn) do + {:ok = ok, _conn} = HTTP.close(conn) + ok + end + + @typep response :: Mint.Types.status() | Mint.Types.headers() | binary + + @spec request(conn, binary, binary, Mint.Types.headers(), iodata, [Ch.query_option()]) :: + {:ok, conn, [response]} + | {:error, Error.t(), conn} + | {:disconnect, Mint.Types.error(), conn} + defp request(conn, method, path, headers, body, opts) do + with {:ok, conn, _ref} <- send_request(conn, method, path, headers, body) do + receive_full_response(conn, timeout(conn, opts)) + end + end + + @spec request_chunked(conn, binary, binary, Mint.Types.headers(), Enumerable.t(), Keyword.t()) :: + {:ok, conn, [response]} + | {:error, Error.t(), conn} + | {:disconnect, Mint.Types.error(), conn} + def request_chunked(conn, method, path, headers, stream, opts) do + with {:ok, conn, ref} <- send_request(conn, method, path, headers, :stream), + {:ok, conn} <- stream_body(conn, ref, stream), + do: receive_full_response(conn, timeout(conn, opts)) + end + + @spec stream_body(conn, Mint.Types.request_ref(), Enumerable.t()) :: + {:ok, conn} | {:disconnect, Mint.Types.error(), conn} + defp stream_body(conn, ref, stream) do + result = + stream + |> Stream.concat([:eof]) + |> Enum.reduce_while({:ok, conn}, fn + chunk, {:ok, conn} -> {:cont, HTTP.stream_request_body(conn, ref, chunk)} + _chunk, {:error, _conn, _reason} = error -> {:halt, error} + end) + + case result do + {:ok, _conn} = ok -> ok + {:error, conn, reason} -> {:disconnect, reason, conn} + end + end + + # stacktrace is a bit cleaner with this function inlined + @compile inline: [send_request: 5] + defp send_request(conn, method, path, headers, body) do + case HTTP.request(conn, method, path, headers, body) do + {:ok, _conn, _ref} = ok -> ok + {:error, conn, reason} -> {:disconnect, reason, conn} + end + end + + @spec receive_full_response(conn, timeout) :: + {:ok, conn, [response]} + | {:error, Error.t(), conn} + | {:disconnect, Mint.Types.error(), conn} + defp receive_full_response(conn, timeout) do + with {:ok, conn, responses} <- recv_all(conn, [], timeout) do + case responses do + [200, headers | _rest] -> + conn = ensure_same_server(conn, headers) + {:ok, conn, responses} + + [_status, headers | data] -> + message = IO.iodata_to_binary(data) + + code = + if code = get_header(headers, "x-clickhouse-exception-code") do + String.to_integer(code) + end + + {:error, Error.exception(code: code, message: message), conn} + end + end + end + + @spec recv_all(conn, [response], timeout()) :: + {:ok, conn, [response]} | {:disconnect, Mint.Types.error(), conn} + defp recv_all(conn, acc, timeout) do + case HTTP.recv(conn, 0, timeout) do + {:ok, conn, responses} -> + case handle_all_responses(responses, acc) do + {:ok, responses} -> {:ok, conn, responses} + {:more, acc} -> recv_all(conn, acc, timeout) + end + + {:error, conn, reason, _responses} -> + {:disconnect, reason, conn} + end + end + + for tag <- [:data, :status, :headers] do + defp handle_all_responses([{unquote(tag), _ref, data} | rest], acc) do + handle_all_responses(rest, [data | acc]) + end + end + + defp handle_all_responses([{:done, _ref}], acc), do: {:ok, :lists.reverse(acc)} + defp handle_all_responses([], acc), do: {:more, acc} + + defp maybe_put_private(conn, _k, nil), do: conn + defp maybe_put_private(conn, k, v), do: HTTP.put_private(conn, k, v) + + defp timeout(conn), do: HTTP.get_private(conn, :timeout) + defp timeout(conn, opts), do: Keyword.get(opts, :timeout) || timeout(conn) + + defp settings(conn, opts) do + default_settings = HTTP.get_private(conn, :settings, []) + opts_settings = Keyword.get(opts, :settings, []) + Keyword.merge(default_settings, opts_settings) + end + + defp headers(conn, extra_headers, opts) do + extra_headers + |> maybe_put_new_header("x-clickhouse-user", get_opts_or_private(conn, opts, :username)) + |> maybe_put_new_header("x-clickhouse-key", get_opts_or_private(conn, opts, :password)) + |> maybe_put_new_header("x-clickhouse-database", get_opts_or_private(conn, opts, :database)) + |> maybe_put_new_header("user-agent", @user_agent) + end + + defp get_opts_or_private(conn, opts, key) do + Keyword.get(opts, key) || HTTP.get_private(conn, key) + end + + defp maybe_put_new_header(headers, _name, _no_value = nil), do: headers + + defp maybe_put_new_header(headers, name, value) do + if List.keymember?(headers, name, 0) do + headers + else + [{name, value} | headers] + end + end + + defp get_header(headers, key) do + case List.keyfind(headers, key, 0) do + {_, value} -> value + nil = not_found -> not_found + end + end + + defp path(conn, query_params, opts) do + settings = settings(conn, opts) + "/?" <> URI.encode_query(settings ++ query_params) + end + + @server_display_name_key :server_display_name + + @spec ensure_same_server(conn, Mint.Types.headers()) :: conn + defp ensure_same_server(conn, headers) do + expected_name = HTTP.get_private(conn, @server_display_name_key) + actual_name = get_header(headers, "x-clickhouse-server-display-name") + + cond do + expected_name && actual_name -> + unless actual_name == expected_name do + Logger.warning( + "Server mismatch detected. Expected #{inspect(expected_name)} but got #{inspect(actual_name)}!" <> + " Connection pooling might be unstable." + ) + end + + conn + + actual_name -> + HTTP.put_private(conn, @server_display_name_key, actual_name) + + true -> + conn + end + end +end diff --git a/ch/lib/ch/error.ex b/ch/lib/ch/error.ex new file mode 100644 index 000000000000..9b427eefcbfd --- /dev/null +++ b/ch/lib/ch/error.ex @@ -0,0 +1,5 @@ +defmodule Ch.Error do + @moduledoc "Error struct wrapping ClickHouse error responses." + defexception [:code, :message] + @type t :: %__MODULE__{code: pos_integer | nil, message: String.t()} +end diff --git a/ch/lib/ch/query.ex b/ch/lib/ch/query.ex new file mode 100644 index 000000000000..c2b12ae4fdf1 --- /dev/null +++ b/ch/lib/ch/query.ex @@ -0,0 +1,307 @@ +defmodule Ch.Query do + @moduledoc "Query struct wrapping the SQL statement." + defstruct [:statement, :command, :encode, :decode] + + @type t :: %__MODULE__{statement: iodata, command: command, encode: boolean, decode: boolean} + + @doc false + @spec build(iodata, [Ch.query_option()]) :: t + def build(statement, opts \\ []) do + command = Keyword.get(opts, :command) || extract_command(statement) + encode = Keyword.get(opts, :encode, true) + decode = Keyword.get(opts, :decode, true) + %__MODULE__{statement: statement, command: command, encode: encode, decode: decode} + end + + statements = [ + {"SELECT", :select}, + {"INSERT", :insert}, + {"CREATE", :create}, + {"ALTER", :alter}, + {"DELETE", :delete}, + {"SYSTEM", :system}, + {"SHOW", :show}, + # as of clickhouse 22.8, WITH is only allowed in SELECT + # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ + {"WITH", :select}, + {"GRANT", :grant}, + {"EXPLAIN", :explain}, + {"REVOKE", :revoke}, + {"ATTACH", :attach}, + {"CHECK", :check}, + {"DESCRIBE", :describe}, + {"DETACH", :detach}, + {"DROP", :drop}, + {"EXISTS", :exists}, + {"KILL", :kill}, + {"OPTIMIZE", :optimize}, + {"RENAME", :rename}, + {"EXCHANGE", :exchange}, + {"SET", :set}, + {"TRUNCATE", :truncate}, + {"USE", :use}, + {"WATCH", :watch} + ] + + command_union = + statements + |> Enum.map(fn {_, command} -> command end) + |> Enum.reduce(&{:|, [], [&1, &2]}) + + @type command :: unquote(command_union) + + defp extract_command(statement) + + for {statement, command} <- statements do + defp extract_command(unquote(statement) <> _), do: unquote(command) + defp extract_command(unquote(String.downcase(statement)) <> _), do: unquote(command) + end + + defp extract_command(<>) when whitespace in [?\s, ?\t, ?\n] do + extract_command(rest) + end + + defp extract_command([first_segment | _] = statement) do + extract_command(first_segment) || extract_command(IO.iodata_to_binary(statement)) + end + + defp extract_command(_other), do: nil +end + +defimpl DBConnection.Query, for: Ch.Query do + alias Ch.{Query, Result, RowBinary} + + @spec parse(Query.t(), [Ch.query_option()]) :: Query.t() + def parse(query, _opts), do: query + + @spec describe(Query.t(), [Ch.query_option()]) :: Query.t() + def describe(query, _opts), do: query + + # stream: insert init + @spec encode(Query.t(), {:stream, term}, [Ch.query_option()]) :: + {:stream, {[{String.t(), String.t()}], Mint.Types.headers(), iodata}} + def encode(query, {:stream, params}, opts) do + {:stream, encode(query, params, opts)} + end + + # stream: insert data chunk + @spec encode(Query.t(), {:stream, Mint.Types.request_ref(), iodata | :eof}, [Ch.query_option()]) :: + {:stream, Mint.Types.request_ref(), iodata | :eof} + def encode(_query, {:stream, ref, data}, _opts) do + {:stream, ref, data} + end + + @spec encode(Query.t(), params, [Ch.query_option()]) :: + {query_params, Mint.Types.headers(), body} + when params: map | [term] | [row :: [term]] | iodata | Enumerable.t(), + query_params: [{String.t(), String.t()}], + body: iodata | Enumerable.t() + + def encode(%Query{command: :insert, encode: false, statement: statement}, data, opts) do + body = + case data do + _ when is_list(data) or is_binary(data) -> [statement, ?\n | data] + _ -> Stream.concat([[statement, ?\n]], data) + end + + {_query_params = [], headers(opts), body} + end + + def encode(%Query{command: :insert, statement: statement}, params, opts) do + cond do + names = Keyword.get(opts, :names) -> + types = Keyword.fetch!(opts, :types) + header = RowBinary.encode_names_and_types(names, types) + data = RowBinary.encode_rows(params, types) + {_query_params = [], headers(opts), [statement, ?\n, header | data]} + + format_row_binary?(statement) -> + types = Keyword.fetch!(opts, :types) + data = RowBinary.encode_rows(params, types) + {_query_params = [], headers(opts), [statement, ?\n | data]} + + true -> + {query_params(params), headers(opts), statement} + end + end + + def encode(%Query{statement: statement}, params, opts) do + types = Keyword.get(opts, :types) + default_format = if types, do: "RowBinary", else: "RowBinaryWithNamesAndTypes" + format = Keyword.get(opts, :format) || default_format + {query_params(params), [{"x-clickhouse-format", format} | headers(opts)], statement} + end + + defp format_row_binary?(statement) when is_binary(statement) do + statement |> String.trim_trailing() |> String.ends_with?("RowBinary") + end + + defp format_row_binary?(statement) when is_list(statement) do + statement + |> IO.iodata_to_binary() + |> format_row_binary?() + end + + # stream: select result + @spec decode(Query.t(), result, [Ch.query_option()]) :: result when result: Result.t() + def decode(_query, %Result{} = result, _opts), do: result + # stream: insert result + @spec decode(Query.t(), ref, [Ch.query_option()]) :: ref when ref: Mint.Types.request_ref() + def decode(_query, ref, _opts) when is_reference(ref), do: ref + + @spec decode(Query.t(), [response], [Ch.query_option()]) :: Result.t() + when response: Mint.Types.status() | Mint.Types.headers() | binary + def decode(%Query{command: :insert}, responses, _opts) do + [_status, headers | _data] = responses + + num_rows = + if summary = get_header(headers, "x-clickhouse-summary") do + summary = Jason.decode!(summary) + + if written_rows = Map.get(summary, "written_rows") do + String.to_integer(written_rows) + end + end + + %Result{num_rows: num_rows, rows: nil, command: :insert, headers: headers} + end + + def decode(%Query{decode: false, command: command}, responses, _opts) when is_list(responses) do + # TODO potentially fails on x-progress-headers + [_status, headers | data] = responses + %Result{rows: data, data: data, command: command, headers: headers} + end + + def decode(%Query{command: command}, responses, opts) when is_list(responses) do + # TODO potentially fails on x-progress-headers + [_status, headers | data] = responses + + case get_header(headers, "x-clickhouse-format") do + "RowBinary" -> + types = Keyword.fetch!(opts, :types) + rows = data |> IO.iodata_to_binary() |> RowBinary.decode_rows(types) + %Result{num_rows: length(rows), rows: rows, command: command, headers: headers} + + "RowBinaryWithNamesAndTypes" -> + rows = data |> IO.iodata_to_binary() |> RowBinary.decode_rows() + %Result{num_rows: length(rows), rows: rows, command: command, headers: headers} + + _other -> + %Result{rows: data, data: data, command: command, headers: headers} + end + end + + defp get_header(headers, key) do + case List.keyfind(headers, key, 0) do + {_, value} -> value + nil = not_found -> not_found + end + end + + defp query_params(params) when is_map(params) do + Enum.map(params, fn {k, v} -> {"param_#{k}", encode_param(v)} end) + end + + defp query_params(params) when is_list(params) do + params + |> Enum.with_index() + |> Enum.map(fn {v, idx} -> {"param_$#{idx}", encode_param(v)} end) + end + + defp encode_param(n) when is_integer(n), do: Integer.to_string(n) + defp encode_param(f) when is_float(f), do: Float.to_string(f) + + # TODO possibly speed up + # For more info see + # https://clickhouse.com/docs/en/interfaces/http#tabs-in-url-parameters + # "escaped" format is the same as https://clickhouse.com/docs/en/interfaces/formats#tabseparated-data-formatting + defp encode_param(b) when is_binary(b) do + escape_param([{"\\", "\\\\"}, {"\t", "\\\t"}, {"\n", "\\\n"}], b) + end + + defp encode_param(b) when is_boolean(b), do: Atom.to_string(b) + defp encode_param(%Decimal{} = d), do: Decimal.to_string(d, :normal) + defp encode_param(%Date{} = date), do: Date.to_iso8601(date) + defp encode_param(%NaiveDateTime{} = naive), do: NaiveDateTime.to_iso8601(naive) + + defp encode_param(%DateTime{time_zone: "Etc/UTC", microsecond: microsecond} = dt) do + case microsecond do + {val, precision} when val > 0 and precision > 0 -> + size = round(:math.pow(10, precision)) + unix = DateTime.to_unix(dt, size) + seconds = div(unix, size) + fractional = rem(unix, size) + + IO.iodata_to_binary([ + Integer.to_string(seconds), + ?., + String.pad_leading(Integer.to_string(fractional), precision, "0") + ]) + + _ -> + dt |> DateTime.to_unix(:second) |> Integer.to_string() + end + end + + defp encode_param(%DateTime{} = dt) do + raise ArgumentError, "non-UTC timezones are not supported for encoding: #{dt}" + end + + defp encode_param(tuple) when is_tuple(tuple) do + IO.iodata_to_binary([?(, encode_array_params(Tuple.to_list(tuple)), ?)]) + end + + defp encode_param(a) when is_list(a) do + IO.iodata_to_binary([?[, encode_array_params(a), ?]]) + end + + defp encode_param(m) when is_map(m) do + IO.iodata_to_binary([?{, encode_map_params(Map.to_list(m)), ?}]) + end + + defp encode_array_params([last]), do: encode_array_param(last) + + defp encode_array_params([s | rest]) do + [encode_array_param(s), ?, | encode_array_params(rest)] + end + + defp encode_array_params([] = empty), do: empty + + defp encode_map_params([last]), do: encode_map_param(last) + + defp encode_map_params([kv | rest]) do + [encode_map_param(kv), ?, | encode_map_params(rest)] + end + + defp encode_map_params([] = empty), do: empty + + defp encode_array_param(s) when is_binary(s) do + [?', escape_param([{"'", "''"}, {"\\", "\\\\"}], s), ?'] + end + + defp encode_array_param(%s{} = param) when s in [Date, NaiveDateTime] do + [?', encode_param(param), ?'] + end + + defp encode_array_param(v), do: encode_param(v) + + defp encode_map_param({k, v}) do + [encode_array_param(k), ?:, encode_array_param(v)] + end + + defp escape_param([{pattern, replacement} | escapes], param) do + param = String.replace(param, pattern, replacement) + escape_param(escapes, param) + end + + defp escape_param([], param), do: param + + @spec headers(Keyword.t()) :: Mint.Types.headers() + defp headers(opts), do: Keyword.get(opts, :headers, []) +end + +defimpl String.Chars, for: Ch.Query do + def to_string(%{statement: statement}) do + IO.iodata_to_binary(statement) + end +end diff --git a/ch/lib/ch/result.ex b/ch/lib/ch/result.ex new file mode 100644 index 000000000000..db75fc63d273 --- /dev/null +++ b/ch/lib/ch/result.ex @@ -0,0 +1,21 @@ +defmodule Ch.Result do + @moduledoc """ + Result struct returned from any successful query. Its fields are: + + * `command` - An atom of the query command, for example: `:select`, `:insert`; + * `rows` - A list of lists, each inner list corresponding to a row, each element in the inner list corresponds to a column + * `num_rows` - The number of fetched or affected rows; + * `headers` - The HTTP response headers + * `data` - The raw iodata from the response + """ + + defstruct [:command, :num_rows, :rows, :headers, :data] + + @type t :: %__MODULE__{ + command: Ch.Query.command(), + num_rows: non_neg_integer | nil, + rows: [[term]] | iodata | nil, + headers: Mint.Types.headers(), + data: iodata + } +end diff --git a/ch/lib/ch/row_binary.ex b/ch/lib/ch/row_binary.ex new file mode 100644 index 000000000000..22d477bee2b1 --- /dev/null +++ b/ch/lib/ch/row_binary.ex @@ -0,0 +1,938 @@ +defmodule Ch.RowBinary do + @moduledoc "Helpers for working with ClickHouse [`RowBinary`](https://clickhouse.com/docs/en/sql-reference/formats#rowbinary) format." + + # @compile {:bin_opt_info, true} + @dialyzer :no_improper_lists + + import Bitwise + + @epoch_date ~D[1970-01-01] + @epoch_naive_datetime NaiveDateTime.new!(@epoch_date, ~T[00:00:00]) + @epoch_utc_datetime DateTime.new!(@epoch_date, ~T[00:00:00]) + + @doc false + def encode_names_and_types(names, types) do + [encode(:varint, length(names)), encode_many(names, :string), encode_types(types)] + end + + defp encode_types([type | types]) do + encoded = + case type do + _ when is_binary(type) -> type + _ -> Ch.Types.encode(type) + end + + [encode(:string, encoded) | encode_types(types)] + end + + defp encode_types([] = done), do: done + + @doc """ + Encodes a single row to [`RowBinary`](https://clickhouse.com/docs/en/sql-reference/formats#rowbinary) as iodata. + + Examples: + + iex> encode_row([], []) + [] + + iex> encode_row([1], ["UInt8"]) + [1] + + iex> encode_row([3, "hello"], ["UInt8", "String"]) + [3, [5 | "hello"]] + + """ + def encode_row(row, types) do + _encode_row(row, encoding_types(types)) + end + + defp _encode_row([el | els], [type | types]), do: [encode(type, el) | _encode_row(els, types)] + defp _encode_row([] = done, []), do: done + + @doc """ + Encodes multiple rows to [`RowBinary`](https://clickhouse.com/docs/en/sql-reference/formats#rowbinary) as iodata. + + Examples: + + iex> encode_rows([], []) + [] + + iex> encode_rows([[1]], ["UInt8"]) + [1] + + iex> encode_rows([[3, "hello"], [4, "hi"]], ["UInt8", "String"]) + [3, [5 | "hello"], 4, [2 | "hi"]] + + """ + def encode_rows(rows, types) do + _encode_rows(rows, encoding_types(types)) + end + + @doc false + def _encode_rows([row | rows], types), do: _encode_rows(row, types, rows, types) + def _encode_rows([] = done, _types), do: done + + defp _encode_rows([el | els], [t | ts], rows, types) do + [encode(t, el) | _encode_rows(els, ts, rows, types)] + end + + defp _encode_rows([], [], rows, types), do: _encode_rows(rows, types) + + @doc false + def encoding_types([type | types]) do + [encoding_type(type) | encoding_types(types)] + end + + def encoding_types([] = done), do: done + + defp encoding_type(type) when is_binary(type) do + encoding_type(Ch.Types.decode(type)) + end + + defp encoding_type(t) + when t in [ + :string, + :binary, + :boolean, + :uuid, + :date, + :datetime, + :date32, + :ipv4, + :ipv6, + :point, + :nothing + ], + do: t + + defp encoding_type({:datetime = d, "UTC"}), do: d + + defp encoding_type({:datetime, tz}) do + raise ArgumentError, "can't encode DateTime with non-UTC timezone: #{inspect(tz)}" + end + + defp encoding_type({:fixed_string, _len} = t), do: t + + for size <- [8, 16, 32, 64, 128, 256] do + defp encoding_type(unquote(:"u#{size}") = u), do: u + defp encoding_type(unquote(:"i#{size}") = i), do: i + end + + for size <- [32, 64] do + defp encoding_type(unquote(:"f#{size}") = f), do: f + end + + defp encoding_type({:array = a, t}), do: {a, encoding_type(t)} + + defp encoding_type({:tuple = t, ts}) do + {t, Enum.map(ts, &encoding_type/1)} + end + + defp encoding_type({:map = m, kt, vt}) do + {m, encoding_type(kt), encoding_type(vt)} + end + + defp encoding_type({:nullable = n, t}), do: {n, encoding_type(t)} + defp encoding_type({:low_cardinality, t}), do: encoding_type(t) + + defp encoding_type({:decimal, p, s}) do + case decimal_size(p) do + 32 -> {:decimal32, s} + 64 -> {:decimal64, s} + 128 -> {:decimal128, s} + 256 -> {:decimal256, s} + end + end + + defp encoding_type({d, _scale} = t) + when d in [:decimal32, :decimal64, :decimal128, :decimal256], + do: t + + defp encoding_type({:datetime64 = t, p}), do: {t, time_unit(p)} + + defp encoding_type({:datetime64 = t, p, "UTC"}), do: {t, time_unit(p)} + + defp encoding_type({:datetime64, _, tz}) do + raise ArgumentError, "can't encode DateTime64 with non-UTC timezone: #{inspect(tz)}" + end + + defp encoding_type({e, mappings}) when e in [:enum8, :enum16] do + {e, Map.new(mappings)} + end + + defp encoding_type({:simple_aggregate_function, _f, t}), do: encoding_type(t) + + defp encoding_type(:ring), do: {:array, :point} + defp encoding_type(:polygon), do: {:array, {:array, :point}} + defp encoding_type(:multipolygon), do: {:array, {:array, {:array, :point}}} + + defp encoding_type(type) do + raise ArgumentError, "unsupported type for encoding: #{inspect(type)}" + end + + @doc false + def encode(type, value) + + def encode(:varint, i) when is_integer(i) and i < 128, do: i + def encode(:varint, i) when is_integer(i), do: encode_varint_cont(i) + + def encode(type, str) when type in [:string, :binary] do + case str do + _ when is_binary(str) -> [encode(:varint, byte_size(str)) | str] + _ when is_list(str) -> [encode(:varint, IO.iodata_length(str)) | str] + nil -> 0 + end + end + + def encode({:fixed_string, size}, str) when byte_size(str) == size do + str + end + + def encode({:fixed_string, size}, str) when byte_size(str) < size do + to_pad = size - byte_size(str) + [str | <<0::size(to_pad * 8)>>] + end + + def encode({:fixed_string, size}, nil), do: <<0::size(size * 8)>> + + def encode(:u8, u) when is_integer(u), do: u + def encode(:u8, nil), do: 0 + + def encode(:i8, i) when is_integer(i) and i >= 0, do: i + def encode(:i8, i) when is_integer(i), do: <> + def encode(:i8, nil), do: 0 + + for size <- [16, 32, 64, 128, 256] do + def encode(unquote(:"u#{size}"), u) when is_integer(u) do + <> + end + + def encode(unquote(:"i#{size}"), i) when is_integer(i) do + <> + end + + def encode(unquote(:"u#{size}"), nil), do: <<0::unquote(size)>> + def encode(unquote(:"i#{size}"), nil), do: <<0::unquote(size)>> + end + + for size <- [32, 64] do + type = :"f#{size}" + + def encode(unquote(type), f) when is_number(f) do + <> + end + + def encode(unquote(type), nil), do: <<0::unquote(size)>> + end + + def encode({:decimal, precision, scale}, decimal) do + type = + case decimal_size(precision) do + 32 -> :decimal32 + 64 -> :decimal64 + 128 -> :decimal128 + 256 -> :decimal256 + end + + encode({type, scale}, decimal) + end + + for size <- [32, 64, 128, 256] do + type = :"decimal#{size}" + + def encode({unquote(type), scale} = t, %Decimal{sign: sign, coef: coef, exp: exp} = d) do + cond do + scale == -exp -> + i = sign * coef + <> + + exp >= 0 -> + i = sign * coef * round(:math.pow(10, exp + scale)) + <> + + true -> + encode(t, Decimal.round(d, scale)) + end + end + + def encode({unquote(type), _scale}, nil), do: <<0::unquote(size)>> + end + + def encode(:boolean, true), do: 1 + def encode(:boolean, false), do: 0 + def encode(:boolean, nil), do: 0 + + def encode({:array, type}, [_ | _] = l) do + [encode(:varint, length(l)) | encode_many(l, type)] + end + + def encode({:array, _type}, []), do: 0 + def encode({:array, _type}, nil), do: 0 + + def encode({:map, k, v}, [_ | _] = m) do + [encode(:varint, length(m)) | encode_many_kv(m, k, v)] + end + + def encode({:map, _k, _v} = t, m) when is_map(m), do: encode(t, Map.to_list(m)) + def encode({:map, _k, _v}, []), do: 0 + def encode({:map, _k, _v}, nil), do: 0 + + def encode({:tuple, _types} = t, v) when is_tuple(v) do + encode(t, Tuple.to_list(v)) + end + + def encode({:tuple, types}, values) when is_list(types) and is_list(values) do + encode_row(values, types) + end + + def encode({:tuple, types}, nil) when is_list(types) do + Enum.map(types, fn type -> encode(type, nil) end) + end + + def encode(:datetime, %NaiveDateTime{} = datetime) do + <> + end + + def encode(:datetime, %DateTime{time_zone: "Etc/UTC"} = datetime) do + <> + end + + def encode(:datetime, %DateTime{} = datetime) do + raise ArgumentError, "non-UTC timezones are not supported for encoding: #{datetime}" + end + + def encode(:datetime, nil), do: <<0::32>> + + def encode({:datetime64, time_unit}, %NaiveDateTime{} = datetime) do + <> + end + + def encode({:datetime64, time_unit}, %DateTime{time_zone: "Etc/UTC"} = datetime) do + <> + end + + def encode({:datetime64, _precision}, %DateTime{} = datetime) do + raise ArgumentError, "non-UTC timezones are not supported for encoding: #{datetime}" + end + + def encode({:datetime64, _precision}, nil), do: <<0::64>> + + def encode(:date, %Date{} = date) do + <> + end + + def encode(:date, nil), do: <<0::16>> + + def encode(:date32, %Date{} = date) do + <> + end + + def encode(:date32, nil), do: <<0::32>> + + def encode(:uuid, <>), do: <> + + def encode( + :uuid, + <> + ) do + raw = + <> + + encode(:uuid, raw) + end + + def encode(:uuid, nil), do: <<0::128>> + + def encode(:ipv4, {a, b, c, d}), do: [d, c, b, a] + def encode(:ipv4, nil), do: <<0::32>> + + def encode(:ipv6, {b1, b2, b3, b4, b5, b6, b7, b8}) do + <> + end + + def encode(:ipv6, <<_::128>> = encoded), do: encoded + def encode(:ipv6, nil), do: <<0::128>> + + def encode(:point, {x, y}), do: [encode(:f64, x) | encode(:f64, y)] + def encode(:point, nil), do: <<0::128>> + def encode(:ring, points), do: encode({:array, :point}, points) + def encode(:polygon, rings), do: encode({:array, :ring}, rings) + def encode(:multipolygon, polygons), do: encode({:array, :polygon}, polygons) + + # TODO enum8 and enum16 nil + for size <- [8, 16] do + enum_t = :"enum#{size}" + int_t = :"i#{size}" + + def encode({unquote(enum_t), mapping}, e) do + i = + case e do + _ when is_integer(e) -> + e + + _ when is_binary(e) -> + case Map.fetch(mapping, e) do + {:ok, res} -> + res + + :error -> + raise ArgumentError, + "enum value #{inspect(e)} not found in mapping: #{inspect(mapping)}" + end + end + + encode(unquote(int_t), i) + end + end + + def encode({:nullable, _type}, nil), do: 1 + + def encode({:nullable, type}, value) do + case encode(type, value) do + e when is_list(e) or is_binary(e) -> [0 | e] + e -> [0, e] + end + end + + defp encode_varint_cont(i) when i < 128, do: <> + + defp encode_varint_cont(i) do + [(i &&& 0b0111_1111) ||| 0b1000_0000 | encode_varint_cont(i >>> 7)] + end + + defp encode_many([el | rest], type), do: [encode(type, el) | encode_many(rest, type)] + defp encode_many([] = done, _type), do: done + + defp encode_many_kv([{key, value} | rest], key_type, value_type) do + [ + encode(key_type, key), + encode(value_type, value) + | encode_many_kv(rest, key_type, value_type) + ] + end + + defp encode_many_kv([] = done, _key_type, _value_type), do: done + + @compile {:inline, d: 1} + + defp d(?0), do: 0 + defp d(?1), do: 1 + defp d(?2), do: 2 + defp d(?3), do: 3 + defp d(?4), do: 4 + defp d(?5), do: 5 + defp d(?6), do: 6 + defp d(?7), do: 7 + defp d(?8), do: 8 + defp d(?9), do: 9 + defp d(?A), do: 10 + defp d(?B), do: 11 + defp d(?C), do: 12 + defp d(?D), do: 13 + defp d(?E), do: 14 + defp d(?F), do: 15 + defp d(?a), do: 10 + defp d(?b), do: 11 + defp d(?c), do: 12 + defp d(?d), do: 13 + defp d(?e), do: 14 + defp d(?f), do: 15 + + @doc """ + Decodes [`RowBinaryWithNamesAndTypes`](https://clickhouse.com/docs/en/sql-reference/formats#rowbinarywithnamesandtypes) into rows. + + Example: + + iex> decode_rows(<<1, 3, "1+1"::bytes, 5, "UInt8"::bytes, 2>>) + [[2]] + + """ + def decode_rows(row_binary_with_names_and_types) + def decode_rows(<>), do: skip_names(rest, cols, cols) + def decode_rows(<<>>), do: [] + + @doc """ + Decodes [`RowBinary`](https://clickhouse.com/docs/en/sql-reference/formats#rowbinary) into rows. + + Example: + + iex> decode_rows(<<1>>, ["UInt8"]) + [[1]] + + """ + def decode_rows(row_binary, types) + def decode_rows(<<>>, _types), do: [] + + def decode_rows(<>, types) do + types = decoding_types(types) + decode_rows(types, data, [], [], types) + end + + @doc false + def decoding_types([type | types]) do + [decoding_type(type) | types] + end + + def decoding_types([] = done), do: done + + defp decoding_type(t) when is_binary(t) do + decoding_type(Ch.Types.decode(t)) + end + + defp decoding_type(t) + when t in [ + :string, + :binary, + :boolean, + :uuid, + :date, + :date32, + :ipv4, + :ipv6, + :point, + :nothing + ], + do: t + + defp decoding_type({:datetime, _tz} = t), do: t + defp decoding_type({:fixed_string, _len} = t), do: t + + for size <- [8, 16, 32, 64, 128, 256] do + defp decoding_type(unquote(:"u#{size}") = u), do: u + defp decoding_type(unquote(:"i#{size}") = i), do: i + end + + for size <- [32, 64] do + defp decoding_type(unquote(:"f#{size}") = f), do: f + end + + defp decoding_type(:datetime = t), do: {t, _tz = nil} + + defp decoding_type({:array = a, t}), do: {a, decoding_type(t)} + + defp decoding_type({:tuple = t, ts}) do + {t, Enum.map(ts, &decoding_type/1)} + end + + defp decoding_type({:map = m, kt, vt}) do + {m, decoding_type(kt), decoding_type(vt)} + end + + defp decoding_type({:nullable = n, t}), do: {n, decoding_type(t)} + defp decoding_type({:low_cardinality, t}), do: decoding_type(t) + + defp decoding_type({:decimal = t, p, s}), do: {t, decimal_size(p), s} + defp decoding_type({:decimal32, s}), do: {:decimal, 32, s} + defp decoding_type({:decimal64, s}), do: {:decimal, 64, s} + defp decoding_type({:decimal128, s}), do: {:decimal, 128, s} + defp decoding_type({:decimal256, s}), do: {:decimal, 256, s} + + defp decoding_type({:datetime64 = t, p}), do: {t, time_unit(p), _tz = nil} + defp decoding_type({:datetime64 = t, p, tz}), do: {t, time_unit(p), tz} + + defp decoding_type({e, mappings}) when e in [:enum8, :enum16] do + {e, Map.new(mappings, fn {k, v} -> {v, k} end)} + end + + defp decoding_type({:simple_aggregate_function, _f, t}), do: decoding_type(t) + + defp decoding_type(:ring), do: {:array, :point} + defp decoding_type(:polygon), do: {:array, {:array, :point}} + defp decoding_type(:multipolygon), do: {:array, {:array, {:array, :point}}} + + defp decoding_type(type) do + raise ArgumentError, "unsupported type for decoding: #{inspect(type)}" + end + + defp skip_names(<>, 0, count), do: decode_types(rest, count, _acc = []) + + varints = [ + {_pattern = quote(do: <<0::1, v1::7>>), _value = quote(do: v1)}, + {quote(do: <<1::1, v1::7, 0::1, v2::7>>), quote(do: (v2 <<< 7) + v1)}, + {quote(do: <<1::1, v1::7, 1::1, v2::7, 0::1, v3::7>>), + quote(do: (v3 <<< 14) + (v2 <<< 7) + v1)}, + {quote(do: <<1::1, v1::7, 1::1, v2::7, 1::1, v3::7, 0::1, v4::7>>), + quote(do: (v4 <<< 21) + (v3 <<< 14) + (v2 <<< 7) + v1)}, + {quote(do: <<1::1, v1::7, 1::1, v2::7, 1::1, v3::7, 1::1, v4::7, 0::1, v5::7>>), + quote(do: (v5 <<< 28) + (v4 <<< 21) + (v3 <<< 14) + (v2 <<< 7) + v1)}, + {quote(do: <<1::1, v1::7, 1::1, v2::7, 1::1, v3::7, 1::1, v4::7, 1::1, v5::7, 0::1, v6::7>>), + quote(do: (v6 <<< 35) + (v5 <<< 28) + (v4 <<< 21) + (v3 <<< 14) + (v2 <<< 7) + v1)}, + {quote do + <<1::1, v1::7, 1::1, v2::7, 1::1, v3::7, 1::1, v4::7, 1::1, v5::7, 1::1, v6::7, 0::1, + v7::7>> + end, + quote do + (v7 <<< 42) + (v6 <<< 35) + (v5 <<< 28) + (v4 <<< 21) + (v3 <<< 14) + (v2 <<< 7) + v1 + end}, + {quote do + <<1::1, v1::7, 1::1, v2::7, 1::1, v3::7, 1::1, v4::7, 1::1, v5::7, 1::1, v6::7, 1::1, + v7::7, 0::1, v8::7>> + end, + quote do + (v8 <<< 49) + (v7 <<< 42) + (v6 <<< 35) + (v5 <<< 28) + (v4 <<< 21) + (v3 <<< 14) + + (v2 <<< 7) + v1 + end} + ] + + for {pattern, value} <- varints do + defp skip_names(<>, left, count) do + skip_names(rest, left - 1, count) + end + end + + defp decode_types(<<>>, 0, _types), do: [] + + defp decode_types(<>, 0, types) do + types = types |> decode_types() |> :lists.reverse() + decode_rows(types, rest, _row = [], _rows = [], types) + end + + defp decode_types(<>, count, acc) do + decode_types(rest, count - 1, [type | acc]) + end + + @doc false + def decode_types([type | types]) do + [decoding_type(Ch.Types.decode(type)) | decode_types(types)] + end + + def decode_types([] = done), do: done + + @compile inline: [decode_string_decode_rows: 5] + + for {pattern, size} <- varints do + defp decode_string_decode_rows( + <>, + types_rest, + row, + rows, + types + ) do + decode_rows(types_rest, bin, [to_utf8(s) | row], rows, types) + end + end + + @doc false + def to_utf8(str) do + utf8 = to_utf8(str, 0, 0, str, []) + IO.iodata_to_binary(utf8) + end + + @dialyzer {:no_improper_lists, to_utf8: 5, to_utf8_escape: 5} + + defp to_utf8(<>, from, len, original, acc) do + to_utf8(rest, from, len + utf8_size(valid), original, acc) + end + + defp to_utf8(<<_invalid, rest::bytes>>, from, len, original, acc) do + acc = [acc | binary_part(original, from, len)] + to_utf8_escape(rest, from + len, 1, original, acc) + end + + defp to_utf8(<<>>, from, len, original, acc) do + [acc | binary_part(original, from, len)] + end + + defp to_utf8_escape(<>, from, len, original, acc) do + acc = [acc | "�"] + to_utf8(rest, from + len, utf8_size(valid), original, acc) + end + + defp to_utf8_escape(<<_invalid, rest::bytes>>, from, len, original, acc) do + to_utf8_escape(rest, from, len + 1, original, acc) + end + + defp to_utf8_escape(<<>>, _from, _len, _original, acc) do + [acc | "�"] + end + + # UTF-8 encodes code points in one to four bytes + @compile inline: [utf8_size: 1] + defp utf8_size(codepoint) when codepoint <= 0x7F, do: 1 + defp utf8_size(codepoint) when codepoint <= 0x7FF, do: 2 + defp utf8_size(codepoint) when codepoint <= 0xFFFF, do: 3 + defp utf8_size(codepoint) when codepoint <= 0x10FFFF, do: 4 + + @compile inline: [decode_binary_decode_rows: 5] + + for {pattern, size} <- varints do + defp decode_binary_decode_rows( + <>, + types_rest, + row, + rows, + types + ) do + decode_rows(types_rest, bin, [s | row], rows, types) + end + end + + @compile inline: [decode_array_decode_rows: 6] + defp decode_array_decode_rows(<<0, bin::bytes>>, _type, types_rest, row, rows, types) do + decode_rows(types_rest, bin, [[] | row], rows, types) + end + + for {pattern, size} <- varints do + defp decode_array_decode_rows( + <>, + type, + types_rest, + row, + rows, + types + ) do + array_types = List.duplicate(type, unquote(size)) + types_rest = array_types ++ [{:array_over, row} | types_rest] + decode_rows(types_rest, bin, [], rows, types) + end + end + + @compile inline: [decode_map_decode_rows: 7] + defp decode_map_decode_rows( + <<0, bin::bytes>>, + _key_type, + _value_type, + types_rest, + row, + rows, + types + ) do + decode_rows(types_rest, bin, [%{} | row], rows, types) + end + + for {pattern, size} <- varints do + defp decode_map_decode_rows( + <>, + key_type, + value_type, + types_rest, + row, + rows, + types + ) do + types_rest = + map_types(unquote(size), key_type, value_type) ++ [{:map_over, row} | types_rest] + + decode_rows(types_rest, bin, [], rows, types) + end + end + + defp map_types(count, key_type, value_type) when count > 0 do + [key_type, value_type | map_types(count - 1, key_type, value_type)] + end + + defp map_types(0, _key_type, _value_types), do: [] + + defp decode_rows([type | types_rest], <>, row, rows, types) do + case type do + :u8 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :u16 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :u32 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :u64 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :u128 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :u256 -> + <> = bin + decode_rows(types_rest, bin, [u | row], rows, types) + + :i8 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :i16 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :i32 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :i64 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :i128 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :i256 -> + <> = bin + decode_rows(types_rest, bin, [i | row], rows, types) + + :f32 -> + case bin do + <> -> + decode_rows(types_rest, bin, [f | row], rows, types) + + <<_nan_or_inf::32, bin::bytes>> -> + decode_rows(types_rest, bin, [nil | row], rows, types) + end + + :f64 -> + case bin do + <> -> + decode_rows(types_rest, bin, [f | row], rows, types) + + <<_nan_or_inf::64, bin::bytes>> -> + decode_rows(types_rest, bin, [nil | row], rows, types) + end + + :string -> + decode_string_decode_rows(bin, types_rest, row, rows, types) + + :binary -> + decode_binary_decode_rows(bin, types_rest, row, rows, types) + + # TODO utf8? + {:fixed_string, size} -> + <> = bin + decode_rows(types_rest, bin, [s | row], rows, types) + + :boolean -> + case bin do + <<0, bin::bytes>> -> decode_rows(types_rest, bin, [false | row], rows, types) + <<1, bin::bytes>> -> decode_rows(types_rest, bin, [true | row], rows, types) + end + + :uuid -> + <> = bin + uuid = <> + decode_rows(types_rest, bin, [uuid | row], rows, types) + + :date -> + <> = bin + decode_rows(types_rest, bin, [Date.add(@epoch_date, d) | row], rows, types) + + :date32 -> + <> = bin + decode_rows(types_rest, bin, [Date.add(@epoch_date, d) | row], rows, types) + + {:datetime, timezone} -> + <> = bin + + dt = + case timezone do + nil -> NaiveDateTime.add(@epoch_naive_datetime, s) + "UTC" -> DateTime.from_unix!(s) + _ -> s |> DateTime.from_unix!() |> DateTime.shift_zone!(timezone) + end + + decode_rows(types_rest, bin, [dt | row], rows, types) + + {:decimal, size, scale} -> + <> = bin + sign = if val < 0, do: -1, else: 1 + d = Decimal.new(sign, abs(val), -scale) + decode_rows(types_rest, bin, [d | row], rows, types) + + {:nullable, type} -> + case bin do + <<1, bin::bytes>> -> decode_rows(types_rest, bin, [nil | row], rows, types) + <<0, bin::bytes>> -> decode_rows([type | types_rest], bin, row, rows, types) + end + + {:array, type} -> + decode_array_decode_rows(bin, type, types_rest, row, rows, types) + + {:array_over, original_row} -> + decode_rows(types_rest, bin, [:lists.reverse(row) | original_row], rows, types) + + {:map, key_type, value_type} -> + decode_map_decode_rows(bin, key_type, value_type, types_rest, row, rows, types) + + {:map_over, original_row} -> + map = row |> Enum.chunk_every(2) |> Enum.map(fn [v, k] -> {k, v} end) |> Map.new() + decode_rows(types_rest, bin, [map | original_row], rows, types) + + {:tuple, tuple_types} -> + decode_rows(tuple_types ++ [{:tuple_over, row} | types_rest], bin, [], rows, types) + + {:tuple_over, original_row} -> + tuple = row |> :lists.reverse() |> List.to_tuple() + decode_rows(types_rest, bin, [tuple | original_row], rows, types) + + {:datetime64, time_unit, timezone} -> + <> = bin + + dt = + case timezone do + nil -> + NaiveDateTime.add(@epoch_naive_datetime, s, time_unit) + + "UTC" -> + DateTime.from_unix!(s, time_unit) + + _ -> + s + |> DateTime.from_unix!(time_unit) + |> DateTime.shift_zone!(timezone) + end + + decode_rows(types_rest, bin, [dt | row], rows, types) + + {:enum8, mapping} -> + <> = bin + decode_rows(types_rest, bin, [Map.fetch!(mapping, v) | row], rows, types) + + {:enum16, mapping} -> + <> = bin + decode_rows(types_rest, bin, [Map.fetch!(mapping, v) | row], rows, types) + + :ipv4 -> + <> = bin + decode_rows(types_rest, bin, [{b1, b2, b3, b4} | row], rows, types) + + :ipv6 -> + <> = bin + decode_rows(types_rest, bin, [{b1, b2, b3, b4, b5, b6, b7, b8} | row], rows, types) + + :point -> + <> = bin + decode_rows(types_rest, bin, [{x, y} | row], rows, types) + end + end + + defp decode_rows([], <<>>, row, rows, _types) do + :lists.reverse([:lists.reverse(row) | rows]) + end + + defp decode_rows([], <>, row, rows, types) do + row = :lists.reverse(row) + decode_rows(types, bin, [], [row | rows], types) + end + + @compile inline: [decimal_size: 1] + # https://clickhouse.com/docs/en/sql-reference/data-types/decimal/ + defp decimal_size(precision) when is_integer(precision) do + cond do + precision >= 39 -> 256 + precision >= 19 -> 128 + precision >= 10 -> 64 + true -> 32 + end + end + + @compile inline: [time_unit: 1] + for precision <- 0..9 do + time_unit = round(:math.pow(10, precision)) + defp time_unit(unquote(precision)), do: unquote(time_unit) + end +end diff --git a/ch/lib/ch/stream.ex b/ch/lib/ch/stream.ex new file mode 100644 index 000000000000..9ec8b5fd8428 --- /dev/null +++ b/ch/lib/ch/stream.ex @@ -0,0 +1,43 @@ +defmodule Ch.Stream do + @moduledoc false + + @derive {Inspect, only: []} + defstruct [:conn, :ref, :query, :params, :opts] + + @type t :: %__MODULE__{ + conn: DBConnection.conn(), + ref: Mint.Types.request_ref() | nil, + query: Ch.Query.t(), + params: term, + opts: [Ch.query_option()] + } + + defimpl Enumerable do + def reduce(stream, acc, fun) do + %Ch.Stream{conn: conn, query: query, params: params, opts: opts} = stream + stream = %DBConnection.Stream{conn: conn, query: query, params: params, opts: opts} + DBConnection.reduce(stream, acc, fun) + end + + def member?(_, _), do: {:error, __MODULE__} + def count(_), do: {:error, __MODULE__} + def slice(_), do: {:error, __MODULE__} + end + + defimpl Collectable do + def into(stream) do + %Ch.Stream{conn: conn, query: query, params: params, opts: opts} = stream + ref = DBConnection.execute!(conn, query, {:stream, params}, opts) + {%{stream | ref: ref}, &collect/2} + end + + defp collect(%{conn: conn, query: query, ref: ref} = stream, {:cont, data}) do + ^ref = DBConnection.execute!(conn, query, {:stream, ref, data}) + stream + end + + defp collect(%{conn: conn, query: query, ref: ref}, eof) when eof in [:halt, :done] do + DBConnection.execute!(conn, query, {:stream, ref, :eof}) + end + end +end diff --git a/ch/lib/ch/types.ex b/ch/lib/ch/types.ex new file mode 100644 index 000000000000..b91ac5c68767 --- /dev/null +++ b/ch/lib/ch/types.ex @@ -0,0 +1,568 @@ +defmodule Ch.Types do + @moduledoc """ + Helpers to turn ClickHouse types into Elixir terms for easier processing. + """ + + types = + [ + {_encoded = "String", _decoded = :string, _args = []}, + {"Bool", :boolean, []}, + for size <- [8, 16, 32, 64, 128, 256] do + [ + {"UInt#{size}", :"u#{size}", []}, + {"Int#{size}", :"i#{size}", []} + ] + end, + for size <- [32, 64] do + {"Float#{size}", :"f#{size}", []} + end, + {"Array", :array, [:type]}, + {"Tuple", :tuple, [:type]}, + {"Map", :map, [:type]}, + {"FixedString", :fixed_string, [:int]}, + {"Nullable", :nullable, [:type]}, + {"DateTime64", :datetime64, [:int, :string]}, + {"DateTime", :datetime, [:string]}, + # {"DateTime", :datetime, []}, + {"Date32", :date32, []}, + {"Date", :date, []}, + {"LowCardinality", :low_cardinality, [:type]}, + for size <- [32, 64, 128, 256] do + {"Decimal#{size}", :"decimal#{size}", [:int]} + end, + {"Decimal", :decimal, [:int, :int]}, + {"SimpleAggregateFunction", :simple_aggregate_function, [:identifier, :type]}, + {"Enum8", :enum8, [:string, :eq, :int]}, + {"Enum16", :enum16, [:string, :eq, :int]}, + {"UUID", :uuid, []}, + {"IPv4", :ipv4, []}, + {"IPv6", :ipv6, []}, + {"Point", :point, []}, + {"Ring", :ring, []}, + {"Polygon", :polygon, []}, + {"MultiPolygon", :multipolygon, []}, + {"Nothing", :nothing, []} + ] + |> List.flatten() + + for {encoded, name, []} <- types do + @doc """ + Helper for `#{encoded}` ClickHouse type: + + iex> #{name}() + :#{name} + + iex> encode(#{name}()) + "#{encoded}" + + iex> decode("#{encoded}") + #{name}() + + """ + def unquote(name)(), do: unquote(name) + end + + @doc """ + Helper for `DateTime` ClickHouse type: + + iex> datetime() + :datetime + + iex> to_string(encode(datetime())) + "DateTime" + + iex> decode("DateTime") + datetime() + + """ + def datetime, do: :datetime + + @doc """ + Helper for `DateTime(timezone)` ClickHouse type: + + iex> datetime("Europe/Vienna") + {:datetime, "Europe/Vienna"} + + iex> to_string(encode(datetime("UTC"))) + "DateTime('UTC')" + + iex> decode("DateTime('UTC')") + datetime("UTC") + + """ + def datetime(timezone) when is_binary(timezone), do: {:datetime, timezone} + + @doc """ + Helper for `DateTime64(precision)` ClickHouse type: + + iex> datetime64(3) + {:datetime64, 3} + + iex> to_string(encode(datetime64(3))) + "DateTime64(3)" + + iex> decode("DateTime64(3)") + datetime64(3) + + """ + def datetime64(precision) when is_integer(precision), do: {:datetime64, precision} + + @doc """ + Helper for `DateTime64(precision, timezone)` ClickHouse type: + + iex> datetime64(3, "UTC") + {:datetime64, 3, "UTC"} + + iex> to_string(encode(datetime64(3, "UTC"))) + "DateTime64(3, 'UTC')" + + iex> decode("DateTime64(3, 'UTC')") + datetime64(3, "UTC") + + """ + def datetime64(precision, timezone) when is_integer(precision) and is_binary(timezone) do + {:datetime64, precision, timezone} + end + + @doc """ + Helper for `FixedString(n)` ClickHouse type: + + iex> fixed_string(3) + {:fixed_string, 3} + + iex> to_string(encode(fixed_string(16))) + "FixedString(16)" + + iex> decode("FixedString(16)") + fixed_string(16) + + """ + def fixed_string(n) when is_integer(n), do: {:fixed_string, n} + + @doc """ + Helper for `Decimal(P, S)` ClickHouse type: + + iex> decimal(18, 4) + {:decimal, 18, 4} + + iex> to_string(encode(decimal(18, 4))) + "Decimal(18, 4)" + + iex> decode("Decimal(18, 4)") + decimal(18, 4) + + """ + def decimal(precision, scale) when is_integer(precision) and is_integer(scale) do + {:decimal, precision, scale} + end + + for size <- [32, 64, 128, 256] do + name = :"decimal#{size}" + + # `select toTypeName(cast(1 as Decimal32(2)))` etc. + precision = + case size do + 32 -> 9 + 64 -> 18 + 128 -> 38 + 256 -> 76 + end + + @doc """ + Helper for `Decimal#{size}(S)` ClickHouse type: + + iex> #{name}(4) + {:#{name}, 4} + + iex> to_string(encode(#{name}(4))) + "Decimal(#{precision}, 4)" + + iex> decode("Decimal#{size}(4)") + {:#{name}, 4} + + """ + def unquote(name)(scale) when is_integer(scale), do: {unquote(name), scale} + end + + defguardp is_type(type) when is_atom(type) or is_tuple(type) + + @doc """ + Helper for `Array(T)` ClickHouse type: + + iex> array(u64()) + {:array, :u64} + + iex> to_string(encode(array(u64()))) + "Array(UInt64)" + + iex> decode("Array(UInt64)") + array(u64()) + + """ + def array(type) when is_type(type), do: {:array, type} + + @doc """ + Helper for `Tuple(T1, T2, ...)` ClickHouse type: + + iex> tuple([u64(), array(string())]) + {:tuple, [:u64, {:array, :string}]} + + iex> to_string(encode(tuple([u64(), array(string())]))) + "Tuple(UInt64, Array(String))" + + iex> decode("Tuple(UInt64, Array(String))") + tuple([u64(), array(string())]) + + """ + def tuple(types) when is_list(types), do: {:tuple, types} + + @doc """ + Helper for `Map(K, V)` ClickHouse type: + + iex> map(string(), array(string())) + {:map, :string, {:array, :string}} + + iex> to_string(encode(map(string(), array(string())))) + "Map(String, Array(String))" + + iex> decode("Map(String, Array(String))") + map(string(), array(string())) + + """ + def map(key_type, value_type) when is_type(key_type) and is_type(value_type) do + {:map, key_type, value_type} + end + + @doc """ + Helper for `Nullable(T)` ClickHouse type: + + iex> nullable(array(boolean())) + {:nullable, {:array, :boolean}} + + iex> to_string(encode(nullable(array(boolean())))) + "Nullable(Array(Bool))" + + iex> decode("Nullable(Array(Bool))") + nullable(array(boolean())) + + """ + def nullable(type) when is_type(type), do: {:nullable, type} + + @doc """ + Helper for `LowCardinality(T)` ClickHouse type: + + iex> low_cardinality(string()) + {:low_cardinality, :string} + + iex> to_string(encode(low_cardinality(string()))) + "LowCardinality(String)" + + iex> decode("LowCardinality(String)") + low_cardinality(string()) + + """ + def low_cardinality(type) when is_type(type), do: {:low_cardinality, type} + + @doc """ + Helper for `SimpleAggregateFunction(name, type)` ClickHouse type: + + iex> simple_aggregate_function("any", u8()) + {:simple_aggregate_function, "any", :u8} + + iex> to_string(encode(simple_aggregate_function("any", u8()))) + "SimpleAggregateFunction(any, UInt8)" + + iex> decode("SimpleAggregateFunction(any, UInt8)") + simple_aggregate_function("any", u8()) + + """ + def simple_aggregate_function(name, type) when is_binary(name) and is_type(type) do + {:simple_aggregate_function, name, type} + end + + for size <- [8, 16] do + name = :"enum#{size}" + + @doc """ + Helper for `Enum#{size}` ClickHouse type: + + iex> #{name}([{"hello", 1}, {"world", 2}]) + {:#{name}, [{"hello", 1}, {"world", 2}]} + + iex> to_string(encode(#{name}([{"hello", 1}, {"world", 2}]))) + "Enum#{size}('hello' = 1, 'world' = 2)" + + iex> decode("Enum#{size}('hello' = 1, 'world' = 2)") + #{name}([{"hello", 1}, {"world", 2}]) + + """ + def unquote(name)(mapping) when is_list(mapping), do: {unquote(name), mapping} + end + + @doc """ + Decodes a ClickHouse type into an intermediary Elixir term. + + iex> decode("String") + :string + + iex> decode("Array(String)") + {:array, :string} + + iex> decode("Enum8('hello' = 1, 'world' = 2)") + {:enum8, [{"hello", 1}, {"world", 2}]} + + iex> decode("Nullable(Decimal(18, 4))") + {:nullable, {:decimal, 18, 4}} + + """ + def decode(type) + + for {encoded, decoded, []} <- types do + def decode(unquote(encoded)), do: unquote(decoded) + end + + def decode("DateTime"), do: :datetime + + def decode(type) do + try do + decode([:type], type, []) + rescue + e -> + message = "failed to decode #{inspect(type)} as ClickHouse type (#{Exception.message(e)})" + reraise(ArgumentError, message, __STACKTRACE__) + end + end + + defguardp is_whitespace(char) when char == ?\s or char == ?\t + + defp decode(stack, <>, acc) when is_whitespace(whitespace) do + decode(stack, rest, acc) + end + + for {encoded, decoded, [_ | _] = args} <- types do + defp decode([:type | stack], unquote(encoded) <> rest, acc) do + decode( + [:open | unquote(args)] ++ [:close, {unquote(decoded), unquote(args)}, acc | stack], + rest, + [] + ) + end + end + + for {encoded, decoded, []} <- types do + defp decode([:type | stack], unquote(encoded) <> rest, acc) do + decode(stack, rest, [unquote(decoded) | acc]) + end + end + + defp decode([:open | stack], <>, acc) do + case rest do + <> -> + decode(stack, rest, acc) + + _ -> + # handles DateTime and Type() + [{type, _args}, prev_acc | stack] = close(stack) + decode(stack, rest, [type | prev_acc]) + end + end + + defp decode(stack, <>, acc) do + [{type, _args}, prev_acc | stack] = close(stack) + decode(stack, rest, [build_type(type, acc) | prev_acc]) + end + + defp decode([:close, {_type, args} | _] = stack, <>, acc) do + decode(args ++ stack, rest, acc) + end + + defp decode(stack, <>, acc) do + decode(stack, rest, acc) + end + + defp decode([:string | stack], <>, acc) do + decode_string(rest, 0, rest, stack, acc) + end + + defp decode([:int | stack], <>, acc) do + decode_int(rest, stack, acc) + end + + defp decode([:identifier | stack], <>, acc) do + decode_identifier(rest, 0, rest, stack, acc) + end + + defp decode([:eq | stack], <>, acc) do + decode(stack, rest, acc) + end + + defp decode([], <<>>, [type]), do: type + + defp close([:close | stack]), do: stack + defp close([_ | stack]), do: close(stack) + + defp build_type(:array = a, [t]), do: {a, t} + defp build_type(:tuple = t, ts), do: {t, :lists.reverse(ts)} + defp build_type(:fixed_string = fs, [n]), do: {fs, n} + defp build_type(:datetime = d, [tz]), do: {d, tz} + defp build_type(:datetime64 = d, [precision]), do: {d, precision} + defp build_type(:datetime64 = d, [tz, p]), do: {d, p, tz} + defp build_type(:map = m, [v, k]), do: {m, k, v} + defp build_type(:nullable = n, [t]), do: {n, t} + defp build_type(:low_cardinality = l, [t]), do: {l, t} + defp build_type(:enum8 = e, mapping), do: {e, build_enum_mapping(mapping)} + defp build_type(:enum16 = e, mapping), do: {e, build_enum_mapping(mapping)} + defp build_type(:simple_aggregate_function = saf, [t, f]), do: {saf, f, t} + defp build_type(:decimal32 = d, [s]), do: {d, s} + defp build_type(:decimal64 = d, [s]), do: {d, s} + defp build_type(:decimal128 = d, [s]), do: {d, s} + defp build_type(:decimal256 = d, [s]), do: {d, s} + defp build_type(:decimal = d, [s, p]), do: {d, p, s} + + defp build_enum_mapping(mapping) do + mapping |> :lists.reverse() |> Enum.chunk_every(2) |> Enum.map(fn [k, v] -> {k, v} end) + end + + # TODO '', \' + + defp decode_string(<>, len, original, stack, acc) do + part = :binary.part(original, 0, len) + decode(stack, rest, [:binary.copy(part) | acc]) + end + + defp decode_string(<>, len, original, stack, acc) do + decode_string(rest, len + utf8_size(u), original, stack, acc) + end + + @compile inline: [utf8_size: 1] + defp utf8_size(codepoint) when codepoint <= 0x7F, do: 1 + defp utf8_size(codepoint) when codepoint <= 0x7FF, do: 2 + defp utf8_size(codepoint) when codepoint <= 0xFFFF, do: 3 + defp utf8_size(codepoint) when codepoint <= 0x10FFFF, do: 4 + + defguardp is_alpha(a) when (a >= ?a and a <= ?z) or (a >= ?A and a <= ?Z) + + defp decode_identifier(<>, len, original, stack, acc) when is_alpha(a) do + decode_identifier(rest, len + 1, original, stack, acc) + end + + defp decode_identifier(<>, len, original, stack, acc) do + part = :binary.part(original, 0, len) + decode(stack, rest, [:binary.copy(part) | acc]) + end + + defguardp is_numeric(char) when char >= ?0 and char <= ?9 + + defp decode_int(<>, stack, outer_acc) when is_numeric(i) do + decode_int_cont(rest, -(i - ?0), stack, outer_acc) + end + + defp decode_int(<>, stack, outer_acc) when is_numeric(i) do + decode_int_cont(rest, i - ?0, stack, outer_acc) + end + + defp decode_int_cont(<>, acc, stack, outer_acc) when is_numeric(i) do + decode_int_cont(rest, acc * 10 + i - ?0, stack, outer_acc) + end + + defp decode_int_cont(<>, int, stack, acc) do + decode(stack, rest, [int | acc]) + end + + @doc """ + Encodes a type from Elixir atom / tuple to proper ClickHouse name. + + iex> encode(:string) + "String" + + iex> IO.iodata_to_binary(encode({:nullable, :i8})) + "Nullable(Int8)" + + """ + def encode(type) + + for {encoded, decoded, []} <- types do + def encode(unquote(decoded)), do: unquote(encoded) + end + + def encode(:datetime), do: "DateTime" + def encode({:nullable, type}), do: ["Nullable(", encode(type), ?)] + def encode({:fixed_string, n}), do: ["FixedString(", String.Chars.Integer.to_string(n), ?)] + def encode({:array, type}), do: ["Array(", encode(type), ?)] + def encode({:tuple, types}), do: ["Tuple(", encode_intersperse(types, ", "), ?)] + + def encode({:map, key_type, value_type}) do + ["Map(", encode(key_type), ", ", encode(value_type), ?)] + end + + def encode({:low_cardinality, type}), do: ["LowCardinality(", encode(type), ?)] + + for size <- [32, 64, 128, 256] do + # `select toTypeName(cast(1 as Decimal32(2)))` etc. + precision = + case size do + 32 -> 9 + 64 -> 18 + 128 -> 38 + 256 -> 76 + end + + def encode({unquote(:"decimal#{size}"), scale}) do + encode({:decimal, unquote(precision), scale}) + end + end + + def encode({:decimal, precision, scale}) do + [ + "Decimal(", + String.Chars.Integer.to_string(precision), + ", ", + String.Chars.Integer.to_string(scale), + ?) + ] + end + + def encode({:datetime, timezone}) when is_binary(timezone) do + ["DateTime('", timezone, "')"] + end + + def encode({:datetime64, precision}) do + ["DateTime64(", String.Chars.Integer.to_string(precision), ?)] + end + + def encode({:datetime64, precision, timezone}) when is_binary(timezone) do + ["DateTime64(", String.Chars.Integer.to_string(precision), ", '", timezone, "')"] + end + + def encode({:enum8, mapping}) do + ["Enum8('", encode_mapping(mapping), ?)] + end + + def encode({:enum16, mapping}) do + ["Enum16('", encode_mapping(mapping), ?)] + end + + def encode({:simple_aggregate_function, name, type}) when is_binary(name) do + ["SimpleAggregateFunction(", name, ", ", encode(type), ?)] + end + + defp encode_intersperse([last_type], _separator) do + [encode(last_type)] + end + + defp encode_intersperse([type | types], separator) do + [encode(type), separator | encode_intersperse(types, separator)] + end + + defp encode_intersperse([] = empty, _separator), do: empty + + defp encode_mapping([{k, v}]) when is_binary(k) do + [k, "' = ", String.Chars.Integer.to_string(v)] + end + + defp encode_mapping([{k, v} | mapping]) when is_binary(k) do + [k, "' = ", String.Chars.Integer.to_string(v), ", '" | encode_mapping(mapping)] + end + + defp encode_mapping([] = empty), do: empty +end diff --git a/ch/mix.exs b/ch/mix.exs new file mode 100644 index 000000000000..3ee583bc75b1 --- /dev/null +++ b/ch/mix.exs @@ -0,0 +1,64 @@ +defmodule Ch.MixProject do + use Mix.Project + + @source_url "https://github.com/plausible/ch" + @version "0.2.6" + + def project do + [ + app: :ch, + version: @version, + elixir: "~> 1.14", + elixirc_paths: elixirc_paths(Mix.env()), + deps: deps(), + name: "Ch", + description: "HTTP ClickHouse driver for Elixir", + docs: docs(), + package: package(), + source_url: @source_url + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger] + ] + end + + # Specifies which paths to compile per environment. + defp elixirc_paths(:test), do: ["lib", "test/support"] + defp elixirc_paths(_env), do: ["lib"] + + # Run "mix help deps" to learn about dependencies. + defp deps do + [ + {:mint, "~> 1.0"}, + {:db_connection, "~> 2.0"}, + {:jason, "~> 1.0"}, + {:decimal, "~> 2.0"}, + {:ecto, "~> 3.5", optional: true}, + {:benchee, "~> 1.0", only: [:bench]}, + {:dialyxir, "~> 1.0", only: [:dev], runtime: false}, + {:ex_doc, ">= 0.0.0", only: :docs}, + {:tz, "~> 0.26.0", only: [:test]} + ] + end + + defp docs do + [ + source_url: @source_url, + source_ref: "v#{@version}", + main: "readme", + extras: ["README.md", "CHANGELOG.md"], + skip_undefined_reference_warnings_on: ["CHANGELOG.md"] + ] + end + + defp package do + [ + licenses: ["MIT"], + links: %{"GitHub" => @source_url} + ] + end +end diff --git a/clickhouse/clickhouse-config.xml/config.xml b/clickhouse/clickhouse-config.xml/config.xml new file mode 100644 index 000000000000..723ded7321a0 --- /dev/null +++ b/clickhouse/clickhouse-config.xml/config.xml @@ -0,0 +1,36 @@ + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + + + 8123 + 9000 + + 4096 + 3 + 100 + 8589934592 + 5368709120 + + /var/lib/clickhouse/ + /var/lib/clickhouse/tmp/ + + + + /etc/clickhouse-server/users.xml + + + + default + default + + UTC + + + + \ No newline at end of file diff --git a/clickhouse/clickhouse-user-config.xml/users.xml b/clickhouse/clickhouse-user-config.xml/users.xml new file mode 100644 index 000000000000..7a186edb3dfe --- /dev/null +++ b/clickhouse/clickhouse-user-config.xml/users.xml @@ -0,0 +1,45 @@ + + + + + 10000000000 + 0 + random + + + + + + + + ::/0 + + default + default + 1 + + + + plausible_password + + ::/0 + + default + default + 1 + + + + + + + 3600 + 0 + 0 + 0 + 0 + 0 + + + + \ No newline at end of file diff --git a/config/.env.dev b/config/.env.dev index 93dd0d087ec5..5c32cf23ee9c 100644 --- a/config/.env.dev +++ b/config/.env.dev @@ -1,7 +1,7 @@ BASE_URL=http://localhost:8000 SECURE_COOKIE=false DATABASE_URL=postgres://postgres:postgres@127.0.0.1:5432/plausible_dev -CLICKHOUSE_DATABASE_URL=http://127.0.0.1:8123/plausible_events_db +CLICKHOUSE_DATABASE_URL=http://plausible:plausible_password@127.0.0.1:8123/plausible_events_db CLICKHOUSE_MAX_BUFFER_SIZE_BYTES=1000000 SECRET_KEY_BASE=/njrhntbycvastyvtk1zycwfm981vpo/0xrvwjjvemdakc/vsvbrevlwsc6u8rcg TOTP_VAULT_KEY=Q3BD4nddbkVJIPXgHuo5NthGKSIH0yesRfG05J88HIo= diff --git a/config/.env.test b/config/.env.test index 5f8d41cafed5..8ff261701223 100644 --- a/config/.env.test +++ b/config/.env.test @@ -1,5 +1,5 @@ DATABASE_URL=postgres://postgres:postgres@127.0.0.1:5432/plausible_test?pool_size=40 -CLICKHOUSE_DATABASE_URL=http://127.0.0.1:8123/plausible_test +CLICKHOUSE_DATABASE_URL=http://plausible:plausible_password@127.0.0.1:8123/plausible_test SECRET_KEY_BASE=/njrhntbycvastyvtk1zycwfm981vpo/0xrvwjjvemdakc/vsvbrevlwsc6u8rcg TOTP_VAULT_KEY=1Jah1HEOnCEnmBE+4/OgbJRraJIppPmYCNbZoFJboZs= BASE_URL=http://localhost:8000 @@ -24,3 +24,4 @@ S3_ENDPOINT=http://localhost:10000 S3_EXPORTS_BUCKET=test-exports S3_IMPORTS_BUCKET=test-imports ALLOWED_DOMAINS="plausible.com,example.com" +GEOLITE2_COUNTRY_DB=priv/geodb/GeoLite2-Country_20250620.tar.gz diff --git a/mix.exs b/mix.exs index 1ffe9bbc1a82..c83d2d851870 100644 --- a/mix.exs +++ b/mix.exs @@ -70,6 +70,7 @@ defmodule Plausible.MixProject do {:bcrypt_elixir, "~> 3.0"}, {:bypass, "~> 2.1", only: [:dev, :test, :ce_test]}, {:ecto_ch, "~> 0.3.5"}, + {:ch, path: "ch", override: true}, {:cloak, "~> 1.1"}, {:cloak_ecto, "~> 1.2"}, {:combination, "~> 0.0.3"}, diff --git a/mix.lock b/mix.lock index bf98ea72ecf3..a9dc9f521c90 100644 --- a/mix.lock +++ b/mix.lock @@ -10,7 +10,6 @@ "bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"}, "castore": {:hex, :castore, "1.0.7", "b651241514e5f6956028147fe6637f7ac13802537e895a724f90bf3e36ddd1dd", [:mix], [], "hexpm", "da7785a4b0d2a021cd1292a60875a784b6caef71e76bf4917bdee1f390455cf5"}, "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"}, - "ch": {:hex, :ch, "0.2.5", "b8d70689951bd14c8c8791dc72cdc957ba489ceae723e79cf1a91d95b6b855ae", [:mix], [{:db_connection, "~> 2.0", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:ecto, "~> 3.5", [hex: :ecto, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: false]}], "hexpm", "97de104c8f513a23c6d673da37741f68ae743f6cdb654b96a728d382e2fba4de"}, "chatterbox": {:hex, :ts_chatterbox, "0.15.1", "5cac4d15dd7ad61fc3c4415ce4826fc563d4643dee897a558ec4ea0b1c835c9c", [:rebar3], [{:hpack, "~> 0.3.0", [hex: :hpack_erl, repo: "hexpm", optional: false]}], "hexpm", "4f75b91451338bc0da5f52f3480fa6ef6e3a2aeecfc33686d6b3d0a0948f31aa"}, "cldr_utils": {:hex, :cldr_utils, "2.27.0", "a75d5cdaaf6b7432eb10f547e6abe635c94746985c5b78e35bbbd08b16473b6c", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "516f601e28da10b8f1f3af565321c4e3da3b898a0b50a5e5be425eff76d587e1"}, "cloak": {:hex, :cloak, "1.1.2", "7e0006c2b0b98d976d4f559080fabefd81f0e0a50a3c4b621f85ceeb563e80bb", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "940d5ac4fcd51b252930fd112e319ea5ae6ab540b722f3ca60a85666759b9585"}, diff --git a/priv/geodb/GeoLite2-Country_20250620.tar.gz b/priv/geodb/GeoLite2-Country_20250620.tar.gz new file mode 100644 index 000000000000..d26b70c7ad6f Binary files /dev/null and b/priv/geodb/GeoLite2-Country_20250620.tar.gz differ