From 04ee24ed06e9cfed2703bf96f087d0bd0c869256 Mon Sep 17 00:00:00 2001 From: Philipp Fent Date: Fri, 22 May 2026 12:38:08 +0200 Subject: [PATCH 1/4] add CI for pull requests --- .github/workflows/ci.yml | 18 ++++++++++++++++++ .github/workflows/deploy.yml | 3 --- 2 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..3a025495 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,18 @@ +name: CI + +on: + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Hugo + uses: peaceiris/actions-hugo@v3 + with: + hugo-version: '0.131.0' + + - name: Hugo Build + run: hugo --minify --panicOnWarning diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index d32668dc..36df9daf 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -10,9 +10,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - with: - submodules: true - fetch-depth: 0 - name: Setup Hugo uses: peaceiris/actions-hugo@v3 From 5a4a4f476091817a8b333ef6ff8dbb9de1e4a1b3 Mon Sep 17 00:00:00 2001 From: Philipp Fent Date: Fri, 22 May 2026 12:32:21 +0200 Subject: [PATCH 2/4] fix warnings --- .github/workflows/ci.yml | 2 +- content/community_edition.md | 8 ++++---- content/compatibility/sql_features.md | 14 +++++++------- content/get_started/operate_in_cloud.md | 6 +++--- content/licensing.md | 6 +++--- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a025495..99c34c97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,4 +15,4 @@ jobs: hugo-version: '0.131.0' - name: Hugo Build - run: hugo --minify --panicOnWarning + run: hugo --minify #--panicOnWarning # TODO: enable with hextra update diff --git a/content/community_edition.md b/content/community_edition.md index 9a2d1fe0..adf7317c 100644 --- a/content/community_edition.md +++ b/content/community_edition.md @@ -3,20 +3,20 @@ title: "CedarDB Community Edition" weight: 5 --- -{{< tabs items="Docker,Local Install,Cloud Setup" >}} -{{< tab >}} +{{< tabs >}} +{{< tab name="Docker" >}} ```shell docker pull cedardb/cedardb ``` ...and visit our [**Run With Docker**](../get_started/install_with_docker) guide for more details. {{< /tab >}} -{{< tab >}} +{{< tab name="Local Install" >}} ``` curl https://get.cedardb.com | bash ``` ...and visit our [**Local Install**](../get_started/install_locally) guide for more details. {{< /tab >}} -{{< tab >}} +{{< tab name="Cloud Setup" >}} Visit [**Operate in the Cloud**](../get_started/operate_in_cloud) for more details. {{< /tab >}} {{< /tabs >}} diff --git a/content/compatibility/sql_features.md b/content/compatibility/sql_features.md index 0a66f2e6..8b779f83 100644 --- a/content/compatibility/sql_features.md +++ b/content/compatibility/sql_features.md @@ -111,20 +111,20 @@ the [system table compatibility](../system_table) page. | **Feature** | **Support State** | **Details** | |---------------------------|-------------------|----------------------------------------------------------------------------------| | Table & View References | Yes | | -| Inner Joins | Yes | [Documentation](/docs/references/queries/#joins) | -| Outer Joins | Yes | [Documentation](/docs/references/queries/#joins) | -| Semijoins | Yes | [Documentation](/docs/references/queries/#joins) | +| Inner Joins | Yes | [Documentation](/docs/references/queries/from/) | +| Outer Joins | Yes | [Documentation](/docs/references/queries/from/) | +| Semijoins | Yes | [Documentation](/docs/references/queries/from/) | | Antijoins | Yes | | | Table Functions | Yes | | | Lateral Subqueries | Yes | | | User-Specified Aliases | Yes | | -| GROUP BY | Yes | [Documentation](/docs/references/queries/#group-by) | -| HAVING | Yes | [Documentation](/docs/references/queries/#group-by) | +| GROUP BY | Yes | [Documentation](/docs/references/queries/groupby/) | +| HAVING | Yes | [Documentation](/docs/references/queries/groupby/) | | GROUPING SETS | Yes | | | CUBE | Yes | | | ROLLUP | Yes | | -| WINDOW Functions | Yes | [Documentation](/docs/references/queries/#window-functions) | -| WITH | Yes | [Documentation](/docs/references/queries/#common-table-expressions) | +| WINDOW Functions | Yes | [Documentation](/docs/references/queries/window/) | +| WITH | Yes | [Documentation](/docs/references/queries/with/) | | WITH RECURSIVE | Yes | | | UNION | Yes | | | UNION ALL | Yes | | diff --git a/content/get_started/operate_in_cloud.md b/content/get_started/operate_in_cloud.md index 0129264c..87290afa 100644 --- a/content/get_started/operate_in_cloud.md +++ b/content/get_started/operate_in_cloud.md @@ -14,8 +14,8 @@ Here's a quick setup example for running CedarDB in the cloud. We recommend using the latest **Ubuntu LTS** release (i.e., Ubuntu 24.04 as of writing). -{{< tabs items="Native,Docker" >}} - {{< tab >}} +{{< tabs >}} + {{< tab name="Native" >}} Install the psql shell: @@ -34,7 +34,7 @@ We recommend using the latest **Ubuntu LTS** release (i.e., Ubuntu 24.04 as of w {{< /tab >}} - {{< tab >}} + {{< tab name="Docker" >}} Install docker and the psql shell: ```shell diff --git a/content/licensing.md b/content/licensing.md index 515f693e..75717026 100644 --- a/content/licensing.md +++ b/content/licensing.md @@ -23,14 +23,14 @@ After receiving your license key, activate it by setting the license.key option In the following, we show the concrete steps needed to configure the license setting. For more information on setting configuration options, see our [configuration reference](/docs/references/configuration). -{{< tabs items="Configuration File (preferred), Environment Variable" >}} -{{< tab >}} +{{< tabs >}} +{{< tab name="Configuration File (preferred)" >}} Add a line with your license key to the CedarDB configuration file. The server will automatically load it at startup. In this example, we will use the default configuration path, which is automatically loaded at CedarDB startup when no other config file is specified. ```shell echo "\"license.key\" = \"\"" >> ~/.cedardb/config ``` {{< /tab >}} -{{< tab >}} +{{< tab name="Environment Variable" >}} An alternative is to set the license in an environment variable. If you are running CedarDB directly on your host machine, then export your license key before starting CedarDB like this: ```shell From 3421a42e555e40d4114943535d3f4f2a53c5a162 Mon Sep 17 00:00:00 2001 From: Philipp Fent Date: Fri, 22 May 2026 15:02:07 +0200 Subject: [PATCH 3/4] use markdownlint for consistent markdown --- .github/workflows/ci.yml | 8 + .markdownlint-cli2.yaml | 8 + README.md | 10 +- Structure.md | 7 +- Styleguide.md | 75 ++++---- content/clients/_index.md | 1 - content/clients/cpp/_index.md | 10 +- content/clients/csharp/_index.md | 20 ++- content/clients/java/_index.md | 10 +- content/clients/javascript/drizzle.md | 2 +- content/clients/python/_index.md | 18 +- content/clients/r/_index.md | 2 +- content/clients/rust/_index.md | 2 +- content/clients/tools/grafana.md | 28 ++- content/clients/tools/psql.md | 8 +- content/community_edition.md | 21 ++- .../compatibility/ecosystem_and_clients.md | 28 +-- content/compatibility/sql_features.md | 160 +++++++++--------- content/compatibility/system_table.md | 3 +- content/cookbook/aurora_debezium.md | 35 ++-- content/cookbook/aws_dms.md | 77 ++++++--- content/cookbook/clickbench.md | 6 +- content/cookbook/importing_from_json.md | 8 +- content/cookbook/importing_from_postgresql.md | 26 ++- content/cookbook/pgbench.md | 18 +- content/cookbook/read_replica_tutorial.md | 22 ++- content/cookbook/working_with_csv.md | 17 +- content/database_upgrade.md | 7 +- content/example_datasets/_index.md | 10 +- content/example_datasets/chbenchmark.md | 4 +- content/example_datasets/glove.md | 26 +-- content/example_datasets/handelsregister.md | 8 +- content/example_datasets/job.md | 5 + content/example_datasets/nasdaq.md | 46 ++--- content/get_started/_index.md | 9 +- content/get_started/install_locally.md | 24 ++- content/get_started/operate_in_cloud.md | 24 ++- content/get_started/quickstart.md | 19 ++- content/licensing.md | 26 +-- content/references/advanced/_index.md | 2 - content/references/advanced/asof_join.md | 2 +- content/references/advanced/benchmarking.md | 17 +- content/references/advanced/createserver.md | 3 +- content/references/advanced/gs.md | 5 +- content/references/advanced/parquet.md | 143 +++++++++------- content/references/advanced/pgvector.md | 5 +- content/references/advanced/prepare.md | 20 ++- content/references/advanced/s3.md | 4 +- content/references/configuration.md | 34 ++-- content/references/datatypes/array.md | 11 +- content/references/datatypes/bit.md | 3 +- content/references/datatypes/blob.md | 3 +- content/references/datatypes/boolean.md | 10 +- content/references/datatypes/date.md | 12 +- content/references/datatypes/enums.md | 30 +++- content/references/datatypes/float.md | 10 +- content/references/datatypes/integer.md | 26 ++- content/references/datatypes/interval.md | 7 +- content/references/datatypes/json.md | 5 +- content/references/datatypes/numeric.md | 22 ++- content/references/datatypes/range.md | 11 +- content/references/datatypes/text.md | 28 +-- content/references/datatypes/time.md | 4 +- content/references/datatypes/timestamp.md | 3 +- content/references/datatypes/uuid.md | 20 ++- content/references/datatypes/vector.md | 2 +- content/references/dml/copy.md | 1 + content/references/dml/delete.md | 1 + content/references/dml/update.md | 2 +- content/references/dml/upsert.md | 4 +- content/references/expressions/try.md | 6 +- .../functions/advanced_functions/_index.md | 13 +- content/references/functions/bitstring.md | 12 +- content/references/functions/json.md | 40 +++-- content/references/functions/system.md | 65 ++++--- content/references/functions/text.md | 24 +-- content/references/functions/timestamp.md | 63 ++++--- content/references/objects/functions.md | 4 +- content/references/objects/indexes.md | 2 + content/references/objects/tables.md | 20 ++- content/references/queries/select.md | 1 + content/references/sessions/settings.md | 4 +- content/references/utility/explain.md | 13 +- content/references/writecache.md | 6 +- content/releases.md | 4 +- content/roadmap.md | 54 +++--- content/technology/parallelism.md | 4 +- 87 files changed, 934 insertions(+), 689 deletions(-) create mode 100644 .markdownlint-cli2.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99c34c97..e82c133f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,3 +16,11 @@ jobs: - name: Hugo Build run: hugo --minify #--panicOnWarning # TODO: enable with hextra update + + markdownlint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run markdownlint + uses: DavidAnson/markdownlint-cli2-action@v23.2.0 diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 00000000..633e47a8 --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,8 @@ +globs: + - "content/**/*.md" + +config: + MD013: false # line-length: no hard limit for documentation prose + MD033: false # no-inline-html: HTML used intentionally in Hugo templates + MD024: + siblings_only: true # duplicate headings OK across different parent sections diff --git a/README.md b/README.md index 321855e1..f75548c7 100644 --- a/README.md +++ b/README.md @@ -4,22 +4,26 @@ This repository contains the code for the [CedarDB documentation](https://cedard The documentation is built by the static-site-generator [Hugo](https://gohugo.io/) and uses the [Hextra](https://imfing.github.io/hextra/docs/guide/) theme. Clone repository: + ```shell git clone --recurse-submodules git@github.com:cedardb/docs.git ``` Run a web server: -``` + +```shell hugo server ``` If you want to exclude the page from being built please add the following to the front matters. -``` + +```text draft: true ``` If you want to exclude the page until a certain publication date please add the following to the front matters. -``` + +```text publishDate: 2023-10-19T00:40:04-07:00 ``` diff --git a/Structure.md b/Structure.md index 3f83ace5..cf76dbcc 100644 --- a/Structure.md +++ b/Structure.md @@ -7,7 +7,7 @@ where new pages go. ## Top-Level Layout -``` +```text content/ ├── _index.md # Landing page ├── community_edition.md # Community edition info @@ -97,7 +97,7 @@ loading strategies, and when to use CedarDB-specific features. ### clients/ layout -``` +```text clients/ ├── python/ │ └── _index.md # psycopg driver @@ -140,7 +140,7 @@ DDL documentation is organized by catalog object, not by SQL command. All operations on an object (CREATE, ALTER, DROP) live on the same page or in tightly linked pages within the same directory. -``` +```text references/ ├── datatypes/ # One page per type family │ ├── _index.md @@ -367,6 +367,7 @@ different purposes: and examples. When a feature's status changes: + 1. Update the compatibility matrix entry. 2. Update (or create) the reference page. 3. Both changes go in the same PR. diff --git a/Styleguide.md b/Styleguide.md index 59cc7255..4001ad5b 100644 --- a/Styleguide.md +++ b/Styleguide.md @@ -27,12 +27,12 @@ Strengths may be highlighted in best-practices content. returned by CedarDB." - Never use "we" to describe product behavior. Use "CedarDB" or "the database." -| Avoid | Prefer | -|-------|--------| +| Avoid | Prefer | +|-------------------------------------------------|---------------------------------------------------------| | We recommend creating an index before querying. | Create an index before querying for better performance. | -| The user should call SELECT after INSERT. | Call SELECT after INSERT to verify the result. | -| We fixed the bug in version 2.3. | CedarDB 2.3 fixes the issue where ... | -| The result is returned by CedarDB. | CedarDB returns the result. | +| The user should call SELECT after INSERT. | Call SELECT after INSERT to verify the result. | +| We fixed the bug in version 2.3. | CedarDB 2.3 fixes the issue where ... | +| The result is returned by CedarDB. | CedarDB returns the result. | --- @@ -56,15 +56,15 @@ Strengths may be highlighted in best-practices content. Every SQL reference page follows this structure. Omit a section only if it genuinely does not apply. Do not leave sections empty or include placeholders. -| Section | Required? | Purpose | -|---------|-----------|---------| -| One-paragraph summary | Yes | What the feature does. No examples yet. | -| Quick example | Yes | Short, self-contained SQL showing the core concept. | -| Syntax | Yes | Railroad diagram of the supported syntax, followed by a plain-text fallback block with `` and `[optional]` parts. Options listed as a table. | -| Parameters and options | If applicable | Table of parameters, types, defaults, descriptions. | -| Permissions | If applicable | Which roles can execute this and under what conditions. | -| PostgreSQL differences | If any exist | Bulleted list of behavioral differences. See Section 5. | -| Further examples | Recommended | Realistic scenarios covering edge cases. | +| Section | Required? | Purpose | +|------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| +| One-paragraph summary | Yes | What the feature does. No examples yet. | +| Quick example | Yes | Short, self-contained SQL showing the core concept. | +| Syntax | Yes | Railroad diagram of the supported syntax, followed by a plain-text fallback block with `` and `[optional]` parts. Options listed as a table. | +| Parameters and options | If applicable | Table of parameters, types, defaults, descriptions. | +| Permissions | If applicable | Which roles can execute this and under what conditions. | +| PostgreSQL differences | If any exist | Bulleted list of behavioral differences. See Section 5. | +| Further examples | Recommended | Realistic scenarios covering edge cases. | Railroad diagrams are the primary way to communicate syntax. Use the `{{}}` shortcode. Only document syntax that CedarDB actually @@ -86,11 +86,11 @@ and edge cases later, clearly labeled. Use tables with a traffic-light status column using inline emoji: -| Status | Emoji | Meaning | -|--------|-------|---------| -| Full | 🟢 | Core PostgreSQL functionality works. | -| Partial | 🟡 | Meaningful, commonly-used functionality is missing. | -| No | 🔴 | The feature is not supported at all. | +| Status | Emoji | Meaning | +|---------|-------|-----------------------------------------------------| +| Full | 🟢 | Core PostgreSQL functionality works. | +| Partial | 🟡 | Meaningful, commonly-used functionality is missing. | +| No | 🔴 | The feature is not supported at all. | - Every entry with a reference page must link to it. - **Full** means core PostgreSQL functionality works. Minor missing sub-features @@ -110,10 +110,10 @@ a reader sees when arriving at a section from a search result or a cross-link. Structure: -| Part | Required? | Notes | -|------|-----------|-------| -| One-paragraph overview | Yes | What this section covers and why it exists as a group. No syntax, no examples. | -| Member list | Yes | A bulleted or linked list of all pages in the section, each with a one-line description. | +| Part | Required? | Notes | +|------------------------|-----------|------------------------------------------------------------------------------------------| +| One-paragraph overview | Yes | What this section covers and why it exists as a group. No syntax, no examples. | +| Member list | Yes | A bulleted or linked list of all pages in the section, each with a one-line description. | Keep the overview to two or three sentences. It should answer "what will I find here?" and orient the reader before they click into a specific page. Do not @@ -182,7 +182,8 @@ INSERT INTO trees VALUES (1, 'Oak', 12.4, '2015-03-10'); -- Verify SELECT * FROM trees; ``` -``` + +```text id | species | height_m | planted ----+---------+----------+------------ 1 | Oak | 12.4 | 2015-03-10 @@ -225,10 +226,10 @@ differences, omit the section entirely. Lead with what CedarDB can do. Limitations and PostgreSQL differences belong in a clearly labeled section near the bottom of the page, not in a warning at the top. -| Avoid | Prefer | -|-------|--------| +| Avoid | Prefer | +|--------------------------------------|--------------------------------------------------------------------------| | Warning: CedarDB does not support X. | CedarDB supports Y and Z. Note: X is not yet available; see the roadmap. | -| Partial support only -- see caveats. | Fully supported. The following options are not yet available: [list] | +| Partial support only -- see caveats. | Fully supported. The following options are not yet available: [list] | --- @@ -311,13 +312,13 @@ fix it immediately in the same PR. Do not file "fix docs later" tickets. ## Quick Reference -| Avoid | Prefer | -|-------|--------| -| Link to postgresql.org for more detail. | Document the detail here. | -| Put limitations in a callout at the top. | Lead with what the feature does; limitations at the bottom. | -| Write "we" for product behavior. | Write "CedarDB" or "the database." | -| Assume readers have context from another page. | Make every example self-contained. | -| Skip documenting simple features. | Document everything. Completeness signals maturity. | -| Leave "No" entries in the compatibility matrix untested. | Test it. Update the entry. Update the reference page. | -| Use passive voice by default. | Use active voice. | -| Commit audit results to main. | Always commit on a dated branch. | +| Avoid | Prefer | +|----------------------------------------------------------|-------------------------------------------------------------| +| Link to postgresql.org for more detail. | Document the detail here. | +| Put limitations in a callout at the top. | Lead with what the feature does; limitations at the bottom. | +| Write "we" for product behavior. | Write "CedarDB" or "the database." | +| Assume readers have context from another page. | Make every example self-contained. | +| Skip documenting simple features. | Document everything. Completeness signals maturity. | +| Leave "No" entries in the compatibility matrix untested. | Test it. Update the entry. Update the reference page. | +| Use passive voice by default. | Use active voice. | +| Commit audit results to main. | Always commit on a dated branch. | diff --git a/content/clients/_index.md b/content/clients/_index.md index ebf926c5..67878470 100644 --- a/content/clients/_index.md +++ b/content/clients/_index.md @@ -6,7 +6,6 @@ weight: 40 CedarDB is compatible with most features of PostgreSQL-compatible clients and drivers out of the box. If you want to use CedarDB in combination with a specific framework or language, take a look at the following sub pages. - Programming languages: * [C#](csharp) diff --git a/content/clients/cpp/_index.md b/content/clients/cpp/_index.md index dd8178df..dec1f385 100644 --- a/content/clients/cpp/_index.md +++ b/content/clients/cpp/_index.md @@ -11,18 +11,22 @@ CedarDB is compatible with the [PostgreSQL libpqxx](https://pqxx.org/development Before demonstrating the connection to CedarDB, we need to get the correct dependencies . libpqxx uses the libpq library internally. On Debian or Ubuntu you can simply get the dev files from an apt repository. + ```bash sudo apt install libpqxx-dev ``` After finishing the client (see at the full program at the bottom of the program), we need to first compile our program with `g++` and then execute it. + ```bash g++ -std=c++17 main.cpp -lpqxx -lpq -o CedarDBClient ./CedarDBClient ``` ## Connecting + Connect to CedarDB like this: + ```cpp // The connection string auto connectionString = "dbname= user= password= host=localhost port=5432"; @@ -40,6 +44,7 @@ try { return 0; ``` + You now have an open connection to CedarDB that allows you to insert data or query the database. ## Inserting Data @@ -55,6 +60,7 @@ auto createTable = "CREATE TABLE IF NOT EXISTS chatlog(userid integer, message t transaction.exec(createTable); transaction.commit(); ``` + In the following, we first prepare a new insert statement, before we insert a new tuple using the `connection` instance: ```cpp @@ -89,6 +95,7 @@ transaction.commit(); ``` ## Bulk Loading + If you need to load a lot of data at once (e.g., for an initial import of your existing data set), inserting tuples one by one is too slow: jdbc has to do a full roundtrip to CedarDB and back for each single insert, making the whole loading process severely network latency bound, even on a local connection. @@ -109,11 +116,10 @@ transaction.commit(); This feature makes use of CedarDB's Postgres-compatible `COPY` mode to bulk transmit all data, leading to significantly higher throughput: -``` +```text LOG: 100000 rows (0.000012 s parsing, 0.000374 s compilation, 0.025395 s transmission, 0.016492 s execution) ``` - ## Source Code {{% details title="Open to show the complete sample code" closed="true" %}} diff --git a/content/clients/csharp/_index.md b/content/clients/csharp/_index.md index 1f9548ea..51478f58 100644 --- a/content/clients/csharp/_index.md +++ b/content/clients/csharp/_index.md @@ -7,7 +7,9 @@ weight: 10 CedarDB is compatible with [Npgsql](https://www.npgsql.org/), the open source .NET driver for PostgreSQL. ## Connecting + Connect to CedarDB like this: + ```C# String connString = "Server=127.0.0.1;User Id=;Password=;Database="; var dataSourceBuilder = new NpgsqlDataSourceBuilder(connString); @@ -26,7 +28,8 @@ await using var createCommand = dataSource.CreateCommand( "CREATE TABLE IF NOT EXISTS chatlog(userid integer, message text, ts timestamptz)"); await createCommand.ExecuteNonQueryAsync(); // you can also run this command synchronously, if required ``` -Alternatively, you can also talk to CedarDB using the `Connection` object instead. + +Alternatively, you can also talk to CedarDB using the `Connection` object instead. In the following, we insert a new tuple using the `conn` instance and a prepared statement: ```C# @@ -62,13 +65,14 @@ while (await reader.ReadAsync()) ``` ## Bulk Loading + If you need to load a lot of data at once (e.g., for an initial import of your existing data set), inserting tuples one by one is too slow: npgsql has to do a full roundtrip to CedarDB and back for each single insert, making the whole loading process severely network latency bound, even on a local connection. Use npgsql's bulk loading feature instead: - ### Binary Mode + ```C# using (var binaryWriter = conn.BeginBinaryImport("COPY chatlog(userid, message, ts) FROM STDIN (FORMAT BINARY)")) { @@ -86,7 +90,7 @@ using (var binaryWriter = conn.BeginBinaryImport("COPY chatlog(userid, message, This feature makes use of CedarDB's Postgres-compatible `COPY` mode to bulk transmit all data, leading to significantly higher throughput: -``` +```text LOG: 1000000 rows (0.000016 s parsing, 0.000286 s compilation, 0.882862 s transmission, 0.073085 s execution) ``` @@ -94,8 +98,8 @@ LOG: 1000000 rows (0.000016 s parsing, 0.000286 s compilation, 0.882862 s transm Take note of the warning in the [npgsql docs](https://www.npgsql.org/doc/copy.html): It is your responsibility to ensure that npgsql uses the correct type for each row. It is therefore encouraged to specify the exact type of each row. {{< /callout >}} - ### Text Mode + Alternatively, you can also use *text mode* for transferring the files. This allows you to send one string per tuple and let CedarDB to the parsing. ```C# @@ -108,7 +112,7 @@ using (var textWriter = conn.BeginTextImport("COPY chatlog (userid, message, ts) } ``` -``` +```text LOG: 1000000 rows (0.000016 s parsing, 0.000273 s compilation, 1.250094 s transmission, 0.034226 s execution) ``` @@ -116,8 +120,8 @@ LOG: 1000000 rows (0.000016 s parsing, 0.000273 s compilation, 1.250094 s transm We recommend using binary copy mode as it significantly faster than text mode due to its terser encoding. {{< /callout >}} - ## Batching + If bulk loading is not an option, but data comes in at such a high rate that network latency becomes an issue, consider *batching*: ```C# @@ -135,15 +139,15 @@ await using var batch = new NpgsqlBatch(conn) await batch.ExecuteNonQueryAsync(); await transaction.CommitAsync(); ``` + Here, npgsql groups multiple statements into a single packet to CedarDB, saving expensive round trips. {{< callout type="info" >}} -We recommend executing each batch within an explicit transaction (as shown above). +We recommend executing each batch within an explicit transaction (as shown above). Otherwise, each insert statement is applied in its own transaction, increasing latency. Furthermore, by using one transaction per batch, you can ensure that either the whole batch is applied or nothing. {{< /callout >}} - ## Source Code {{% details title="Open to show the complete sample code" closed="true" %}} diff --git a/content/clients/java/_index.md b/content/clients/java/_index.md index 71bc9965..ff369e3f 100644 --- a/content/clients/java/_index.md +++ b/content/clients/java/_index.md @@ -10,12 +10,14 @@ CedarDB is compatible with the [PostgreSQL JDBC](https://jdbc.postgresql.org/) d Before demonstrating the connection to CedarDB, we need to get the correct dependencies and set the classpath. Note that you can simply [download](https://jdbc.postgresql.org/download/) the latest version of the JDBC. + ```bash wget https://jdbc.postgresql.org/download/postgresql-42.7.3.jar ``` After finishing the client (see at the full program at the bottom of the program), we need to first compile our java program with `javac` and then execute the class with the right classpath. This examples assumes that the java program has the name `CedarDBClient`. + ```bash export CLASSPATH=".:postgresql-42.7.3.jar" javac CedarDBClient.java @@ -23,7 +25,9 @@ java CedarDBClient ``` ## Connecting + Connect to CedarDB like this: + ```Java // The connection string String jdbc = "jdbc:postgresql://localhost:5432/"; @@ -51,6 +55,7 @@ try (Connection connection = DriverManager.getConnection(jdbc, props)) { System.err.println("Connection error: " + e.getMessage()); } ``` + You now have an open connection to CedarDB that allows you to insert data or query the database. ## Inserting Data @@ -62,6 +67,7 @@ Statement st = conn.createStatement(); st.execute("CREATE TABLE IF NOT EXISTS chatlog(userid integer, message text, ts timestamptz)"); st.close(); ``` + In the following, we insert a new tuple using the `conn` instance: ```Java @@ -94,6 +100,7 @@ statement.close(); ``` ## Bulk Loading + If you need to load a lot of data at once (e.g., for an initial import of your existing data set), inserting tuples one by one is too slow: jdbc has to do a full roundtrip to CedarDB and back for each single insert, making the whole loading process severely network latency bound, even on a local connection. @@ -113,11 +120,10 @@ System.out.println(rowsCopied + " rows inserted"); This feature makes use of CedarDB's Postgres-compatible `COPY` mode to bulk transmit all data, leading to significantly higher throughput: -``` +```text LOG: 100000 rows (0.000033 s parsing, 0.001294 s compilation, 0.263479 s transmission, 0.049921 s execution) ``` - ## Source Code {{% details title="Open to show the complete sample code" closed="true" %}} diff --git a/content/clients/javascript/drizzle.md b/content/clients/javascript/drizzle.md index dbcf04d8..cca8c1c4 100644 --- a/content/clients/javascript/drizzle.md +++ b/content/clients/javascript/drizzle.md @@ -7,7 +7,7 @@ weight: 10 [Drizzle ORM](https://orm.drizzle.team/) is a TypeScript-first ORM with a SQL-like query API and schema migration tooling. {{< callout type="info" >}} -Drizzle support is currently **under active development**, already offering **full query API support**. +Drizzle support is currently **under active development**, already offering **full query API support**. However, certain advanced features are not yet available. Schema migrations (e.g. via `drizzle-kit push`) might not work as expected. {{< /callout >}} diff --git a/content/clients/python/_index.md b/content/clients/python/_index.md index cf8baa09..6533bf9a 100644 --- a/content/clients/python/_index.md +++ b/content/clients/python/_index.md @@ -11,8 +11,8 @@ CedarDB supports the older, but still very common `psycopg2`, as well as the new This article is about the newer `psycopg`(3). {{< /callout >}} - ## Connecting + Connect to CedarDB like this: ```python @@ -20,9 +20,11 @@ connstr = "host=localhost port=5432 dbname= user= password=}} Be careful: To make sure that data is persisted, you + - have to explicitly call the commit method of your connection object (like we did above) **or** - let the connection object go out of scope without encountering an exception **or** - explictly enable autocommit for your connection (`autocommit=True`). @@ -105,6 +108,7 @@ Chatline(ts=datetime.datetime(2024, 4, 8, 11, 47, 46, 135798, tzinfo=zoneinfo.Zo ``` ## Bulk Loading + If you need to load a lot of data at once (e.g., for an initial import of your existing data set), inserting tuples one by one is too slow: psycopg has to do a full roundtrip to CedarDB and back for each single insert, making the whole loading process severely network latency bound, even on a local connection. @@ -119,15 +123,16 @@ with psycopg.connect(connstr) as conn: copy.write_row((ts + datetime.timedelta(seconds=i), i, "Hello!")) conn.commit() ``` + This feature makes use of CedarDB's Postgres-compatible `COPY` mode to bulk transmit all data, leading to significantly higher throughput: -``` +```text LOG: 1000000 rows (0.000013 s parsing, 0.000310 s compilation, 3.967416 s transmission, 0.023089 s execution) ``` - For a moderate performance gain, you can also copy data formatted as raw binary stream. Just append `(FORMAT BINARY)` to the `COPY` statement and specify the data types: + ```python with cur.copy("COPY chatlog (ts, userid, message) FROM STDIN (FORMAT BINARY)") as copy: copy.set_types(["timestamp","int4","text"]) @@ -136,17 +141,16 @@ with cur.copy("COPY chatlog (ts, userid, message) FROM STDIN (FORMAT BINARY)") a conn.commit() ``` -``` +```text LOG: 1000000 rows (0.000016 s parsing, 0.000344 s compilation, 3.677765 s transmission, 0.063557 s execution) ``` Please familiarize yourself with the limits of psycopg's binary copy support in the [official docs](https://www.psycopg.org/psycopg3/docs/basic/copy.html). - {{< callout type="info" >}} `execute()` and `executeMany()` automatically *prepare* statements that are executed multiple times in sequence. -You can also override this setting by passing `prepare=True|False` to both methods. -Take a look [here](../../references/advanced/prepare), why preparing your statements is a *very good thing*. +You can also override this setting by passing `prepare=True|False` to both methods. +Take a look at the [prepared statements reference](../../references/advanced/prepare) to understand why preparing your statements is a *very good thing*. {{< /callout >}} ## Pipelining diff --git a/content/clients/r/_index.md b/content/clients/r/_index.md index debfab44..19248664 100644 --- a/content/clients/r/_index.md +++ b/content/clients/r/_index.md @@ -40,7 +40,7 @@ dbExecute(con, "create table chatlog(userid integer, message text, ts timestamp) dbListTables(con) ``` -``` +```text [1] "chatlog" ``` diff --git a/content/clients/rust/_index.md b/content/clients/rust/_index.md index fe268720..dd6eadbb 100644 --- a/content/clients/rust/_index.md +++ b/content/clients/rust/_index.md @@ -60,7 +60,7 @@ for row in result { } ``` -``` +```text [2024-06-06 11:00:41.536369 +02:00]: User 7 wrote message "(☞゚∀゚)☞" ``` diff --git a/content/clients/tools/grafana.md b/content/clients/tools/grafana.md index 1adbdd47..47d680ea 100644 --- a/content/clients/tools/grafana.md +++ b/content/clients/tools/grafana.md @@ -6,7 +6,6 @@ weight: 100 CedarDB is compatible with [Grafana](https://grafana.com/), the popular dashboard building solution. You can use Grafana's PostgreSQL connector to visualize the data stored in CedarDB. - ## Set up Grafana {{% steps %}} @@ -20,16 +19,16 @@ For example, if you are using Ubuntu, use [this guide](https://grafana.com/docs/ Again, use the [corresponding page of the official documentation](https://grafana.com/docs/grafana/latest/setup-grafana/start-restart-grafana/). - ### Make sure everything works Use [this page](https://grafana.com/docs/grafana/latest/getting-started/build-first-dashboard/) of Grafana's documentation to test your installation and ensure it set it up correctly. ### Optional: reduce refresh interval -By default, Grafana refreshes its dashboard at most once every 5 seconds. +By default, Grafana refreshes its dashboard at most once every 5 seconds. If you want to update your dashboards more often, open `/etc/grafana/grafana.ini` in your favorite text editor: + ```ini min_refresh_interval = 100ms ``` @@ -40,12 +39,13 @@ Afterwards, restart Grafana to load the setting: sudo systemctl restart grafana-server ``` -You can then choose your own refresh intervals within your dashboard: Within your dashboard's setting menu, under the general tab, set the Auto refresh setting to e.g. `100ms,200ms,500ms,1s,5s,10s,30s,1m,5m`. +You can then choose your own refresh intervals within your dashboard: Within your dashboard's setting menu, under the general tab, set the Auto refresh setting to e.g. `100ms,200ms,500ms,1s,5s,10s,30s,1m,5m`. {{% /steps %}} ## Set up CedarDB -Now that Grafana is up and running, we have to start a CedarDB instance it can connect to. + +Now that Grafana is up and running, we have to start a CedarDB instance it can connect to. {{< callout type="info" >}} @@ -90,19 +90,17 @@ insert into test values(current_timestamp, (random() * 100)::int); {{% /steps %}} - ## Build a dashboard in Grafana {{% steps %}} ### Add a new data source -Within Grafana (e.g. on https://localhost:3000/), navigate to: Main Menu > Connections > Data sources > Add new data source - -Choose "PostgreSQL", and fill in the following settings: +Within Grafana (e.g. on ), navigate to: Main Menu > Connections > Data sources > Add new data source +Choose "PostgreSQL", and fill in the following settings: -``` +```text Name: cedardb Host URL: localhost:5432 Database name: grafana @@ -114,13 +112,14 @@ PostgreSQL Version: 15 Min time interval: 100ms ``` -Then click on "Save & test". You will get an error message "Internal Server Error" with the message `ERROR: schema "information_schema" does not exist` in the CedarDB logs. This is expected behaviour and fine for now. +Then click on "Save & test". You will get an error message "Internal Server Error" with the message `ERROR: schema "information_schema" does not exist` in the CedarDB logs. This is expected behaviour and fine for now. ### Build a dashboard On the top right of the data source window, click on "Build a dashbaord", and then "Add visualization". Choose your new "CedarDB" data source. In the Query builder, toggle the "Code" view on the far right and enter the statement + ```sql select * from test; ``` @@ -128,23 +127,22 @@ select * from test; ![Query Builder](/images/grafana/querybuilder.png) {{< callout type="warning" >}} -The interactive query builder is currently not supported, as it relies on database instrospection features that CedarDB does not support yet. +The interactive query builder is currently not supported, as it relies on database instrospection features that CedarDB does not support yet. {{< /callout >}} - When choosing the correct interval (e.g., "Last 10 minutes") you should see an automatically updating live view of your data. Apply the changes on the top right to get a view of your new dashboard. ![Chart](/images/grafana/chart.png) - - ### Fiddle with the refresh intervals + If you have changed the minimum refresh interval of Grafana earlier, you can set the auto refresh interval in the top right to a lower value (e.g., 100ms). Rerun your `watch` command in your psql shell: ```sql insert into test values(current_timestamp, (random() * 100)::int); \watch 0.05 ``` + to get a more responsive Grafana. {{% /steps %}} diff --git a/content/clients/tools/psql.md b/content/clients/tools/psql.md index c674feb9..8a38c09e 100644 --- a/content/clients/tools/psql.md +++ b/content/clients/tools/psql.md @@ -17,7 +17,7 @@ To start, connect psql with CedarDB with your connection parameters: psql -h localhost -d $DBNAME -U $USERNAME ``` -``` +```text psql (16.3 (Ubuntu 16.3-0ubuntu0.24.04.1)) Type "help" for help. @@ -33,7 +33,7 @@ Afterward, you have an open connection to your database, where you can enter any ## Creating a database psql is a handy too for one-time setup tasks like to create your database, set up user accounts, and to create schemas. -See [here](/docs/get_started/quickstart). +See the [quickstart guide](/docs/get_started/quickstart). Nevertheless, we recommend to keep such configuration in a separate `.sql` file in your version control system. In interactive mode, you can execute all SQL commands from such a file with the `\i` command: @@ -66,7 +66,7 @@ You can show a list of all databases within the current system: \l ``` -``` +```text List of databases Name | Owner | Encoding | Locale Provider | Collate | Ctype | ICU Locale | ICU Rules | Access privileges ----------+----------+----------+-----------------+---------+-------+------------+-----------+------------------- @@ -81,7 +81,7 @@ You can also show a list of all tables in the current database: \d ``` -``` +```text List of relations Schema | Name | Type | Owner --------+----------+-------+---------- diff --git a/content/community_edition.md b/content/community_edition.md index adf7317c..f7f00bf3 100644 --- a/content/community_edition.md +++ b/content/community_edition.md @@ -5,15 +5,19 @@ weight: 5 {{< tabs >}} {{< tab name="Docker" >}} + ```shell docker pull cedardb/cedardb ``` + ...and visit our [**Run With Docker**](../get_started/install_with_docker) guide for more details. {{< /tab >}} {{< tab name="Local Install" >}} -``` + +```shell curl https://get.cedardb.com | bash ``` + ...and visit our [**Local Install**](../get_started/install_locally) guide for more details. {{< /tab >}} {{< tab name="Cloud Setup" >}} @@ -26,31 +30,32 @@ Visit [**Operate in the Cloud**](../get_started/operate_in_cloud) for more detai --- The **CedarDB Community Edition** + - is free forever - requires no signup - is ready to run on any machine. It is **the right choice** for everything from solo projects to small-scale production workloads. No strings attached! - {{% callout type="info" %}} Looking for advanced features or enterprise support? Check out the **Enterprise Edition** or visit our [pricing page](https://cedardb.com/pricing). {{% /callout %}} -| | Community Edition | Enterprise Edition | -|------------------------------|---------------------------------------------------------------------------------------|---------------------------| -| Maximum Data Set Size | 64 GiB | unlimited | -| Commercial Support | [Contact Us](mailto:sales@cedardb.com) (consider joining our [Community Slack](https://bonsai.cedardb.com/slack)) | Per terms of your license | -| License | [Community Edition license](https://cedardb.com/legal/agreements/community_tcs.pdf) | Paid enterprise license | - +| | Community Edition | Enterprise Edition | +|------------------------------|-------------------------------------------------------------------------------------------------------------------|---------------------------| +| Maximum Data Set Size | 64 GiB | unlimited | +| Commercial Support | [Contact Us](mailto:sales@cedardb.com) (consider joining our [Community Slack](https://bonsai.cedardb.com/slack)) | Per terms of your license | +| License | [Community Edition license](https://cedardb.com/legal/agreements/community_tcs.pdf) | Paid enterprise license | ## Requirements + To run CedarDB, you need a Linux-based OS and an x86 or ARM CPU. The CedarDB docker image also works on macOS. ## Installation CedarDB can be installed in multiple environments: + - [Run locally](../get_started/install_locally) on your own hardware - [Use Docker](../get_started/install_with_docker) for a quick setup - [Deploy to the cloud](../get_started/operate_in_cloud) on your preferred infrastructure diff --git a/content/compatibility/ecosystem_and_clients.md b/content/compatibility/ecosystem_and_clients.md index f00ca876..74b900ba 100644 --- a/content/compatibility/ecosystem_and_clients.md +++ b/content/compatibility/ecosystem_and_clients.md @@ -23,26 +23,26 @@ range of PostgreSQL connectors for end-user tools like Grafana and programming l ### Applications -| **Application** | **Version** | **Support State** | **Details** | -|-----------------|-------------|-------------------|------------------------------------------| +| **Application** | **Version** | **Support State** | **Details** | +|-----------------|-------------|-------------------|------------------------------------------------| | DataGrip | 2024.2.2 | Partial | [Documentation](/docs/clients/tools/datagrip/) | | DBeaver | 24.2.2 | Partial | [Documentation](/docs/clients/tools/dbeaver/) | | Grafana | 10.4.2 | Partial | [Documentation](/docs/clients/tools/grafana/) | ### Programming Language Libraries -| **Language** | **Framework** | **Version** | **Support State** | **Details** | -|--------------|----------------|-------------|-------------------|-----------------------------------------| -| C# | Npgsql | 8.0.4 | Full | [Documentation](/docs/clients/csharp/) | -| C++ | libpqxx | 7.9.1 | Full | [Documentation](/docs/clients/cpp/) | -| Java | JDBC | 42.7.4 | Full | [Documentation](/docs/clients/java/) | -| JavaScript | node-postgres | 8.13.0 | Full | [Documentation](/docs/clients/javascript/) | -| | drizzle-orm | 0.45.1 | Partial | [Documentation](/docs/clients/javascript/drizzle/) | -| | prisma-orm | 7.7.0 | Partial | [Documentation](/docs/clients/javascript/prisma/) | -| Python | psycopg2 | 2.9.10 | Full | | -| | psycopg | 3.2.3 | Full | [Documentation](/docs/clients/python/) | -| R | RPostgres | 1.4.7 | Full | [Documentation](/docs/clients/r/) | -| Rust | tokio-postgres | 0.7.12 | Full | [Documentation](/docs/clients/rust/) | +| **Language** | **Framework** | **Version** | **Support State** | **Details** | +|--------------|----------------|-------------|-------------------|----------------------------------------------------| +| C# | Npgsql | 8.0.4 | Full | [Documentation](/docs/clients/csharp/) | +| C++ | libpqxx | 7.9.1 | Full | [Documentation](/docs/clients/cpp/) | +| Java | JDBC | 42.7.4 | Full | [Documentation](/docs/clients/java/) | +| JavaScript | node-postgres | 8.13.0 | Full | [Documentation](/docs/clients/javascript/) | +| | drizzle-orm | 0.45.1 | Partial | [Documentation](/docs/clients/javascript/drizzle/) | +| | prisma-orm | 7.7.0 | Partial | [Documentation](/docs/clients/javascript/prisma/) | +| Python | psycopg2 | 2.9.10 | Full | | +| | psycopg | 3.2.3 | Full | [Documentation](/docs/clients/python/) | +| R | RPostgres | 1.4.7 | Full | [Documentation](/docs/clients/r/) | +| Rust | tokio-postgres | 0.7.12 | Full | [Documentation](/docs/clients/rust/) | ## Extensions diff --git a/content/compatibility/sql_features.md b/content/compatibility/sql_features.md index 8b779f83..6e6f2eca 100644 --- a/content/compatibility/sql_features.md +++ b/content/compatibility/sql_features.md @@ -17,20 +17,20 @@ the [system table compatibility](../system_table) page. ### Table Creation & Deletion -| **Feature** | **Support State** | **Details** | -|-----------------------|-------------------|---------------------------------------------------------------------------------------------| +| **Feature** | **Support State** | **Details** | +|-----------------------|-------------------|------------------------------------------------------------------------| | CREATE TABLE | Yes | [Documentation](/docs/references/objects/tables/) | -| DROP TABLE | Yes | | -| Default Values | Yes | | -| GENERATED | Yes | only AS IDENTITY | -| Check Constraints | No | | +| DROP TABLE | Yes | | +| Default Values | Yes | | +| GENERATED | Yes | only AS IDENTITY | +| Check Constraints | No | | | Not-Null Constraints | Yes | [Documentation](/docs/references/objects/tables/) | | Unique Constraints | Yes | [Documentation](/docs/references/objects/tables/) | | Primary Keys | Yes | [Documentation](/docs/references/objects/tables/) | | Foreign Keys | Yes | Without ON DELETE
[Documentation](/docs/references/objects/tables/) | -| Named Constraints | No | | -| Exclusion Constraints | No | | -| System Columns | Yes | Only meaningful for tableoid and ctid | +| Named Constraints | No | | +| Exclusion Constraints | No | | +| System Columns | Yes | Only meaningful for tableoid and ctid | ### Table Modification (ALTER TABLE) @@ -51,52 +51,52 @@ the [system table compatibility](../system_table) page. ### Privileges -| **Feature** | **Support State** | **Details** | -|-----------------------|-------------------|-----------------------------------------------------------------------| +| **Feature** | **Support State** | **Details** | +|-----------------------|-------------------|--------------------------------------------------| | CREATE ROLE | Yes | [Documentation](/docs/references/objects/roles) | -| OWNER TO | Yes | | -| ALTER ROLE | Yes | [Documentation](/docs/references/objects/roles) | -| GRANT | Yes | Only GRANT role to other_role | -| REVOKE | No | | -| SET ROLE | No | | +| OWNER TO | Yes | | +| ALTER ROLE | Yes | [Documentation](/docs/references/objects/roles) | +| GRANT | Yes | Only GRANT role to other_role | +| REVOKE | No | | +| SET ROLE | No | | | INHERIT | Yes | [Documentation](/docs/references/objects/roles/) | -| Row Security Policies | No | | +| Row Security Policies | No | | ### Indexes -| **Feature** | **Support State** | **Details** | -|------------------------|-------------------|--------------------------------------------------------------------------------------------| +| **Feature** | **Support State** | **Details** | +|------------------------|-------------------|------------------------------------------------------------------------| | CREATE INDEX | Yes | Only B-Tree Indexes [Documentation](/docs/references/objects/indexes/) | -| GIN | No | | -| BRIN | No | | +| GIN | No | | +| BRIN | No | | | Multicolumn Indexes | Yes | [Documentation](/docs/references/objects/indexes/) | | Ordered Indexes | Yes | [Documentation](/docs/references/objects/indexes/#column-order) | -| Unique Indexes | Yes | | -| Indexes on Expressions | No | | -| Partial Indexes | No | | +| Unique Indexes | Yes | | +| Indexes on Expressions | No | | +| Partial Indexes | No | | ### Misc -| **Feature** | **Support State** | **Details** | -|------------------------|-------------------|--------------------------------------------------------------------------------------------------------------| +| **Feature** | **Support State** | **Details** | +|------------------------|-------------------|-----------------------------------------------------------------------------------------| | CREATE SCHEMA | Yes | [Documentation](/docs/references/objects/schemas/) | -| DROP SCHEMA | Yes | Only if the schema is empty | +| DROP SCHEMA | Yes | Only if the schema is empty | | search_path | Yes | [Documentation](/docs/references/objects/schemas/#using-schemas) | -| Table Inheritance | No | | -| Table Partitioning | Yes | Only at creation, only by hash | -| Foreign Data Wrappers | No | | +| Table Inheritance | No | | +| Table Partitioning | Yes | Only at creation, only by hash | +| Foreign Data Wrappers | No | | | Views | Yes | [Documentation](/docs/references/objects/views/) | -| Databases | Yes | [Documentation](/docs/references/objects/databases/) | +| Databases | Yes | [Documentation](/docs/references/objects/databases/) | | Functions & Procedures | Yes | [Documentation](/docs/references/objects/functions/)
Also in cedar_script language | -| Custom Types | No | | -| Triggers | No | | -| Prepared Statements | Yes | | -| Advisory Locks | Yes | [Documentation](/docs/references/functions/system/#advisory-locks) | +| Custom Types | No | | +| Triggers | No | | +| Prepared Statements | Yes | | +| Advisory Locks | Yes | [Documentation](/docs/references/functions/system/#advisory-locks) | ## Data Manipulation -| **Feature** | **Support State** | **Details** | -|-------------|-------------------|-----------------------------------------------------------------------------| +| **Feature** | **Support State** | **Details** | +|-------------|-------------------|---------------------------------------------------------| | INSERT | Yes | [Documentation](/docs/references/dml/insert/) | | UPDATE | Yes | [Documentation](/docs/references/dml/update/) | | DELETE | Yes | [Documentation](/docs/references/dml/delete/) | @@ -108,32 +108,32 @@ the [system table compatibility](../system_table) page. ## Queries -| **Feature** | **Support State** | **Details** | -|---------------------------|-------------------|----------------------------------------------------------------------------------| -| Table & View References | Yes | | -| Inner Joins | Yes | [Documentation](/docs/references/queries/from/) | -| Outer Joins | Yes | [Documentation](/docs/references/queries/from/) | -| Semijoins | Yes | [Documentation](/docs/references/queries/from/) | -| Antijoins | Yes | | -| Table Functions | Yes | | -| Lateral Subqueries | Yes | | -| User-Specified Aliases | Yes | | -| GROUP BY | Yes | [Documentation](/docs/references/queries/groupby/) | -| HAVING | Yes | [Documentation](/docs/references/queries/groupby/) | -| GROUPING SETS | Yes | | -| CUBE | Yes | | -| ROLLUP | Yes | | -| WINDOW Functions | Yes | [Documentation](/docs/references/queries/window/) | -| WITH | Yes | [Documentation](/docs/references/queries/with/) | -| WITH RECURSIVE | Yes | | -| UNION | Yes | | -| UNION ALL | Yes | | -| INTERSECT | Yes | | -| EXCEPT | Yes | | -| ORDER BY | Yes | | -| LIMIT | Yes | | -| OFFSET | Yes | | -| Table Generating Function | Yes | | +| **Feature** | **Support State** | **Details** | +|---------------------------|-------------------|----------------------------------------------------| +| Table & View References | Yes | | +| Inner Joins | Yes | [Documentation](/docs/references/queries/from/) | +| Outer Joins | Yes | [Documentation](/docs/references/queries/from/) | +| Semijoins | Yes | [Documentation](/docs/references/queries/from/) | +| Antijoins | Yes | | +| Table Functions | Yes | | +| Lateral Subqueries | Yes | | +| User-Specified Aliases | Yes | | +| GROUP BY | Yes | [Documentation](/docs/references/queries/groupby/) | +| HAVING | Yes | [Documentation](/docs/references/queries/groupby/) | +| GROUPING SETS | Yes | | +| CUBE | Yes | | +| ROLLUP | Yes | | +| WINDOW Functions | Yes | [Documentation](/docs/references/queries/window/) | +| WITH | Yes | [Documentation](/docs/references/queries/with/) | +| WITH RECURSIVE | Yes | | +| UNION | Yes | | +| UNION ALL | Yes | | +| INTERSECT | Yes | | +| EXCEPT | Yes | | +| ORDER BY | Yes | | +| LIMIT | Yes | | +| OFFSET | Yes | | +| Table Generating Function | Yes | | ## Data Types @@ -747,8 +747,8 @@ the [system table compatibility](../system_table) page. ##### Generic -| **Feature** | **Support State** | **Details** | -|-----------------------|-------------------|-------------------------------------------------------------------------------------------------------------------:| +| **Feature** | **Support State** | **Details** | +|-----------------------|-------------------|------------------------------------------------------------------------------------------------------:| | any_value | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | array_agg | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | avg | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | @@ -759,22 +759,22 @@ the [system table compatibility](../system_table) page. | bool_or | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | count(*) | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | count("any") | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | -| json(b)_agg | No | | -| json(b)_objectagg | No | | -| json(b)_object_agg | No | | -| json_arrayagg | No | | +| json(b)_agg | No | | +| json(b)_objectagg | No | | +| json(b)_object_agg | No | | +| json_arrayagg | No | | | max | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | min | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | -| range(_intersect)_agg | No | | +| range(_intersect)_agg | No | | | string_agg | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | | sum | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#general-purpose-functions) | -| xmlagg | No | | +| xmlagg | No | | ##### Statistical -| **Feature** | **Support State** | **Details** | -|----------------|-------------------|----------------------------------------------------------------------------------------------------------------:| -| corr | No | | +| **Feature** | **Support State** | **Details** | +|----------------|-------------------|---------------------------------------------------------------------------------------------------:| +| corr | No | | | covar_pop | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | covar_samp | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | regr_avgx | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | @@ -785,7 +785,7 @@ the [system table compatibility](../system_table) page. | regr_slope | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | regr_sxx | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | regr_sxy | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | -| regr_syy | No | | +| regr_syy | No | | | stddev | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | stddev_pop | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | | stddev_samp | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation/#statistical-aggregates) | @@ -795,11 +795,11 @@ the [system table compatibility](../system_table) page. ##### Ordered-Set -| **Feature** | **Support State** | **Details** | -|-----------------|-------------------|--------------------------------------------------------------------------------------------------------------------------:| -| mode | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | -| percentile_cont | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | -| percentile_disc | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | +| **Feature** | **Support State** | **Details** | +|-----------------|-------------------|--------------------------------------------------------------------------------------------------------------:| +| mode | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | +| percentile_cont | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | +| percentile_disc | Yes | [Aggregate Function Documentation](/docs/references/functions/aggregation//#ordered-set-aggregate-functions) | #### Window diff --git a/content/compatibility/system_table.md b/content/compatibility/system_table.md index 59944da6..3ebe9c46 100644 --- a/content/compatibility/system_table.md +++ b/content/compatibility/system_table.md @@ -119,7 +119,7 @@ For more information on cold and hot data, see [this blog post](https://cedardb. This is an excerpt of the output for [TPCH](https://www.tpc.org/tpch/) with scale factor 1: -``` +```text oid tablename attributename datatype encoding compressedvaluesize compressedsize uncompressedsize tuplecount ----- 268435460 partsupp ps_comment text simple dictionary fourbyte 104230496 107284124 768830 @@ -266,4 +266,3 @@ select * from information_schema.tables; | view_routine_usage | 🟡 | | | view_table_usage | 🟡 | | | views | 🟢 | | - diff --git a/content/cookbook/aurora_debezium.md b/content/cookbook/aurora_debezium.md index 5621817a..9b566ddb 100644 --- a/content/cookbook/aurora_debezium.md +++ b/content/cookbook/aurora_debezium.md @@ -6,14 +6,12 @@ weight: 102 [Debezium](https://debezium.io/) is a popular platform for [Change Data Capture (CDC)](https://en.wikipedia.org/wiki/Change_data_capture). -This guide shows you how you can replicate tables from your transactional Amazon Aurora PostgreSQL to CedarDB, allowing you to do fast analytics on data is it comes in without impacting your existing data infrastructure. - +This guide shows you how you can replicate tables from your transactional Amazon Aurora PostgreSQL to CedarDB, allowing you to do fast analytics on data is it comes in without impacting your existing data infrastructure. ## Setting up Replication {{% steps %}} - ### Starting an EC2 Instance CedarDB and Debezium will live inside this instance. @@ -24,7 +22,6 @@ If you do not already know your requirements, we recommend using the `m6id.2xlar The rest of this instruction manual assumes you use Ubuntu 24.04 as your operating system. Since CedarDB runs inside its own docker image, you can choose any other OS as well but you might have to adapt the installation instructions accordingly. - {{< callout type="info" >}} Configure the EBS volume where your root partition is mounted to be large enough to hold all of the data Debezium needs to store its CDC events. By default, it retains all events for one week and there will be one message per insert/update/delete of all replicated tables. @@ -34,6 +31,7 @@ For playing around, the default of 8 GiB is fine. ### Setting up your EC2 Instance CedarDB loves fast SSDs. If your instance comes with an ephemeral SSD, mount it like this: + ```shell sudo mkfs.ext4 -E nodiscard /dev/nvme1n1 mkdir /home/ubuntu/db @@ -42,6 +40,7 @@ sudo chown ubuntu:ubuntu db ``` Next, we install docker: + ```shell sudo apt update sudo apt install ca-certificates curl @@ -58,6 +57,7 @@ sudo apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin dock ``` Before you can docker commands, you need to add your user to the docker group and re-login: + ```shell sudo adduser ubuntu docker ``` @@ -68,7 +68,6 @@ Finally, build the docker image using the CedarDB Dockerfile. docker build --tag cedardb . ``` - ### Starting an Amazon Aurora PostgreSQL Cluster If you already have a cluster, you can skip this step. @@ -77,7 +76,6 @@ If you already have a cluster, you can skip this step. If you do not already know your requirements, we recommend using the `db.r6gd.xlarge` instance type which comes with 32 GiB of memory and 4 vCPUs. You can definitely go cheaper here, if you just want to play around a little bit. {{< /callout >}} - Make sure to connect it to your EC2 instance. In this example, we will assume you have created an admin user `postgresuser` with password `postgrespw`. @@ -89,15 +87,14 @@ If you use such easily guessable credentials, make sure this cluster is only rea If you intend to do more than just play around, you should not give the Postgres user used for replication root access! Instead, it's best practice to [create a separate user with replication privileges](https://debezium.io/documentation/reference/stable/connectors/postgresql.html#postgresql-permissions). {{< /callout >}} - - ### Configuring your Amazon Aurora PostgreSQL Cluster + Amazon Aurora PostgreSQL needs to be configured for logical replication to Debezium. You can take a look at the [Debezium documentation](https://debezium.io/documentation/reference/stable/connectors/postgresql.html#postgresql-in-the-cloud) for details and instructions to check if your cluster is already set up correctly. If not, here are the steps to enable it: 1. Create a new parameter group for your cluster - + Call it, e.g., `logicalreplication`, set the engine type to `Aurora PostgreSQL`, the family to your PostgreSQL version, e.g. `aurora-postgresql15` and the type to `DB Cluster Parameter Group`. Then, within that parameter group, change the parameter `rds.logical_replication` to `1`. @@ -105,10 +102,10 @@ If not, here are the steps to enable it: 2. Apply this group to your cluster 3. Restart your cluster (or wait for the next maintenance window) - ### Starting CedarDB and Debezium Create a file `docker-compose.yml` with the following content: + ```yml services: zookeeper: @@ -158,16 +155,16 @@ Make sure that your docker container database is created somewhere on a fast SSD If you followed this guide, `/home/ubuntu/db` should point to your fast ephemeral ssd (if your instance has one). Then, start all services with the following command: + ```shell docker compose up ``` -### Install psql to talk to CedarDB and Postgres: +### Install psql to talk to CedarDB and Postgres ```shell sudo apt install posgresql-common postgresql-client-16 ``` - ### Creating a Source and Sink Configuration for Debezium @@ -215,7 +212,7 @@ Create a file `sink.json` with the following contents: "schema.evolution": "basic", "delete.enabled": "true", "primary.key.mode": "record_key", - "primary.key.fields": "lineitem_id", + "primary.key.fields": "lineitem_id", "table.name.format": "${source.table}" } } @@ -223,7 +220,6 @@ Create a file `sink.json` with the following contents: This configuration assumes we want to replicate a table called `lineitem` with a primary key called `lineitem_id`. Modify both files to work with your Amazon Aurora PostgreSQL and CedarDB credentials. - ### Starting Source and Sink Execute the following commands to register the source and sink with Debezium and start them: @@ -238,13 +234,14 @@ It's possible that you need to restart all containers (`docker compose down` the {{< callout type="info" >}} If you want to delete source and sink, you can use the following commands: + ```shell curl -i -X DELETE -H "Accept:application/json" -H "Content-Type:application/json" http://localhost:8083/connectors/postgres-source curl -i -X DELETE -H "Accept:application/json" -H "Content-Type:application/json" http://localhost:8083/connectors/cedar-sink ``` -{{< /callout >}} +{{< /callout >}} {{% /steps %}} @@ -282,7 +279,6 @@ INSERT INTO lineitem (lineitem_id, transaction_id, product_id, quantity, unit_p (4, 1003, 2004, 7.00, 75.00, 3.00, 8.00, '2024-10-16', 'void', 'Voided line item for test purposes'); ``` - ### Checking Replication in CedarDB Now connect to CedarDB (e.g., via `PGPASSWORD=postgres psql -h localhost -U postgres`) and check the replicated table: @@ -292,6 +288,7 @@ select * from lineitem; ``` ## Automate + Let's create some more rows! Create a file `inserter.py` with the following content: ```python @@ -354,10 +351,10 @@ finally: cur.close() conn.close() ``` -It requires psycopg2 which you can install via `sudo apt install python3-psycopg2`. -Then run `python3 inserter.py`. +It requires psycopg2 which you can install via `sudo apt install python3-psycopg2`. +Then run `python3 inserter.py`. ## Running Analytical Queries @@ -376,4 +373,4 @@ ORDER BY avg_tax_rate DESC; ``` -You can now run all your expensive analytical queries against CedarDB while keeping your PostgreSQL database system as system of record. \ No newline at end of file +You can now run all your expensive analytical queries against CedarDB while keeping your PostgreSQL database system as system of record. diff --git a/content/cookbook/aws_dms.md b/content/cookbook/aws_dms.md index dbd61255..44281119 100644 --- a/content/cookbook/aws_dms.md +++ b/content/cookbook/aws_dms.md @@ -111,10 +111,12 @@ psql (17.4, server 16.3) SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off, ALPN: none) Type "help" for help. ``` + ```sql postgres=> \dn ``` -``` + +```text List of schemas Name | Owner -----------+------------------- @@ -122,10 +124,12 @@ postgres=> \dn public | pg_database_owner (2 rows) ``` + ```sql postgres=> \dt employees.* ``` -``` + +```text List of relations Schema | Name | Type | Owner -----------+---------------------+-------+---------- @@ -137,10 +141,12 @@ postgres=> \dt employees.* employees | title | table | postgres (6 rows) ``` + ```sql postgres=> select count(*) from employees.department; ``` -``` + +```text count ------- 9 @@ -148,10 +154,12 @@ postgres=> select count(*) from employees.department; Time: 15.528 ms ``` + ```sql postgres=> select count(*) from employees.department_employee; ``` -``` + +```text count -------- 331603 @@ -159,10 +167,12 @@ postgres=> select count(*) from employees.department_employee; Time: 26.052 ms ``` + ```sql postgres=> select count(*) from employees.department_manager; ``` -``` + +```text count ------- 24 @@ -170,20 +180,25 @@ postgres=> select count(*) from employees.department_manager; Time: 18.947 ms ``` + ```sql postgres=> select count(*) from employees.employee; ``` -``` + +```text count -------- 300024 (1 row) Time: 23.509 ms +``` + ```sql postgres=> select count(*) from employees.salary; ``` -``` + +```text count --------- 2844047 @@ -191,10 +206,12 @@ postgres=> select count(*) from employees.salary; Time: 82.885 ms ``` + ```sql postgres=> select count(*) from employees.title; ``` -``` + +```text count -------- 443308 @@ -250,10 +267,12 @@ psql (17.4, server 16.3 cedar v2025-06-20) SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off, ALPN: none) Type "help" for help. ``` + ```sql postgres=# \dn ``` -``` + +```text List of schemas Name | Owner -----------+---------- @@ -265,7 +284,8 @@ postgres=# \dn ```sql postgres=# \dt employees.* ``` -``` + +```text List of relations Schema | Name | Type | Owner -----------+---------------------+-------+---------- @@ -281,7 +301,8 @@ postgres=# \dt employees.* ```sql postgres=# select count(*) from employees.department; ``` -``` + +```text count ------- 9 @@ -293,7 +314,8 @@ Time: 17.725 ms ```sql postgres=# select count(*) from employees.department_employee; ``` -``` + +```text count -------- 331603 @@ -305,7 +327,8 @@ Time: 15.559 ms ```sql postgres=# select count(*) from employees.department_manager; ``` -``` + +```text count ------- 24 @@ -317,7 +340,8 @@ Time: 14.472 ms ```sql postgres=# select count(*) from employees.employee; ``` -``` + +```text count -------- 300024 @@ -329,7 +353,8 @@ Time: 21.151 ms ```sql postgres=# select count(*) from employees.salary; ``` -``` + +```text count --------- 2844047 @@ -341,7 +366,8 @@ Time: 15.671 ms ```sql postgres=# select count(*) from employees.title; ``` -``` + +```text count -------- 443308 @@ -355,7 +381,7 @@ otherwise, you can close it. ## Verify that an UPDATE on the RDS Postgres instance is replicated to the CedarDB target -### Log into the RDS Postgres instance. +### Log into the RDS Postgres instance ```bash $ psql "postgresql://postgres:g3n0A3aRApaimA@rds-pg.ctkkwgwc2jnl.us-east-2.rds.amazonaws.com:5432/postgres?sslmode=require" @@ -367,14 +393,15 @@ Type "help" for help. postgres=> ``` -### Using the employees.salary table, validate the initial state agrees both here on the source and on the CedarDB target. +### Using the employees.salary table, validate the initial state agrees both here on the source and on the CedarDB target On the RDS Postgres instance: ```sql postgres=> select * from employees.salary where employee_id = 10004 order by from_date, to_date; ``` -``` + +```text employee_id | amount | from_date | to_date -------------+--------+------------+------------ 10004 | 40054 | 1986-12-01 | 1987-12-01 @@ -403,7 +430,8 @@ On the CedarDB instance: ```sql postgres=# select * from employees.salary where employee_id = 10004 order by from_date, to_date; ``` -``` + +```text employee_id | amount | from_date | to_date -------------+--------+------------+------------ 10004 | 40054 | 1986-12-01 | 1987-12-01 @@ -427,7 +455,7 @@ postgres=# select * from employees.salary where employee_id = 10004 order by fro Time: 31.985 ms ``` -### On the RDS Postgres instance, give the employee with ID 10004 a raise. +### On the RDS Postgres instance, give the employee with ID 10004 a raise ```sql postgres=> begin; @@ -449,7 +477,8 @@ Verify that raise: ```sql postgres=> select * from employees.salary where employee_id = 10004 and amount >= 74057 order by from_date; ``` -``` + +```text employee_id | amount | from_date | to_date -------------+--------+------------+------------ 10004 | 74057 | 2001-11-27 | 2026-03-23 @@ -464,7 +493,8 @@ Time: 34.304 ms ```sql postgres=# select * from employees.salary where employee_id = 10004 and amount >= 74057 order by from_date; ``` -``` + +```text employee_id | amount | from_date | to_date -------------+--------+------------+------------ 10004 | 74057 | 2001-11-27 | 2026-03-23 @@ -473,4 +503,3 @@ postgres=# select * from employees.salary where employee_id = 10004 and amount > Time: 26.653 ms ``` - diff --git a/content/cookbook/clickbench.md b/content/cookbook/clickbench.md index cf82c26c..5e3aab3e 100644 --- a/content/cookbook/clickbench.md +++ b/content/cookbook/clickbench.md @@ -234,7 +234,7 @@ Now let's also do a full run with all queries and report the total time: cat <(echo '\\timing') queries.sql | psql -h localhost -U postgres | grep 'Time' | awk '{print "Q" NR-1 " " $0; sum += $2;} END {print "Total: " sum " ms = " NR*60000/sum " qpm";}' ``` -``` +```text Q0 Time: 11.032 ms Q1 Time: 13.756 ms Q2 Time: 65.314 ms @@ -286,7 +286,7 @@ the `c6a.4xlarge` results from the dashboard. If you want to compare performance to a wider variety of hardware, Phoronix has a number of results for ClickHouse on OpenBenchmarking.org: -https://openbenchmarking.org/test/pts/clickhouse + Their results use the [geometric mean](https://en.wikipedia.org/wiki/Geometric_mean) for queries per minute (qpm). You can use the following command to get comparable qpm numbers. @@ -294,7 +294,7 @@ You can use the following command to get comparable qpm numbers. cat <(echo '\\timing') queries.sql | psql -h localhost -U postgres | grep 'Time' | awk '{sum += log($2);} END {print "Geometric mean: " exp(1)^(sum/NR) " ms = " 60000/exp(1)^(sum/NR) " qpm ";}' ``` -``` +```text Geometric mean: 197.884 ms = 303.208 qpm ``` diff --git a/content/cookbook/importing_from_json.md b/content/cookbook/importing_from_json.md index 7be935ad..39ff2dfd 100644 --- a/content/cookbook/importing_from_json.md +++ b/content/cookbook/importing_from_json.md @@ -5,7 +5,7 @@ weight: 12 --- CedarDB natively supports storing JSON documents in tables and working with JSON in SQL. -Two data types: `json` that stores the documents as text, and `jsonb` which stores a binary representation to allow +Two data types: `json` that stores the documents as text, and `jsonb` which stores a binary representation to allow efficient access to fields of the document. ## Importing JSON @@ -20,15 +20,18 @@ You can import data from a [JSON Lines](https://jsonlines.org/) file: ``` Load the data into a table: + ```sql create table stars_json (star json); copy stars_json from 'stars.json'; ``` Now you can use the json documents in SQL queries: + ```sql select star->>'name' as name from stars_json where star->>'gender' = 'F'; ``` + ```sql name --------------- @@ -49,7 +52,7 @@ is planned for a future CedarDB release. ## Relationalize JSON To efficiently execute queries on data, we recommend to transform JSON documents to relational tables. -When storing data in CedarDB's native storage format, it uses advanced statistics and columnar data storage for +When storing data in CedarDB's native storage format, it uses advanced statistics and columnar data storage for efficient execution. For the previous example, you can relationalize by creating a table with explicit [data types](../../references/datatypes). @@ -59,7 +62,6 @@ JSON field access returns `null` when a key is not present. Depending on your JSON schema, you can also mark columns as `not null`. {{< /callout >}} - ```sql create table stars ( id integer primary key generated always as identity, diff --git a/content/cookbook/importing_from_postgresql.md b/content/cookbook/importing_from_postgresql.md index 9b474cbf..67a39e1e 100644 --- a/content/cookbook/importing_from_postgresql.md +++ b/content/cookbook/importing_from_postgresql.md @@ -5,9 +5,12 @@ weight: 10 --- In this section, you will learn how to seamlessly transfer data from PostgreSQL to CedarDB. +## Migration steps + {{% steps %}} ### Dump the schema from Postgres + Make sure your Postgres instance is running. Then use the Tool `pg_dump` (probably supplied together with PostgreSQL by your package manager): @@ -15,13 +18,14 @@ Then use the Tool `pg_dump` (probably supplied together with PostgreSQL by your pg_dump --schema-only postgres > schema.sql ``` - {{< callout type="info" >}} `postgres` is the default database into which PostgreSQL inserts new tables. Replace with other database in above command if you want to export the tables of a different database. {{< /callout >}} ### Adapt the dumped schema + CedarDB does not yet support some settings PostgreSQL tries to set. Remove them from the schema dump for now by running the following three `sed` commands: + ```shell sed -i.bak 's/^SET.*$//g' schema.sql sed -i.bak 's/.*set_config.*//g' schema.sql @@ -29,6 +33,7 @@ sed -i.bak 's/^ALTER TABLE .* OWNER TO .*//g' schema.sql ``` ### Remove unsupported data types + CedarDB doesn't support some data types yet, especially no auto generated series. For now remove them, by manually editing schema.sql. For example, the statement @@ -38,6 +43,7 @@ create table x(id integer generated always as identity); ``` generates the following lines in the dump file: + ```sql CREATE TABLE public.x ( id integer NOT NULL @@ -56,23 +62,27 @@ ALTER TABLE public.x ALTER COLUMN id ADD GENERATED ALWAYS AS IDENTITY ( CACHE 1 ); ``` + In this case, remove the alter table statement. ### Dump the data out of PostgreSQL into CSV files + Connect with PostgreSQL via `psql`. Execute the following statement against Postgres for each table you want to export to CedarDB + ```sql \copy {tablename} TO 'your/path/{tablename}.csv' DELIMITER '|' CSV NULL ''; ``` + {{< callout type="info" >}} The `NULL` parameter specifies into which string null values should be serialized. If your data set contains empty strings, choose a different value. {{< /callout >}} ### Import data from your freshly dumped files into CedarDB -Now make sure that CedarDB is running. Either let it run in parallel to Postgres and use a different port, or shut down Postgres and then start CedarDB. -For the following commands, we assume CedarDB listens at port 5432. If you're using another port, please change the commands accordingly (via the argument ` -p {Portnumber}`). +Now make sure that CedarDB is running. Either let it run in parallel to Postgres and use a different port, or shut down Postgres and then start CedarDB. +For the following commands, we assume CedarDB listens at port 5432. If you're using another port, please change the commands accordingly (via the argument `-p {Portnumber}`). Import the schema you've exported from PostgreSQL and modified earlier @@ -82,12 +92,14 @@ psql -h localhost -U postgres < schema.sql If you get some error messages in the server log, that's okay for now, as long as your tables are created (`psql` answers with `CREATE TABLE`). -Connect with CedarDB via `psql`, e.g. via +Connect with CedarDB via `psql`, e.g. via + ```shell psql -h localhost -U postgres ``` Execute the following statement against CedarDB for each table you exported in the previous step + ```sql copy {tablename} from 'your/path/{tablename}.csv' DELIMITER '|' CSV NULL ''; ``` @@ -95,6 +107,7 @@ copy {tablename} from 'your/path/{tablename}.csv' DELIMITER '|' CSV NULL ''; Note that the path is relative to the **server**, i.e., if you run CedarDB inside the docker container, the location where the csv files resist must be mapped as docker volume. If you want the path to be relative to the **client**, precede the command with a backslash: + ```sql \copy {tablename} from 'your/path/{tablename}.csv' DELIMITER '|' CSV NULL ''; ``` @@ -102,6 +115,7 @@ If you want the path to be relative to the **client**, precede the command with Note that this incurs some network overhead as the data is sent via the PostgreSQL wire protocol over the psql connection. The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, you can instead import in text mode which is multi-threaded and thus **much** faster: + ```sql copy {tablename} from 'your/path/{tablename}.csv' with(format text, delimiter '|', null ''); ``` @@ -110,10 +124,6 @@ copy {tablename} from 'your/path/{tablename}.csv' with(format text, delimiter '| Multithreaded import does not yet work when using a backslash in front of copy (i.e. when importing relative to the client). {{< /callout >}} - {{% /steps %}} - - - Your data is now successfully imported into CedarDB! diff --git a/content/cookbook/pgbench.md b/content/cookbook/pgbench.md index 978cd538..5b66c651 100644 --- a/content/cookbook/pgbench.md +++ b/content/cookbook/pgbench.md @@ -6,6 +6,7 @@ weight: 100 `pgbench` is a benchmarking utility [included with PostgreSQL](https://www.postgresql.org/docs/current/pgbench.html), and widely available in package repositories: + ```shell sudo apt install postgresql-contrib ``` @@ -24,6 +25,7 @@ transaction. ## Running pgbench pgbench runs its workloads on a simple four table schema: + ```sql create table pgbench_history (tid int, bid int, aid int, delta int, mtime timestamp); create table pgbench_tellers (tid int primary key, bid int, tbalance int, filler char(84)); @@ -41,10 +43,12 @@ corresponds to 100k rows. A scale factor of 100 thus inserts 10M rows with about 200MB of data. In addition to the scale, you also need to specify the connection parameters, username and database name: + ```shell pgbench --initialize -h localhost -U postgres postgres --scale=100 ``` -``` + +```text dropping old tables... creating tables... generating data (client-side)... @@ -70,7 +74,8 @@ little load, but is mostly bound by the connection latency. ```shell pgbench -h localhost -U postgres postgres -T 10 --protocol=prepared --builtin=select ``` -``` + +```text pgbench (16.3 (Ubuntu 16.3-0ubuntu0.24.04.1)) starting vacuum...end. transaction type: @@ -93,7 +98,8 @@ picture that allows you to judge how the system scales in a read-heavy scenario: ```shell pgbench -h localhost -U postgres postgres -T 10 --protocol=prepared --builtin=select --jobs=20 --client=200 ``` -``` + +```text tps = 1183279.095676 (without initial connection time) ``` @@ -107,7 +113,8 @@ For typical consumer SSDs, this is >1ms, but enterprise SSDs can have lower writ ```shell pgbench -h localhost -U postgres postgres -T 10 --protocol=prepared --builtin=simple-update ``` -``` + +```text pgbench (16.3 (Ubuntu 16.3-0ubuntu0.24.04.1)) starting vacuum...end. transaction type: @@ -130,6 +137,7 @@ bound. ```shell pgbench -h localhost -U postgres postgres -T 10 --protocol=prepared --builtin=simple-update --jobs=20 --client=200 ``` -``` + +```text tps = 45882.003693 (without initial connection time) ``` diff --git a/content/cookbook/read_replica_tutorial.md b/content/cookbook/read_replica_tutorial.md index f02c9730..10abe4bf 100644 --- a/content/cookbook/read_replica_tutorial.md +++ b/content/cookbook/read_replica_tutorial.md @@ -4,31 +4,37 @@ linkTitle: "Setting Up Read Replication" weight: 30 draft: true --- -# WIP!! +> **Note:** This page is a work in progress. ## Setup + Before replicating to CedarDB, you have to make a few configurations to your PostgreSQL replication source. ### Set the publisher's log level + The replication source must use *logical* write ahead logging. Enable this by setting `wal_level = logical` in the `WRITE-AHEAD LOG` section of your `postgresql.conf`. This change requires a restart. - ### Enable md5 password encryption + Ensure that your replication source uses the legacy **md5** authentification mode. This can be easily checked by running the query + ```sql SHOW password_encryption; ``` + as the user you intend to use for replication. If the result is `md5` you do not need to do anything. Otherwise, change the password of your use to use md5 encryption like this: + ```sql SET password_encryption = 'md5'; ALTER USER with password ; SET password_encryption = 'scram-sha-256'; -- or whatever else it was before ``` + Finally, allow your user to authentificate from outside via `md5` by adding a line to your `pg_hba.conf`, e.g. like this: -``` +```text # TYPE DATABASE USER ADDRESS METHOD host replication md5 ``` @@ -37,14 +43,16 @@ host replication md5 Support for SASL authentification will come in a future CedarDB release. {{< /callout >}} - ### Ensure correct privileges are set + The replication user needs some privileges. You can find an overview in the official [PostgreSQL documentation](https://www.postgresql.org/docs/current/logical-replication-security.html). ## Set up publication at the replication source + Next, we ensure the source PostgreSQL is correctly configured to publish changes to CedarDB. ### Create the tables you want to replicate + If not already existing, create the table(s) you want to replicate *from* in your PostgreSQL system like this: ```sql @@ -52,7 +60,9 @@ create table foo(a integer, b integer); ``` ### Create a publication + For each table you want to replicate, you need to create a *Publication*: + ```sql create publication foo; ``` @@ -62,6 +72,7 @@ The publication interface allows for a rich set of customization on what to repl {{< /callout >}} ### Create a replication slot + PostgreSQL needs to create a *replication slot* to store the state of the replication to CedarDB. We need to create this slot. ```sql @@ -72,9 +83,8 @@ select * from pg_create_logical_replication_slot ('foo_slot', 'pgoutput'); PostgreSQL's `subscription` abstraction normally deals with replication slots. This abstraction is not yet supported by CedarDB. A future version of CedarDB will make manually dealing with replication slots obsolete. {{< /callout >}} - - ## Create your replication target + Within CedarDB create mirroring tables: ```sql diff --git a/content/cookbook/working_with_csv.md b/content/cookbook/working_with_csv.md index 85fa5038..143b6188 100644 --- a/content/cookbook/working_with_csv.md +++ b/content/cookbook/working_with_csv.md @@ -5,17 +5,18 @@ weight: 11 --- CedarDB allows you to import CSVs into database relations for permanent storage, harness them as temporary external sources for dynamic data manipulation, and effortlessly export query results to CSV files. - ## Importing Data from CSV sources + Importing CSV files into your database relations before querying allows you to make the most of CedarDB's query engine, as it allows CedarDB to scan data more efficiently and make use of collected statistics for query optimization. {{% steps %}} ### Create a table for each CSV file + Before copying your data into CedarDB, you first need to create a database relation with the schema of your data. To start with your own movie database, you might want to start with some movie information. +Connect with CedarDB via `psql`, e.g. via -Connect with CedarDB via `psql`, e.g. via ```shell psql -h localhost -U postgres ``` @@ -35,6 +36,7 @@ create table movies ( ### Copy data into CedarDB Execute the following statement against CedarDB to import your external movie database from csv. Feel free to get started with our example [movies.csv](https://cedardb.com/data/movies/movies.csv) to play around with. + ```sql copy movies from 'your/path/movies.csv' DELIMITER ',' CSV NULL '' HEADER; ``` @@ -44,6 +46,7 @@ The `header` option tells CedarDB to treat the first line as the column names an Note that the path is relative to the **server**, which might differ from the path, or even the system, from which you connect to CedarDB. If you want the path to be relative to the **client**, precede the command with a backslash: + ```sql \copy movies from 'your/path/movies.csv' DELIMITER ',' CSV NULL '' HEADER; ``` @@ -51,6 +54,7 @@ If you want the path to be relative to the **client**, precede the command with Note that this incurs some network overhead as the data is sent via the PostgreSQL wire protocol over the psql connection. The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, as is the case for our example dataset, you can instead import in text mode which is multi-threaded and thus **much** faster: + ```sql copy movies from 'your/path/movies.csv' with(format text, delimiter ',', null '', header); ``` @@ -60,6 +64,7 @@ Multithreaded import does not yet work when using a backslash in front of copy ( {{< /callout >}} ### Start working with your data + Once you have successfully copied your data into CedarDB, you can get to work. Modify or query your data however you like. For example, find a good and long fantasy movie for a rainy day: ```sql @@ -101,6 +106,7 @@ movieId,starId ``` ### Querying a CSV view + You can query external CSV files efficiently using the `csvview` function. Similar to the data import, you need to specify both the delimiter and the schema, this time as arguments of the function. You can read all data in the `starsIn.csv` like this: ```sql @@ -109,7 +115,6 @@ select * from csvview('your/path/starsIn.csv', 'delimiter ",", header', 'movieId The `header` option again tells CedarDB to treat the first line as the column names and ignore it as a data point. - ### Start working with your external CSV view To include these csv views in a query, it is best to include them as a common table expression. Finding any movie starring an actor born after 1970 can then be achieved like this: @@ -119,10 +124,11 @@ with starsIn as (select * from csvview('your/path/starsIn.csv', 'delimiter ",", stars as (select * from csvview('your/path/stars.csv', 'delimiter ",", header', 'id integer, name text, wikiLink text, gender char, birthdate date')) select movies.title, movies.year from movies, stars, starsIn where starsIn.starId = stars.id and starsIn.movieId = movies.id and extract(year from stars.birthdate) > 1970; ``` -{{% /steps %}} +{{% /steps %}} ## Writing query results to CSV + CedarDB not only allows you to read from CSV files, but write them as well. This allows you to export results of individual queries, or whole tables, with ease. You can create a separate CSV file for you movie collection containing only Thrillers with a simple `COPY` statement: @@ -131,6 +137,3 @@ copy (select * from movies where genre = 'Thriller') TO 'your/path/thrillers.csv ``` The `header` option tells CedarDB to include the column names in the CSV file as the first line. - - - diff --git a/content/database_upgrade.md b/content/database_upgrade.md index 4fc9cb66..2800483b 100644 --- a/content/database_upgrade.md +++ b/content/database_upgrade.md @@ -6,8 +6,8 @@ weight: 110 This page contains instructions for upgrading to a newer version of CedarDB that is not backward compatible with previous database formats. Please check this page whenever a new release is listed in the release notes as incompatible with previous versions. - ## Export your data + For now, upgrading the database format requires exporting all user data held in tables. {{% steps %}} @@ -33,10 +33,8 @@ Following, copy all your tables to CSV files by running the following command fo \copy {tablename} TO 'your/backup/path/{tablename}.csv' DELIMITER '|' CSV NULL ''; ``` - {{% /steps %}} - ## Upgrade CedarDB After you have exported all your data, you can upgrade CedarDB to the latest release. @@ -71,7 +69,7 @@ Please follow the [installation guide](../get_started) to install and start the Once the latest version of CedarDB is up and running, you can import your data back into it. -{{% steps %}} +{{% steps %}} ### Re-create your schema @@ -82,6 +80,7 @@ psql -h localhost -U {{username}} < schema.sql ``` ### Copy data from CSV files + Once your schema is created, copy the contents of all tables from the CSV backups you created earlier. Run for each table: diff --git a/content/example_datasets/_index.md b/content/example_datasets/_index.md index e5d53494..7724b45c 100644 --- a/content/example_datasets/_index.md +++ b/content/example_datasets/_index.md @@ -9,9 +9,9 @@ These example datasets cover various domains and come in different formats, allo Whether you're a developer looking to test CedarDB's performance with large datasets or a researcher exploring data analysis techniques, these example datasets provide valuable resources for getting started. You can use them to run queries, perform analytics, and evaluate CedarDB's suitability for your specific use case. Example datasets: - * [CH-benCHmark](./chbenchmark) - * [GloVe: Global Vectors for Word Representation](./glove) - * [Handelsregister](./handelsregister) - * [Join Order Benchmark](./job) - * [NASDAQ Level 3 Order Data](./nasdaq) +* [CH-benCHmark](./chbenchmark) +* [GloVe: Global Vectors for Word Representation](./glove) +* [Handelsregister](./handelsregister) +* [Join Order Benchmark](./job) +* [NASDAQ Level 3 Order Data](./nasdaq) diff --git a/content/example_datasets/chbenchmark.md b/content/example_datasets/chbenchmark.md index e422c44c..73af2bb2 100644 --- a/content/example_datasets/chbenchmark.md +++ b/content/example_datasets/chbenchmark.md @@ -6,15 +6,13 @@ weight: 20 The [CH-benCHmark](https://db.in.tum.de/research/projects/CHbenCHmark/?lang=en) bridges the gap between TPC-C, an OLTP (i.e., transactional), and TPC-H, an OLAP (i.e., analytical benchmark). In contrast to many other benchmarks for hybrid workloads, CH-benCHmark runs its analytical queries on the same tables that are updated by the transactional queries. -This especially stresses the database's transaction subsystem as it has to ensure that all queries see a consistent state of the heavily write-contested tables. - +This especially stresses the database's transaction subsystem as it has to ensure that all queries see a consistent state of the heavily write-contested tables. ## The Dataset The data set consists of all nine [TPC-C](https://www.tpc.org/tpcc/) tables, and three additional [TPC-H](https://www.tpc.org/tpch/) tables. It runs all 22 TPC-H queries in a slightly adapted form which uses the TPC-C base tables, ensuring analytical queries depend on the transactional updates of the TPC-C tables. - ## Executing the benchmark CMU's benchmarking tool [benchbase](https://github.com/cmu-db/benchbase/) comes with a CH-benCHmark configuration and is compatible to Postgres. diff --git a/content/example_datasets/glove.md b/content/example_datasets/glove.md index 06185484..108bdf48 100644 --- a/content/example_datasets/glove.md +++ b/content/example_datasets/glove.md @@ -11,9 +11,8 @@ This example uses the syntax of the `pgvector` PostgreSQL extension. CedarDB implements compatible vector support, so this example can run in both CedarDB as well as PostgreSQL. {{< /callout >}} - - ## The Dataset + The dataset comprises one table with two columns. Each row specifies a word and a vector representing that word in a 300 dimensional vector space. We can use this vector for word similarity search or finding interesting correlations. @@ -27,15 +26,16 @@ CREATE TABLE words ( Here is the row in the dataset for the word `cedar`: -``` +```text cedar -0.035741 0.30627 -0.89386 -0.42192 0.4423 -0.0031244 0.1343 -0.1627 -0.56503 0.55582 0.04976 -0.38961 -0.70721 -0.22251 -0.63599 0.010212 0.44991 1.4495 0.14731 0.49291 -0.43543 0.43853 0.89911 0.7473 -0.32095 -0.37141 0.011313 0.40663 -0.054914 0.052961 -0.089976 0.61442 -0.098188 -0.55887 0.17341 0.2009 0.36195 -0.028696 -0.61912 0.25283 -0.43368 0.52983 0.29688 -0.45046 0.86235 -0.033074 -0.40946 -0.88257 0.5405 0.31433 -0.66538 -0.40765 0.59338 0.15275 -0.03373 -0.58895 0.082962 0.19579 -0.33768 0.17269 -0.1885 0.28613 -0.0763 0.47297 -0.25998 0.43033 0.2628 -0.41632 -0.44285 0.34218 -0.23407 0.32939 -0.18196 0.36787 0.50732 0.62926 0.35413 0.07248 -0.7088 -0.48028 0.069412 0.1061 0.4844 0.41549 0.075002 0.087866 1.093 1.4178 -0.18223 0.19481 0.3665 -0.38657 0.3705 -0.067371 -0.15721 0.67263 0.60278 0.18825 -0.47069 0.23095 0.53747 0.15372 0.28769 -0.23418 -0.065959 -1.3184 0.3386 0.49832 0.23596 -0.84735 0.034094 0.89097 -0.039864 -0.18604 -0.44887 0.65578 0.49864 0.056556 -0.14284 0.21705 -0.31605 -0.080527 -0.26386 0.068591 -0.24204 0.0085045 0.12535 0.25822 -0.45192 -0.19591 -0.28525 -0.21406 -0.23933 -0.079567 0.077772 -0.044807 0.13742 -0.38121 0.51215 -0.15845 -1.1855 0.48977 -0.11569 0.071149 -0.21234 0.63803 -0.074817 0.12214 0.22618 -0.30874 0.3661 -0.45319 -0.46136 0.25993 0.20315 -0.14687 0.30222 -0.10477 0.24161 0.7081 0.158 0.22283 -0.57998 0.51195 0.095581 0.37133 0.038913 -0.10041 0.18371 0.12732 -0.078713 0.015901 -0.27671 0.82712 -0.55948 0.65985 0.27161 -0.056506 0.81918 -0.176 -0.10151 0.3601 0.43843 -0.019308 0.09502 0.21175 -0.66881 -0.42617 -0.033088 0.13867 -0.29438 -0.17065 -0.050052 0.046184 -0.46501 -0.28081 -0.055363 0.12984 0.24892 0.15695 -0.051954 -0.067292 -0.15835 -0.023483 0.34172 0.53221 -0.26182 0.28651 0.40593 -0.029766 -0.19969 0.80703 0.37087 -0.14587 0.26325 0.23282 0.33873 0.10298 0.29892 -0.27437 -0.75017 0.51737 0.20513 0.52544 -0.13263 0.5456 -0.53637 0.68089 0.062844 0.63056 0.76018 -0.23215 -0.60312 0.045453 0.23291 -0.71336 0.31855 -0.98226 0.31373 0.49496 0.29236 -0.052623 -0.22314 -0.17556 -0.24841 -0.27599 0.28926 0.006082 0.47148 -0.55711 0.40644 -0.19782 0.10583 0.080815 0.074759 0.42763 0.66005 -0.5212 0.091959 0.090721 0.63784 -0.5445 0.3681 -0.18135 0.095805 -0.2006 -0.52705 -0.29647 -0.97121 0.57904 -0.26934 0.3796 0.22758 -0.32191 -0.43989 -0.6026 0.34945 0.42713 -0.15097 -0.020774 -0.43159 0.19217 -0.12373 -0.1276 0.086802 0.21242 0.54875 0.045418 0.016401 -0.17856 -0.098253 0.092168 -0.52934 -0.51203 -0.2586 -0.028755 0.40569 -0.54969 -0.20679 -0.28477 ``` {{% steps %}} ### Obtain the data + You can download the dataset from the [GloVe project website](https://nlp.stanford.edu/projects/glove/). -There are multiple versions with differing training sets and vectors of different dimensionalities. +There are multiple versions with differing training sets and vectors of different dimensionalities. Let's choose the biggest dataset "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors". To obtain the data simply run @@ -94,6 +94,7 @@ if __name__ == '__main__': ### Nearest Neighbors Let's first find the 10 nearest neighbors for a given word: + ```sql select b.word, a.embedding <=> b.embedding as distance @@ -108,7 +109,7 @@ limit 10; This query calculates the distance of the vector of the word `cedar` with every other word's vector, sorts them by distance and then returns the 10 nearest words. The result seems reasonable: -``` +```text word | distance ----------+------------------- oak | 0.230610596571098 @@ -125,9 +126,8 @@ The result seems reasonable: ``` - {{< callout type="info" >}} -We're using the cosine distance (`<=>`) as metric. CedarDB also supports other distance metrics. For a full list, take a look at the [vector reference](). +We're using the cosine distance (`<=>`) as metric. CedarDB also supports other distance metrics. For a full list, take a look at the [vector reference](/docs/references/datatypes/vector). {{< /callout >}} {{< callout type="info" >}} @@ -135,9 +135,10 @@ Interestingly, the word most dissimilar to `cedar` is `Counterinsurgency` which {{< /callout >}} ### Linear Substructures + Since each word's vector representation has so many dimensions, we can look at similarities among multiple dimensions using vector arithmetic. -Take for example the words `cat`, `feline`, and `dog`. +Take for example the words `cat`, `feline`, and `dog`. A cat is a feline, but what would be the corresponding word for a dog? Let's ask CedarDB! @@ -153,10 +154,11 @@ where cat.word = 'cat' and dog.word = 'dog' order by (feline.embedding - cat.embedding) + dog.embedding <=> target.embedding limit 1; ``` + Take a look at the `order by` clause: It subtracts the "catness" from the word "feline", adds the "dogness" and finally searches for the word closest to the resulting vector. Let's look at the result: -``` +```text word -------- canine @@ -165,7 +167,6 @@ Let's look at the result: Bingo! - You can try to find more such relations with the following prepared statement: ```sql @@ -185,8 +186,9 @@ prepare deduce as ``` We recommend the following word triplets as a starting point: -```sql + +```sql execute deduce('Germany', 'Berlin', 'France'); execute deduce('dark', 'darker', 'soft'); execute deduce('sister', 'brother', 'niece'); -``` \ No newline at end of file +``` diff --git a/content/example_datasets/handelsregister.md b/content/example_datasets/handelsregister.md index cd6fadb3..4e9d7a24 100644 --- a/content/example_datasets/handelsregister.md +++ b/content/example_datasets/handelsregister.md @@ -18,20 +18,24 @@ Below, you can see an example of the data: ## Data Loading To load the data into CedarDB, you first need to download it locally: + ```sh curl -O https://daten.offeneregister.de/de_companies_ocdata.jsonl.bz2 bzip2 --decompress de_companies_ocdata.jsonl.bz2 ``` + {{< callout type="info" >}} The bzip2 compressed download is about 250 MB, which decompresses to about 4 GB. {{< /callout >}} You can query the JSON file directly: + ```sql select data from csvview('de_companies_ocdata.jsonl') d(data) limit 3; ``` Or load it into CedarDB: + ```sql create table register_json (data jsonb not null); copy register_json from 'de_companies_ocdata.jsonl'; @@ -40,7 +44,7 @@ copy register_json from 'de_companies_ocdata.jsonl'; ## Relational Schema A relational schema allows efficient queries on the data. -A simplified schema for the JSON data looks as follows: +A simplified schema for the JSON data looks as follows: ```sql create table companies ( @@ -62,6 +66,7 @@ create table officers ( ``` With a relational transformation, we can load the data into CedarDB: + ```sql insert into companies(company_number, current_status, jurisdiction_code, name, registered_address, retrieved_at) select distinct data->>'company_number', @@ -103,4 +108,3 @@ with marsalek as (select * from officers where name = 'Jan Marsalek' and city (select company_number from marsalek_l2)) select distinct name from marsalek_l3 order by name; ``` - diff --git a/content/example_datasets/job.md b/content/example_datasets/job.md index 06817758..8af9c9e9 100644 --- a/content/example_datasets/job.md +++ b/content/example_datasets/job.md @@ -13,6 +13,7 @@ This makes ordering joins in queries over this dataset challenging, hence the na assume uniform data distribution and instead has to rely on collected samples and statistics for join ordering. ## The Dataset + The dataset comprises a total of 21 tables extracted from IMDB, containing information about the movie industry, such as movies, studios, actors, and their connections, such as roles of actors in movies. The full schema with information on all tables is available as an SQL file [schema.sql](https://www.cedardb.com/data/job/schema.sql). @@ -33,6 +34,7 @@ CREATE TABLE cast_info ( {{% steps %}} ### Obtain the data + An excerpt of the IMDB dataset is available for non-commercial purposes through the [JOB paper](https://www.vldb.org/pvldb/vol9/p204-leis.pdf). To obtain the relevant data simply run @@ -88,6 +90,7 @@ For more information and alternative options to server-relatives paths for CSV i {{% /steps %}} ## The Query Workload + The queries of the Join Order Benchmarks were created, as the name already reveals, to contain challenging join order decisions for the optimizer. Queries in the join order benchmark, therefore, join at least 4 and up to 17 tables, with an average of 8 joins in a query. @@ -97,6 +100,7 @@ want to know about movies. {{% steps %}} ### Run the benchmark queries + All 113 JOB queries are available for [download](https://bonsai.cedardb.com/job/job.tgz). You can either run these queries manually one by one using the usual query interface. E.g., the first query, `1a`, tries to find movies in the top 250 that were not produced by Metro-Goldwyn-Mayer Pictures. @@ -122,6 +126,7 @@ Alternatively, you can also include the query directly from the SQL file from wi ``` ### Get started with your own queries + In addition, you can of course play around with the dataset on your own however you like. Collect information on your favorite movies, update potentially outdated information, or enrich the data with external sources. diff --git a/content/example_datasets/nasdaq.md b/content/example_datasets/nasdaq.md index 581f479f..a1521075 100644 --- a/content/example_datasets/nasdaq.md +++ b/content/example_datasets/nasdaq.md @@ -8,8 +8,6 @@ weight: 50 NASDAQ provides [dumps of real-time orders for some trading days](https://emi.nasdaq.com/ITCH/Nasdaq%20ITCH/) free of charge. This guide will show you how to parse it and import it into CedarDB for analysis. - - ## What is order data? High-frequency traders, quantitative analysts, and institutional investors depend on knowing the exact state of the market at any time to base their investment decisions on. @@ -44,18 +42,16 @@ The *market price* of AAPL is now $225.49 since this was the last price at which Issuer of order number 2 now notices nobody wants to buy at their price and reduces their ask: -| ID | Ticker | Timestamp | Side | Quantity | Price | PrevOrderId | -| -- | ------ | ----------- | ---- | -------- | ------ | ----------- | -| 1 | AAPL | 9:30:00:000 | BUY | 3 | 225.49 | null | -| 2 | AAPL | 9:30:00:010 | SELL | 2 | 225.52 | null | -| 3 | AAPL | 9:31:00:000 | SELL | 2 | **225.50** | **2** | +| ID | Ticker | Timestamp | Side | Quantity | Price | PrevOrderId | +|----------|-------------|--------------------|-------------|----------|---------------|-------------| +| 1 | AAPL | 9:30:00:000 | BUY | 3 | 225.49 | null | +| 2 | AAPL | 9:30:00:010 | SELL | 2 | 225.52 | null | +| 3 | AAPL | 9:31:00:000 | SELL | 2 | **225.50** | **2** | -As you can see, the updated order is a *new event* which references the old event it supersedes. +As you can see, the updated order is a *new event* which references the old event it supersedes. As orders are immutable, the system has to keep track of which orders are still active and which are replaced. - -Buy processing the order and execution streams, we can reconstruct the complete state of the exchange. - +By processing the order and execution streams, we can reconstruct the complete state of the exchange. ## Obtaining the data @@ -71,10 +67,9 @@ It downloads the dataset directly from [NASDAQ](https://emi.nasdaq.com/ITCH/Nasd There is data for multiple full trading days (approx. one a year). We chose January 30, 2020 since it was the most busy and recent trading day available. - The data comes in the [NASDAQ ITCH v5.0 protocol](https://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/NQTVITCHSpecification.pdf) format: -``` +```text $ hexdump -C -n 100 01302020.NASDAQ_ITCH50 00000000 00 0c 53 00 00 00 00 09 f6 49 c8 0c d3 4f 00 27 |..S......I...O.'| 00000010 52 00 01 00 00 0a 37 d4 c8 05 0b 41 20 20 20 20 |R.....7....A | @@ -86,7 +81,7 @@ $ hexdump -C -n 100 01302020.NASDAQ_ITCH50 00000064 ``` -We have written a [Python parser](https://github.com/cedardb/examples/blob/main/nasdaq/parser.py) to transform this into human readable CSV files. +We have written a [Python parser](https://github.com/cedardb/examples/blob/main/nasdaq/parser.py) to transform this into human-readable CSV files. It is automatically invoked by the `prepare.sh` script. {{< callout type="info" >}} @@ -198,19 +193,16 @@ This cuts the import time from ~3 minutes to 1 minute! {{< /callout >}} - - - ## Queries + Let's run some queries to gain some insight! ### What was the price of one Apple share at end of day? The canonical stock price is by definition the price of the last executed order, i.e. the price where the timestamp is the largest. -Since executions usually don't come with a price attached (except for some special cases), +Since executions usually don't come with a price attached (except for some special cases), we have to look up the price in the matching order if it is null. - ```sql select arg_max(coalesce(e.price, o.price), e.timestamp) as price from executions e, stocks s, orders o @@ -219,7 +211,7 @@ where e.stockid = s.stockid and s.name = 'AAPL'; ``` -``` +```text price ---------- 323.5800 @@ -228,7 +220,6 @@ where e.stockid = s.stockid Time: 11.413 ms ``` - ### How many new orders in a trading day? All orders that don't supersede another order are new: @@ -236,7 +227,8 @@ All orders that don't supersede another order are new: ```sql select count(*) as new from orders where prevOrder is null; ``` -``` + +```text new ----------- 181194793 @@ -257,8 +249,7 @@ with executions_per_order as ( select num as executions, count(*) from executions_per_order group by num order by num asc; ``` - -``` +```text executions | count ------------+--------- 1 | 5575247 @@ -304,7 +295,7 @@ where pe.stockid = s.stockid order by quantity * real_price desc limit 10; ``` -``` +```text ticker | quantity | price | total --------+----------+-----------+-------------- TSLA | 14549 | 647.0000 | 9413203.0000 @@ -322,7 +313,6 @@ order by quantity * real_price desc limit 10; Time: 716.949 ms ``` - ### Orders that took the longest to be executed Which market participants where the most patient? I.e., which order took the longest from being created to being executed without it being changed in between. @@ -343,8 +333,7 @@ from exec_distance order by minutes desc limit 10; ``` - -``` +```text ticker | side | price | orderquant | executedquant | minutes --------+------+---------+------------+---------------+--------- INO | SELL | 5.0400 | 1000 | 950 | 611 @@ -367,7 +356,6 @@ Time: 896.267 ms Let's take a look at how the trading activity changes over the trading day. We can use `R` with `ggplot` to generate a nice graph of all activity: - ```R {fileName=nasdaq.R} #!/usr/bin/Rscript #install.packages(c("RPostgres", "ggplot2")) diff --git a/content/get_started/_index.md b/content/get_started/_index.md index bdbab71e..3d863204 100644 --- a/content/get_started/_index.md +++ b/content/get_started/_index.md @@ -3,27 +3,22 @@ title: Get Started weight: 10 --- -This guide is designed to help you quickly get started with CedarDB, a SQL database system designed for modern data management needs. - +This guide is designed to help you quickly get started with CedarDB, a SQL database system designed for modern data management needs. ## Quickstart Get CedarDB up and running locally within seconds. The [Quick Start](./quickstart) page will guide you through the setup process and running your first queries. - ## Install CedarDB - [**Docker:**](./install_with_docker) This method offers several convenience features, including automated database upgrades and parameterized user initialization. If you're using macOS, Docker is required. - [**Standalone binary:**](./install_locally) Use this if you're in an environment without Docker or just want to experiment with the interactive SQL shell. - [**Operate in the cloud:**](./operate_in_cloud) Instance sizing guidelines showing you how to best run CedarDB in the cloud. - - {{< callout type="info" >}} After you've set up CedarDB, take a look at the [Data Cookbook](../cookbook) to find out how to accomplish typical data engineering tasks with CedarDB. {{< /callout >}} - ## CedarDB Enterprise -If you have obtained an enterprise license, refer to the [licensing page](../licensing) for a step-by-step guide on how to activate it. +If you have obtained an enterprise license, refer to the [licensing page](../licensing) for a step-by-step guide on how to activate it. diff --git a/content/get_started/install_locally.md b/content/get_started/install_locally.md index 7ec00e20..371ae4eb 100644 --- a/content/get_started/install_locally.md +++ b/content/get_started/install_locally.md @@ -11,8 +11,8 @@ This tutorial explains how to install and run the native CedarDB binary on your CedarDB is distributed as a standalone binary. It runs out of the box on any Linux distribution with **glibc >= 2.27** (released in 2018). - To automatically download and decompress the appropriate version, run: + ```shell curl https://get.cedardb.com | bash ``` @@ -22,6 +22,7 @@ By using CedarDB, you agree to our [Terms and Conditions]({{< relref "/licensing {{< /callout >}} CedarDB supports two modes of operation: + - **Server mode:** A PostgreSQL-compatible server for external clients. - **Interactive mode:** A SQL shell for direct, manual interaction with the database. @@ -39,10 +40,10 @@ By default, CedarDB runs in server mode: This starts the server and creates the `mydb` database directory if it doesn't already exist. - ### Connect via domain socket Initially, the server only accepts connections via local domain socket from the same OS user: + ```shell psql -h /tmp -U postgres ``` @@ -57,18 +58,22 @@ create database test; ### Enable remote connections To accept external connections (e.g., over TCP), run: + ```shell ./cedardb mydb --address=:: ``` Then connect using a PostgreSQL compatible client, e.g.: + ```shell psql -h localhost -U test psql -h cedardb.example.com -U test ``` ### Explore command line options + To see all available flags and options, run: + ```shell ./cedardb --help ``` @@ -77,16 +82,15 @@ To see all available flags and options, run: If you have obtained an enterprise license, refer to the [licensing page](../../licensing) for a step-by-step guide on how to activate it. {{< /callout >}} - - ## Run interactively Interactive mode launches a REPL-style SQL shell. You can use it to create, explore, and manipulate a database locally. - ### Create a new persistent database + To create and open a persistent database: + ```shell ./cedardb --interactive --createdb mydb ``` @@ -99,7 +103,6 @@ You can also specify an absolute or relative path: ./cedardb --interactive --createdb /opt/dbs/movies ``` - {{< callout type="info" >}} Ensure the database is stored on a reasonably fast SSD for optimal performance. {{< /callout >}} @@ -107,6 +110,7 @@ Ensure the database is stored on a reasonably fast SSD for optimal performance. ### Open an existing database If the database already exists, you can open it like this: + ```shell ./cedardb --interactive mydb ``` @@ -121,30 +125,34 @@ Without the flag, CedarDB will only open an existing database and fail if none i ### Create a temporary in-memory database To launch an ephemeral, in-memory database: + ```shell ./cedardb --interactive --inmemory ``` + This database exists only for the duration of the session and will be discarded upon exit. {{< callout type="info" >}} Since this database is held completely in-memory, working with large data sets can quickly exhaust system memory and cause OOM. {{< /callout >}} - ### Running SQL in the shell Once in the SQL shell, you can run standard SQL: + ```sql create table example(i int); insert into example values (42); ``` + The REPL supports: + - Common readline keyboard shortcuts (e.g., CTRL + R to search the history) - Backslash commands: + ```sql \i schema_definition.sql -- Run commands from a file \? -- View available commands ``` For more details, see the [SQL Reference](/docs/references/). - diff --git a/content/get_started/operate_in_cloud.md b/content/get_started/operate_in_cloud.md index 87290afa..6f008835 100644 --- a/content/get_started/operate_in_cloud.md +++ b/content/get_started/operate_in_cloud.md @@ -8,12 +8,10 @@ You can easily deploy CedarDB on your own AWS EC2 or GCP instances. ## Installation - Here's a quick setup example for running CedarDB in the cloud. We recommend using the latest **Ubuntu LTS** release (i.e., Ubuntu 24.04 as of writing). - {{< tabs >}} {{< tab name="Native" >}} @@ -32,7 +30,6 @@ We recommend using the latest **Ubuntu LTS** release (i.e., Ubuntu 24.04 as of w For more details, see the [local installation guide](../install_locally). - {{< /tab >}} {{< tab name="Docker" >}} @@ -64,24 +61,25 @@ When deploying CedarDB in the cloud, performance depends on three key resource d - **Main Memory:** CedarDB caches hot data and intermediate query results in RAM. For best performance, choose an instance with enough memory to fit your working set. - **CPU:** CedarDB scales seamlessly from a single core to hundreds. Analytical workloads benefit significantly from more CPU cores. - **Storage:** - * For **analytical workloads**, throughput is critical, especially for cold data not yet in memory. - * For **transactional workloads**, durability and write latency are key. + - For **analytical workloads**, throughput is critical, especially for cold data not yet in memory. + - For **transactional workloads**, durability and write latency are key. ### Recommended EC2 instance types As a starting point: -* Use the [`m7a`](https://instances.vantage.sh/aws/ec2/m7a.4xlarge) range of instances with the `m7a.4xlarge` as a good baseline for bigger workloads. -* Choose the compute-optimized [`c7a`](https://instances.vantage.sh/aws/ec2/c7a.4xlarge) family for compute-heavy workloads where RAM demand is lower. -* Use the memory-optimized [`r7a`](https://instances.vantage.sh/aws/ec2/r7a.4xlarge) family if you have a large working set but latency is not as big of a concern. -* Use a network-optimized [`c6in`](https://instances.vantage.sh/aws/ec2/c6in.8xlarge) or [`m6in`](https://instances.vantage.sh/aws/ec2/m6in.8xlarge) family if you store your data on S3 and process large amounts of data. + +- Use the [`m7a`](https://instances.vantage.sh/aws/ec2/m7a.4xlarge) range of instances with the `m7a.4xlarge` as a good baseline for bigger workloads. +- Choose the compute-optimized [`c7a`](https://instances.vantage.sh/aws/ec2/c7a.4xlarge) family for compute-heavy workloads where RAM demand is lower. +- Use the memory-optimized [`r7a`](https://instances.vantage.sh/aws/ec2/r7a.4xlarge) family if you have a large working set but latency is not as big of a concern. +- Use a network-optimized [`c6in`](https://instances.vantage.sh/aws/ec2/c6in.8xlarge) or [`m6in`](https://instances.vantage.sh/aws/ec2/m6in.8xlarge) family if you store your data on S3 and process large amounts of data. ### Recommended GCP instance types As a starting point: -* Use the [`c4-standard`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-standard) range of instances with the `c4-standard-48` as a good baseline for medium workloads. -* Choose the compute-optimized [`c4-highcpu`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-highcpu) family for compute-heavy workloads where RAM demand is lower. -* Use the memory-optimized [`c4-highmem`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-highmem) family if you have a large working set. +- Use the [`c4-standard`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-standard) range of instances with the `c4-standard-48` as a good baseline for medium workloads. +- Choose the compute-optimized [`c4-highcpu`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-highcpu) family for compute-heavy workloads where RAM demand is lower. +- Use the memory-optimized [`c4-highmem`](https://cloud.google.com/compute/docs/general-purpose-machines#c4-highmem) family if you have a large working set. ## Storage guidelines @@ -89,11 +87,11 @@ For an overview of AWS storage types, see [the EBS volume types](https://aws.ama For an overview of GCP Compute Engine storage types, read the [durable block storage docs](https://cloud.google.com/compute/docs/disks). AWS recommendations by use case: + - **Analytical, read-heavy workloads:** Use `gp3` volumes. They are cost-efficient and sufficient when the working set fits into memory. - **High durability and transactional throughput:** Use `io2` volumes with enough provisioned IOPS to ensure consistent latency and reliability. - **Ephemeral storage for temporary workloads:** If you don't need persistence across instance shutdowns, instances with attached ephemeral NVMe SSDs offer fast, low-latency storage at a lower price. This is a good fit for: Batch workloads, temporary database instances, or situations where data is already backed up elsewhere. - {{% callout type="info" %}} Want to store your data on [AWS S3](../../references/advanced/s3/) or [Google Cloud Storage](../../references/advanced/gs/) instead for increased performance and much lower cost? [Sign up](https://console.cedardb.com) for our Enterprise trial license or [contact us](mailto:sales@cedardb.com)! diff --git a/content/get_started/quickstart.md b/content/get_started/quickstart.md index 2d6aa496..d3457373 100644 --- a/content/get_started/quickstart.md +++ b/content/get_started/quickstart.md @@ -9,7 +9,6 @@ It covers the essential setup steps and walks you through simple examples to beg ## Installation - To automatically download and decompress the appropriate CedarDB version, run: ```shell @@ -54,9 +53,11 @@ create table starsIn ( ``` ## Load data + Once the schema is in place, we can populate it using one of the following methods: ### Plain Inserts + Use standard [`INSERT`](../../references/dml/insert) statements: ```sql @@ -67,6 +68,7 @@ insert into movies values ``` ### Import from CSV + If your data is stored in a CSV file, like this: ```text {filename="stars.csv"} @@ -75,11 +77,15 @@ If your data is stored in a CSV file, like this: 3,Michelle Yeoh,https://en.wikipedia.org/wiki/Michelle_Yeoh,F,1962-08-06 4,Jürgen Prochnow,https://en.wikipedia.org/wiki/Jürgen_Prochnow,M,1941-06-10 ``` + You bulk import it: + ```sql copy stars from 'stars.csv' delimiter ','; ``` + ### Importing an SQL dump + To import a SQL dump file: ```sql {filename="dump.sql"} @@ -96,9 +102,11 @@ Run it directly from the shell: ``` ## Query your dataset + CedarDB supports standard SQL queries. Example: The following query returns the average movie length: + ```sql select avg(length) from movies; ``` @@ -127,24 +135,27 @@ Drama 83 years 10 mons 27 days ``` {{< callout type="info" >}} -**No tuning needed:** +**No tuning needed:** If you've worked with database systems before, you're probably familiar with techniques such as query decorrelation or schema denormalization to enhance query performance. -However, with CedarDB, these practices are unnecessary. CedarDB automatically handles query decorrelation, even with complex queries containing hundreds of joins. +However, with CedarDB, these practices are unnecessary. CedarDB automatically handles query decorrelation, even with complex queries containing hundreds of joins. This means you can focus on the essential aspect: the business logic driving your queries. {{< /callout >}} ## Modify data - ### Updates + Update existing rows using the [`UPDATE`](../../references/dml/update) statement. For example: + ```sql update stars set name = '杨紫琼' where name = 'Michelle Yeoh'; ``` ### Deletes + Delete rows using [`DELETE`](../../references/dml/delete). For instance, to remove movies not linked to any stars: + ```sql delete from movies m where not exists diff --git a/content/licensing.md b/content/licensing.md index 75717026..45ae0bd1 100644 --- a/content/licensing.md +++ b/content/licensing.md @@ -5,12 +5,11 @@ weight: 1000 CedarDB is available in three editions to suit different needs, from free usage to full enterprise: -| Type | Description | -|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------| -| Community Edition | Free version of CedarDB with a 64 GiB database size limit and no enterprise features. Includes access to documentation and Slack for support. Find the license terms [here](https://cedardb.com/legal/agreements/community_tcs.pdf). | -| Enterprise | Full-featured CedarDB with all enterprise capabilities. License must be renewed as negotiated. Includes dedicated Enterprise support. | -| Enterprise Trial | Time-limited trial of CedarDB Enterprise with all features. Includes Community Edition support (documentation and Slack). | - +| Type | Description | +|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Community Edition | Free version of CedarDB with a 64 GiB database size limit and no enterprise features. Includes access to documentation and Slack for support. Find the [Community Edition license terms](https://cedardb.com/legal/agreements/community_tcs.pdf). | +| Enterprise | Full-featured CedarDB with all enterprise capabilities. License must be renewed as negotiated. Includes dedicated Enterprise support. | +| Enterprise Trial | Time-limited trial of CedarDB Enterprise with all features. Includes Community Edition support (documentation and Slack). | ## Obtain a license @@ -19,6 +18,7 @@ CedarDB is available in three editions to suit different needs, from free usage - To start an **Enterprise Trial**, visit the [self-service console](https://console.cedardb.com) for a 90-day trial license. ## Activate your license + After receiving your license key, activate it by setting the license.key option to the token provided. In the following, we show the concrete steps needed to configure the license setting. For more information on setting configuration options, see our [configuration reference](/docs/references/configuration). @@ -26,22 +26,27 @@ For more information on setting configuration options, see our [configuration re {{< tabs >}} {{< tab name="Configuration File (preferred)" >}} Add a line with your license key to the CedarDB configuration file. The server will automatically load it at startup. In this example, we will use the default configuration path, which is automatically loaded at CedarDB startup when no other config file is specified. + ```shell echo "\"license.key\" = \"\"" >> ~/.cedardb/config ``` + {{< /tab >}} {{< tab name="Environment Variable" >}} An alternative is to set the license in an environment variable. If you are running CedarDB directly on your host machine, then export your license key before starting CedarDB like this: + ```shell export LICENSE_KEY='' ./cedardb ``` If you use docker, then pass the license key as environment variable when starting CedarDB (e.g., via `docker run`): + ```Shell docker run --rm -p 5432:5432 -e CEDAR_PASSWORD=test -e LICENSE_KEY='' cedardb/cedardb ``` + {{< /tab >}} {{< /tabs >}} @@ -49,7 +54,7 @@ docker run --rm -p 5432:5432 -e CEDAR_PASSWORD=test -e LICENSE_KEY='' At startup, CedarDB logs the license activation: -``` +```text LOG: initializing license.key= from config file INFO: License registered to Customer Name, valid until 2025-08-20. INFO: You're running CEDARDB ENTERPRISE EDITION. @@ -61,7 +66,6 @@ You can see your license expiration date at the [CedarDB console](https://consol ![console license page](/images/license.png) - ## Renew your expired license - To renew your **Enterprise** license, [contact sales](mailto:sales@cedardb.com). @@ -71,20 +75,20 @@ You can see your license expiration date at the [CedarDB console](https://consol ## FAQs ### What happens if I exceed the Community Edition data size limits? + Your database will enter *read-only* mode. You won’t be able to insert, update, or delete data, but: - You can still query your data. -- You can export it using SQL statements like `COPY OUT` (to CSV) or tools like `pg_dump` (to SQL). - +- You can export it using SQL statements like `COPY OUT` (to CSV) or tools like `pg_dump` (to SQL). ### What happens when my Enterprise trial license expires? + Your database automatically reverts to the **Community Edition**: - Enterprise features are disabled. - If your database exceeds the 64 GiB limit, it will become read-only. - You retain all your data, even if it exceeds the Community Edition limit. - ### Can I chain multiple trials? No, you can only obtain and activate a single trial license per database. diff --git a/content/references/advanced/_index.md b/content/references/advanced/_index.md index cc7cc9f4..c3d09869 100644 --- a/content/references/advanced/_index.md +++ b/content/references/advanced/_index.md @@ -14,5 +14,3 @@ The following sub pages give an overview over the most important ones and explai * [pgvector compatibility](pgvector) * [Prepared statements](prepare) * [Tables stored on S3](s3) - - diff --git a/content/references/advanced/asof_join.md b/content/references/advanced/asof_join.md index 9ffa339e..ffcde938 100644 --- a/content/references/advanced/asof_join.md +++ b/content/references/advanced/asof_join.md @@ -69,7 +69,7 @@ where s.name = 'GME' limit 10; ``` -``` +```text stockid | orderid | side | market_price | orderprice | delta ---------+---------+------+--------------+------------+--------- 3405 | 3655250 | BUY | 5.8300 | 5.8400 | -0.0100 diff --git a/content/references/advanced/benchmarking.md b/content/references/advanced/benchmarking.md index cda7ff44..aada3cdc 100644 --- a/content/references/advanced/benchmarking.md +++ b/content/references/advanced/benchmarking.md @@ -12,45 +12,44 @@ To validate performance, it is important to run queries multiple times. To repeat a query, you can either repeat the execution (`e`), the compilation (`c`), or both (`a`). To change the repetition mode, simply query this command. -``` +```text \set repeatmode 'a' ``` The number of repetitions can be set with the following command. -``` +```text \set repeat 3 ``` -#### Timeout +### Timeout In the unlikely event of a long-running query, you may want to set a query time after which the query is terminated automatically. This can be accomplished with our timeout setting. This setting specifies the timeout in milliseconds, with 0 milliseconds disabling the timeout. -``` +```text \set timeout 1000 ``` - ## Performance statistics commands In interactive mode, you can enable timing of commands using: -``` + +```text \timing on ``` To record our performance statistics, you can create a CSV with our performance results. Just specify the output CSV with the following setting. -``` +```text \record path/to/perf.csv ``` The output of the queries can be redirected to files (or `/dev/null`). -``` +```text \o path/to/output ``` - diff --git a/content/references/advanced/createserver.md b/content/references/advanced/createserver.md index 75dc6a9f..f7112bd0 100644 --- a/content/references/advanced/createserver.md +++ b/content/references/advanced/createserver.md @@ -29,7 +29,8 @@ For setting up a S3 server CedarDB needs AWS IAM credentials. Please create an AWS IAM user that is allowed to access the S3 buckets. For example if you want to create a user for us-east-1 region, you can do the following steps to grant it full access to all S3 buckets. - 1. Goto https://us-east-1.console.aws.amazon.com/iam/home?region=us-east-1#/users + + 1. Goto 2. Create a new user and add it to the AmazonS3FullAccess group 3. Click on the user and go to the security credentials tab 4. Create a new access key, which looks similar to AKIA5CBDR…, and note down the secret that is shown diff --git a/content/references/advanced/gs.md b/content/references/advanced/gs.md index c2be4a4d..ce67136e 100644 --- a/content/references/advanced/gs.md +++ b/content/references/advanced/gs.md @@ -10,7 +10,6 @@ CedarDB automatically compresses data to columnar format on the fly when appendi Behind the scenes, our unified storage system [Colibri](https://cedardb.com/blog/colibri/) differentiates between hot and cold data. For more technical details, you can also read up on our [blog](https://cedardb.com/blog/colibri/). - {{}} CedarDB still uses a regular (local) storage device for metadata and recently updated (hot) data. {{}} @@ -46,7 +45,8 @@ Please also provision a fast local disk (Persistent Disk SSD or Hyperdisk) with CedarDB's hot set and metadata live there; size PD to hit the IOPS/MB/s you need, or consider Hyperdisk for higher ceilings. To create such an instance, you can use the following gcloud cli command as a starting point. -``` + +```text gcloud compute instances create cedardb_cloud_storage \ --project=project \ --zone=europe-west1-b \ @@ -73,7 +73,6 @@ See GCS pricing for storage classes, operations, and network egress. Note that it is important to co-locate the storage bucket and the instance (same region) to avoid any network cost. Otherwise, expensive egress cost will be charged which may dominate the overall cost. - ## CREATE SERVER Script You can create the create server statement with the help of the following python script. diff --git a/content/references/advanced/parquet.md b/content/references/advanced/parquet.md index 1d3b32f0..90974122 100644 --- a/content/references/advanced/parquet.md +++ b/content/references/advanced/parquet.md @@ -12,8 +12,8 @@ While interactive querying of Parquet files is also possible, CedarDB is optimiz You should use parquet mainly to import data into CedarDB. {{}} - ## Read a Parquet file + ```sql -- Autodetect based on file suffix SELECT * FROM 'test.parquet'; @@ -35,17 +35,20 @@ SELECT * FROM parquet_view('test.parquet'); ## Import Parquet into CedarDB You can either load parquet data directly into a table: + ```sql CREATE TABLE test AS (SELECT * FROM 'test.parquet'); ``` Use `parquet_view` if the file name does not end in `.parquet` + ```sql CREATE TABLE test AS (SELECT * FROM parquet_view('test')); ``` Or you first create the table and insert afterward Copy data into an existing table via psql + ```sql CREATE TABLE test (a integer, b integer); @@ -57,6 +60,7 @@ COPY test from 'test.parquet' (format parquet); ``` You can also specify only some columns + ```sql -- Create a table with the columns you need CREATE TABLE onecol (c integer); @@ -68,6 +72,7 @@ INSERT INTO onecol (SELECT a FROM 'test.parquet') ## Inspect Parquet Metadata Print the parquet file layout + ```sql SELECT * FROM parquet_schema('test.parquet'); ``` @@ -82,6 +87,7 @@ SELECT * FROM parquet_schema('test.parquet'); ``` Print the parquet file meta data footer: + ```sql SELECT * FROM parquet_file('test.parquet'); ``` @@ -94,6 +100,7 @@ SELECT * FROM parquet_file('test.parquet'); ``` Print all contained row groups: + ```sql SELECT * FROM parquet_rowgroups('test.parquet'); ``` @@ -105,6 +112,7 @@ SELECT * FROM parquet_rowgroups('test.parquet'); ``` Print all contained column chunks: + ```sql SELECT * FROM parquet_colchunks('test.parquet'); ``` @@ -128,82 +136,87 @@ Thus, you should always prefer importing the columns you need into CedarDB over This page summarizes the available features supported by the CedarDB Parser. ### Legend + - 🟢 **Supported** - 🟡 **Partially suported**: Details for partial support - 🔴 **Not yet supported** ### Physical Types -| Data Type | Support | -|:--------------------:|:-------:| -| BOOLEAN | 🟢 | -| INT32 | 🟢 | -| INT64 | 🟢 | -| INT961 | 🟢 | -| FLOAT | 🟢 | -| DOUBLE | 🟢 | -| BYTE_ARRAY | 🟢 | -| FIXED_LEN_BYTE_ARRAY | 🟡 (not for legacy string columns) | + +| Data Type | Support | +|:--------------------:|:----------------------------------:| +| BOOLEAN | 🟢 | +| INT32 | 🟢 | +| INT64 | 🟢 | +| INT961 | 🟢 | +| FLOAT | 🟢 | +| DOUBLE | 🟢 | +| BYTE_ARRAY | 🟢 | +| FIXED_LEN_BYTE_ARRAY | 🟡 (not for legacy string columns) | ### Logical Types -| Data Type | Support | -|:------------------------------:|:--------------------:| -| STRING | 🟢 | -| ENUM | 🟡 (parsed as text) | -| UUID | 🟢 | -| Int8,16,32,64 | 🟢 | -| UInt8,16,32,64 | 🟢 | -| DECIMAL (INT32) | 🟢 | -| DECIMAL (INT64) | 🟢 | -| DECIMAL (BYTE_ARRAY) | 🟢 | -| DECIMAL (FIXED_LEN_BYTE_ARRAY) | 🟢 | -| FLOAT16 | 🔴 | -| DATE | 🟢 | -| TIME (INT32) | 🟢 | -| TIME (INT64) | 🟢 | -| TIMESTAMP (INT64) | 🟢 | -| INTERVAL | 🔴 | + +| Data Type | Support | +|:------------------------------:|:---------------------:| +| STRING | 🟢 | +| ENUM | 🟡 (parsed as text) | +| UUID | 🟢 | +| Int8,16,32,64 | 🟢 | +| UInt8,16,32,64 | 🟢 | +| DECIMAL (INT32) | 🟢 | +| DECIMAL (INT64) | 🟢 | +| DECIMAL (BYTE_ARRAY) | 🟢 | +| DECIMAL (FIXED_LEN_BYTE_ARRAY) | 🟢 | +| FLOAT16 | 🔴 | +| DATE | 🟢 | +| TIME (INT32) | 🟢 | +| TIME (INT64) | 🟢 | +| TIMESTAMP (INT64) | 🟢 | +| INTERVAL | 🔴 | | JSON | 🟡 (use text instead) | -| BSON | 🔴 | -| VARIANT | 🔴 | -| GEOMETRY | 🔴 | -| GEOGRAPHY | 🔴 | -| LIST | 🔴 | -| MAP | 🔴 | -| UNKNOWN (always null) | 🟢 | +| BSON | 🔴 | +| VARIANT | 🔴 | +| GEOMETRY | 🔴 | +| GEOGRAPHY | 🔴 | +| LIST | 🔴 | +| MAP | 🔴 | +| UNKNOWN (always null) | 🟢 | ### Encodings -| Encoding | Support | -|:-----------------------:|:--------------------:| -| PLAIN | 🟢 | -| PLAIN_DICTIONARY | 🟢 | -| RLE_DICTIONARY | 🟢 | -| RLE | 🟢 | -| BIT_PACKED (deprecated) | 🔴 | -| DELTA_BINARY_PACKED | 🔴 | -| DELTA_LENGTH_BYTE_ARRAY | 🔴 | -| DELTA_BYTE_ARRAY | 🔴 | -| BYTE_STREAM_SPLIT | 🔴 | + +| Encoding | Support | +|:-----------------------:|:--------:| +| PLAIN | 🟢 | +| PLAIN_DICTIONARY | 🟢 | +| RLE_DICTIONARY | 🟢 | +| RLE | 🟢 | +| BIT_PACKED (deprecated) | 🔴 | +| DELTA_BINARY_PACKED | 🔴 | +| DELTA_LENGTH_BYTE_ARRAY | 🔴 | +| DELTA_BYTE_ARRAY | 🔴 | +| BYTE_STREAM_SPLIT | 🔴 | ### Compression Codecs -| Compression | Support | -|:---------------------:|:--------------------:| -| UNCOMPRESSED | 🟢 | -| BROTLI | 🔴 | -| GZIP | 🟢 | -| LZ4 (deprecated) | 🔴 | -| LZ4_RAW | 🟢 | -| LZO | 🔴 | -| SNAPPY | 🟢 | -| ZSTD | 🟢 | + +| Compression | Support | +|:---------------------:|:-------:| +| UNCOMPRESSED | 🟢 | +| BROTLI | 🔴 | +| GZIP | 🟢 | +| LZ4 (deprecated) | 🔴 | +| LZ4_RAW | 🟢 | +| LZO | 🔴 | +| SNAPPY | 🟢 | +| ZSTD | 🟢 | ### Enhanced Features -| Feature | Support | -|:---------------------:|:--------------------:| -| Selective Column Read | 🟢 | -| Row-Group Skip | 🔴 | -| DataPageHeaderV2 | 🟢 | -| Size Statistics | 🔴 | -| Page Index | 🔴 | -| Bloom Filter | 🔴 | -| Nested Encodings | 🔴 | +| Feature | Support | +|:---------------------:|:--------:| +| Selective Column Read | 🟢 | +| Row-Group Skip | 🔴 | +| DataPageHeaderV2 | 🟢 | +| Size Statistics | 🔴 | +| Page Index | 🔴 | +| Bloom Filter | 🔴 | +| Nested Encodings | 🔴 | diff --git a/content/references/advanced/pgvector.md b/content/references/advanced/pgvector.md index 2739c972..3f94ca78 100644 --- a/content/references/advanced/pgvector.md +++ b/content/references/advanced/pgvector.md @@ -19,6 +19,7 @@ accuracy. ## Creating Vectors You can write vectors as string literals like this: + ```sql -- Create a vector with three elements select '[1,1.5,2]'::vector; @@ -28,6 +29,7 @@ select '[1,1.5,2]'::vector(3); Vectors must have at least one element and must not contain infinity or NaN values. So, all of these examples are invalid: + ```sql -- Vector must contain at least one element select '[]'::vector; @@ -43,6 +45,7 @@ You can also create vectors by casting them from arrays. Casting to a vector with a fixed number of dimensions will fail if the array does not have the same number of elements. But you can cast between different vector types which will truncate the vector or append zeroes, if necessary. + ```sql -- Cast an array of doubles to a vector select cast('{1,2,3}'::double[] as vector); @@ -119,6 +122,7 @@ select vector_cmp('[1,2,3]'::vector, '[0,1,2]'::vector); Since vectors support basic arithmetic and comparison, you can also use them with aggregation functions such as `sum`, `avg`, `min`, and `max`: + ```sql create table my_vectors ( v vector not null ); select sum(v), avg(v), min(v), max(v) from my_vectors; @@ -170,7 +174,6 @@ Currently, CedarDB has no vector similarity index. This may reduce vector simila search performance when scanning multiple gigabytes of data. {{}} - ### Miscellaneous Functions CedarDB also supports some utility functions on vectors shown in the following diff --git a/content/references/advanced/prepare.md b/content/references/advanced/prepare.md index dea33e4b..50a4dbd9 100644 --- a/content/references/advanced/prepare.md +++ b/content/references/advanced/prepare.md @@ -5,12 +5,14 @@ weight: 10 Prepared statements allow you to declare an SQL statement *template* ahead of time once and execute it many times over later on. You can think of it like a function call in your favorite programming language: + ```sql prepare add as select $1::int + $2::int as sum; ``` You can then run your freshly prepared statement with any argument values: -``` + +```text execute add(4,5); sum @@ -20,7 +22,8 @@ execute add(4,5); ``` If you want to re-define a prepared statement, you first have to make CedarDB forget about your previous definition: -``` + +```text deallocate add; ``` @@ -28,17 +31,18 @@ deallocate add; Names of prepared statements are case-*insensitive*: `ADD`, `adD` and `add` refer to the same statement. {{< /callout >}} - - ## Why you might want prepare your statements + Prepared statements especially shine in two use cases: ### Increase security + When working with user-facing applications, queries to CedarDB usually involve user-submitted data, such as a user's name or their data of birth. If such data is not properly sanitized, the application is vulnerable to [SQL injections](https://en.wikipedia.org/wiki/SQL_injection). In a prepared statement, that user-supplied data stays confined to the arguments within a query and is not treated as a query itself. **Unsafe**: + ```sql username = 'alonso;drop table users;'; query = "select * from users where name =" + username; @@ -46,6 +50,7 @@ query = "select * from users where name =" + username; ``` **Safe, with prepared statements**: + ```sql prepare lookupuser as select * from users where name = $1; execute lookupuser('alonso;drop table users'); @@ -54,6 +59,7 @@ execute lookupuser('alonso;drop table users'); ``` ### Increase performance + SQL statements are usually executed not just once, but multiple times. When preparing a statement, CedarDB can do a lot of upfront work **once**, instead of each time when the query is executed. Assume, for example, we want to build a stock market monitoring app and have to keep track of thousands of incoming trades per second: @@ -78,6 +84,7 @@ EXECUTE newTrade(1715958691, 'BUY', 'AAPL', 3, 189.8700); ``` While the difference does not seem like much, it quickly adds up over millions of inserts. + ```sql > insert into trades values (1715958691, 'BUY', 'AAPL', 3, 189.8700); INFO: [s] execution: (0.000178s) compilation: (0.000313s) @@ -87,6 +94,7 @@ INFO: [s] execution: (0.000179s) compilation: (0.000032s) <--- just a tenth! ``` ## Prepared statements in your favorite client + You can of course use prepared statements via raw SQL instructions as shown above. {{< callout type="info" >}} @@ -101,9 +109,10 @@ For example, in [Python's `psycopg`](/docs/clients/python) you can use the `exec Have a look at the ["Clients" section](/docs/clients) to see how your favorite client can be used efficiently with prepared statements. - ## How CedarDB treats prepared statements + Whenever instructed to prepare a statement, CedarDB + 1. parses the query string 2. generates and optimizes an execution plan 3. generates highly optimized executable code for the plan @@ -114,6 +123,7 @@ This ensures that all subsequent queries are consistently executed with minimal To be able to completely compile the prepared statement, CedarDB enforces static types for all parameters. In many cases, CedarDB infers parameter types from the surrounding query, and an explicit type can be omitted. However, in some cases it is not possible to infer the data type, which results in an error: + ```sql prepare echo as select $1; ERROR: unable to infer the data type for parameter $1 diff --git a/content/references/advanced/s3.md b/content/references/advanced/s3.md index 0c8c13f1..6348c52a 100644 --- a/content/references/advanced/s3.md +++ b/content/references/advanced/s3.md @@ -10,7 +10,6 @@ CedarDB uploads the compressed data to S3 and downloads it on demand during quer Behind the scenes, our unified storage system [Colibri](https://cedardb.com/blog/colibri/) differentiates between hot and cold data. For more technical details, you can also read up on our [blog](https://cedardb.com/blog/colibri/). - {{}} CedarDB still uses a regular (local) storage device for metadata and recently updated (hot) data. {{}} @@ -39,7 +38,8 @@ For fast transactional throughput the EBS device should have enough IOPS and ban We recommend a `gp3` volume with at least `500 MB/s` bandwidth and `10k IOPS`, but additional cost occur for the higher IOPS and bandwidth (compared to the standard `gp3` volumes). To create such an instance, you can use the following aws cli command as a starting point. -``` + +```text aws ec2 run-instances \ --image-id ami-xxxxxxxxxxxxxxxxx \ # Replace with desired AMI --instance-type c6in.16xlarge \ # Network-Optimized c6in.16xlarge diff --git a/content/references/configuration.md b/content/references/configuration.md index 892db6f7..4b172355 100644 --- a/content/references/configuration.md +++ b/content/references/configuration.md @@ -17,10 +17,13 @@ All setting names are _case-insensitive_ and we support the following two ways t The recommended way to set option values is via a config file. By default, CedarDB looks for it at `~/.cedardb/config`. Each line in the file can either be a comment (starting with `#`) or a key-value pair of the form: + ```text "settingName" = "value" # optional inline comment ``` + Example: + ```shell cat ~/.cedardb/config # Comment only line @@ -31,19 +34,21 @@ cat ~/.cedardb/config ``` You can pass a custom path to the configuration file as a CLI option to `cedardb`, as shown in the following example: + ```shell cedardb --configFile=/your/path/cedardb_config ``` {{% callout type="info" %}} -Note that both the setting name and value must be *double-quoted*, even if the value is an integer. +Note that both the setting name and value must be _double-quoted_, even if the value is an integer. {{% /callout %}} ### Environment Variable You can also use environment variables for settings. Because dot (.) is not allowed in a variable name, you have to replace it with an underscore (_). The following example has the same effect as the previous one with the config file. -``` + +```text export VERBOSITY=debug5 export BUFFERSIZE=1G export WORKMEMSIZE=3G @@ -81,14 +86,15 @@ journalctl -u cedardb By default, CedarDB logs only a few messages -- primarily errors. You can adjust the verbosity using the setting: -| **Setting Name** | **Description** | **Possible Values** | **Default** | -|------------------|----------------------------------------|-------------------------------------------------------------------------------|-------------| -| `verbosity` | Sets the minimum level of log messages | debug5,...,debug1,info,notice,warning,error,log,fatal,panic | log | +| **Setting Name** | **Description** | **Possible Values** | **Default** | +|------------------|----------------------------------------|--------------------------------------------------------------|-------------| +| `verbosity` | Sets the minimum level of log messages | debug5,...,debug1,info,notice,warning,error,log,fatal,panic | log | For troubleshooting, it can be helpful to increase the verbosity to a higher level. This can have a noticeable impact on performance, and can generate lots of log messages. Especially in high-traffic instances, verbosity should be kept low. You can also change the verbosity at runtime through SQL: + ```SQL SET verbosity = 'debug1'; ``` @@ -101,15 +107,14 @@ while doing heavy query processing on big datasets, you may want to reduce this These settings must be set before starting CedarDB. -| **Setting Name** | **Description** | **Unit** | **Default** | -|---------------------|--------------------------------------------------|-----------------------------------------|-------------------------| -| `buffersize` | Buffer manager pool size | Size with unit suffix (5G, 256M, 1024K) | 45% of available memory | -| `workmemsize` | Amount of memory to be used before spooling disk | Same as above | 45% of available memory | - +| **Setting Name** | **Description** | **Unit** | **Default** | +|------------------|--------------------------------------------------|-----------------------------------------|-------------------------| +| `buffersize` | Buffer manager pool size | Size with unit suffix (5G, 256M, 1024K) | 45% of available memory | +| `workmemsize` | Amount of memory to be used before spooling disk | Same as above | 45% of available memory | ## Degree of parallelism -CedarDB also uses *all* threads of the system for best performance. +CedarDB also uses _all_ threads of the system for best performance. This is intended behaviour, but might generate high load on your machine. If you want to keep other applications responsive, consider starting CedarDB with `nice`. Alternatively, you can limit the number of threads CedarDB uses. @@ -118,11 +123,12 @@ parallelism of the system. This stems from our superior [morsel-driven](https://db.in.tum.de/~leis/papers/morsels.pdf) parallelization strategy. To change the parallelism, you can change the following setting: -| **Setting Name** | **Description** | **Unit** | **Default** | -|---------------------|--------------------------------|----------|-----------------------------------------| -| `parallel` | Number of threads CedarDB uses | Integer | #hardware threads (logical cores/vCPUs) | +| **Setting Name** | **Description** | **Unit** | **Default** | +|------------------|--------------------------------|----------|-----------------------------------------| +| `parallel` | Number of threads CedarDB uses | Integer | #hardware threads (logical cores/vCPUs) | ## License + Enterprise license are passed as a setting named `license.key` to CedarDB. We have detailed how to obtain one in the dedicated [licensing page](/docs/licensing). diff --git a/content/references/datatypes/array.md b/content/references/datatypes/array.md index 21e2cda2..76be9ea6 100644 --- a/content/references/datatypes/array.md +++ b/content/references/datatypes/array.md @@ -9,6 +9,7 @@ Arrays can have arbitrary underlying types, e.g, `int[]` or `text[]`, and an arb Similar to PostgreSQL, the length and dimensions of an array column do not need to be uniform. ## Usage Example + ```sql create table example ( numbers int[], @@ -20,7 +21,7 @@ insert into example select * from example; ``` -``` +```text numbers | strings ---------------+--------------- {1,2,3} | {a,b,c} @@ -36,7 +37,8 @@ Contrary to most programming languages, access to arrays is **1-indexed**: with data(a) as (values (array[1, 2, 3])) select a[1] as first from data; ``` -``` + +```text first ------- 1 @@ -44,11 +46,12 @@ select a[1] as first from data; ``` CedarDB supports arrays with a maximum of 4 GB of underlying data. -For most data types, this limits array to about one billion elements. +For most data types, this limits array to about one billion elements. ```sql select array_fill('x', array[1000000000]); ``` -``` + +```text ERROR: string length overflow ``` diff --git a/content/references/datatypes/bit.md b/content/references/datatypes/bit.md index 2177b810..478e4cd6 100644 --- a/content/references/datatypes/bit.md +++ b/content/references/datatypes/bit.md @@ -9,6 +9,7 @@ CedarDB supports the standard SQL bit-string data types `bit(n)` and `bit varyin Bit strings can be specified with binary or hex digits: `b'1010' == x'a'`. ## Usage Example + ```sql create table example ( bits bit varying @@ -18,7 +19,7 @@ insert into example select * from example; ``` -``` +```text bits ---------------------------------- 00000000 diff --git a/content/references/datatypes/blob.md b/content/references/datatypes/blob.md index 7967995c..973ecca8 100644 --- a/content/references/datatypes/blob.md +++ b/content/references/datatypes/blob.md @@ -9,6 +9,7 @@ Binary blobs can store arbitrary data in opaque *binary* strings, which differen Input data can be specified as hex or PostgreSQL compatible [`bytea` escape format](https://www.postgresql.org/docs/current/datatype-binary.html). ## Usage Example + ```sql create table example ( data blob @@ -18,7 +19,7 @@ insert into example select * from example; ``` -``` +```text data ------------------ \xdeadbeef diff --git a/content/references/datatypes/boolean.md b/content/references/datatypes/boolean.md index eb15791f..20cff0ca 100644 --- a/content/references/datatypes/boolean.md +++ b/content/references/datatypes/boolean.md @@ -7,6 +7,7 @@ weight: 23 Booleans store logical truth values, and can be used to, e.g., store flags. ## Usage Example + ```sql create table example ( flag bool @@ -17,7 +18,7 @@ insert into example select * from example; ``` -``` +```text flag ------ t @@ -38,11 +39,12 @@ select * from example; Due to `null` values, boolean expressions in SQL have *ternary* logic, with sometimes unexpected results compared to other programming languages. However, this ternary logic is context-sensitive: -* In predicates (e.g., for `where` conditions), `null`s are considered `false` and filter tuples. + +* In predicates (e.g., for `where` conditions), `null`s are considered `false` and filter tuples. This is the intuitive behavior, that is used most often. * For value returning expressions (e.g., in a `select` result), boolean logic can result in `null`. -The underlying reason for this ternary logic is that `null` values are considered an *unknown* value, and e.g., the +The underlying reason for this ternary logic is that `null` values are considered an *unknown* value, and e.g., the expression `42 < null` results in a `null` value, since we don't know how an arbitrary value compares to 42. Equality comparisons between values follow the same rules, i.e., `x = null` will always result in another `null`, even when the value of `x` is `null` as well. @@ -59,7 +61,7 @@ select a.v as a, b.v as b, a and b as and, a or b as or from bools a, bools b; ``` -``` +```text a | b | and | or ---+---+-----+---- t | t | t | t diff --git a/content/references/datatypes/date.md b/content/references/datatypes/date.md index 0ade3f01..a64ac0b2 100644 --- a/content/references/datatypes/date.md +++ b/content/references/datatypes/date.md @@ -8,6 +8,7 @@ Date is a day-accurate type without time of day references in ISO 8601 `YYY CedarDB also accepts [PostgreSQL notation](https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-DATE-TABLE). ## Usage Example + ```sql create table example ( due_date date @@ -18,7 +19,7 @@ insert into example select due_date from example; ``` -``` +```text due_date ------------ 2000-01-01 @@ -44,7 +45,8 @@ In a session, you can change the `DateStyle` setting, which determines the parsi set DateStyle = 'DMY'; select '01/02/03'; ``` -``` + +```text ?column? ------------ 2003-02-01 @@ -56,12 +58,10 @@ select '01/02/03'; set DateStyle = 'MDY'; select '01/02/03'; ``` -``` + +```text ?column? ------------ 2003-01-02 (1 row) ``` - - - diff --git a/content/references/datatypes/enums.md b/content/references/datatypes/enums.md index 86e84a00..d75cf7c9 100644 --- a/content/references/datatypes/enums.md +++ b/content/references/datatypes/enums.md @@ -3,12 +3,12 @@ title: "Reference: Enum Types" linkTitle: "Enums" --- -Just like in many programming languages, an enum types consist of a static, ordered set of labels defined by the user. -They combine the clarity of text with the compactness of numerics, when the inherent meaning of a value is more than should be encoded by a single number, but the number of possible values is relatively small. +Just like in many programming languages, an enum types consist of a static, ordered set of labels defined by the user. +They combine the clarity of text with the compactness of numerics, when the inherent meaning of a value is more than should be encoded by a single number, but the number of possible values is relatively small. ## Creation of Enum types -Enum types can be created via the Create Type command. +Enum types can be created via the Create Type command. ```sql create type enum_name as enum (['label' [,...]]); @@ -32,18 +32,22 @@ insert into tasks values (1, 'major'), (2, 'minor'), (3, 'critical'), (4, 'major The enum labels are case sensitive, whereas the enum names are not. This does not work: + ```sql select 'mInOr'::importance; ``` -``` + +```text ERROR: invalid input value for enum importance: "mInOr" ``` This works: -```sql + +```sql select 'minor'::iMpOrTaNcE; ``` -``` + +```text enum importance --------------- minor @@ -53,13 +57,13 @@ minor Values of the same enum type are comparable. Their ordering corresponds the order in which they were listed at creation time. Values of different enum types are incomparable. Similarly, an enum cannot be compared with a builtin type. - In this example ID2 gets filtered out as its corresponding priority is too low in the enum ordering. + ```sql select id, priority from tasks where priority >= 'major'; ``` -``` +```text id | priority ------------- 1 | major @@ -70,9 +74,11 @@ id | priority ```sql select id from tasks where priority > 1; ``` -``` + +```text ERROR: cannot compare enum importance and integer ``` + ## Deletion of Enum types Enum types can be removed via the Drop Type command. @@ -80,20 +86,26 @@ Enum types can be removed via the Drop Type command. ```sql drop type [if exists] name; ``` + The deletion of an enum type is not possible, when any other object still depends on it. Trying to do so regardless results in an error. ## Alter Enum types ### Add new label + A new label can be added to an existing enum via + ```sql alter type enum_name add value [if not exists] added_enum_label; ``` + The newly inserted label is the new maximum in this enum type. Inserting a new label at another location is currently not supported. If the label is already present, the insertion fails with an error. Specifying "if not exists" suppresses this error. ### Change ownership + The owner of an enum can be changed via + ```sql alter type enum_name owner to new_owner; ``` diff --git a/content/references/datatypes/float.md b/content/references/datatypes/float.md index a17c9901..405c9d12 100644 --- a/content/references/datatypes/float.md +++ b/content/references/datatypes/float.md @@ -27,7 +27,7 @@ insert into constants select * from constants; ``` -``` +```text name | value ----------+-------------- pi | 3.14159 @@ -44,7 +44,7 @@ If you need IEEE 754 *special values*, you need to enter them with explicit select real 'nan', real 'inf', real '-0'; ``` -``` +```text ?column? | ?column? | ?column? ----------+----------+---------- NaN | Infinity | -0 @@ -74,7 +74,7 @@ with x(i) as ( select sum(i::double precision) from x; ``` -``` +```text sum ---------------------- 5.55111512312578e-17 @@ -88,7 +88,7 @@ The result also is not stable, i.e., can change indeterministically when repeate CedarDB executes queries in parallel and thus cannot guarantee the order in which the numbers are added. For the above query, an equally valid result would be: -``` +```text sum ---------------------- 2.77555756156289e-17 @@ -105,7 +105,7 @@ The following example shows the behavior of CedarDB where PostgreSQL would give select 1/0::float, 0/0::float, pow(-10, 999); ``` -``` +```text ?column? | ?column? | pow ----------+----------+----------- Infinity | NaN | -Infinity diff --git a/content/references/datatypes/integer.md b/content/references/datatypes/integer.md index 05c47262..c3b312c6 100644 --- a/content/references/datatypes/integer.md +++ b/content/references/datatypes/integer.md @@ -7,11 +7,13 @@ weight: 10 Integers are whole numbers that are typically used to represent counters or identifiers. CedarDB supports three different widths of integers: -* A two-byte `smallint`, -* a four-byte `integer`, -* and an eight-byte `bigint`. + +* A two-byte `smallint`, +* a four-byte `integer`, +* and an eight-byte `bigint`. ## Usage Example + ```sql create table example ( id integer primary key @@ -20,7 +22,7 @@ insert into example select i from generate_series(1, 3) g(i); select id from example; ``` -``` +```text id ---- 1 @@ -42,6 +44,7 @@ Operations on integers are range checked, so that e.g., numeric overflows will n To avoid overflows, it might be necessary to cast to a type that can represent a larger range. ### Handling Overflows + ```sql create table integers(i) as values (power(2, 29)::int), @@ -49,20 +52,26 @@ values (power(2, 29)::int), ``` The following will produce an overflow, since $2^{30} + 2^{30} > 2^{31}-1$. + ```sql select i + i from integers; ``` -``` + +```text ERROR: numeric overflow ``` + You can handle such overflows in multiple ways: #### Try + Wrapping the operation in a [`try()`](/docs/references/expressions/try/) produces a `null` value for overflows: + ```sql select try(i + i) from integers; ``` -``` + +```text try ------------ 1073741824 @@ -71,16 +80,17 @@ select try(i + i) from integers; ``` #### Casting + Casting to `bigint` increases the value range and produces the correct result without an exception: ```sql select i::bigint + i from integers; ``` -``` + +```text ?column? ------------ 1073741824 2147483648 (2 rows) ``` - diff --git a/content/references/datatypes/interval.md b/content/references/datatypes/interval.md index 2c476ca7..27972c2b 100644 --- a/content/references/datatypes/interval.md +++ b/content/references/datatypes/interval.md @@ -21,7 +21,7 @@ insert into example select * from example; ``` -``` +```text duration ---------------- 90 00:00:00 @@ -45,7 +45,7 @@ using CedarDBs calendar for calculations. select date '2024-05-31' + interval '1' month, date '2024-05-31' + interval '2' month; ``` -``` +```text ?column? | ?column? ---------------------+--------------------- 2024-06-30 00:00:00 | 2024-07-31 00:00:00 @@ -56,10 +56,9 @@ select date '2024-05-31' + interval '1' month, date '2024-05-31' + interval '2' select date '2024-02-28' + interval '2' day; ``` -``` +```text ?column? --------------------- 2024-03-01 00:00:00 (1 row) ``` - diff --git a/content/references/datatypes/json.md b/content/references/datatypes/json.md index f2dd572c..f1a2f459 100644 --- a/content/references/datatypes/json.md +++ b/content/references/datatypes/json.md @@ -18,7 +18,7 @@ insert into example select doc, doc->'x' from example; ``` -``` +```text doc | ?column? ---------+---------- {"x":1} | 1 @@ -35,7 +35,7 @@ It guarantees round-trip safety of an arbitrary JSON input (i.e., storing and th This means, that CedarDB returns a semantically equivalent representation, that may syntactically differ from the input. For example, JSONB is allowed to produce the following two semantically equivalent output objects. -``` +```text doc --------- {"x": 1, "y": "10.00"} @@ -46,6 +46,7 @@ For example, JSONB is allowed to produce the following two semantically equivale {"y": "10.00", "x": 1} (1 row) ``` + Because JSONB can be processed much faster, applications that do not require an explicit order of elements within documents, should use JSONB. ### Improving access performance diff --git a/content/references/datatypes/numeric.md b/content/references/datatypes/numeric.md index 8f9c4492..d37084d1 100644 --- a/content/references/datatypes/numeric.md +++ b/content/references/datatypes/numeric.md @@ -9,11 +9,12 @@ Numerics are numbers that are typically used to represent counters or identifier They are useful when exact precision is needed and rounding errors need to be exact, e.g., when storing monetary amounts. Numeric types offer a fixed amount of decimal *precision*, and a fixed *scale* of fractional digits. Decimal *precision* is the total count of significant digits in the number to both sides of the decimal point. The *scale* is the count of -decimal digits in the fractional part of the number. CedarDB supports two different storage widths, an eight-byte -`numeric`, and a sixteen-byte `bignumeric`. Type specifications can use both names, as well as `decimal(precision, scale)`, +decimal digits in the fractional part of the number. CedarDB supports two different storage widths, an eight-byte +`numeric`, and a sixteen-byte `bignumeric`. Type specifications can use both names, as well as `decimal(precision, scale)`, interchangeably. CedarDB will choose the underlying representation automatically based on the specified precision. ## Usage Example + ```sql create table example ( price numeric(38, 3), @@ -25,7 +26,7 @@ insert into example values select * from example; ``` -``` +```text price | tax_rate ----------+---------- 123.450 | 0.19 @@ -47,7 +48,6 @@ Operations on 16 Byte types are expensive to compute. We recommend using a precision of 18 or less when possible for your application. {{< /callout >}} - Storing values outside of the supported ranges will result in an overflow exception. Operations on numerics are range checked, so that e.g., numeric overflows will never cause wrong results. @@ -67,7 +67,6 @@ The table below illustrates how CedarDB selects the precision and scale of the r | n1 % n2 | min(p1 - s1, p2 - s2) + max(s1, s2) | max(s1, s2) | | Other operations (such as UNION, CASE, etc.) | max(s1, s2) + max(p1 - s1, p2 - s2) | max(s1, s2) | - As numerics in CedarDB have a maximum precision of 38, the resulting precisions and scales can sometimes exceed the system limits. In these cases, CedarDB adapts the scale and precision through a set of rules. ### Rules for All Operations Except Multiplication and Division @@ -84,24 +83,29 @@ For example, if the resulting precision were 42 and the scale were 6, the precis ## Handling Overflows Example: + ```sql create table numerics(i) as values (power(2, 126)::numeric(38,0)); ``` The following will produce an overflow, since $2^{126} + 2^{126} > 2^{127}-1$. + ```sql select i + i from integers; ``` -``` + +```text ERROR: numeric overflow ``` Wrapping the operation in a [`try()`](/docs/references/expressions/try/) produces a `null` value for overflows: + ```sql select try(i + i) from numerics; ``` -``` + +```text try ------------ <---- null @@ -110,7 +114,7 @@ select try(i + i) from numerics; ## PostgreSQL Compatibility -PostgreSQL offers a maximum precision of 131072 and scale of 16383, where CedarDB restricts precision and scale to a +PostgreSQL offers a maximum precision of 131072 and scale of 16383, where CedarDB restricts precision and scale to a maximum of 38, for performance reasons. Additionally, PostgreSQL allows `NaN`, `+Infinity`, and `-Infinity` as special numeric values. @@ -118,4 +122,4 @@ Since all operations on numerics are bounds-checked, these values cannot occur d However, PostgreSQL still allows entering them directly. CedarDB forbids entering these values as numeric data types. -See [Float](/docs/references/datatypes/float) for data types supporting those special values. +See [Float](/docs/references/datatypes/float) for data types supporting those special values. diff --git a/content/references/datatypes/range.md b/content/references/datatypes/range.md index a4f982d6..bee7ce6e 100644 --- a/content/references/datatypes/range.md +++ b/content/references/datatypes/range.md @@ -6,6 +6,7 @@ linkTitle: "Ranges" The range types are a convenient way to define a range of values between two bounds. CedarDB supports ranges with the following bound types: + - `int4range` (range of [`int`](../integer.md)) - `int8range` (range of [`bigint`](../integer.md)) - `numrange` (range of [`bignumeric(38,6)`](../numeric.md)) @@ -35,7 +36,7 @@ FROM trees WHERE lower(height_range) >= 20; ``` -``` +```text species | height_range ---------+-------------- Oak | [30,40) @@ -49,7 +50,7 @@ FROM trees WHERE height_range @> 20; ``` -``` +```text species | height_range ---------+-------------- Cedar | [15,40) @@ -79,7 +80,7 @@ INSERT INTO trees VALUES SELECT * FROM trees; ``` -``` +```text species | height_range ----------+-------------- Cedar | [15,40) @@ -96,6 +97,6 @@ Appletree | [2,13) -- The upper bound has been canonicalized In PostgreSQL, canonicalization is only applied to `int4range`, `int8range` and `daterange`. In CedarDB, this is possible for all range types. -CedarDB restricts the precision and scale of [`numerics`](../numeric.md) for performance reason. +CedarDB restricts the precision and scale of [`numerics`](../numeric.md) for performance reason. As the `numeric` datatype is used for the bound values of `numranges`, the restrictions apply here as well. -In this case, CedarDB stores the bounds as `bignumeric(38,6)`. \ No newline at end of file +In this case, CedarDB stores the bounds as `bignumeric(38,6)`. diff --git a/content/references/datatypes/text.md b/content/references/datatypes/text.md index ce550e9f..af624055 100644 --- a/content/references/datatypes/text.md +++ b/content/references/datatypes/text.md @@ -6,10 +6,11 @@ weight: 12 CedarDB's `text` data type stores string data. It considers all strings as Unicode in UTF-8 encoding. -In addition to the unconstrained `text` data type, CedarDB support standard SQL blank-padded +In addition to the unconstrained `text` data type, CedarDB support standard SQL blank-padded `char(length)`, and length constrained `varchar(length)` types. ## Usage Example + ```sql create table example ( gender char(1), @@ -21,7 +22,7 @@ insert into example select * from example; ``` -``` +```text gender | description --------+------------- ⚧ | UwUUwU... @@ -37,7 +38,7 @@ CedarDB specifies text length in *Unicode code points*: select length('🍍'), char_length('🍍'), octet_length('🍍'); ``` -``` +```text length | char_length | octet_length --------+-------------+-------------- 1 | 1 | 4 @@ -53,6 +54,7 @@ arbitrary binary data in string columns without additional encoding. For such data, consider using `bytea`. ## Performance Considerations + Text and length-constrained string data types are handled equivalently. Strings with explicit length do not provide performance or storage benefits. Thus, we generally recommend against length-constraining string columns. @@ -72,6 +74,7 @@ Collates can be specified as [Unicode CLDR locale identifiers](https://unicode.o with the additional tags `_ci` for case-insensitivity, and `_ai` for accent-insensitivity. For example, a case-insensitive collate can be useful for text comparison: + ```sql with strings(a, b) as ( values ('foo', 'FOO') @@ -80,7 +83,7 @@ select a, b, a = b, a collate "en_US_ci" = b from strings; ``` -``` +```text a | b | ?column? | ?column? -----+-----+----------+---------- foo | FOO | f | t @@ -88,22 +91,25 @@ from strings; ``` ### Non-deterministic Results -Be aware that queries using collates can lead to unexpected results, when values *look* different, but + +Be aware that queries using collates can lead to unexpected results, when values *look* different, but are considered equivalent according to the specified collate! For example, for the following query, both, the lowercase and the uppercase result are equally valid: + ```sql with strings(s) as (values ('foo'), ('FOO')) select distinct s collate "en_US_ci" from strings; ``` -``` +```text ?column? ---------- foo (1 row) ``` -``` + +```text ?column? ---------- FOO @@ -111,6 +117,7 @@ from strings; ``` You can achieve a deterministic result by rewriting the query to output a `min()` aggregate in `binary` collate. + ```sql select min(s collate "binary") from strings @@ -118,6 +125,7 @@ group by s collate "en_US_ci"; ``` ### Choose the Right Locale + The expected ordering of diacritics can depend on the specified collate. French Candians, for example, seem to have a specific preference about the lexicographical order of diacritics: ```sql @@ -130,7 +138,7 @@ with strings(s) as ( select s from strings order by s; ``` -``` +```text s ------ cote @@ -140,12 +148,11 @@ select s from strings order by s; (4 rows) ``` - ```sql select s from strings order by s collate "fr_CA"; ``` -``` +```text s ------ cote @@ -154,4 +161,3 @@ select s from strings order by s collate "fr_CA"; côté (4 rows) ``` - diff --git a/content/references/datatypes/time.md b/content/references/datatypes/time.md index 2adc04f1..ebb4edbe 100644 --- a/content/references/datatypes/time.md +++ b/content/references/datatypes/time.md @@ -19,7 +19,7 @@ insert into example select meeting_time from example; ``` -``` +```text meeting_time -------------- 08:00:00 @@ -36,7 +36,7 @@ Calculations with times automatically wrap around following the 24-hour clock. select time '11:00 pm' + interval '8' hour; ``` -``` +```text ?column? ---------- 07:00:00 diff --git a/content/references/datatypes/timestamp.md b/content/references/datatypes/timestamp.md index 785b84e3..9f236934 100644 --- a/content/references/datatypes/timestamp.md +++ b/content/references/datatypes/timestamp.md @@ -20,7 +20,7 @@ insert into example select raw, local from example; ``` -``` +```text raw | local ----------------------------+------------------------------- 2024-05-24 11:42:36.470585 | 2024-05-24 11:42:36.470585+02 @@ -28,7 +28,6 @@ select raw, local from example; (2 rows) ``` - ## Working with Time Zones Timestamps with time zones are printed in *local time*, not the time zone they were initially entered in, as demonstrated in the usage example above: diff --git a/content/references/datatypes/uuid.md b/content/references/datatypes/uuid.md index 5cea73b4..b70faf8d 100644 --- a/content/references/datatypes/uuid.md +++ b/content/references/datatypes/uuid.md @@ -5,19 +5,23 @@ weight: 24 --- UUIDs are universally unique identifiers that can be used as synthetic keys in tables. -Each UUID is a 128 bit value, that can be generated via the commands ```gen_random_uuid``` or ```uuidv7```. The former will utilize a generation algorithm as specified in [RFC 4122](https://datatracker.ietf.org/doc/html/rfc4122), +Each UUID is a 128 bit value, that can be generated via the commands ```gen_random_uuid``` or ```uuidv7```. The former will utilize a generation algorithm as specified in [RFC 4122](https://datatracker.ietf.org/doc/html/rfc4122), whereas the latter also utilizes timestamps in the generation process (see [RFC 9562](https://datatracker.ietf.org/doc/html/rfc9562#name-example-of-a-uuidv7-value) for more details). Both generation algorithms aim at having a high probability of generating no collisions. {{< callout type="info" >}} As an alternative to UUIDs, consider using auto-incrementing integer IDs smaller than 16 Bytes. E.g.: + ```sql create table foo(id integer generated always as identity) ``` + {{< /callout >}} ## Usage Example -### UUIDv4: + +### UUIDv4 + ```sql create table example ( id uuid default gen_random_uuid() @@ -26,7 +30,7 @@ insert into example select from generate_series(1, 3); select id from example; ``` -``` +```text id -------------------------------------- 32cee028-940a-42d8-a2ed-1a6ab8d7b5cc @@ -35,8 +39,7 @@ select id from example; (3 rows) ``` - -### UUIDv7: +### UUIDv7 ```sql create table example2 ( @@ -50,7 +53,7 @@ insert into example2 select uuidv7('-1 hour'); select id from example2; ``` -``` +```text id -------------------------------------- bcd9f4e7-efab-753a-af7c-f7e0c1efb458 @@ -62,6 +65,7 @@ bcd9f4b2-4ac9-71bf-b20a-0f438c5702eb UUIDs are stored as 16 Bytes, but always displayed as 32 standard hexadecimal characters. ## Input + UUIDs are case and hyphen insensitive and can be surrounded by braces: ```sql @@ -76,6 +80,7 @@ select uuid '{a0eebc99-9c0b4ef8-bb6d6bb9-bd380a11}'; ## UUID Extraction Functions ### uuid_extract_version + Provided with a valid UUID, `uuid_extract_version` extracts the version in a `smallint`. Otherwise the function returns `NULL`. ```sql @@ -93,6 +98,7 @@ NULL ``` ### uuid_extract_timestamp + `uuid_extract_timestamp` extracts the timestamp with time zone of a uuid of version 1 or 7. Otherwise, the function returns `NULL`. ```sql @@ -106,5 +112,3 @@ SELECT uuid_extract_timestamp('017F22E2-79B0-7CC3-98C4-DC0C0C07398F'::uuid); --v ---- 2022-02-22 20:22:22+01 ``` - - diff --git a/content/references/datatypes/vector.md b/content/references/datatypes/vector.md index 64fb60f7..ddee5155 100644 --- a/content/references/datatypes/vector.md +++ b/content/references/datatypes/vector.md @@ -22,7 +22,7 @@ insert into example select * from example; ``` -``` +```text word | embedding ------+----------- cat | [1,2,3] diff --git a/content/references/dml/copy.md b/content/references/dml/copy.md index 24e852e5..8e467408 100644 --- a/content/references/dml/copy.md +++ b/content/references/dml/copy.md @@ -48,6 +48,7 @@ CedarDB also supports a best-effort import mode: With this option, rows containing mismatching elements will be skipped on a best-effort basis. A query that uses all available copy options could look like this: + ```sql COPY target_table (column1, column2, column3) FROM '/absolute/path/to/your_file.csv' diff --git a/content/references/dml/delete.md b/content/references/dml/delete.md index 59a3d2e2..4ced6197 100644 --- a/content/references/dml/delete.md +++ b/content/references/dml/delete.md @@ -36,6 +36,7 @@ returning *; Scans in the returning clause read the database after the delete is completed. That is why the exists statement in the returning clause does not match the deleted row with itself in the following example. + ```sql delete from users where users.user_id = 42 diff --git a/content/references/dml/update.md b/content/references/dml/update.md index 79862e64..8a6fc708 100644 --- a/content/references/dml/update.md +++ b/content/references/dml/update.md @@ -46,7 +46,7 @@ returning m.name, m.gross_opening_week; Concurrent updates to rows might cause serialization failures, which show up in the form of: -``` +```text ERROR: conflict with concurrent transaction ``` diff --git a/content/references/dml/upsert.md b/content/references/dml/upsert.md index f34ffdf8..b0976d88 100644 --- a/content/references/dml/upsert.md +++ b/content/references/dml/upsert.md @@ -7,7 +7,7 @@ Upserts allow [inserting](../insert) values into tables with a custom [update](. conflicting existing values. This allows, e.g., idempotent inserts, or to keep track of stateful data without knowledge of the previous state. -## Usage examples: +## Usage examples A sparse table with a per-user credit balance: @@ -67,7 +67,7 @@ For the `credit_balance` example, this would be the current credit amount of a u Upserts can and will still conflict with concurrent transactions: -``` +```text ERROR: conflict with concurrent transaction ``` diff --git a/content/references/expressions/try.md b/content/references/expressions/try.md index 0af32b31..91797c90 100644 --- a/content/references/expressions/try.md +++ b/content/references/expressions/try.md @@ -20,7 +20,7 @@ with input(str) as (values ('42'), ('oops')) select str::int from input; ``` -``` +```text ERROR: invalid number format for integer: no digits found in "oops" ``` @@ -31,7 +31,7 @@ with input(str) as (values ('42'), ('oops')) select try(str::int) from input; ``` -``` +```text try -------- 42 @@ -51,7 +51,7 @@ with input(str) as (values ('1'), ('0'), ('oops')) select try(1::numeric / str::int) from input; ``` -``` +```text try ---------- 1.000000 diff --git a/content/references/functions/advanced_functions/_index.md b/content/references/functions/advanced_functions/_index.md index 60f870eb..363327c3 100644 --- a/content/references/functions/advanced_functions/_index.md +++ b/content/references/functions/advanced_functions/_index.md @@ -9,14 +9,17 @@ In addition to type specific and aggregate functions, CedarDB offers additional ## IO Functionality ### csvview + CSV Views allow you to work with temporary data in external CSV files without first copying them into the database. Example: + ```sql select * from csvview('movies.csv', 'delimiter \",\",header', 'id integer not null, title text'); ``` The `csvview` function takes 3 arguments, the last being optional: + 1. Filename 2. CSV Options 3. Schema (optional) @@ -28,13 +31,14 @@ CSV options need to be encoded into a single string, with special characters, e. The optional third parameter allows to specify the schema of the CSV file, which is autodetected by default. This can be useful if the user has additional information on the properties of the csv file, such as non-nullable colunms. - ## Advanced Analytical Functions + ### kmeans CedarDB provides an optimized implementation to cluster points with any number of dimensions using the k-Means clustering algorithm with an euclidean distance. Example: + ```sql select * from kmeans(table (select id, title, year, length from movies), 5 order by char_length(title), year, length); ``` @@ -51,21 +55,24 @@ The `oder by` clause specifies which attributes the k-Means algorithm should use The `kmeans` function returns an entire table, so you can only use it anywhere you could also write a subquery. The returned table contains all attributes and tuples of the input and one additional column with the name `cluster_id` which contains an integer between 0 and the specified number of clusters (exclusive). For each tuple the `cluster_id` attribute specifies the cluster the tuple was assigned to. - ## Utility Functions ### hash + The `hash` function allows to calculate efficient 64-bit hash values over an arbitrary number of input arguments. Example: + ```sql select title, hash(id, title, genre) from movies; ``` ### normalize_datetime + `normalize_datetime` is a helper function to cast all date types to the timestamp type. Example: + ```sql select normalize_datetime(birthdate) from stars; ``` @@ -73,9 +80,11 @@ select normalize_datetime(birthdate) from stars; For times without date, such as `TIME` and `INTERVAL`, the date is set to `01.01.1970`. ### split_part + `split_part` allows to split text types by a separator string and access the result of the split by index. Example: + ```sql select split_part('abbcdbbef', 'bb', 2); ``` diff --git a/content/references/functions/bitstring.md b/content/references/functions/bitstring.md index 1f9c23b7..b0dd6159 100644 --- a/content/references/functions/bitstring.md +++ b/content/references/functions/bitstring.md @@ -13,23 +13,29 @@ See [PostgreSQL: Bit string functions and operators](https://www.postgresql.org/ ## General-purpose functions -#### `bit & bit` +### `bit & bit` + Bitwise AND (inputs must be of equal length). Example: + ```sql SELECT B'10011' & B'10101'; -> B'10001' ``` -#### `bit | bit` +### `bit | bit` + Bitwise OR (inputs must be of equal length). Example: + ```sql SELECT B'10011' | B'10101'; -> B'10111' ``` -#### `bit # bit` +### `bit # bit` + Bitwise XOR (inputs must be of equal length). Example: + ```sql SELECT B'10011' # B'10101'; -> B'00110' ``` diff --git a/content/references/functions/json.md b/content/references/functions/json.md index cbf233b8..9d72bbe0 100644 --- a/content/references/functions/json.md +++ b/content/references/functions/json.md @@ -23,7 +23,7 @@ When the key is not found, it returns `null`. select data->'name' from json_data; ``` -``` +```text name ------------- "philipp" @@ -45,7 +45,7 @@ It returns `null` for out-of-bounds access. select data->'friends'->0 from json_data; ``` -``` +```text 0 --- 2 @@ -64,7 +64,7 @@ This converts any value, especially JSON strings, but also integers and nested o select data->>'name' from json_data; ``` -``` +```text name --------- philipp @@ -82,7 +82,7 @@ select data->>'name' from json_data; select data::text from json_data limit 1; ``` -``` +```text text ------------------------------------------------- {"id": 1, "name": "philipp", "friends": [2, 3]} @@ -102,7 +102,7 @@ The `json_array_length()` function allows calculating the number of elements in select json_array_length(data->'friends') from json_data; ``` -``` +```text json_array_length ------------------- 2 @@ -124,7 +124,7 @@ select data->'id', json_array_elements(data->'friends') from json_data; ``` -``` +```text id | json_array_elements ----+--------------------- 3 | 1 @@ -137,39 +137,45 @@ from json_data; ``` ## Containment and Existence + The `jsonb_contains` function answers whether a given `jsonb` document is structurally contained within another `jsonb` document. For example, the following query finds the name of the people that consider Max as a friend. + ```sql select data->'name' from json_data where jsonb_contains(data, '{"friends": [2]}'); ``` -``` + +```text name ----------- "philipp" (1 row) ``` -The `@>` operator performs the same operation when applied to json data. +The `@>` operator performs the same operation when applied to json data. The `jsonb_exists` function and the equivalent `?` operator can determine if a given jsonb document has a given text as an object key or as an array value. ```sql select data->'name', data->'nick' from json_data where data ? 'nick'; ``` -``` + +```text name | nick -------------+--------- "christian" | "chris" (1 row) ``` -Additionally, CedarDB supports the `jsonb_exists_all` (`?&` operator) and `jsonb_exists_any` (`?|`) variants, which check for the existence of all (or any) of a given set of keys. +Additionally, CedarDB supports the `jsonb_exists_all` (`?&` operator) and `jsonb_exists_any` (`?|`) variants, which check for the existence of all (or any) of a given set of keys. + ```sql select data->'name', data->'nick' from json_data where jsonb_exists_any(data, ARRAY['nick', 'name']); ``` -``` + +```text name | nick -------------+--------- "philipp" | @@ -183,20 +189,25 @@ where jsonb_exists_any(data, ARRAY['nick', 'name']); select data->'name', data->'nick' from json_data where jsonb_exists_all(data, ARRAY['nick', 'name']); ``` -``` + +```text name | nick -------------+--------- "christian" | "chris" (1 rows) ``` + For the full semantics, refer to the PostgreSQL documentation: [PostgreSQL JSONB containment and existence](https://www.postgresql.org/docs/17/datatype-json.html#JSON-CONTAINMENT) ## Concatenation + The `jsonb_concat` operation concatenates two jsonb documents. To use it, call the `jsonb_concat` function or by providing `jsonb` as input to the `||` operator. + ```sql select data || '{"country": "Germany"}' from jsonb_data. ``` -``` + +```text ?column? --------------------------------------------------------------------------------------- {"id": 1, "name": "philipp", "country": "Germany", "friends": [2, 3]} @@ -209,7 +220,8 @@ select data || '{"country": "Germany"}' from jsonb_data. ```sql select (data->'friends') || (data->>'id')::jsonb as me_and_my_friends from json_data; ``` -``` + +```text me_and_my_friends ------------------- [2, 3, 1] diff --git a/content/references/functions/system.md b/content/references/functions/system.md index b1ad81f6..f5082bca 100644 --- a/content/references/functions/system.md +++ b/content/references/functions/system.md @@ -9,13 +9,13 @@ CedarDB supports a variety of PostgreSQL functions. This page currently only des Similar to [PostgreSQL](https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS), CedarDB advisory locks provide a means for creating _application-defined locks_. -A common example is their use in database migration tools such as *Flyway* or *Liquibase*. +A common example is their use in database migration tools such as _Flyway_ or _Liquibase_. When multiple instances of an application start simultaneously, they might all attempt to apply schema migrations at once. To prevent race conditions or conflicting DDL changes, these tools use advisory locks to ensure that only one process runs migrations at a time. There are two ways to acquire an advisory lock in CedarDB: _at the session level_ or _at the transaction level_. -- _Session_-level advisory locks are held until explicitly released or the session ends. When a client disconnects, the session is closed and all locks held by it are automatically released. +- _Session_-level advisory locks are held until explicitly released or the session ends. When a client disconnects, the session is closed and all locks held by it are automatically released. They are not subject to transaction semantics—if a transaction that acquired a session-level lock is rolled back, the lock remains held. Likewise, an unlock operation remains effective even if the transaction later fails. A lock can be acquired multiple times by the same session; each acquisition must be matched by a corresponding unlock before the lock is fully released. @@ -63,7 +63,7 @@ Each locking function comes in two variants: If waiting would lead to a deadlock, CedarDB automatically aborts the request with the runtime error: -``` +```text ERROR: deadlock_detected (40P01) ``` @@ -77,85 +77,100 @@ Note that advisory locks can participate in deadlock cycles together with other Below is an exhaustive list of all supported advisory lock functions in CedarDB: -#### pg_advisory_lock +### pg_advisory_lock Obtains an exclusive session-level advisory lock, waiting if necessary. -``` +```text pg_advisory_lock (key Bigint) → Void pg_advisory_lock (key1 Integer, key2 Integer) → Void ``` -#### pg_advisory_lock_shared +### pg_advisory_lock_shared + Obtains a shared session-level advisory lock, waiting if necessary. -``` + +```text pg_advisory_lock_shared (key Bigint) → Void pg_advisory_lock_shared (key1 Integer, key2 Integer) → Void ``` +### pg_advisory_unlock -#### pg_advisory_unlock Releases a previously acquired exclusive session-level advisory lock. Returns true if the lock is successfully released. If the lock was not held, false is returned, and in addition, an SQL warning will be reported by the server. -``` + +```text pg_advisory_unlock(key Bigint) → Boolean pg_advisory_unlock(key1 Integer, key2 Integer) → Boolean ``` -#### pg_advisory_unlock_all +### pg_advisory_unlock_all + Releases all session-level advisory locks held by the current session. (This function is implicitly invoked at session end, even if the client disconnects ungracefully.) -``` + +```text pg_advisory_unlock_all() → Void ``` -#### pg_advisory_unlock_shared +### pg_advisory_unlock_shared + Releases a previously acquired shared session-level advisory lock. Returns true if the lock is successfully released. If the lock was not held, false is returned, and in addition, an SQL warning will be reported by the server. -``` + +```text pg_advisory_unlock_shared(key Bigint) → Boolean pg_advisory_unlock_shared(key1 Integer, key2 Integer) → Boolean ``` -#### pg_advisory_xact_lock +### pg_advisory_xact_lock + Obtains an exclusive transaction-level advisory lock, waiting if necessary. -``` + +```text pg_advisory_xact_lock(key Bigint) → Void pg_advisory_xact_lock(key1 Integer, key2 Integer) → Void ``` +### pg_advisory_xact_lock_shared -#### pg_advisory_xact_lock_shared Obtains a shared transaction-level advisory lock, waiting if necessary. -``` + +```text pg_advisory_xact_lock_shared(key Bigint) → Void pg_advisory_xact_lock_shared(key1 Integer, key2 Integer) → Void ``` -#### pg_try_advisory_lock +### pg_try_advisory_lock + Obtains an exclusive session-level advisory lock if available. This will either obtain the lock immediately and return true, or return false without waiting if the lock cannot be acquired immediately. -``` + +```text pg_try_advisory_lock(key Bigint) → Boolean pg_try_advisory_lock(key1 Integer, key2 Integer) → Boolean ``` +### pg_try_advisory_lock_shared -#### pg_try_advisory_lock_shared Obtains a shared session-level advisory lock if available. This will either obtain the lock immediately and return true, or return false without waiting if the lock cannot be acquired immediately. -``` + +```text pg_try_advisory_lock_shared(key Bigint) → Boolean pg_try_advisory_lock_shared(key1 Integer, key2 Integer) → Boolean ``` +### pg_try_advisory_xact_lock -#### pg_try_advisory_lock_shared Obtains an exclusive transaction-level advisory lock if available. This will either obtain the lock immediately and return true, or return false without waiting if the lock cannot be acquired immediately. -``` +```text pg_try_advisory_xact_lock(key Bigint) → Boolean pg_try_advisory_xact_lock(key1 Integer, key2 Integer) → Boolean ``` -#### pg_try_advisory_xact_lock_shared -``` +### pg_try_advisory_xact_lock_shared + +```text pg_try_advisory_xact_lock_shared(key Bigint) → Boolean pg_try_advisory_xact_lock_shared(key1 Integer, key2 Integer) → Boolean ``` + Obtains a shared transaction-level advisory lock if available. This will either obtain the lock immediately and return true, or return false without waiting if the lock cannot be acquired immediately. diff --git a/content/references/functions/text.md b/content/references/functions/text.md index caeead11..93b00a9a 100644 --- a/content/references/functions/text.md +++ b/content/references/functions/text.md @@ -9,26 +9,26 @@ functions. ## Functions and Operators -#### string_to_table +### string_to_table The `string_to_table` function splits a _string_ at the _delimiter_ and replaces output words that match the _null_string_ by null. If the _delimiter_ is NULL, the _string_ is split into all its characters. If the _delimiter_ is empty, the _string_ is not split at all. The final result is returned as a column of type **text** where each output word is a row. -##### **Syntax** +#### **Syntax** -``` +```text string_to_table(string Text, delimiter Text [, null_string Text])) -> setof Text ``` -##### **Examples** +#### **Examples** ```sql select string_to_table('The General Sherman tree is the largest tree in the world.', ' ', 'the'); ``` -``` +```text string_to_table ----------------- The @@ -61,7 +61,7 @@ matching case-insensitive. ##### **Syntax** -``` +```text regexp_count(string Text, pattern Text [, start Integer [, flags Text]]) -> Integer ``` @@ -71,7 +71,7 @@ regexp_count(string Text, pattern Text [, start Integer [, flags Text]]) -> Inte select regexp_count('The General Sherman tree is the largest tree in the world.', 'Tree', 23, 'i'); ``` -``` +```text regexp_count -------------- 1 @@ -91,7 +91,7 @@ defaults to 0, which leads to identifying the position of the whole match regard ##### **Syntax** -``` +```text regexp_instr(string Text, pattern Text[, start Integer [, N Integer [, endpoint Integer [, flags Text [, subexpr Integer ]]]]]) -> Integer ``` @@ -100,7 +100,8 @@ regexp_instr(string Text, pattern Text[, start Integer [, N Integer [, endpoint ```sql select regexp_instr('The General Sherman tree is the largest tree in the world.', 'Tree', 23, 1, 1, 'i'); ``` -``` + +```text regexp_instr -------------- 45 @@ -115,7 +116,7 @@ for example, the `i` flag makes the pattern matching case-insensitive. ##### **Syntax** -``` +```text regexp_like(string Text, pattern Text [, flags Text])) -> Boolean ``` @@ -124,7 +125,8 @@ regexp_like(string Text, pattern Text [, flags Text])) -> Boolean ```sql select regexp_like('The General Sherman tree is the largest tree in the world.', 'Tree.*Largest', 'i'); ``` -``` + +```text regexp_like ------------- t diff --git a/content/references/functions/timestamp.md b/content/references/functions/timestamp.md index 5415092c..61e9be69 100644 --- a/content/references/functions/timestamp.md +++ b/content/references/functions/timestamp.md @@ -13,55 +13,62 @@ functions. The `to_char` function formats a timestamp (with or without time zone) into a string based on a user-specified format pattern. -##### **Syntax** +#### **Syntax** -``` +```text to_char(timestamp Timestamp, format Text) -> Text to_char(timestamp TimestampTZ, format Text) -> Text ``` -##### **Examples** +#### **Examples** + Formatting the release date of CedarDB with `to_char` to a common U.S. datetime style: + ```sql select to_char(timestamptz 'Sep 10 2025, 5:57PM+02', 'MM/DD/YYYY HH12:MI:SS PM'); ``` -``` + +```text to_char ------------------------ 09/10/2025 05:57:00 PM ``` + Format to a common German datetime style: + ```sql select to_char(timestamptz 'Sep 10 2025, 5:57PM+02', 'DD.MM.YYYY HH24:MI:SS'); ``` -``` + +```text to_char --------------------- 10.09.2025 17:57:00 ``` -##### **Supported Format Patterns** +#### **Supported Format Patterns** + Currently, CedarDB supports a limited but commonly used subset of PostgreSQL format patterns: -| Pattern | Description | -|-----------------------------------|------------------------------------------------| -| `AD`, `BC` | Era indicator without periods (uppercase) | -| `ad`, `bc` | Era indicator without periods (lowercase) | -| `A.D.`, `B.C.` | Era indicator with periods (uppercase) | -| `a.d.`, `b.c.` | Era indicator with periods (lowercase) | -| `AM` / `PM` | Meridiem indicator (before/after noon) | -| `DAY` /`Day`/ `day` | Day name (MONDAY / Monday / monday) | -| `DY` / `Dy` / `dy` | Abbreviated day name (MON / Mon / mon) | -| `ID` | ISO day of the week (1–7, Monday=1) | -| `DD` | Day of month (01–31) | -| `MM` | Month number (01–12) | -| `MONTH` / `Month` / `month` | Month name (MARCH / March / march) | -| `MON` / `Mon` / `mon` | Abbreviated month name (MAR / Mar / mar) | -| `YYYY` | Year (4 digits) | -| `IYYY` | ISO 8601 week-numbering year (4 digits) | -| `MI` | Minute (00–59) | -| `HH`, `HH12` | Hour (01–12) | -| `HH24` | Hour (00–23) | -| `SS` | Seconds (00–59) | -| `SSSS` | Seconds past midnight (0–86399) | -| `:`, `;`, `,`, ` `, `.`, `-`, `/` | Char separators | +| Pattern | Description | +|-----------------------------------|-------------------------------------------| +| `AD`, `BC` | Era indicator without periods (uppercase) | +| `ad`, `bc` | Era indicator without periods (lowercase) | +| `A.D.`, `B.C.` | Era indicator with periods (uppercase) | +| `a.d.`, `b.c.` | Era indicator with periods (lowercase) | +| `AM` / `PM` | Meridiem indicator (before/after noon) | +| `DAY` /`Day`/ `day` | Day name (MONDAY / Monday / monday) | +| `DY` / `Dy` / `dy` | Abbreviated day name (MON / Mon / mon) | +| `ID` | ISO day of the week (1–7, Monday=1) | +| `DD` | Day of month (01–31) | +| `MM` | Month number (01–12) | +| `MONTH` / `Month` / `month` | Month name (MARCH / March / march) | +| `MON` / `Mon` / `mon` | Abbreviated month name (MAR / Mar / mar) | +| `YYYY` | Year (4 digits) | +| `IYYY` | ISO 8601 week-numbering year (4 digits) | +| `MI` | Minute (00–59) | +| `HH`, `HH12` | Hour (01–12) | +| `HH24` | Hour (00–23) | +| `SS` | Seconds (00–59) | +| `SSSS` | Seconds past midnight (0–86399) | +| `:`, `;`, `,`, ` `, `.`, `-`, `/` | Char separators | diff --git a/content/references/objects/functions.md b/content/references/objects/functions.md index 12bc243f..ebdc6e0a 100644 --- a/content/references/objects/functions.md +++ b/content/references/objects/functions.md @@ -29,7 +29,7 @@ Now you can use these functions in regular SQL queries: select times_two(x::int) from generate_series(1, 3) s(x); ``` -``` +```text times_two ----------- 2 @@ -42,7 +42,7 @@ select times_two(x::int) from generate_series(1, 3) s(x); select cowsay('Hello CedarDB!'); ``` -``` +```text cowsay ------------------------------ ________________ + diff --git a/content/references/objects/indexes.md b/content/references/objects/indexes.md index ea9b8181..8bb68c72 100644 --- a/content/references/objects/indexes.md +++ b/content/references/objects/indexes.md @@ -37,6 +37,7 @@ create index complaints_index on sales(...); CedarDB currently exclusively supports B-tree indexes. All indexes, thus, support range and prefix lookup. E.g., the example index on sales can be used for queries with predicates like: + ```sql ... where customer_id = 42; ... where customer_id between 5 and 10; @@ -48,6 +49,7 @@ Additionally, you can declare an index to be `unique`, which will create a corre ## Column Order You can also specify the ordering of columns within the index: + * `asc`, `desc` * `nulls first`, `nulls last` diff --git a/content/references/objects/tables.md b/content/references/objects/tables.md index d3b62b3b..491f8b34 100644 --- a/content/references/objects/tables.md +++ b/content/references/objects/tables.md @@ -52,10 +52,11 @@ create table orders ( ``` The `constraint ` part can be omitted entirely, in which case CedarDB automatically assigns a default name using the same conventions as PostgreSQL: -- Primary key: `tablename_pkey` -- Unique: `tablename_colname_key` -- Foreign key: `tablename_colname_fkey` -- Check: `tablename_colname_check` + +* Primary key: `tablename_pkey` +* Unique: `tablename_colname_key` +* Foreign key: `tablename_colname_fkey` +* Check: `tablename_colname_check` These default names can be used just like explicit names, e.g., to drop a constraint with `alter table ... drop constraint `. Naming is not supported for `not null`. @@ -80,8 +81,6 @@ Create a table that stores all compressed data on remote server, which was creat create table table_name (...) with (server = remote_storage); ``` - - ### Permissions To create a role, you need to have superuser or `createrole` permissions. @@ -129,9 +128,11 @@ See [PostgreSQL: ALTER TABLE](https://www.postgresql.org/docs/current/sql-altert ### Column statements #### `RENAME COLUMN` + ```sql ALTER TABLE IF EXISTS movies RENAME COLUMN runlength TO duration; ``` + Rename column of table with `table_name` and `current_column_name` to the `new_column_name`. If exists checks if the table exists and only tries to rename in the case of existence. @@ -149,9 +150,10 @@ ALTER TABLE orders ADD CONSTRAINT orders_unique unique (customer, item); ``` The `CONSTRAINT ` part can be omitted entirely, in which case CedarDB automatically assigns a default name using the same conventions as PostgreSQL: -- Primary key: `tablename_pkey` -- Unique: `tablename_colname_key` -- Foreign key: `tablename_colname_fkey` + +* Primary key: `tablename_pkey` +* Unique: `tablename_colname_key` +* Foreign key: `tablename_colname_fkey` #### `DROP CONSTRAINT` diff --git a/content/references/queries/select.md b/content/references/queries/select.md index 47f64d10..b4b909b2 100644 --- a/content/references/queries/select.md +++ b/content/references/queries/select.md @@ -15,6 +15,7 @@ For fast single-element access with `where`, consider specifying `id` as primary {{< /callout >}} You can also use [expressions](../../expressions) or [functions](/docs/references/functions) to transform your data: + ```sql select date_trunc('month', release_date) from movies; ``` diff --git a/content/references/sessions/settings.md b/content/references/sessions/settings.md index a591f99e..1b11fc9a 100644 --- a/content/references/sessions/settings.md +++ b/content/references/sessions/settings.md @@ -11,7 +11,7 @@ Usage example: show TimeZone; ``` -``` +```text timezone --------------- Europe/Berlin @@ -22,7 +22,7 @@ show TimeZone; set timezone='US/Pacific'; ``` -``` +```text SET 0 ``` diff --git a/content/references/utility/explain.md b/content/references/utility/explain.md index 633ea417..7742d4f5 100644 --- a/content/references/utility/explain.md +++ b/content/references/utility/explain.md @@ -21,7 +21,7 @@ group by o_orderpriority order by o_orderpriority ``` -``` +```text plan --------------------------------------------------------- 🖩 OUTPUT (Estimate: 5) + @@ -37,7 +37,7 @@ This plan shows an overview over how CedarDB plans to execute the query. Annotated in the plan are the estimated output sizes of the operators, which CedarDB uses to determine the best algorithms and execution order. -How to read a plan: +How to read a plan: As a default, CedarDB's query plans are trees that are shown as text where child nodes are indented (only if a node has at least one child). The uppermost operator is the result output and the input tables are the most indented children. CedarDB generally executes plans starting at the input nodes going up in the plan. @@ -54,7 +54,7 @@ In addition, CedarDB annotates timing information to identify costly operations. explain analyze select ... ``` -``` +```text plan -------------------------------------------------------------------------------- 🖩 OUTPUT () + @@ -80,7 +80,8 @@ enable `verbose` output: ```sql explain verbose select ... ``` -``` + +```text plan ---------------------------------------------------------------------------------------------------------------------------------------------- 🖩 OUTPUT (Estimate: 5) + @@ -103,6 +104,7 @@ Here, you can now see the used index (`lineitem_pkey`), aggregated expressions ( ## Format CedarDB can reconstruct the query plan in the following output formats: + - flat (default) - tree - sql @@ -116,7 +118,7 @@ For a graphical presentation of the query plan, you can use the tree format: explain (format tree) select ... ``` -``` +```text plan ---------------------------------------------- ┌────────────┐ + @@ -209,6 +211,7 @@ explain (step ) select ... For `step_value`, the number of optimization steps to be performed can be passed as an integer. Alternatively, the name of the last step to be applied can be used. The possible values are: + - NoOptimizations - ExpressionSimplification - Unnesting diff --git a/content/references/writecache.md b/content/references/writecache.md index 287de898..37f2e910 100644 --- a/content/references/writecache.md +++ b/content/references/writecache.md @@ -44,7 +44,7 @@ the [Linux block sysfs](https://www.kernel.org/doc/Documentation/ABI/stable/sysf cat /sys/block/nvme0n1/queue/write_cache ``` -``` +```text write through ``` @@ -58,8 +58,8 @@ You can also verify if your NVMe SSD uses a volatile cache using the `nvme-cli` sudo nvme id-ctrl /dev/nvme0n1 -H | grep 'Write Cache' ``` -``` - [0:0] : 0 Volatile Write Cache Not Present +```text + [0:0] : 0 Volatile Write Cache Not Present ``` ## CedarDB Durability Guarantees diff --git a/content/releases.md b/content/releases.md index fc878576..f2f71c64 100644 --- a/content/releases.md +++ b/content/releases.md @@ -3,7 +3,7 @@ title: "CedarDB Releases" linkTitle: "Releases" --- -### Current Release +## Current Release You can automatically [install the latest release](/docs/get_started/install_locally) using our helper script: @@ -30,6 +30,6 @@ For manual download, please choose the correct version for your system: {{< releasenotes current="true" >}} -### Older Versions +## Older Versions {{< releasenotes current="false" >}} diff --git a/content/roadmap.md b/content/roadmap.md index f69c8a60..9d95b864 100644 --- a/content/roadmap.md +++ b/content/roadmap.md @@ -10,11 +10,11 @@ Information about intermediate status of features and minor features will be ava We categorise the status of the features on this page as planned, in progress, and fully available: -| **State** | **Icon** | -|--------------------|:---------------------:| -| Planned | {{< iconplanned >}} | +| **State** | **Icon** | +|--------------------|:----------------------:| +| Planned | {{< iconplanned >}} | | Under Construction | {{< iconinprogress >}} | -| Available | {{< icondone >}} | +| Available | {{< icondone >}} | A more detailed overview of our PostgreSQL compatibility level can be found on the separate [Compatibility](../compatibility/) page. @@ -37,30 +37,30 @@ CedarDB. ### Data Model & Domain-Specific Features -| **Feature** | **State** | **Details** | -|------------------------------|:--------------------:|------------------------------------------------------------| -| AsOf joins | {{< icondone >}} | [Documentation](/docs/references/advanced/asof_join/) | -| Fulltext search | {{< iconplanned >}} | | -| Enhanced graph query support | {{< iconplanned >}} | | -| Range types | {{< iconinprogress >}} | | -| Schema evolution | {{< iconinprogress >}} | [Documentation](/docs/references/objects/) | -| Vector support | {{< iconinprogress >}} | [Documentation](/docs/references/advanced/pgvector/) | +| **Feature** | **State** | **Details** | +|------------------------------|:----------------------:|-------------------------------------------------------| +| AsOf joins | {{< icondone >}} | [Documentation](/docs/references/advanced/asof_join/) | +| Fulltext search | {{< iconplanned >}} | | +| Enhanced graph query support | {{< iconplanned >}} | | +| Range types | {{< iconinprogress >}} | | +| Schema evolution | {{< iconinprogress >}} | [Documentation](/docs/references/objects/) | +| Vector support | {{< iconinprogress >}} | [Documentation](/docs/references/advanced/pgvector/) | ### Data Formats -| **Feature** | **State** | **Details** | -|-----------------------|:--------------------:|------------------------------------------------------------| -| Parquet reader | {{< iconplanned >}} | | -| Parquet writer | {{< iconplanned >}} | | -| Iceberg support | {{< iconplanned >}} | | +| **Feature** | **State** | **Details** | +|-----------------------|:----------------------:|------------------------------------------------------------| +| Parquet reader | {{< iconplanned >}} | | +| Parquet writer | {{< iconplanned >}} | | +| Iceberg support | {{< iconplanned >}} | | | pg_dump compatibility | {{< iconinprogress >}} | [Documentation](/docs/cookbook/importing_from_postgresql/) | ### Connectivity -| **Feature** | **State** | **Details** | -|--------------------------------|:--------------------:|----------------------------------------------------| -| PostgreSQL system tables | {{< icondone >}} | [Documentation](/docs/compatibility/system_table/) | -| information_schema support | {{< icondone >}} | [Documentation](/docs/compatibility/system_table/) | +| **Feature** | **State** | **Details** | +|--------------------------------|:----------------------:|----------------------------------------------------| +| PostgreSQL system tables | {{< icondone >}} | [Documentation](/docs/compatibility/system_table/) | +| information_schema support | {{< icondone >}} | [Documentation](/docs/compatibility/system_table/) | | PostgreSQL Logical replication | {{< iconinprogress >}} | | | Support for more CDC tools | {{< iconinprogress >}} | [Documentation](/docs/cookbook/aurora_debezium/) | @@ -73,8 +73,8 @@ production use of CedarDB. These features may not be available to all CedarDB us ### Operations -| **Feature** | **State** | **Details** | -|-----------------------------|:------------------:|-------------| +| **Feature** | **State** | **Details** | +|-----------------------------|:-------------------:|-------------| | Read replication to CedarDB | {{< iconplanned >}} | | | Automatic failover | {{< iconplanned >}} | | | Automatic backups | {{< iconplanned >}} | | @@ -84,8 +84,8 @@ production use of CedarDB. These features may not be available to all CedarDB us ### Multi Tenancy -| **Feature** | **State** | **Details** | -|----------------------------------------|:--------------------:|-------------| -| Resource limits for individual tenants | {{< iconplanned >}} | | +| **Feature** | **State** | **Details** | +|----------------------------------------|:----------------------:|-------------| +| Resource limits for individual tenants | {{< iconplanned >}} | | | Extended role & grant management | {{< iconinprogress >}} | | -| Fair scheduling over multiple tenants | {{< iconplanned >}} | | +| Fair scheduling over multiple tenants | {{< iconplanned >}} | | diff --git a/content/technology/parallelism.md b/content/technology/parallelism.md index 477bda0e..e463aa0b 100644 --- a/content/technology/parallelism.md +++ b/content/technology/parallelism.md @@ -21,13 +21,11 @@ workers a query should be processed. To avoid that the whole system becomes unresponsive when under high load, static parallelism cannot dedicate all available hardware resources to a single query, resulting in low overall utilization. -
{{< asciinema key="htop_cedardb" scale="width" autoPlay=true >}}
{{< asciinema key="htop_postgresql" autoPlay=true >}}
- In the image above, PostgreSQL uses only 10 cores, while CedarDB uses all 48 cores. This uses the hardware much more efficiently, and ensures you always get the best query latency from your hardware. @@ -52,7 +50,7 @@ Compared to other database systems, the total system load will be higher, usuall cat /proc/loadavg ``` -``` +```text 19.54 19.14 18.89 25/5342 368920 ``` From 7a2d59e98a8324f0914bcff53d38c3a509394a1b Mon Sep 17 00:00:00 2001 From: Philipp Fent Date: Fri, 22 May 2026 16:41:09 +0200 Subject: [PATCH 4/4] add vale style checker --- .github/workflows/ci.yml | 11 ++ .vale-styles/.vale-config/1-Hugo.ini | 10 + .vale-styles/Google/Contractions.yml | 30 +++ .vale-styles/Google/FirstPerson.yml | 13 ++ .vale-styles/Google/Gender.yml | 9 + .vale-styles/Google/GenderBias.yml | 43 ++++ .vale-styles/Google/Headings.yml | 29 +++ .vale-styles/Google/Latin.yml | 11 ++ .vale-styles/Google/LyHyphens.yml | 14 ++ .vale-styles/Google/OptionalPlurals.yml | 12 ++ .vale-styles/Google/Ordinal.yml | 7 + .vale-styles/Google/OxfordComma.yml | 7 + .vale-styles/Google/Passive.yml | 184 ++++++++++++++++++ .vale-styles/Google/Ranges.yml | 7 + .vale-styles/Google/Spacing.yml | 10 + .vale-styles/Google/Units.yml | 8 + .vale-styles/Google/We.yml | 11 ++ .vale-styles/Google/Will.yml | 7 + .../config/vocabularies/CedarDB/accept.txt | 112 +++++++++++ .../config/vocabularies/CedarDB/reject.txt | 0 .vale.ini | 10 + Structure.md | 10 +- Styleguide.md | 4 +- content/clients/_index.md | 2 +- content/clients/csharp/_index.md | 2 +- content/clients/java/_index.md | 4 +- content/clients/javascript/prisma.md | 2 +- content/clients/python/_index.md | 2 +- content/clients/rust/_index.md | 2 +- content/clients/tools/grafana.md | 8 +- content/cookbook/aurora_debezium.md | 4 +- content/cookbook/aws_dms.md | 10 +- content/cookbook/importing_from_json.md | 2 +- content/cookbook/importing_from_postgresql.md | 2 +- content/cookbook/read_replica_tutorial.md | 4 +- content/cookbook/working_with_csv.md | 2 +- content/example_datasets/glove.md | 4 +- content/example_datasets/nasdaq.md | 2 +- content/get_started/install_with_docker.md | 4 +- content/licensing.md | 2 +- content/references/advanced/_index.md | 2 +- content/references/advanced/gs.md | 4 +- content/references/advanced/parquet.md | 8 +- content/references/advanced/pgvector.md | 2 +- content/references/advanced/prepare.md | 2 +- content/references/configuration.md | 2 +- content/references/datatypes/blob.md | 4 +- content/references/datatypes/boolean.md | 2 +- content/references/datatypes/date.md | 2 +- content/references/datatypes/enums.md | 4 +- content/references/datatypes/float.md | 2 +- content/references/datatypes/integer.md | 2 +- content/references/datatypes/numeric.md | 4 +- content/references/datatypes/uuid.md | 4 +- content/references/dml/upsert.md | 2 +- content/references/expressions/_index.md | 2 +- .../functions/advanced_functions/_index.md | 4 +- content/references/functions/json.md | 6 +- content/references/functions/timestamp.md | 2 +- content/references/objects/indexes.md | 2 +- content/references/objects/policies.md | 4 +- content/references/objects/roles.md | 22 +-- content/roadmap.md | 4 +- 63 files changed, 627 insertions(+), 82 deletions(-) create mode 100644 .vale-styles/.vale-config/1-Hugo.ini create mode 100644 .vale-styles/Google/Contractions.yml create mode 100644 .vale-styles/Google/FirstPerson.yml create mode 100644 .vale-styles/Google/Gender.yml create mode 100644 .vale-styles/Google/GenderBias.yml create mode 100644 .vale-styles/Google/Headings.yml create mode 100644 .vale-styles/Google/Latin.yml create mode 100644 .vale-styles/Google/LyHyphens.yml create mode 100644 .vale-styles/Google/OptionalPlurals.yml create mode 100644 .vale-styles/Google/Ordinal.yml create mode 100644 .vale-styles/Google/OxfordComma.yml create mode 100644 .vale-styles/Google/Passive.yml create mode 100644 .vale-styles/Google/Ranges.yml create mode 100644 .vale-styles/Google/Spacing.yml create mode 100644 .vale-styles/Google/Units.yml create mode 100644 .vale-styles/Google/We.yml create mode 100644 .vale-styles/Google/Will.yml create mode 100644 .vale-styles/config/vocabularies/CedarDB/accept.txt create mode 100644 .vale-styles/config/vocabularies/CedarDB/reject.txt create mode 100644 .vale.ini diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e82c133f..356e0669 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,3 +24,14 @@ jobs: - name: Run markdownlint uses: DavidAnson/markdownlint-cli2-action@v23.2.0 + + vale: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Vale Spell and Style Check + uses: vale-cli/vale-action@2.1.2 + continue-on-error: true + with: + files: content/ diff --git a/.vale-styles/.vale-config/1-Hugo.ini b/.vale-styles/.vale-config/1-Hugo.ini new file mode 100644 index 00000000..4347ca9e --- /dev/null +++ b/.vale-styles/.vale-config/1-Hugo.ini @@ -0,0 +1,10 @@ +[*.md] +# Exclude `{{< ... >}}`, `{{% ... %}}`, [Who]({{< ... >}}) +TokenIgnores = ({{[%<] .* [%>]}}.*?{{[%<] ?/.* [%>]}}), \ +(\[.+\]\({{< .+ >}}\)), \ +[^\S\r\n]({{[%<] \w+ .+ [%>]}})\s, \ +[^\S\r\n]({{[%<](?:/\*) .* (?:\*/)[%>]}})\s + +# Exclude `{{< myshortcode `This is some HTML, ... >}}` +BlockIgnores = (?sm)^({{[%<] \w+ [^{]*?\s[%>]}})\n$, \ +(?s) *({{< highlight [^>]* ?>}}.*?{{< ?/ ?highlight >}}) diff --git a/.vale-styles/Google/Contractions.yml b/.vale-styles/Google/Contractions.yml new file mode 100644 index 00000000..4f6fd5d4 --- /dev/null +++ b/.vale-styles/Google/Contractions.yml @@ -0,0 +1,30 @@ +extends: substitution +message: "Use '%s' instead of '%s'." +link: 'https://developers.google.com/style/contractions' +level: suggestion +ignorecase: true +action: + name: replace +swap: + are not: aren't + cannot: can't + could not: couldn't + did not: didn't + do not: don't + does not: doesn't + has not: hasn't + have not: haven't + how is: how's + is not: isn't + it is: it's + should not: shouldn't + that is: that's + they are: they're + was not: wasn't + we are: we're + we have: we've + were not: weren't + what is: what's + when is: when's + where is: where's + will not: won't diff --git a/.vale-styles/Google/FirstPerson.yml b/.vale-styles/Google/FirstPerson.yml new file mode 100644 index 00000000..0b7b8828 --- /dev/null +++ b/.vale-styles/Google/FirstPerson.yml @@ -0,0 +1,13 @@ +extends: existence +message: "Avoid first-person pronouns such as '%s'." +link: 'https://developers.google.com/style/pronouns#personal-pronouns' +ignorecase: true +level: warning +nonword: true +tokens: + - (?:^|\s)I\s + - (?:^|\s)I,\s + - \bI'm\b + - \bme\b + - \bmy\b + - \bmine\b diff --git a/.vale-styles/Google/Gender.yml b/.vale-styles/Google/Gender.yml new file mode 100644 index 00000000..c8486181 --- /dev/null +++ b/.vale-styles/Google/Gender.yml @@ -0,0 +1,9 @@ +extends: existence +message: "Don't use '%s' as a gender-neutral pronoun." +link: 'https://developers.google.com/style/pronouns#gender-neutral-pronouns' +level: error +ignorecase: true +tokens: + - he/she + - s/he + - \(s\)he diff --git a/.vale-styles/Google/GenderBias.yml b/.vale-styles/Google/GenderBias.yml new file mode 100644 index 00000000..36f5a3f8 --- /dev/null +++ b/.vale-styles/Google/GenderBias.yml @@ -0,0 +1,43 @@ +extends: substitution +message: "Consider using '%s' instead of '%s'." +ignorecase: true +link: "https://developers.google.com/style/inclusive-documentation" +level: error +action: + name: replace +swap: + (?:alumna|alumnus): graduate + (?:alumnae|alumni): graduates + air(?:m[ae]n|wom[ae]n): pilot(s) + anchor(?:m[ae]n|wom[ae]n): anchor(s) + authoress: author + camera(?:m[ae]n|wom[ae]n): camera operator(s) + door(?:m[ae]|wom[ae]n): concierge(s) + draft(?:m[ae]n|wom[ae]n): drafter(s) + fire(?:m[ae]n|wom[ae]n): firefighter(s) + fisher(?:m[ae]n|wom[ae]n): fisher(s) + fresh(?:m[ae]n|wom[ae]n): first-year student(s) + garbage(?:m[ae]n|wom[ae]n): waste collector(s) + lady lawyer: lawyer + ladylike: courteous + mail(?:m[ae]n|wom[ae]n): mail carriers + man and wife: husband and wife + man enough: strong enough + mankind: human kind|humanity + manmade: manufactured + manpower: personnel + middle(?:m[ae]n|wom[ae]n): intermediary + news(?:m[ae]n|wom[ae]n): journalist(s) + ombuds(?:man|woman): ombuds + oneupmanship: upstaging + poetess: poet + police(?:m[ae]n|wom[ae]n): police officer(s) + repair(?:m[ae]n|wom[ae]n): technician(s) + sales(?:m[ae]n|wom[ae]n): salesperson or sales people + service(?:m[ae]n|wom[ae]n): soldier(s) + steward(?:ess)?: flight attendant + tribes(?:m[ae]n|wom[ae]n): tribe member(s) + waitress: waiter + woman doctor: doctor + woman scientist[s]?: scientist(s) + work(?:m[ae]n|wom[ae]n): worker(s) diff --git a/.vale-styles/Google/Headings.yml b/.vale-styles/Google/Headings.yml new file mode 100644 index 00000000..c8d5be26 --- /dev/null +++ b/.vale-styles/Google/Headings.yml @@ -0,0 +1,29 @@ +extends: capitalization +message: "'%s' should use sentence-style capitalization." +link: "https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings" +level: warning +scope: heading +match: $sentence +indicators: + - ":" +exceptions: + - Azure + - CLI + - Cosmos + - Docker + - Emmet + - gRPC + - I + - Kubernetes + - Linux + - macOS + - Marketplace + - MongoDB + - REPL + - Studio + - TypeScript + - URLs + - Visual + - VS + - Windows + - JSON diff --git a/.vale-styles/Google/Latin.yml b/.vale-styles/Google/Latin.yml new file mode 100644 index 00000000..ca03b915 --- /dev/null +++ b/.vale-styles/Google/Latin.yml @@ -0,0 +1,11 @@ +extends: substitution +message: "Use '%s' instead of '%s'." +link: 'https://developers.google.com/style/abbreviations' +ignorecase: true +level: error +nonword: true +action: + name: replace +swap: + '\b(?:eg|e\.g\.)(?=[\s,;])': for example + '\b(?:ie|i\.e\.)(?=[\s,;])': that is diff --git a/.vale-styles/Google/LyHyphens.yml b/.vale-styles/Google/LyHyphens.yml new file mode 100644 index 00000000..50dacb40 --- /dev/null +++ b/.vale-styles/Google/LyHyphens.yml @@ -0,0 +1,14 @@ +extends: existence +message: "'%s' doesn't need a hyphen." +link: "https://developers.google.com/style/hyphens" +level: error +ignorecase: false +nonword: true +action: + name: edit + params: + - regex + - "-" + - " " +tokens: + - '\b[^\s-]+ly-\w+\b' diff --git a/.vale-styles/Google/OptionalPlurals.yml b/.vale-styles/Google/OptionalPlurals.yml new file mode 100644 index 00000000..4a8767d6 --- /dev/null +++ b/.vale-styles/Google/OptionalPlurals.yml @@ -0,0 +1,12 @@ +extends: existence +message: "Don't use plurals in parentheses such as in '%s'." +link: "https://developers.google.com/style/plurals-parentheses" +level: error +nonword: true +action: + name: edit + params: + - trim_right + - "(s)" +tokens: + - '\b\w+\(s\)' diff --git a/.vale-styles/Google/Ordinal.yml b/.vale-styles/Google/Ordinal.yml new file mode 100644 index 00000000..d1ac7d27 --- /dev/null +++ b/.vale-styles/Google/Ordinal.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Spell out all ordinal numbers ('%s') in text." +link: 'https://developers.google.com/style/numbers' +level: error +nonword: true +tokens: + - \d+(?:st|nd|rd|th) diff --git a/.vale-styles/Google/OxfordComma.yml b/.vale-styles/Google/OxfordComma.yml new file mode 100644 index 00000000..b9ba21eb --- /dev/null +++ b/.vale-styles/Google/OxfordComma.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Use the Oxford comma in '%s'." +link: 'https://developers.google.com/style/commas' +scope: sentence +level: warning +tokens: + - '(?:[^,]+,){1,}\s\w+\s(?:and|or)' diff --git a/.vale-styles/Google/Passive.yml b/.vale-styles/Google/Passive.yml new file mode 100644 index 00000000..3265890e --- /dev/null +++ b/.vale-styles/Google/Passive.yml @@ -0,0 +1,184 @@ +extends: existence +link: 'https://developers.google.com/style/voice' +message: "In general, use active voice instead of passive voice ('%s')." +ignorecase: true +level: suggestion +raw: + - \b(am|are|were|being|is|been|was|be)\b\s* +tokens: + - '[\w]+ed' + - awoken + - beat + - become + - been + - begun + - bent + - beset + - bet + - bid + - bidden + - bitten + - bled + - blown + - born + - bought + - bound + - bred + - broadcast + - broken + - brought + - built + - burnt + - burst + - cast + - caught + - chosen + - clung + - come + - cost + - crept + - cut + - dealt + - dived + - done + - drawn + - dreamt + - driven + - drunk + - dug + - eaten + - fallen + - fed + - felt + - fit + - fled + - flown + - flung + - forbidden + - foregone + - forgiven + - forgotten + - forsaken + - fought + - found + - frozen + - given + - gone + - gotten + - ground + - grown + - heard + - held + - hidden + - hit + - hung + - hurt + - kept + - knelt + - knit + - known + - laid + - lain + - leapt + - learnt + - led + - left + - lent + - let + - lighted + - lost + - made + - meant + - met + - misspelt + - mistaken + - mown + - overcome + - overdone + - overtaken + - overthrown + - paid + - pled + - proven + - put + - quit + - read + - rid + - ridden + - risen + - run + - rung + - said + - sat + - sawn + - seen + - sent + - set + - sewn + - shaken + - shaven + - shed + - shod + - shone + - shorn + - shot + - shown + - shrunk + - shut + - slain + - slept + - slid + - slit + - slung + - smitten + - sold + - sought + - sown + - sped + - spent + - spilt + - spit + - split + - spoken + - spread + - sprung + - spun + - stolen + - stood + - stridden + - striven + - struck + - strung + - stuck + - stung + - stunk + - sung + - sunk + - swept + - swollen + - sworn + - swum + - swung + - taken + - taught + - thought + - thrived + - thrown + - thrust + - told + - torn + - trodden + - understood + - upheld + - upset + - wed + - wept + - withheld + - withstood + - woken + - won + - worn + - wound + - woven + - written + - wrung diff --git a/.vale-styles/Google/Ranges.yml b/.vale-styles/Google/Ranges.yml new file mode 100644 index 00000000..3ec045e7 --- /dev/null +++ b/.vale-styles/Google/Ranges.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Don't add words such as 'from' or 'between' to describe a range of numbers." +link: 'https://developers.google.com/style/hyphens' +nonword: true +level: warning +tokens: + - '(?:from|between)\s\d+\s?-\s?\d+' diff --git a/.vale-styles/Google/Spacing.yml b/.vale-styles/Google/Spacing.yml new file mode 100644 index 00000000..66e45a6b --- /dev/null +++ b/.vale-styles/Google/Spacing.yml @@ -0,0 +1,10 @@ +extends: existence +message: "'%s' should have one space." +link: 'https://developers.google.com/style/sentence-spacing' +level: error +nonword: true +action: + name: remove +tokens: + - '[a-z][.?!] {2,}[A-Z]' + - '[a-z][.?!][A-Z]' diff --git a/.vale-styles/Google/Units.yml b/.vale-styles/Google/Units.yml new file mode 100644 index 00000000..53522ab2 --- /dev/null +++ b/.vale-styles/Google/Units.yml @@ -0,0 +1,8 @@ +extends: existence +message: "Put a nonbreaking space between the number and the unit in '%s'." +link: "https://developers.google.com/style/units-of-measure" +nonword: true +level: error +tokens: + - \b\d+(?:B|kB|MB|GB|TB) + - \b\d+(?:ns|ms|s|min|h|d) diff --git a/.vale-styles/Google/We.yml b/.vale-styles/Google/We.yml new file mode 100644 index 00000000..c7ac7d36 --- /dev/null +++ b/.vale-styles/Google/We.yml @@ -0,0 +1,11 @@ +extends: existence +message: "Try to avoid using first-person plural like '%s'." +link: 'https://developers.google.com/style/pronouns#personal-pronouns' +level: warning +ignorecase: true +tokens: + - we + - we'(?:ve|re) + - ours? + - us + - let's diff --git a/.vale-styles/Google/Will.yml b/.vale-styles/Google/Will.yml new file mode 100644 index 00000000..128a9183 --- /dev/null +++ b/.vale-styles/Google/Will.yml @@ -0,0 +1,7 @@ +extends: existence +message: "Avoid using '%s'." +link: 'https://developers.google.com/style/tense' +ignorecase: true +level: warning +tokens: + - will diff --git a/.vale-styles/config/vocabularies/CedarDB/accept.txt b/.vale-styles/config/vocabularies/CedarDB/accept.txt new file mode 100644 index 00000000..1d739002 --- /dev/null +++ b/.vale-styles/config/vocabularies/CedarDB/accept.txt @@ -0,0 +1,112 @@ +# PostgreSQL data types +(?i)bigint +(?i)bitwise +(?i)boolean +(?i)bytea +(?i)cidr +(?i)inet +(?i)jsonb +(?i)lseg +(?i)macaddr +(?i)oid +(?i)ctid +(?i)uuid +(?i)xml + +# SQL terms +(?i)async +(?i)csv +(?i)csvs +(?i)enum +(?i)enums +(?i)json +(?i)sql +(?i)subquery +(?i)subqueries +(?i)subselect +(?i)upsert +(?i)upserts + +# Programming and tech abbreviations +(?i)cli +(?i)jdbc +(?i)gzip +(?i)kmeans +(?i)libpq +(?i)libpqxx +(?i)npgsql +(?i)psycopg +(?i)psql +(?i)ssd +(?i)ssds +(?i)stderr +(?i)sysfs +(?i)systemd + +# AWS and cloud tools +(?i)aws +(?i)gcloud +(?i)Hyperdisk + +# Tools, ORMs, and frameworks +(?i)DBeaver +(?i)Debezium +(?i)Grafana +(?i)Liquibase +(?i)Prisma +(?i)Tokio +(?i)glibc +(?i)pgvector + +# Abbreviations +(?i)CMU +(?i)CPUs +(?i)CTEs +(?i)NVMe +(?i)ORMs +(?i)SSDs +(?i)UUIDs +(?i)vCPUs + +# CedarDB-specific terms +(?i)Codecs +(?i)Codegen +(?i)Colibri +(?i)autocommit +(?i)csvview +(?i)qpm + +# Person names (researchers and contributors) +Alfons +Amdahl +Boncz +Kahan +Kemper +Kohn +Marsalek +Neumann +Viktor + +# Hugo shortcode names +callout +relref + +# Example dataset words +catness +dogness + +# Technical notation and abbreviations used in prose +(?i)signup +(?i)infos +(?i)inlines +(?i)Goto +(?i)Mul +(?i)config +(?i)monday +exa +fd +th +N +n +dev +env diff --git a/.vale-styles/config/vocabularies/CedarDB/reject.txt b/.vale-styles/config/vocabularies/CedarDB/reject.txt new file mode 100644 index 00000000..e69de29b diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 00000000..850a23b9 --- /dev/null +++ b/.vale.ini @@ -0,0 +1,10 @@ +StylesPath = .vale-styles + +MinAlertLevel = error + +Packages = Google, Hugo + +Vocab = CedarDB + +[*.md] +BasedOnStyles = Vale, Google diff --git a/Structure.md b/Structure.md index cf76dbcc..8a55acd4 100644 --- a/Structure.md +++ b/Structure.md @@ -62,7 +62,7 @@ to reference pages for feature details. ### clients/ -Organized into three sub-sections: +Organized into three subsections: - **Language folders** (`python/`, `javascript/`, `java/`, etc.) — one folder per programming language. The folder's `_index.md` covers the primary driver or @@ -280,7 +280,7 @@ jump directly to it. 8. **Is it a configuration parameter?** Go to `references/configuration.md` -9. **Is it a multi-step procedure or integration task?** +9. **Is it a multistep procedure or integration task?** Go to `cookbook/.md` 10. **Is it about connecting with a programming language driver or ORM?** @@ -327,13 +327,13 @@ jump directly to it. Within every directory, sub-pages are ordered alphabetically by their `linkTitle` (falling back to `title`). This applies to both the sidebar and -to any hand-written member list in an `_index.md`. +to any handwritten member list in an `_index.md`. -- The sidebar uses `.ByLinkTitle` for sub-levels via a site override of the +- The sidebar uses `.ByLinkTitle` for sublevels via a site override of the Hextra `sidebar.html` partial (`layouts/partials/sidebar.html`). The top-level nav continues to use `weight` so editorial ordering (Get Started first, Licensing last) is preserved. -- Hand-written bullet lists in `_index.md` pages must mirror the sidebar +- Handwritten bullet lists in `_index.md` pages must mirror the sidebar order — list items alphabetically by the visible link text. Keep them in sync when adding or renaming pages. - Do not set per-page `weight:` on sub-pages to force a different order. If diff --git a/Styleguide.md b/Styleguide.md index 4001ad5b..5d304a9a 100644 --- a/Styleguide.md +++ b/Styleguide.md @@ -163,7 +163,7 @@ strengths. Include benchmarks or explanations where relevant. demonstrated. Never use meaningless names like `t`, `a`, `b`, `foo`. 6. **One concept per example.** Do not combine unrelated features. 7. **Keep it short.** Under 10 lines preferred. 15 lines maximum for complex - features (transactions with savepoints, multi-step migrations). + features (transactions with savepoints, multistep migrations). ### Pattern @@ -305,7 +305,7 @@ fix it immediately in the same PR. Do not file "fix docs later" tickets. warning (behavior that might surprise the reader or cause data loss). - Limit to one admonition per page section. If you need more, revise the prose to incorporate the information directly. -- Use markdown tables for structured information. Keep cells simple. If a cell +- Use Markdown tables for structured information. Keep cells simple. If a cell needs more than one sentence, use prose instead. --- diff --git a/content/clients/_index.md b/content/clients/_index.md index 67878470..55803dec 100644 --- a/content/clients/_index.md +++ b/content/clients/_index.md @@ -4,7 +4,7 @@ weight: 40 --- CedarDB is compatible with most features of PostgreSQL-compatible clients and drivers out of the box. -If you want to use CedarDB in combination with a specific framework or language, take a look at the following sub pages. +If you want to use CedarDB in combination with a specific framework or language, take a look at the following sub-pages. Programming languages: diff --git a/content/clients/csharp/_index.md b/content/clients/csharp/_index.md index 51478f58..8cc5bea6 100644 --- a/content/clients/csharp/_index.md +++ b/content/clients/csharp/_index.md @@ -117,7 +117,7 @@ LOG: 1000000 rows (0.000016 s parsing, 0.000273 s compilation, 1.250094 s transm ``` {{< callout type="info" >}} -We recommend using binary copy mode as it significantly faster than text mode due to its terser encoding. +We recommend using binary copy mode as it is significantly faster than text mode due to its terser encoding. {{< /callout >}} ## Batching diff --git a/content/clients/java/_index.md b/content/clients/java/_index.md index ff369e3f..c74f8b0a 100644 --- a/content/clients/java/_index.md +++ b/content/clients/java/_index.md @@ -15,8 +15,8 @@ Note that you can simply [download](https://jdbc.postgresql.org/download/) the l wget https://jdbc.postgresql.org/download/postgresql-42.7.3.jar ``` -After finishing the client (see at the full program at the bottom of the program), we need to first compile our java program with `javac` and then execute the class with the right classpath. -This examples assumes that the java program has the name `CedarDBClient`. +After finishing the client (see at the full program at the bottom of the program), we need to first compile our Java program with `javac` and then execute the class with the right classpath. +This examples assumes that the Java program has the name `CedarDBClient`. ```bash export CLASSPATH=".:postgresql-42.7.3.jar" diff --git a/content/clients/javascript/prisma.md b/content/clients/javascript/prisma.md index 41fdadc3..ebf4481c 100644 --- a/content/clients/javascript/prisma.md +++ b/content/clients/javascript/prisma.md @@ -32,7 +32,7 @@ Then enable TCP connections: ./cedardb mydb --address=:: ``` -For more details, see the [install guide]({{< relref "/get_started/install_locally.md" >}}). +For more details, see the [installation guide]({{< relref "/get_started/install_locally.md" >}}). ## Installing diff --git a/content/clients/python/_index.md b/content/clients/python/_index.md index 6533bf9a..3b237881 100644 --- a/content/clients/python/_index.md +++ b/content/clients/python/_index.md @@ -61,7 +61,7 @@ Be careful: To make sure that data is persisted, you - have to explicitly call the commit method of your connection object (like we did above) **or** - let the connection object go out of scope without encountering an exception **or** -- explictly enable autocommit for your connection (`autocommit=True`). +- explicitly enable autocommit for your connection (`autocommit=True`). If you don't do anything of the above, your transaction will be rolled back and all data you thought you did insert will be discarded. {{< /callout >}} diff --git a/content/clients/rust/_index.md b/content/clients/rust/_index.md index dd6eadbb..c7e4ad9e 100644 --- a/content/clients/rust/_index.md +++ b/content/clients/rust/_index.md @@ -42,7 +42,7 @@ client.execute("insert into chatlog values ($1, $2, $3)", &[&7, &"(☞゚∀゚) ``` {{< callout type="info" >}} -Under the hood, the Rust values are converted from/to Postgres types via the +Under the hood, the Rust values are converted from/to PostgreSQL types via the [postgres_types](https://docs.rs/postgres-types/latest/postgres_types/trait.ToSql.html#types) crate. This example converts the parameters `i32` to an `integer`, a `&str` to a `text` and a `chrono::DateTime` to a `timestamptz`. diff --git a/content/clients/tools/grafana.md b/content/clients/tools/grafana.md index 47d680ea..6004888f 100644 --- a/content/clients/tools/grafana.md +++ b/content/clients/tools/grafana.md @@ -33,7 +33,7 @@ If you want to update your dashboards more often, open `/etc/grafana/grafana.ini min_refresh_interval = 100ms ``` -Afterwards, restart Grafana to load the setting: +Afterward, restart Grafana to load the setting: ```shell sudo systemctl restart grafana-server @@ -112,11 +112,11 @@ PostgreSQL Version: 15 Min time interval: 100ms ``` -Then click on "Save & test". You will get an error message "Internal Server Error" with the message `ERROR: schema "information_schema" does not exist` in the CedarDB logs. This is expected behaviour and fine for now. +Then click on "Save & test". You will get an error message "Internal Server Error" with the message `ERROR: schema "information_schema" does not exist` in the CedarDB logs. This is expected behavior and fine for now. ### Build a dashboard -On the top right of the data source window, click on "Build a dashbaord", and then "Add visualization". Choose your new "CedarDB" data source. +On the top right of the data source window, click on "Build a dashboard", and then "Add visualization". Choose your new "CedarDB" data source. In the Query builder, toggle the "Code" view on the far right and enter the statement @@ -136,7 +136,7 @@ When choosing the correct interval (e.g., "Last 10 minutes") you should see an a ### Fiddle with the refresh intervals -If you have changed the minimum refresh interval of Grafana earlier, you can set the auto refresh interval in the top right to a lower value (e.g., 100ms). Rerun your `watch` command in your psql shell: +If you have changed the minimum refresh interval of Grafana earlier, you can set the auto refresh interval in the top right to a lower value (e.g., `100ms`). Rerun your `watch` command in your psql shell: ```sql insert into test values(current_timestamp, (random() * 100)::int); diff --git a/content/cookbook/aurora_debezium.md b/content/cookbook/aurora_debezium.md index 9b566ddb..0fbafb49 100644 --- a/content/cookbook/aurora_debezium.md +++ b/content/cookbook/aurora_debezium.md @@ -23,7 +23,7 @@ If you do not already know your requirements, we recommend using the `m6id.2xlar The rest of this instruction manual assumes you use Ubuntu 24.04 as your operating system. Since CedarDB runs inside its own docker image, you can choose any other OS as well but you might have to adapt the installation instructions accordingly. {{< callout type="info" >}} -Configure the EBS volume where your root partition is mounted to be large enough to hold all of the data Debezium needs to store its CDC events. +Configure the EBS volume where your root partition is mounted to be large enough to hold all the data Debezium needs to store its CDC events. By default, it retains all events for one week and there will be one message per insert/update/delete of all replicated tables. For playing around, the default of 8 GiB is fine. {{< /callout >}} @@ -56,7 +56,7 @@ sudo apt update sudo apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` -Before you can docker commands, you need to add your user to the docker group and re-login: +Before you can Docker commands, you need to add your user to the docker group and re-login: ```shell sudo adduser ubuntu docker diff --git a/content/cookbook/aws_dms.md b/content/cookbook/aws_dms.md index 44281119..4d3d7731 100644 --- a/content/cookbook/aws_dms.md +++ b/content/cookbook/aws_dms.md @@ -52,10 +52,10 @@ This **RDS _parameter group_** which will enable logical replication to DMS. ![Enable logical replication](./img/rds_parameter_group_5.jpg) -## Deploy a new RDS Postgres instance +## Deploy a new RDS PostgreSQL instance -**If you want to use your existing RDS Postgres instance**, skip this section and head down to -[Configure your existing RDS Postgres instance](#configure-your-existing-rds-postgres-instance). +**If you want to use your existing RDS PostgreSQL instance**, skip this section and head down to +[Configure your existing RDS PostgreSQL instance](#configure-your-existing-rds-postgresql-instance). ![Navigate to RDS](./img/rds_create_postgres_0.jpg) @@ -79,9 +79,9 @@ This **RDS _parameter group_** which will enable logical replication to DMS. ![Create database](./img/rds_create_postgres_9.jpg) -Now, wait a while for the RDS Postgres instance to come up. +Now, wait a while for the RDS PostgreSQL instance to come up. -## Configure your existing RDS Postgres instance +## Configure your existing RDS PostgreSQL instance ![RDS Modify Step 1](./img/rds_modify_1.jpg) diff --git a/content/cookbook/importing_from_json.md b/content/cookbook/importing_from_json.md index 39ff2dfd..e142beeb 100644 --- a/content/cookbook/importing_from_json.md +++ b/content/cookbook/importing_from_json.md @@ -26,7 +26,7 @@ create table stars_json (star json); copy stars_json from 'stars.json'; ``` -Now you can use the json documents in SQL queries: +Now you can use the JSON documents in SQL queries: ```sql select star->>'name' as name from stars_json where star->>'gender' = 'F'; diff --git a/content/cookbook/importing_from_postgresql.md b/content/cookbook/importing_from_postgresql.md index 67a39e1e..f8b48650 100644 --- a/content/cookbook/importing_from_postgresql.md +++ b/content/cookbook/importing_from_postgresql.md @@ -114,7 +114,7 @@ If you want the path to be relative to the **client**, precede the command with Note that this incurs some network overhead as the data is sent via the PostgreSQL wire protocol over the psql connection. -The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, you can instead import in text mode which is multi-threaded and thus **much** faster: +The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, you can instead import in text mode which is multithreaded and thus **much** faster: ```sql copy {tablename} from 'your/path/{tablename}.csv' with(format text, delimiter '|', null ''); diff --git a/content/cookbook/read_replica_tutorial.md b/content/cookbook/read_replica_tutorial.md index 10abe4bf..0f9c0cdd 100644 --- a/content/cookbook/read_replica_tutorial.md +++ b/content/cookbook/read_replica_tutorial.md @@ -1,5 +1,5 @@ --- -title: "Tutorial: CedarDB as Postgres Read-Replica" +title: "Tutorial: CedarDB as PostgreSQL Read-Replica" linkTitle: "Setting Up Read Replication" weight: 30 draft: true @@ -53,7 +53,7 @@ Next, we ensure the source PostgreSQL is correctly configured to publish changes ### Create the tables you want to replicate -If not already existing, create the table(s) you want to replicate *from* in your PostgreSQL system like this: +If not already existing, create the tables you want to replicate *from* in your PostgreSQL system like this: ```sql create table foo(a integer, b integer); diff --git a/content/cookbook/working_with_csv.md b/content/cookbook/working_with_csv.md index 143b6188..ebc7310e 100644 --- a/content/cookbook/working_with_csv.md +++ b/content/cookbook/working_with_csv.md @@ -53,7 +53,7 @@ If you want the path to be relative to the **client**, precede the command with Note that this incurs some network overhead as the data is sent via the PostgreSQL wire protocol over the psql connection. -The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, as is the case for our example dataset, you can instead import in text mode which is multi-threaded and thus **much** faster: +The csv import is currently single-threaded, as CedarDB has to correctly handle newlines and escapes. If you are sure that your strings don't contain newlines **and** don't contain the delimiter, as is the case for our example dataset, you can instead import in text mode which is multithreaded and thus **much** faster: ```sql copy movies from 'your/path/movies.csv' with(format text, delimiter ',', null '', header); diff --git a/content/example_datasets/glove.md b/content/example_datasets/glove.md index 108bdf48..8db96ddb 100644 --- a/content/example_datasets/glove.md +++ b/content/example_datasets/glove.md @@ -8,7 +8,7 @@ You can use it to check out CedarDB's vector capabilities. {{< callout type="info" >}} This example uses the syntax of the `pgvector` PostgreSQL extension. -CedarDB implements compatible vector support, so this example can run in both CedarDB as well as PostgreSQL. +CedarDB implements compatible vector support, so this example can run in both CedarDB and PostgreSQL. {{< /callout >}} ## The Dataset @@ -36,7 +36,7 @@ cedar -0.035741 0.30627 -0.89386 -0.42192 0.4423 -0.0031244 0.1343 -0.1627 -0.56 You can download the dataset from the [GloVe project website](https://nlp.stanford.edu/projects/glove/). There are multiple versions with differing training sets and vectors of different dimensionalities. -Let's choose the biggest dataset "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors". +Let's choose the biggest dataset "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors)". To obtain the data simply run diff --git a/content/example_datasets/nasdaq.md b/content/example_datasets/nasdaq.md index a1521075..8fc9d7ce 100644 --- a/content/example_datasets/nasdaq.md +++ b/content/example_datasets/nasdaq.md @@ -14,7 +14,7 @@ High-frequency traders, quantitative analysts, and institutional investors depen To this end, exchanges like the NASDAQ or NYSE offer live market data which interested parties can then consume with specialized tools to recreate the exchange state, the so-called *order book* in real time. The holy grail of this live market data is the Level III order book data. It not only shows the current market price of any given stock but all orders that are currently active. -The exact format of *orders* differ between exchanges, but it generally has the following format: +The exact format of *orders* differs between exchanges, but it generally has the following format: | ID | Ticker | Timestamp | Side | Quantity | Price | PrevOrderId | | -- | ------ | ----------- | ---- | -------- | ------ | ----------- | diff --git a/content/get_started/install_with_docker.md b/content/get_started/install_with_docker.md index ab9025f6..008fe50d 100644 --- a/content/get_started/install_with_docker.md +++ b/content/get_started/install_with_docker.md @@ -38,7 +38,7 @@ postgres= SELECT 1 as foo; ## Configuration -The above setup is perfrect for quick testing, but you'll likely want more control. +The above setup is perfect for quick testing, but you'll likely want more control. Here are a few ways to configure CedarDB in Docker. ### Make the database persistent @@ -80,7 +80,7 @@ docker run --rm -p 5432:5432 \ cedardb/cedardb ``` -This command creates a superuser `test` with password `test` and a a database named `db`. +This command creates a superuser `test` with password `test` and a database named `db`. Connect like this: diff --git a/content/licensing.md b/content/licensing.md index 45ae0bd1..8688a1e5 100644 --- a/content/licensing.md +++ b/content/licensing.md @@ -25,7 +25,7 @@ For more information on setting configuration options, see our [configuration re {{< tabs >}} {{< tab name="Configuration File (preferred)" >}} -Add a line with your license key to the CedarDB configuration file. The server will automatically load it at startup. In this example, we will use the default configuration path, which is automatically loaded at CedarDB startup when no other config file is specified. +Add a line with your license key to the CedarDB configuration file. The server will automatically load it at startup. In this example, we will use the default configuration path, which is automatically loaded at CedarDB startup when no other config file is specified. ```shell echo "\"license.key\" = \"\"" >> ~/.cedardb/config diff --git a/content/references/advanced/_index.md b/content/references/advanced/_index.md index c3d09869..426ae517 100644 --- a/content/references/advanced/_index.md +++ b/content/references/advanced/_index.md @@ -4,7 +4,7 @@ weight: 60 --- CedarDB supports many advanced features. -The following sub pages give an overview over the most important ones and explain how to use them most effectively. +The following sub-pages give an overview over the most important ones and explain how to use them most effectively. * [AsOf joins](asof_join) * [Benchmarking](benchmarking) diff --git a/content/references/advanced/gs.md b/content/references/advanced/gs.md index ce67136e..a72a4299 100644 --- a/content/references/advanced/gs.md +++ b/content/references/advanced/gs.md @@ -67,7 +67,7 @@ gcloud compute instances create cedardb_cloud_storage \ ## Cost considerations -Google cloud object storage is typically cheaper per GiB than local SSD, but requests and network egress are billed separately. +Google Cloud object storage is typically cheaper per GiB than local SSD, but requests and network egress are billed separately. CedarDB writes large objects to keep per-request overhead low; still, budget for PUT, GET, and listing operations as your workload scales. See GCS pricing for storage classes, operations, and network egress. Note that it is important to co-locate the storage bucket and the instance (same region) to avoid any network cost. @@ -75,7 +75,7 @@ Otherwise, expensive egress cost will be charged which may dominate the overall ## CREATE SERVER Script -You can create the create server statement with the help of the following python script. +You can create the create server statement with the help of the following Python script. Just download the JSON key of your service account and specify the bucket and region when running the script. ```shell diff --git a/content/references/advanced/parquet.md b/content/references/advanced/parquet.md index 90974122..70d5b6d4 100644 --- a/content/references/advanced/parquet.md +++ b/content/references/advanced/parquet.md @@ -86,7 +86,7 @@ SELECT * FROM parquet_schema('test.parquet'); (3 rows) ``` -Print the parquet file meta data footer: +Print the parquet file metadata footer: ```sql SELECT * FROM parquet_file('test.parquet'); @@ -127,8 +127,8 @@ SELECT * FROM parquet_colchunks('test.parquet'); ## Performance Considerations CedarDB's parquet scan is optimized for full parquet file imports. -The scan is fully multi-threaded and only reads the columns that are queried by the user. -We do not yet push-down filters into the parquet rowgroups to prune based on parquet statistics and metadata. +The scan is fully multithreaded and only reads the columns that are queried by the user. +We do not yet push down filters into the parquet rowgroups to prune based on parquet statistics and metadata. Thus, you should always prefer importing the columns you need into CedarDB over working on the parquet file directly. ## Implementation Status @@ -138,7 +138,7 @@ This page summarizes the available features supported by the CedarDB Parser. ### Legend - 🟢 **Supported** -- 🟡 **Partially suported**: Details for partial support +- 🟡 **Partially supported**: Details for partial support - 🔴 **Not yet supported** ### Physical Types diff --git a/content/references/advanced/pgvector.md b/content/references/advanced/pgvector.md index 3f94ca78..055f496b 100644 --- a/content/references/advanced/pgvector.md +++ b/content/references/advanced/pgvector.md @@ -4,7 +4,7 @@ weight: 20 --- CedarDB supports working with vectors using the syntax from the [pgvector -Postgres extension](https://github.com/pgvector/pgvector). +PostgreSQL extension](https://github.com/pgvector/pgvector). All vectors are represented as a bracket-enclosed, comma-separated list of float values with the SQL type `vector`. You can optionally specify the number diff --git a/content/references/advanced/prepare.md b/content/references/advanced/prepare.md index 50a4dbd9..54c3ebf1 100644 --- a/content/references/advanced/prepare.md +++ b/content/references/advanced/prepare.md @@ -31,7 +31,7 @@ deallocate add; Names of prepared statements are case-*insensitive*: `ADD`, `adD` and `add` refer to the same statement. {{< /callout >}} -## Why you might want prepare your statements +## Why you might want to prepare your statements Prepared statements especially shine in two use cases: diff --git a/content/references/configuration.md b/content/references/configuration.md index 4b172355..7135bedf 100644 --- a/content/references/configuration.md +++ b/content/references/configuration.md @@ -115,7 +115,7 @@ These settings must be set before starting CedarDB. ## Degree of parallelism CedarDB also uses _all_ threads of the system for best performance. -This is intended behaviour, but might generate high load on your machine. +This is intended behavior, but might generate high load on your machine. If you want to keep other applications responsive, consider starting CedarDB with `nice`. Alternatively, you can limit the number of threads CedarDB uses. Note, however, that this will limit the performance of CedarDB, since all queries will take advantage of the full diff --git a/content/references/datatypes/blob.md b/content/references/datatypes/blob.md index 973ecca8..b706a4c4 100644 --- a/content/references/datatypes/blob.md +++ b/content/references/datatypes/blob.md @@ -28,11 +28,11 @@ select * from example; (3 rows) ``` -While binary blobs can hold arbitrary data of up to 4GB, we recommend to avoid storing overly large data within your database. +While binary blobs can hold arbitrary data of up to 4 GB, we recommend to avoid storing overly large data within your database. For reading queries, CedarDB optimizes columns that are not accessed and blob columns that are not part of the query have no performance impact. However, such columns still come with downsides when modifying data due to the transactional consistency guarantees that CedarDB provides. -As an alternative, consider storing large files (>1MB) on the file system or a cloud object store like S3, and only +As an alternative, consider storing large files (>1 MB) on the file system or a cloud object store like S3, and only store a file path or URL in a `text` field. diff --git a/content/references/datatypes/boolean.md b/content/references/datatypes/boolean.md index 20cff0ca..0e047f6e 100644 --- a/content/references/datatypes/boolean.md +++ b/content/references/datatypes/boolean.md @@ -48,7 +48,7 @@ The underlying reason for this ternary logic is that `null` values are considere expression `42 < null` results in a `null` value, since we don't know how an arbitrary value compares to 42. Equality comparisons between values follow the same rules, i.e., `x = null` will always result in another `null`, even when the value of `x` is `null` as well. -To opt-out of this behaviour you can use `x is null`, which never returns `null`. +To opt-out of this behavior you can use `x is null`, which never returns `null`. The syntax to compare two values and consider `nulls` equal is somewhat verbose: `x is not distinct from y`. However, while most functions return `null` for any `null` input, for boolean logic, e.g., in `and`, a single `false` diff --git a/content/references/datatypes/date.md b/content/references/datatypes/date.md index a64ac0b2..5aeb4314 100644 --- a/content/references/datatypes/date.md +++ b/content/references/datatypes/date.md @@ -33,7 +33,7 @@ select due_date from example; |------------:|------------:| | -4712-01-01 | 99999-12-31 | -Storing values outside of the supported range will result in an overflow exception. +Storing values outside the supported range will result in an overflow exception. Operations on dates are range checked, so that e.g., overflows will never cause wrong results. ## Input diff --git a/content/references/datatypes/enums.md b/content/references/datatypes/enums.md index d75cf7c9..2014dfcb 100644 --- a/content/references/datatypes/enums.md +++ b/content/references/datatypes/enums.md @@ -22,14 +22,14 @@ create type importance as enum ('minor', 'major', 'critical'); ## Usage of Enum types -Enums can be used just like any other type inside of tables, views, queries, etc. . +Enums can be used just like any other type inside of tables, views, queries, etc. ```sql create table tasks (id int, priority importance); insert into tasks values (1, 'major'), (2, 'minor'), (3, 'critical'), (4, 'major'); ``` -The enum labels are case sensitive, whereas the enum names are not. +The enum labels are case-sensitive, whereas the enum names are not. This does not work: diff --git a/content/references/datatypes/float.md b/content/references/datatypes/float.md index 405c9d12..ca419bc3 100644 --- a/content/references/datatypes/float.md +++ b/content/references/datatypes/float.md @@ -5,7 +5,7 @@ weight: 13 --- The types `double precision`, `float`, and `real` are floating-point numbers. Depending on the precision, they are -stored in an four or eight-byte +stored in a four or eight-byte [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) format. {{< callout type="warning" >}} diff --git a/content/references/datatypes/integer.md b/content/references/datatypes/integer.md index c3b312c6..7a44962e 100644 --- a/content/references/datatypes/integer.md +++ b/content/references/datatypes/integer.md @@ -39,7 +39,7 @@ select id from example; | `integer` | $-2^{31}$ | $2^{31}-1$ | | `bigint` | $-2^{63}$ | $2^{63}-1$ | -Storing values outside of the supported ranges will result in an overflow exception. +Storing values outside the supported ranges will result in an overflow exception. Operations on integers are range checked, so that e.g., numeric overflows will never cause wrong results. To avoid overflows, it might be necessary to cast to a type that can represent a larger range. diff --git a/content/references/datatypes/numeric.md b/content/references/datatypes/numeric.md index d37084d1..55aacb2f 100644 --- a/content/references/datatypes/numeric.md +++ b/content/references/datatypes/numeric.md @@ -48,7 +48,7 @@ Operations on 16 Byte types are expensive to compute. We recommend using a precision of 18 or less when possible for your application. {{< /callout >}} -Storing values outside of the supported ranges will result in an overflow exception. +Storing values outside the supported ranges will result in an overflow exception. Operations on numerics are range checked, so that e.g., numeric overflows will never cause wrong results. ## Precision and Scale Changes in Operations @@ -71,7 +71,7 @@ As numerics in CedarDB have a maximum precision of 38, the resulting precisions ### Rules for All Operations Except Multiplication and Division -If the resulting precision exceeds 38, it is clipped to 38 and the scale is reduced by this amount. If the scale would become negative, it is instead set to 0. +If the resulting precision exceeds 38, it is clipped to 38 and the scale is reduced by this amount. If the scale becomes negative, it is instead set to 0. For example, if the resulting precision were 42 and the scale were 6, the precision would be reduced by 4 to 38, and the scale would also be reduced by 4 to 2. ### Rules for Multiplication and Divisions diff --git a/content/references/datatypes/uuid.md b/content/references/datatypes/uuid.md index b70faf8d..a1460a6e 100644 --- a/content/references/datatypes/uuid.md +++ b/content/references/datatypes/uuid.md @@ -81,7 +81,7 @@ select uuid '{a0eebc99-9c0b4ef8-bb6d6bb9-bd380a11}'; ### uuid_extract_version -Provided with a valid UUID, `uuid_extract_version` extracts the version in a `smallint`. Otherwise the function returns `NULL`. +Provided with a valid UUID, `uuid_extract_version` extracts the version in a `smallint`. Otherwise, the function returns `NULL`. ```sql select uuid_extract_version(gen_random_uuid()); @@ -99,7 +99,7 @@ NULL ### uuid_extract_timestamp -`uuid_extract_timestamp` extracts the timestamp with time zone of a uuid of version 1 or 7. Otherwise, the function returns `NULL`. +`uuid_extract_timestamp` extracts the timestamp with time zone of a UUID of version 1 or 7. Otherwise, the function returns `NULL`. ```sql SET timezone = 'Europe/Berlin'; --The timestamp displayed depends on the timezone diff --git a/content/references/dml/upsert.md b/content/references/dml/upsert.md index b0976d88..9eaeb2dd 100644 --- a/content/references/dml/upsert.md +++ b/content/references/dml/upsert.md @@ -74,7 +74,7 @@ ERROR: conflict with concurrent transaction This might sound counterintuitive at first, since we have an execution path even for conflicts. However, the problem for upserts are *uncommitted* changes of other concurrent transactions. Since we don't know if these changes will be committed or rolled back, the upsert can neither proceed with inserting -or updating the row. +nor updating the row. In contrast to PostgreSQL, CedarDB does not lock the values for writes, since this might indefinitely block the transaction. diff --git a/content/references/expressions/_index.md b/content/references/expressions/_index.md index 50145a87..314f3000 100644 --- a/content/references/expressions/_index.md +++ b/content/references/expressions/_index.md @@ -11,7 +11,7 @@ CedarDB supports many expressions that manipulate data: * And (`&&`) * At time zone * Between, between symmetric -* Bit and (`&`) on [bitsrings](/docs/references/functions/bitstring#bit--bit) +* Bit and (`&`) on [bitstrings](/docs/references/functions/bitstring#bit--bit) * Bit or (`|`) on [bitstrings](/docs/references/functions/bitstring#bit--bit-1) * Bit xor (`#`) on [bitstrings](/docs/references/functions/bitstring#bit--bit-2) * Case-insensitive like (`ilike`, `~~*`), negated (`not ilike`, `!~~*`) diff --git a/content/references/functions/advanced_functions/_index.md b/content/references/functions/advanced_functions/_index.md index 363327c3..20e33511 100644 --- a/content/references/functions/advanced_functions/_index.md +++ b/content/references/functions/advanced_functions/_index.md @@ -29,13 +29,13 @@ The filename is relative to the execution path of the CedarDB server, not to the CSV options need to be encoded into a single string, with special characters, e.g. the `"` surrounding delimiters, encoded. The optional third parameter allows to specify the schema of the CSV file, which is autodetected by default. -This can be useful if the user has additional information on the properties of the csv file, such as non-nullable colunms. +This can be useful if the user has additional information on the properties of the csv file, such as non-nullable columns. ## Advanced Analytical Functions ### kmeans -CedarDB provides an optimized implementation to cluster points with any number of dimensions using the k-Means clustering algorithm with an euclidean distance. +CedarDB provides an optimized implementation to cluster points with any number of dimensions using the k-Means clustering algorithm with a Euclidean distance. Example: diff --git a/content/references/functions/json.md b/content/references/functions/json.md index 9d72bbe0..c4b3af2a 100644 --- a/content/references/functions/json.md +++ b/content/references/functions/json.md @@ -92,7 +92,7 @@ select data::text from json_data limit 1; For `jsonb` columns, CedarDB stores *semantically* equivalent documents, so you might get a *syntactically* different text representation in a `text::jsonb::text` conversion. In contrast, `json` columns are stored in a plain text representation, where such a conversion is character-by-character -equivalent, but the access operations are slower, since they need to re-parse the JSON string. +equivalent, but the access operations are slower, since they need to reparse the JSON string. ## Arrays @@ -117,7 +117,7 @@ To relationalize arrays, you can use the `json_array_elements()` function, which multiple rows with the elements of the array. This is similar to the `unnest()` function for SQL arrays. -For the example, you can get a `friends_with` relation from the json array: +For the example, you can get a `friends_with` relation from the JSON array: ```sql select data->'id', json_array_elements(data->'friends') @@ -153,7 +153,7 @@ select data->'name' from json_data where jsonb_contains(data, '{"friends": [2]}' (1 row) ``` -The `@>` operator performs the same operation when applied to json data. +The `@>` operator performs the same operation when applied to JSON data. The `jsonb_exists` function and the equivalent `?` operator can determine if a given jsonb document has a given text as an object key or as an array value. diff --git a/content/references/functions/timestamp.md b/content/references/functions/timestamp.md index 61e9be69..53ddf9ff 100644 --- a/content/references/functions/timestamp.md +++ b/content/references/functions/timestamp.md @@ -9,7 +9,7 @@ functions. ## Functions and Operators -### to_char +### `to_char` The `to_char` function formats a timestamp (with or without time zone) into a string based on a user-specified format pattern. diff --git a/content/references/objects/indexes.md b/content/references/objects/indexes.md index 8bb68c72..64c0d5a3 100644 --- a/content/references/objects/indexes.md +++ b/content/references/objects/indexes.md @@ -58,7 +58,7 @@ Specifying the sort order is useful to support top-k queries. When the order of the top-k query matches an index, CedarDB will use a matching index: ```sql --- this query will be eligable to use the index +-- this query will be eligible to use the index ... order by customer_id, article_id limit 10; ``` diff --git a/content/references/objects/policies.md b/content/references/objects/policies.md index 723fffdc..99a0afea 100644 --- a/content/references/objects/policies.md +++ b/content/references/objects/policies.md @@ -78,7 +78,7 @@ DROP POLICY [ IF EXISTS ] name ON table_name Dropping the last policy on a table does not disable RLS — the default-deny behavior remains active until `ALTER TABLE ... DISABLE ROW LEVEL SECURITY` is called. -## row_security_active +## `row_security_active` `row_security_active(table_name)` returns whether RLS policies will be applied for the given table and the current user. Policies are active when: @@ -95,7 +95,7 @@ SELECT row_security_active('secrets'); -- true ``` -## row_security session setting +## `row_security` session setting Setting `row_security = off` causes CedarDB to throw an error if any table in the query has active RLS, rather than silently filtering rows. This is useful for full-database dumps where filtered output would produce an incomplete backup. diff --git a/content/references/objects/roles.md b/content/references/objects/roles.md index 1f4c58ea..cf7733ea 100644 --- a/content/references/objects/roles.md +++ b/content/references/objects/roles.md @@ -23,20 +23,20 @@ create user admin with createdb createrole password 'admin'; The create role statement can be used with multiple of the following options. Direct options can be specified for both create and alter role: -* superuser, nosuperuser -* createdb, nocreatedb -* createrole, nocreaterole -* inherit, noinherit -* login, nologin -* replication, noreplication -* connection limit connlimit (currently not enforced) -* password 'password', password null +* `superuser`, `nosuperuser` +* `createdb`, `nocreatedb` +* `createrole`, `nocreaterole` +* `inherit`, `noinherit` +* `login`, `nologin` +* `replication`, `noreplication` +* `connection limit connlimit` (currently not enforced) +* `password 'password'`, `password null` When creating a new role, you can additionally specify the hierarchy of its group memberships: -* in role role_name, ... -* role role_name, ... -* admin role_name, ... +* `in role role_name, ...` +* `role role_name, ...` +* `admin role_name, ...` ### Permissions diff --git a/content/roadmap.md b/content/roadmap.md index 9d95b864..24e93493 100644 --- a/content/roadmap.md +++ b/content/roadmap.md @@ -8,7 +8,7 @@ This page provides a brief overview of the most important CedarDB [database](#da and [enterprise](#enterprise-features) features that are next on our roadmap. Information about intermediate status of features and minor features will be available in our release notes. -We categorise the status of the features on this page as planned, in progress, and fully available: +We categorize the status of the features on this page as planned, in progress, and fully available: | **State** | **Icon** | |--------------------|:----------------------:| @@ -40,7 +40,7 @@ CedarDB. | **Feature** | **State** | **Details** | |------------------------------|:----------------------:|-------------------------------------------------------| | AsOf joins | {{< icondone >}} | [Documentation](/docs/references/advanced/asof_join/) | -| Fulltext search | {{< iconplanned >}} | | +| Full-text search | {{< iconplanned >}} | | | Enhanced graph query support | {{< iconplanned >}} | | | Range types | {{< iconinprogress >}} | | | Schema evolution | {{< iconinprogress >}} | [Documentation](/docs/references/objects/) |