diff --git a/mddocs/Makefile b/mddocs/Makefile
new file mode 100644
index 00000000..d4bb2cbb
--- /dev/null
+++ b/mddocs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/mddocs/_static/custom.css b/mddocs/_static/custom.css
new file mode 100644
index 00000000..a48da9ce
--- /dev/null
+++ b/mddocs/_static/custom.css
@@ -0,0 +1,6 @@
+.logo {
+ width: 200px !important;
+}
+.sidebar-brand-text {
+ text-align: center;
+}
diff --git a/mddocs/_static/icon.svg b/mddocs/_static/icon.svg
new file mode 100644
index 00000000..a2301814
--- /dev/null
+++ b/mddocs/_static/icon.svg
@@ -0,0 +1,11 @@
+
diff --git a/mddocs/_static/icon_original.svg b/mddocs/_static/icon_original.svg
new file mode 100644
index 00000000..b7e195bb
--- /dev/null
+++ b/mddocs/_static/icon_original.svg
@@ -0,0 +1,98 @@
+
+
+
+
diff --git a/mddocs/_static/logo.svg b/mddocs/_static/logo.svg
new file mode 100644
index 00000000..9fd9fa93
--- /dev/null
+++ b/mddocs/_static/logo.svg
@@ -0,0 +1,26 @@
+
diff --git a/mddocs/_static/logo_original.svg b/mddocs/_static/logo_original.svg
new file mode 100644
index 00000000..b1e9da39
--- /dev/null
+++ b/mddocs/_static/logo_original.svg
@@ -0,0 +1,182 @@
+
+
+
+
diff --git a/mddocs/_static/logo_wide.svg b/mddocs/_static/logo_wide.svg
new file mode 100644
index 00000000..b356ecfc
--- /dev/null
+++ b/mddocs/_static/logo_wide.svg
@@ -0,0 +1,28 @@
+
diff --git a/mddocs/_static/logo_wide_original.svg b/mddocs/_static/logo_wide_original.svg
new file mode 100644
index 00000000..42779fbe
--- /dev/null
+++ b/mddocs/_static/logo_wide_original.svg
@@ -0,0 +1,198 @@
+
+
+
+
diff --git a/mddocs/_static/logo_wide_white_text.svg b/mddocs/_static/logo_wide_white_text.svg
new file mode 100644
index 00000000..a2f97022
--- /dev/null
+++ b/mddocs/_static/logo_wide_white_text.svg
@@ -0,0 +1,28 @@
+
diff --git a/mddocs/_static/logo_with_text.svg b/mddocs/_static/logo_with_text.svg
new file mode 100644
index 00000000..68806607
--- /dev/null
+++ b/mddocs/_static/logo_with_text.svg
@@ -0,0 +1,27 @@
+
diff --git a/mddocs/_static/logo_with_text_original.svg b/mddocs/_static/logo_with_text_original.svg
new file mode 100644
index 00000000..88ac4b14
--- /dev/null
+++ b/mddocs/_static/logo_with_text_original.svg
@@ -0,0 +1,197 @@
+
+
+
+
diff --git a/mddocs/_static/logo_with_white_text.svg b/mddocs/_static/logo_with_white_text.svg
new file mode 100644
index 00000000..a538714e
--- /dev/null
+++ b/mddocs/_static/logo_with_white_text.svg
@@ -0,0 +1,27 @@
+
diff --git a/mddocs/_static/metrics.prom b/mddocs/_static/metrics.prom
new file mode 100644
index 00000000..0a831744
--- /dev/null
+++ b/mddocs/_static/metrics.prom
@@ -0,0 +1 @@
+# Generated in CI
diff --git a/mddocs/_static/swagger_http2kafka.html b/mddocs/_static/swagger_http2kafka.html
new file mode 100644
index 00000000..99d5cf25
--- /dev/null
+++ b/mddocs/_static/swagger_http2kafka.html
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+ DataRentgen HTTP2Kafka - SwaggerUI
+
+
+
+
+
+
+
+
+
diff --git a/mddocs/_static/swagger_server.html b/mddocs/_static/swagger_server.html
new file mode 100644
index 00000000..6bfe9c51
--- /dev/null
+++ b/mddocs/_static/swagger_server.html
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+ DataRentgen REST API - SwaggerUI
+
+
+
+
+
+
+
+
+
diff --git a/mddocs/changelog.md b/mddocs/changelog.md
new file mode 100644
index 00000000..21342cc5
--- /dev/null
+++ b/mddocs/changelog.md
@@ -0,0 +1,3 @@
+# Changelog
+
+!include(../../changelog/index.md)
diff --git a/mddocs/changelog/0.1.0.md b/mddocs/changelog/0.1.0.md
new file mode 100644
index 00000000..c57769da
--- /dev/null
+++ b/mddocs/changelog/0.1.0.md
@@ -0,0 +1,3 @@
+# 0.1.0 (2024-12-25) { #0.1.0 }
+
+🎉 Data.Rentgen first release! 🎉
diff --git a/mddocs/changelog/0.2.0.md b/mddocs/changelog/0.2.0.md
new file mode 100644
index 00000000..435a3b4c
--- /dev/null
+++ b/mddocs/changelog/0.2.0.md
@@ -0,0 +1,830 @@
+# 0.2.0 (2025-03-25) { #0.2.0 }
+
+## TL;DR
+
+* Implemented column lineage support.
+* HDFS/S3 partitions are now truncated from table path.
+* Added total run/operation statistics (input/output bytes, rows, files).
+* Lineage graph UX improvements.
+* Kafka -> consumer integrations improvements.
+
+## Breaking Changes
+
+* Change response schema of `GET /operations`. ([#158](https://github.com/MobileTeleSystems/data-rentgen/issues/158))
+
+ Operation properties are moved to `data` key, added new `statistics` key.
+ This allows to show operation statistics in UI without building up lineage graph.
+
+### Operatirons response examples
+
+=== Before
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "kind": "OPERATION",
+ "id": "00000000-0000-0000-0000-000000000000",
+ "name": "abc",
+ "description": "some",
+ // ...
+ }
+ ],
+ }
+ ```
+
+=== to
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "data": {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "name": "abc",
+ "description": "some",
+ // ...
+ },
+ "statistics": {
+ "inputs": {
+ "total_datasets": 2,
+ "total_bytes": 123456,
+ "total_rows": 100,
+ "total_files": 0,
+ },
+ "outputs": {
+ "total_datasets": 2,
+ "total_bytes": 123456,
+ "total_rows": 100,
+ "total_files": 0,
+ },
+ },
+ }
+ ],
+ }
+ ```
+
+* Change response schema of `GET /runs`. ([#159](https://github.com/MobileTeleSystems/data-rentgen/issues/159))
+
+ Run properties are moved to `data` key, added new `statistics` key.
+ This allows to show run statistics in UI without building up lineage graph.
+
+### Runs response examples
+
+=== Before:
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "kind": "RUN",
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "abc",
+ "description": "some",
+ // ...
+ }
+ ],
+ }
+ ```
+
+=== to
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "data": {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "abc",
+ "description": "some",
+ // ...
+ },
+ "statistics": {
+ "inputs": {
+ "total_datasets": 2,
+ "total_bytes": 123456,
+ "total_rows": 100,
+ "total_files": 0,
+ },
+ "outputs": {
+ "total_datasets": 2,
+ "total_bytes": 123456,
+ "total_rows": 100,
+ "total_files": 0,
+ },
+ "operations": {
+ "total_operations": 10,
+ },
+ },
+ }
+ ],
+ }
+ ```
+
+* Change response schema of `GET /locations`. ([#160](https://github.com/MobileTeleSystems/data-rentgen/issues/160))
+
+ Location properties are moved to `data` key, added new `statistics` key.
+ This allows to show location statistics in UI.
+
+### Locations response examples
+
+=== Before
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "kind": "LOCATION",
+ "id": 123,
+ "name": "rnd_dwh",
+ "type": "hdfs",
+ // ...
+ }
+ ],
+ }
+ ```
+
+=== to
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "id": "123",
+ "data": {
+ "id": "123",
+ "name": "rnd_dwh",
+ "type": "hdfs",
+ // ...
+ },
+ "statistics": {
+ "datasets": {"total_datasets": 2},
+ "jobs": {"total_jobs": 0},
+ },
+ }
+ ],
+ }
+ ```
+
+ Same for `PATCH /locations/:id`:
+
+### Location response examples
+
+=== Before
+
+ ```json
+ {
+ "kind": "LOCATION",
+ "id": 123,
+ "name": "abc",
+ // ...
+ }
+ ```
+
+=== after
+
+ ```json
+ {
+ "id": "123",
+ "data": {
+ "id": "123",
+ "name": "abc",
+ // ...
+ },
+ "statistics": {
+ "datasets": {"total_datasets": 2},
+ "jobs": {"total_jobs": 0},
+ },
+ }
+ ```
+
+* Change response schema of `GET /datasets`. ([#161](https://github.com/MobileTeleSystems/data-rentgen/issues/161))
+
+ Dataset properties are moved to `data` key.
+ This makes API response more consistent with others (e.g. `GET /runs`, `GET /operations`).
+
+ ### Response examples
+
+=== Before
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "kind": "DATASET",
+ "id": 123,
+ "name": "abc",
+ // ...
+ }
+ ],
+ }
+ ```
+
+=== to
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "id": "123",
+ "data": {
+ "id": "123",
+ "name": "abc",
+ // ...
+ },
+ }
+ ],
+ }
+ ```
+
+* Change response schema of `GET /jobs`. ([#162](https://github.com/MobileTeleSystems/data-rentgen/issues/162))
+
+ Job properties are moved to `data` key.
+ This makes API response more consistent with others (e.g. `GET /runs`, `GET /operations`).
+
+### Jobs response examples
+
+=== Before
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "kind": "JOB",
+ "id": 123,
+ "name": "abc",
+ // ...
+ }
+ ],
+ }
+ ```
+
+=== after
+
+ ```json
+ {
+ "meta": {
+ // ...
+ },
+ "items": [
+ {
+ "id": "123",
+ "data": {
+ "id": "123",
+ "name": "abc",
+ // ...
+ },
+ }
+ ],
+ }
+ ```
+
+* Change response schema of `GET /:entity/lineage`. ([#164](https://github.com/MobileTeleSystems/data-rentgen/issues/164))
+
+ List of all nodes (e.g. `list[Node]`) is split by node type, and converted to map (e.g. `dict[str, Dataset]`, `dict[str, Job]`).
+
+ List of all relations (e.g. `list[Relation]`) is split by relation type (e.g. `list[DatasetSymlink]`, `list[Input]`).
+
+### Lineage response examples
+
+=== Before
+
+ ```json
+ {
+ "relations": [
+ {
+ "kind": "PARENT",
+ "from": {"kind": "JOB", "id": 123},
+ "to": {"kind": "RUN", "id": "00000000-0000-0000-0000-000000000000"},
+ },
+ {
+ "kind": "SYMLINK",
+ "from": {"kind": "DATASET", "id": 234},
+ "to": {"kind": "DATASET", "id": 999},
+ },
+ {
+ "kind": "INPUT",
+ "from": {"kind": "DATASET", "id": 234},
+ "to": {"kind": "OPERATION", "id": "11111111-1111-1111-1111-111111111111"},
+ },
+ {
+ "kind": "OUTPUT",
+ "from": {"kind": "OPERATION", "id": "11111111-1111-1111-1111-111111111111"},
+ "to": {"kind": "DATASET", "id": 234},
+ },
+ ],
+ "nodes": [
+ {"kind": "DATASET", "id": 123, "name": "abc"},
+ {"kind": "JOB", "id": 234, "name": "cde"},
+ {
+ "kind": "RUN",
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "def",
+ },
+ {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ "name": "efg",
+ },
+ ],
+ }
+ ```
+
+=== after
+
+ ```json
+ {
+ "relations": {
+ "parents": [
+ {
+ "from": {"kind": "JOB", "id": "123"},
+ "to": {"kind": "RUN", "id": "00000000-0000-0000-0000-000000000000"},
+ },
+ ],
+ "symlinks": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {"kind": "DATASET", "id": "999"},
+ },
+ ],
+ "inputs": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ },
+ ],
+ "outputs": [
+ {
+ "from": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ "to": {"kind": "DATASET", "id": "234"},
+ },
+ ],
+ },
+ "nodes": {
+ "datasets": {
+ "123": {"id": "123", "name": "abc"},
+ },
+ "jobs": {
+ "234": {"id": "234", "name": "cde"},
+ },
+ "runs": {
+ "00000000-0000-0000-0000-000000000000": {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "def",
+ },
+ },
+ "operations": {
+ "11111111-1111-1111-1111-111111111111": {
+ "id": "11111111-1111-1111-1111-111111111111",
+ "name": "efg",
+ },
+ },
+ },
+ }
+ ```
+
+ This allows to replace filters on UI side with O(n) complexity like:
+
+ ```javascript
+ // O(n)
+ relations.filter((relation) => relation.kind == "INPUT" && relation.from.kind == "DATASET" && relation.from.id == dataset_id)
+ // again O(n)
+ nodes.filter((node) => node.kind == "DATASET" && node.id == dataset_id)
+ ```
+
+ with much more effective ones:
+
+ ```javascript
+ // O(n) with much smaller n
+ relations.inputs.filter((relation) => relation.from.kind == "DATASET" && relation.from.id == dataset_id)
+ // O(1)
+ nodes.datasets[dataset_id]
+ ```
+
+ The size of output JSON is not much different.
+
+ Note that dataset, job and location ids in all responses were converted from ints to strings, because in JSON object keys have to be strings.
+
+ Also nodes and relations don’t have `kind` field anymore.
+
+* Change `DATA_RENTGEN__KAFKA__BOOTSTRAP_SERVERS` value type from string (one `host:port` item) to list (`["host1:port1", "host2:port2"]`). ([#183](https://github.com/MobileTeleSystems/data-rentgen/issues/183))
+* Sync `DATA_RENTGEN__KAFKA__SECURITY__TYPE` values with Kafka client values, e.g. `scram-sha256` -> `SCRAM-SHA-256`. ([#183](https://github.com/MobileTeleSystems/data-rentgen/issues/183))
+
+## Features
+
+* Consumer now can capture and save OpenLineage column lineage. ([#155](https://github.com/MobileTeleSystems/data-rentgen/issues/155))
+
+ It is highly recommended to update to OpenLineage 1.23 and use `columnLineage.datasetLineageEnabled=true`,
+ to reduce both event JSON size and to reduce CPU load on DataRentgen consumer.
+
+* Include sum inputs & outputs to lineage responses. ([#171](https://github.com/MobileTeleSystems/data-rentgen/issues/171))
+
+ For example, if user asked for lineage with `granularity=OPERATION`, include inputs & outputs with detalization to `RUN` (sum of all included operations by `run_id`) and `JOB` (sum of all included operations by `job_id`).
+ This allows to show that specific operation is some specific percent of all operations within this run or job.
+
+* Add column lineage to lineage responses `GET /:entity/lineage`. ([#172](https://github.com/MobileTeleSystems/data-rentgen/issues/172))
+
+ ### Another lineage response examples
+
+=== Before
+
+ ```json
+ {
+ "relations": {
+ "parents": [
+ {
+ "from": {"kind": "JOB", "id": "123"},
+ "to": {"kind": "RUN", "id": "00000000-0000-0000-0000-000000000000"},
+ },
+ ],
+ "symlinks": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {"kind": "DATASET", "id": "999"},
+ },
+ ],
+ "inputs": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ },
+ ],
+ "outputs": [
+ {
+ "from": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ "to": {"kind": "DATASET", "id": "234"},
+ },
+ ],
+ },
+ "nodes": {
+ "datasets": {
+ "123": {"id": "123", "name": "abc"},
+ },
+ "jobs": {
+ "234": {"id": "234", "name": "cde"},
+ },
+ "runs": {
+ "00000000-0000-0000-0000-000000000000": {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "def",
+ },
+ },
+ "operations": {
+ "11111111-1111-1111-1111-111111111111": {
+ "id": "11111111-1111-1111-1111-111111111111",
+ "name": "efg",
+ },
+ },
+ },
+ }
+ ```
+
+=== after
+
+ ```json
+ {
+ "relations": {
+ "parents": [
+ {
+ "from": {"kind": "JOB", "id": "123"},
+ "to": {"kind": "RUN", "id": "00000000-0000-0000-0000-000000000000"},
+ },
+ ],
+ "symlinks": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {"kind": "DATASET", "id": "999"},
+ },
+ ],
+ "inputs": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ },
+ ],
+ "outputs": [
+ {
+ "from": {
+ "kind": "OPERATION",
+ "id": "11111111-1111-1111-1111-111111111111",
+ },
+ "to": {"kind": "DATASET", "id": "234"},
+ },
+ ],
+ // Here |
+ // v
+ "direct_column_lineage": [
+ {
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {"kind": "DATASET", "id": "235"},
+ "fields": {
+ "target_column_1": [
+ {
+ "field": "direct_source_column_1",
+ "last_used_at": "2008-09-15T15:53:00+05:00",
+ "types": [
+ "TRANSFORMATION_MASKING",
+ "AGGREGATION",
+ ],
+ },
+ {
+ "field": "direct_source_column_2",
+ "last_used_at": "2008-09-15T15:53:00+05:00",
+ "types": [
+ "AGGREGATION",
+ ],
+ },
+ ],
+ "target_column_2": [
+ {
+ "field": "direct_source_column_1",
+ "last_used_at": "2008-09-15T15:53:00+05:00",
+ "types": [
+ "TRANSFORMATION_MASKING",
+ "AGGREGATION",
+ ],
+ },
+ ]
+ },
+ },
+ ],
+ "indirect_column_lineage": [
+ {
+
+ "from": {"kind": "DATASET", "id": "234"},
+ "to": {"kind": "DATASET", "id": "235"},
+ "fields": [
+ {
+ "field": "indirect_source_column_1",
+ "last_used_at": "2008-09-15T15:53:00+05:00",
+ "types": ["JOIN"],
+ },
+ ]
+ },
+ ],
+ },
+ "nodes": {
+ "datasets": {
+ "123": {"id": "123", "name": "abc"},
+ },
+ "jobs": {
+ "234": {"id": "234", "name": "cde"},
+ },
+ "runs": {
+ "00000000-0000-0000-0000-000000000000": {
+ "id": "00000000-0000-0000-0000-000000000000",
+ "external_id": "def",
+ },
+ },
+ "operations": {
+ "11111111-1111-1111-1111-111111111111": {
+ "id": "11111111-1111-1111-1111-111111111111",
+ "name": "efg",
+ },
+ },
+ },
+ }
+ ```
+
+* Add support for Kafka GSSAPI auth. ([#183](https://github.com/MobileTeleSystems/data-rentgen/issues/183))
+* Allow fetching `GET /v1/runs?since=...` without `search_query` query param. ([#184](https://github.com/MobileTeleSystems/data-rentgen/issues/184))
+
+## Improvements
+
+* Fix multiple `proxyUrl` for spark facet with `master="yarn"`. ([#154](https://github.com/MobileTeleSystems/data-rentgen/issues/154))
+
+ When spark application send lineage, the `proxyUrl` field may come in this form:
+
+ `http://node-mn-0001.msk.mts.ru:8088/proxy/application_1733,http://node-mn-0002.msk.mts.ru:8088/proxy/application_7400`
+
+ We are using only first one (before `,`)
+
+* Add dataset name parsing for removing partition-like part from name. ([#175](https://github.com/MobileTeleSystems/data-rentgen/issues/175))
+
+=== Before
+
+ Two different datasets:
+
+ ```python
+ Dataset(name="/app/warehouse/somedb.db/sometable/business_dt=2025-01-01/reg_id=99")
+ Dataset(name="/app/warehouse/somedb.db/sometable/business_dt=2025-02-01/reg_id=99")
+ ```
+
+=== After
+
+ Two partitions union in one dataset:
+
+ ```python
+ Dataset(name="/app/warehouse/somedb.db/sometable")
+ ```
+
+* Change logic for output/input dataset schema in lineage response. ([#185](https://github.com/MobileTeleSystems/data-rentgen/issues/185))
+
+ Add types of schema in response ‘EXACT_MATCH’ and ‘LATEST_KNOWN’.
+
+ ‘EXACT_MATCH’ - when last and first(order by created_at ascending) schema_ids are the same.
+ ‘LATEST_KNOWN’ - when last and first are not the same, in this case last schema_id will return.
+
+### Dataset in lineage response examples
+
+=== Before
+
+ ```json
+ {
+ "relations": {
+ "direct_column_lineage": [],
+ "indirect_column_lineage": [],
+ "inputs": [
+ {
+ "from": {
+ "id": "2697",
+ "kind": "DATASET"
+ },
+ "last_interaction_at": "2025-03-14T15:22:30.572000Z",
+ "num_bytes": 13166146,
+ "num_files": 240,
+ "num_rows": 22793,
+ "schema": {
+ "fields": [
+ {
+ "description": null,
+ "fields": [],
+ "name": "dt",
+ "type": "timestamp"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "customer_id",
+ "type": "decimal(20,0)"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "total_spent",
+ "type": "float"
+ }
+ ],
+ "id": "1418"
+ },
+ "to": {
+ "id": "1260",
+ "kind": "JOB"
+ }
+ },
+ {
+ "from": {
+ "id": "3300",
+ "kind": "DATASET"
+ },
+ "last_interaction_at": "2025-03-17T08:45:58.439000Z",
+ "num_bytes": 13060345,
+ "num_files": 112,
+ "num_rows": 13723,
+ "schema": null,
+ "to": {
+ "id": "0195a347-fa5f-7a72-aa14-bc510fadfd3a",
+ "kind": "RUN"
+ }
+ }
+ ]
+ }
+ }
+ ```
+
+=== to
+
+ ```json
+ {
+ "relations": {
+ "direct_column_lineage": [],
+ "indirect_column_lineage": [],
+ "inputs": [
+ {
+ "from": {
+ "id": "2697",
+ "kind": "DATASET"
+ },
+ "last_interaction_at": "2025-03-14T15:22:30.572000Z",
+ "num_bytes": 13166146,
+ "num_files": 240,
+ "num_rows": 22793,
+ "schema": {
+ "fields": [
+ {
+ "description": null,
+ "fields": [],
+ "name": "dt",
+ "type": "timestamp"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "customer_id",
+ "type": "decimal(20,0)"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "total_spent",
+ "type": "float"
+ }
+ ],
+ "id": "1418",
+ "relevance_type": "EXACT_MATCH" // <--
+ },
+ "to": {
+ "id": "1260",
+ "kind": "JOB"
+ }
+ },
+ {
+ "from": {
+ "id": "3300",
+ "kind": "DATASET"
+ },
+ "last_interaction_at": "2025-03-17T08:45:58.439000Z",
+ "num_bytes": 13060345,
+ "num_files": 112,
+ "num_rows": 13723,
+ "schema": {
+ "fields": [
+ {
+ "description": null,
+ "fields": [],
+ "name": "dt",
+ "type": "timestamp"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "customer_id",
+ "type": "decimal(20,0)"
+ },
+ {
+ "description": null,
+ "fields": [],
+ "name": "total_spent",
+ "type": "float"
+ }
+ ],
+ "id": "1657",
+ "relevance_type": "LATEST_KNOWN" // <--
+ },
+ "to": {
+ "id": "0195a347-fa5f-7a72-aa14-bc510fadfd3a",
+ "kind": "RUN"
+ }
+ }
+ ]
+ }
+ }
+ ```
+
+* Remove partition part of the name from dataset names. ([#188](https://github.com/MobileTeleSystems/data-rentgen/issues/188))
+* Remove datasets and symlinks from lineage response which have no inputs or outputs. ([#189](https://github.com/MobileTeleSystems/data-rentgen/issues/189))
+* Add `/v1/auth/logout` endpoint to KeycloakAuthProvider. ([#192](https://github.com/MobileTeleSystems/data-rentgen/issues/192))
diff --git a/mddocs/changelog/0.2.1.md b/mddocs/changelog/0.2.1.md
new file mode 100644
index 00000000..a9f2f073
--- /dev/null
+++ b/mddocs/changelog/0.2.1.md
@@ -0,0 +1,7 @@
+# 0.2.1 (2025-04-07) { #0.2.1 }
+
+## Improvements
+
+- Reduce image size x2
+- Change docker image user from `root` to `data-rentgen`, to improve security.
+- SBOM file is generated on release.
diff --git a/mddocs/changelog/0.3.0.md b/mddocs/changelog/0.3.0.md
new file mode 100644
index 00000000..f532aed4
--- /dev/null
+++ b/mddocs/changelog/0.3.0.md
@@ -0,0 +1,150 @@
+# 0.3.0 (2025-07-04) { #0.3.0 }
+
+## Features
+
+- Improved support for `openlineage-airflow` ([#210](https://github.com/MobileTeleSystems/data-rentgen/issues/210)).
+
+ Before we tracked only DAG and Task start/stop events, but not lineage.
+ Now we store lineage produces by Airflow Operators like `SQLExecuteQueryOperator`.
+- Added support for `openlineage-flink` ([#214](https://github.com/MobileTeleSystems/data-rentgen/issues/214)).
+- Added support for `openlineage-hive` ([#245](https://github.com/MobileTeleSystems/data-rentgen/issues/245)).
+- Added support for `openlineage-dbt` ([#223](https://github.com/MobileTeleSystems/data-rentgen/issues/223)).
+- Add `DATASET` granularity for `GET /api/datasets/lineage` ([#235](https://github.com/MobileTeleSystems/data-rentgen/issues/235)).
+- Store SQL queries received from OpenLineage integrations. ([#213](https://github.com/MobileTeleSystems/data-rentgen/issues/213), [#218](https://github.com/MobileTeleSystems/data-rentgen/issues/218)).
+
+## Breaking changes
+
+- Change `Output.type` in `GET /api/*/lineage` response from `Enum` to `List[Enum]` ([#222](https://github.com/MobileTeleSystems/data-rentgen/issues/222)).
+
+### ENUM in lineage response examples
+
+=== Before
+
+ ```python
+ {
+ "nodes": {...},
+ "relations": {
+ "outputs": [
+ {
+ "from": {"kind": "JOB", "id": 3981},
+ "to": {"kind": "DATASET", "id": 8400},
+ "types": "OVERWRITE", # <---
+ ...
+ ]
+ },
+ }
+ ```
+
+=== After
+
+ ```python
+ {
+ "nodes": {...},
+ "relations": {
+ "outputs": [
+ {
+ "from": {"kind": "JOB", "id": 3981},
+ "to": {"kind": "DATASET", "id": 8400},
+ "types": ["OVERWRITE", "DROP", "TRUNCATE"], # <---
+ ...
+ ]
+ },
+ }
+ ```
+
+ We’re using output schema, if any, then fallback to input schema.
+
+- Moved `Input.schema` and `Output.schema` to `Dataset.schema` in `GET /api/*/lineage` response ([#249](https://github.com/MobileTeleSystems/data-rentgen/issues/249)).
+
+### Schema in lineage response examples
+
+=== Before
+
+ ```python
+ {
+ "nodes": {
+ "datasets": {
+ "8400": {
+ "id": "8400",
+ "location": {...},
+ "name": "dataset_name",
+ ...
+ }
+
+ },
+ "relations": {
+ "outputs": [
+ {
+ "from": {"kind": "JOB", "id": 3981},
+ "to": {"kind": "DATASET", "id": 8400},
+ "types": "OVERWRITE",
+ "schema": { # <---
+ "id": "10062",
+ "fields": [ ... ],
+ "relevance_type": "EXACT_MATCH"
+ ]
+ ]
+ },
+ }
+ ```
+
+=== After
+
+ ```python
+ {
+ "nodes": {
+ "datasets": {
+ "8400": {
+ "id": "25896",
+ "location": {...},
+ "name": "dataset_name",
+ "schema": { # <---
+ "id": "10062",
+ "fields": [...],
+ "relevance_type": "EXACT_MATCH"
+ },
+ ...
+ }
+ }
+ ...
+ },
+ "relations": {
+ "outputs": [
+ {
+ "from": {"kind": "JOB", "id": 3981},
+ "to": {"kind": "DATASET", "id": 8400},
+ "types": ["OVERWRITE", "DROP", "TRUNCATE"],
+ ...
+ ]
+ },
+ }
+ ```
+
+## Improvements
+
+- Added `cleanup_partitions.py` script to automate the cleanup of old table partitions ([#254](https://github.com/MobileTeleSystems/data-rentgen/issues/254)).
+- Added `data_rentgen.db.seed` script which creates example data in database ([#257](https://github.com/MobileTeleSystems/data-rentgen/issues/257)).
+- Speedup fetching `Run` and `Operation` from database by id ([#247](https://github.com/MobileTeleSystems/data-rentgen/issues/247)).
+- Speedup consuming OpenLineage events from Kafka ([#236](https://github.com/MobileTeleSystems/data-rentgen/issues/236)).
+- Make consumer message parsing more robust ([#204](https://github.com/MobileTeleSystems/data-rentgen/issues/204)).
+
+ Previously malformed OpenLineage events (JSON) lead to skipping the entire message batch read from Kafka.
+ Now messages are parsed separately, and malformed ones are send back to `input.runs__malformed` Kafka topic.
+- Improve storing lineage data for long running operations ([#253](https://github.com/MobileTeleSystems/data-rentgen/issues/253)).
+
+### Description
+
+ Previously if operation was running for a long time (more than a day, Flink streaming jobs can easily run for months or years),
+ and lineage graph was build for last day, there were no Flink job/run/operation in the graph.
+
+ This is because we created input/output/column lineage at operation start,
+ and `RUNNING` events of the same operation (checkpoints) were just updating the same row statistics.
+
+ Now we create new input/output/column lineage row for checkpoints events as well.
+ But only one row for each hour since operation was started, as increasing number of rows slows down lineage graph resolution.
+
+ For short-lived operations (most of batch operations take less than hour) behavior remains unchanged.
+
+## Bug Fixes
+
+- Fix Airflow 3.x DAG and Task url template ([#227](https://github.com/MobileTeleSystems/data-rentgen/issues/227)).
diff --git a/mddocs/changelog/0.3.1.md b/mddocs/changelog/0.3.1.md
new file mode 100644
index 00000000..c3cc6c28
--- /dev/null
+++ b/mddocs/changelog/0.3.1.md
@@ -0,0 +1,9 @@
+# 0.3.1 (2025-07-04) { #0.3.1 }
+
+## Breaking changes
+
+- Drop `Dataset.format` field.
+
+## Improvements
+
+- Added syntax highlighting for SQL queries.
diff --git a/mddocs/changelog/0.4.0.md b/mddocs/changelog/0.4.0.md
new file mode 100644
index 00000000..7b24f45e
--- /dev/null
+++ b/mddocs/changelog/0.4.0.md
@@ -0,0 +1,169 @@
+# 0.4.0 (2025-10-03) { #0.4.0 }
+
+## Features
+
+- Introduce new [http2kafka][http2kafka] component. ([#281](https://github.com/MobileTeleSystems/data-rentgen/issues/281))
+
+ It allows using DataRentgen with OpenLineage HttpTransport.
+ Authentication is done using personal tokens.
+
+- Add REST API endpoints for managing personal tokens. ({issue}`276`)
+
+List of endpoints:
+
+- `GET /personal-tokens` - get personal tokens for current user.
+- `POST /personal-tokens` - create new personal token for current user.
+- `PATCH /personal-tokens/:id` - refresh personal token (revoke token and create new one).
+- `DELETE /personal-tokens/:id` - revoke personal token.
+
+- Add new entities `Tag` and `TagValue`. ([#268](https://github.com/MobileTeleSystems/data-rentgen/issues/268))
+
+ Tags can be used as additional properties for another entities.
+ This feature is still under construction.
+
+- Added endpoint `GET /v1/tags`. ([#289](https://github.com/MobileTeleSystems/data-rentgen/issues/289))
+
+ Tag names and values can be paginated, searched by, or fetched by ids.
+
+### Response example
+
+```json
+[
+ {
+ "id": 1,
+ "name": "env",
+ "values": [
+ {
+ "id": 1,
+ "value": "dev"
+ },
+ {
+ "id": 2,
+ "value": "prod"
+ }
+ ]
+ }
+]
+
+```
+
+- Updated `GET /v1/datasets` to include `tags: [...]` in response. ([#289](https://github.com/MobileTeleSystems/data-rentgen/issues/289))
+
+### Dataset response examples
+
+Before:
+
+```python
+{
+ "id": "8400",
+ "location": {...},
+ "name": "dataset_name",
+ "schema": {},
+}
+```
+
+After:
+
+```python
+{
+ "id": "25896",
+ "location": {...},
+ "name": "dataset_name",
+ "schema": {...},
+ "tags": [ # <---
+ {
+ "id": "1",
+ "name": "environment",
+ "values": [
+ {
+ "id": "2",
+ "value": "production"
+ }
+ ]
+ },
+ {
+ "id": "2",
+ "name": "team",
+ "values": [
+ {
+ "id": "4",
+ "value": "my_awesome_team"
+ }
+ ]
+ }
+ ]
+}
+```
+
+- Added new filters to `GET /v1/datasets` endpoint. ([#294](https://github.com/MobileTeleSystems/data-rentgen/issues/294), [#289](https://github.com/MobileTeleSystems/data-rentgen/issues/289))
+
+Query params:
+
+- location_id: `int`
+- tag_value_id: `list[int]` - if multiple values are passed, dataset should have all of them.
+
+- Added new filters for `GET /v1/jobs` endpoint. ([#319](https://github.com/MobileTeleSystems/data-rentgen/issues/319))
+
+Query params:
+
+- location_id: `int`
+- job_type: `list[str]`
+
+- Added new filters to `GET /v1/runs` endpoint. ([#322](https://github.com/MobileTeleSystems/data-rentgen/issues/322), [#323](https://github.com/MobileTeleSystems/data-rentgen/issues/323))
+
+Query params
+
+- job_type: `list[str]`
+- status: `list[RunStatus]`
+- started_since: `datetime | None`
+- started_until: `datetime | None`
+- ended_since: `datetime | None`
+- ended_until: `datetime | None`
+- job_location_id: `int | None`
+- started_by_user: `list[str] | None`
+
+- Added new endpoint `GET /v1/jobs/types`. ([#319](https://github.com/MobileTeleSystems/data-rentgen/issues/319))
+
+- Add custom `dataRentgen_run` and `dataRentgen_operation` facets. ([#265](https://github.com/MobileTeleSystems/data-rentgen/issues/265))
+
+These facets allow to
+
+- Passing custom `external_id`, `persistent_log_url` and other fields of Run.
+- Passing custom `name`, `description`, `group`, `positition` fields of Operation.
+- mark event as containing only Operation or both Run + Operation data.
+
+- Set `output.type` based on executed SQL query, e.g. `INSERT`, `UPDATE`, `DELETE`, and so on. ({issue}`310`)
+
+## Improvements
+
+- Improve consumer performance by reducing DB load on reading operations. ([#314](https://github.com/MobileTeleSystems/data-rentgen/issues/314))
+
+- Add workaround if OpenLineage emitted Spark application event with `job.name=unknown`. ([#263](https://github.com/MobileTeleSystems/data-rentgen/issues/263))
+
+ This requires installing OpenLineage with this fix merged: .
+
+- Dataset symlinks with no inputs/outputs are no longer removed from lineage graph. ([#269](https://github.com/MobileTeleSystems/data-rentgen/issues/269))
+
+- Make matching for addresses and locations more deterministic by converting them to lowercase. ([#313](https://github.com/MobileTeleSystems/data-rentgen/issues/313))
+
+ Items `oracle://host:1521` and `ORACLE://HOST:1521` are the same item `oracle://host:1521` now.
+
+- Make matching for datasets, jobs, tags and user names case-insensitive by using unique indexes on `lower(name)` expression. ([#313](https://github.com/MobileTeleSystems/data-rentgen/issues/313))
+
+ Item `database.schema.table` and `DATABASE.SCHEMA.TABLE` are the same item now.
+
+ As dataset canonical name depends on database naming convention (`UPPERCASE` for Oracle, `lowercase` for Postgres),
+ we can't convert them into one specific case (upper or lower). Instead we use first received value as canonical one.
+
+## Bug Fixes
+
+- For lineage with `granularity=DATASET` return real lineage graph. ([#264](https://github.com/MobileTeleSystems/data-rentgen/issues/264))
+
+ v0.4.x resolved lineage by `run_id`, but this may produce wrong lineage. v0.4.x now resolves lineage by `operation_id`.
+
+- Exclude self-referencing lineage edges in case `granularity=DATASET`. ([#261](https://github.com/MobileTeleSystems/data-rentgen/issues/261))
+
+ If some run uses the same table as both input and output (e.g. merging duplicates or performing some checks before writing),
+ DataRentgen excludes `dataset1 -> dataset1` relations from lineage.
+
+ This doesn't affect chains like `dataset1 -> job1 -> dataset1` or `dataset1 -> dataset2 -> dataset1`.
diff --git a/mddocs/changelog/0.4.1.md b/mddocs/changelog/0.4.1.md
new file mode 100644
index 00000000..ca1e25fe
--- /dev/null
+++ b/mddocs/changelog/0.4.1.md
@@ -0,0 +1,11 @@
+# 0.4.1 (2025-10-08) { #0.4.1 }
+
+## Features
+
+- Add new `GET /v1/locations/types` endpoint returning list of all known location types. ([#328](https://github.com/MobileTeleSystems/data-rentgen/issues/328))
+- Add new filter to `GET /v1/jobs` ([#328](https://github.com/MobileTeleSystems/data-rentgen/issues/328)):
+ - location_type: `list[str]`
+- Add new filter to `GET /v1/datasets` ([#328](https://github.com/MobileTeleSystems/data-rentgen/issues/328)):
+ - location_type: `list[str]`
+- Allow passing multiple `location_type` filters to `GET /v1/locations`. ([#328](https://github.com/MobileTeleSystems/data-rentgen/issues/328))
+- Allow passing multiple values to `GET` endpoinds with filters like `job_id`, `parent_run_id`, and so on. ([#329](https://github.com/MobileTeleSystems/data-rentgen/issues/329))
diff --git a/mddocs/changelog/0.4.2.md b/mddocs/changelog/0.4.2.md
new file mode 100644
index 00000000..55a4f9d9
--- /dev/null
+++ b/mddocs/changelog/0.4.2.md
@@ -0,0 +1,10 @@
+# 0.4.2 (2025-10-29) { #0.4.2 }
+
+## Bug fixes
+
+- Fix search query filter on UI Run list page.
+- Fix passing multiple filters to `GET /v1/runs`.
+
+## Doc only Changes
+
+- Document `DATA_RENTGEN__UI__AUTH_PROVIDER` config variable.
diff --git a/mddocs/changelog/0.4.3.md b/mddocs/changelog/0.4.3.md
new file mode 100644
index 00000000..c75737dc
--- /dev/null
+++ b/mddocs/changelog/0.4.3.md
@@ -0,0 +1,11 @@
+# 0.4.3 (2025-11-21) { #0.4.3 }
+
+## Features
+
+- Disable `server.session.enabled` by default. It is required only by KeycloakAuthProvider which is not used by default.
+
+## Bug Fixes
+
+- Escape unprintable ASCII symbols in SQL queries before storing them in Postgres. Previously saving queries containing `\x00` symbol lead to exceptions.
+- Kafka topic with malformed messages doesn't have to use the same number partitions as input topics.
+- Prevent OpenLineage from reporting events which [claim to read 8 Exabytes of data](https://github.com/OpenLineage/OpenLineage/pull/4165), this is actually a Spark quirk.
diff --git a/mddocs/changelog/0.4.4.md b/mddocs/changelog/0.4.4.md
new file mode 100644
index 00000000..8b7662e1
--- /dev/null
+++ b/mddocs/changelog/0.4.4.md
@@ -0,0 +1,5 @@
+# 0.4.4 (2025-11-21) { #0.4.4 }
+
+## Bug Fixes
+
+- 0.4.3 release broken inputs with 0 bytes statistics, fixed
diff --git a/mddocs/changelog/0.4.5.md b/mddocs/changelog/0.4.5.md
new file mode 100644
index 00000000..4d0510cb
--- /dev/null
+++ b/mddocs/changelog/0.4.5.md
@@ -0,0 +1,5 @@
+# 0.4.5 (2025-12-24) { #0.4.5 }
+
+## Improvements
+
+Allow disabling `SessionMiddleware`, as it only required by `KeycloakAuthProvider`.
diff --git a/mddocs/changelog/0.4.6.md b/mddocs/changelog/0.4.6.md
new file mode 100644
index 00000000..6aefa62d
--- /dev/null
+++ b/mddocs/changelog/0.4.6.md
@@ -0,0 +1,3 @@
+# 0.4.6 (2025-01-12) { #0.4.6 }
+
+Dependency-only updates.
diff --git a/mddocs/changelog/0.4.7.md b/mddocs/changelog/0.4.7.md
new file mode 100644
index 00000000..d434ddfd
--- /dev/null
+++ b/mddocs/changelog/0.4.7.md
@@ -0,0 +1,3 @@
+# 0.4.7 (2025-01-20) { #0.4.7 }
+
+Dependency-only updates.
diff --git a/mddocs/changelog/0.4.8.md b/mddocs/changelog/0.4.8.md
new file mode 100644
index 00000000..c3cfd782
--- /dev/null
+++ b/mddocs/changelog/0.4.8.md
@@ -0,0 +1,3 @@
+# 0.4.8 (2025-01-26) { #0.4.8 }
+
+Fixed issue with updating Location's `external_id` field - server returned response coe 200 but ignored the input value.
diff --git a/mddocs/changelog/0.5.0.md b/mddocs/changelog/0.5.0.md
new file mode 100644
index 00000000..0feee88f
--- /dev/null
+++ b/mddocs/changelog/0.5.0.md
@@ -0,0 +1,323 @@
+# 0.5.0 (2026-03-19) { #0.5.0 }
+
+## OpenLineage-related features
+
+### Extracting dataset & job tags
+
+[#367](https://github.com/MobileTeleSystems/data-rentgen/issues/367), [#368](https://github.com/MobileTeleSystems/data-rentgen/issues/368), [#369](https://github.com/MobileTeleSystems/data-rentgen/issues/369), [#372](https://github.com/MobileTeleSystems/data-rentgen/issues/372)
+
+Now DataRentgen extracts tags from OpenLineage events:
+
+- dataset tags (currently not reported by any integration)
+- job & run tags
+
+Some of tags are created based on engine versions:
+
+- `airflow.version`
+- `dbt.version`
+- `flink.version`
+- `hive.version`
+- `spark.version`
+- `openlineage_adapter.version`
+- `openlineage_client.version` (only for Python client v1.38.0 or higher)
+
+Note that passing job & run tags depends on integration. For example, tags can be setup for Spark, Airflow and dbt, but not for Flink or Hive.
+Also tags are configured in a different way in each integration.
+
+### Extracting `nominalTime`
+
+[#378](https://github.com/MobileTeleSystems/data-rentgen/issues/378)
+
+Now DataRentgen extracts `nominalTime` run facet, and stores values in `run.expected_start_at`, `run.expected_end_at` fields.
+
+### Extracting `jobDependencies`
+
+[#402](https://github.com/MobileTeleSystems/data-rentgen/issues/402)
+
+Now DataRentgen extracts information from [jobDependencies](https://openlineage.io/docs/spec/facets/run-facets/job_dependencies/) facet, and store it in `job_dependency` table.
+For now this is just a simple tuple `from_dataset_id, to_dataset_id, type` (arbitrary string provided by integration, not enum).
+This can be changed in future versions of Data.Rentgen.
+
+Currently the only integration providing this kind of information is Airflow. But it is implemented only in most recent version of OpenLineage provider for Airflow ([2.10 or higher](https://github.com/apache/airflow/pull/59521)).
+For now provider also doesn't send facet with information about direct task -> task dependencies - only indirect ones are included (declared via [Asset](https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/assets.html)).
+So there is a fallback for Airflow which extracts these dependencies from `downstream_task_ids` and `upstream_task_ids` task fields.
+
+## REST API features
+
+### Added `GET /v1/jobs/hierarchy` endpoint
+
+This endpoint can be used retrieve job hierarchy graph (parents, dependencies) for a given job. ([#407](https://github.com/MobileTeleSystems/data-rentgen/issues/407), [#412](https://github.com/MobileTeleSystems/data-rentgen/issues/412))
+
+### Response example
+
+```python
+{
+ "relations": {
+ "parents": [
+ {
+ "from": {"kind": "JOB", "id": "1"},
+ "to": {"kind": "JOB", "id": "2"}
+ }
+ ],
+ "dependencies": [
+ {
+ "from": {"kind": "JOB", "id": "3"},
+ "to": {"kind": "JOB", "id": "1"},
+ "type": "DIRECT_DEPENDENCY"
+ },
+ {
+ "from": {"kind": "JOB", "id": "1"},
+ "to": {"kind": "JOB", "id": "4"},
+ "type": "DIRECT_DEPENDENCY"
+ }
+ ]
+ },
+ "nodes": {
+ "jobs": {
+ "1": {
+ "id": 1,
+ "parent_job_id": null,
+ "name": "my_job",
+ "type": "SPARK_APPLICATION",
+ "location": {
+ "name": "my_cluster",
+ "type": "YARN"
+ }
+ },
+ "2": {
+ "id": 2,
+ "parent_job_id": 1,
+ "name": "my_job.child_task",
+ "type": "SPARK_APPLICATION",
+ "location": {
+ "name": "my_cluster",
+ "type": "YARN"
+ }
+ },
+ "3": {
+ "id": 3,
+ "parent_job_id": null,
+ "name": "source_job",
+ "type": "SPARK_APPLICATION",
+ "location": {
+ "name": "my_cluster",
+ "type": "YARN"
+ }
+ },
+ "4": {
+ "id": 4,
+ "parent_job_id": null,
+ "name": "target_job",
+ "type": "SPARK_APPLICATION",
+ "location": {
+ "name": "my_cluster",
+ "type": "YARN"
+ }
+ }
+ }
+ }
+}
+```
+
+### Added parent relation between jobs
+
+Jobs can now reference a parent job via `parent_job_id` field. ([#394](https://github.com/MobileTeleSystems/data-rentgen/issues/394))
+
+Before:
+
+### Response example relation
+
+```python
+{
+ "meta": { ... },
+ "items": [
+ {
+ "id": "42",
+ "data": {
+ "id": "42",
+ "name": "my-spark-task",
+ "type": "SPARK_APPLICATION",
+ "location": { ... }
+ }
+ }
+ ]
+}
+```
+
+After:
+
+```python
+{
+ "meta": { ... },
+ "items": [
+ {
+ "id": "42",
+ "data": {
+ "id": "42",
+ "name": "my-spark-task",
+ "type": "SPARK_APPLICATION",
+ "location": { ... },
+ "parent_job_id": "10"
+ }
+ }
+ ]
+}
+```
+
+### Added JOB-JOB and RUN-RUN relations to lineage API
+
+For example, it is possible to get Airflow DAG → Airflow Task → Spark app chain from a single response. ([#392](https://github.com/MobileTeleSystems/data-rentgen/issues/392), [#399](https://github.com/MobileTeleSystems/data-rentgen/issues/399), [#401](https://github.com/MobileTeleSystems/data-rentgen/issues/401))
+
+Before:
+
+```python
+{
+ "relations": {
+ "parents": [
+ {"from": {"kind": "JOB", "id": "1"}, "to": {"kind": "RUN", "id": "parent-run-uuid"}},
+ {"from": {"kind": "JOB", "id": "2"}, "to": {"kind": "RUN", "id": "run-uuid"}}
+ ],
+ "symlinks": [],
+ "inputs": [...],
+ "outputs": [...]
+ },
+ "nodes": {...}
+}
+```
+
+After:
+
+```python
+{
+ "relations": {
+ "parents": [
+ {"from": {"kind": "JOB", "id": "1"}, "to": {"kind": "RUN", "id": "parent-run-uuid"}},
+ {"from": {"kind": "JOB", "id": "2"}, "to": {"kind": "RUN", "id": "run-uuid"}},
+ # NEW:
+ {"from": {"kind": "JOB", "id": "1"}, "to": {"kind": "JOB", "id": "2"}},
+ {"from": {"kind": "RUN", "id": "parent-run-uuid"}, "to": {"kind": "RUN", "id": "run-uuid"}}
+ ],
+ "symlinks": [],
+ "inputs": [...],
+ "outputs": [...]
+ },
+ "nodes": {...}
+}
+```
+
+### Include `job` to `GET /v1/runs` response
+
+This allows to show job type & name for specific run without sending additional requests. [#411](https://github.com/MobileTeleSystems/data-rentgen/issues/411)
+
+Before:
+
+```python
+{
+ "meta": {
+ "page": 1,
+ "page_size": 20,
+ "total_count": 1,
+ "pages_count": 1,
+ "has_next": False,
+ "has_previous": False,
+ "next_page": None,
+ "previous_page": None,
+ },
+ "items": [
+ {
+ "id": "01908224-8410-79a2-8de6-a769ad6944c9",
+ "data": {
+ "id": "01908224-8410-79a2-8de6-a769ad6944c9",
+ "created_at": "2024-07-05T09:05:49.584000",
+ "job_id": "123",
+ ...
+ },
+ "statistics": { ... }
+ }
+ ]
+}
+```
+
+After:
+
+```python
+{
+ "meta": {
+ "page": 1,
+ "page_size": 20,
+ "total_count": 1,
+ "pages_count": 1,
+ "has_next": False,
+ "has_previous": False,
+ "next_page": None,
+ "previous_page": None,
+ },
+ "items": [
+ {
+ "id": "01908224-8410-79a2-8de6-a769ad6944c9",
+ "data": {
+ "id": "01908224-8410-79a2-8de6-a769ad6944c9",
+ "created_at": "2024-07-05T09:05:49.584000",
+ "job_id": "123",
+ ...
+ },
+ "job": {
+ "id": "123",
+ "name": "myjob",
+ ...
+ },
+ "statistics": { ... }
+ }
+ ]
+}
+```
+
+### Include `last_run` field to `GET /v1/jobs` response
+
+This allows to show last start time, status and duration for each job in the list, without additional requests. [#387](https://github.com/MobileTeleSystems/data-rentgen/issues/387)
+
+Before:
+
+```python
+{
+ "meta": { ... },
+ "items": [
+ {
+ "id": "42",
+ "data": {
+ "id": "42",
+ "name": "my-spark-task",
+ "type": "SPARK_APPLICATION",
+ "location": { ... },
+ "parent_job_id": "10"
+ }
+ }
+ ]
+}
+```
+
+After:
+
+```python
+{
+ "meta": { ... },
+ "items": [
+ {
+ "id": "42",
+ "data": {
+ "id": "42",
+ "name": "my-spark-task",
+ "type": "SPARK_APPLICATION",
+ "location": { ... },
+ "parent_job_id": "10"
+ },
+ "last_run": {
+ "id": "01908224-8410-79a2-8de6-a769ad6944c9",
+ "created_at": "2024-07-05T09:05:49.584000",
+ "job_id": "123",
+ ...
+ }
+ }
+ ]
+}
+```
diff --git a/mddocs/changelog/DRAFT.md b/mddocs/changelog/DRAFT.md
new file mode 100644
index 00000000..e69de29b
diff --git a/mddocs/changelog/NEXT_RELEASE.md b/mddocs/changelog/NEXT_RELEASE.md
new file mode 100644
index 00000000..a9831f9d
--- /dev/null
+++ b/mddocs/changelog/NEXT_RELEASE.md
@@ -0,0 +1 @@
+% towncrier release notes start
diff --git a/mddocs/changelog/index.md b/mddocs/changelog/index.md
new file mode 100644
index 00000000..b97d6bad
--- /dev/null
+++ b/mddocs/changelog/index.md
@@ -0,0 +1,17 @@
+# Changelog
+
+[0.5.0][0.5.0]
+[0.4.8][0.4.8]
+[0.4.7][0.4.7]
+[0.4.6][0.4.6]
+[0.4.5][0.4.5]
+[0.4.4][0.4.4]
+[0.4.3][0.4.3]
+[0.4.2][0.4.2]
+[0.4.1][0.4.1]
+[0.4.0][0.4.0]
+[0.3.1][0.3.1]
+[0.3.0][0.3.0]
+[0.2.1][0.2.1]
+[0.2.0][0.2.0]
+[0.1.0][0.1.0]
diff --git a/mddocs/changelog/next_release/261.bugfix.md b/mddocs/changelog/next_release/261.bugfix.md
new file mode 100644
index 00000000..768b2664
--- /dev/null
+++ b/mddocs/changelog/next_release/261.bugfix.md
@@ -0,0 +1 @@
+If some run uses the same table as both input and output (e.g. merging duplicates or performing some checks before writing), DataRentgen excludes `dataset1 -> dataset1` relations from lineage with `granularity=DATASET`.
diff --git a/mddocs/changelog/next_release/263.improvement.md b/mddocs/changelog/next_release/263.improvement.md
new file mode 100644
index 00000000..2ef02728
--- /dev/null
+++ b/mddocs/changelog/next_release/263.improvement.md
@@ -0,0 +1,3 @@
+Add workaround if OpenLineage emitted Spark application event with `job.name=unknown`.
+
+This requires installing OpenLineage with this fix merged: [https://github.com/OpenLineage/OpenLineage/pull/3848](https://github.com/OpenLineage/OpenLineage/pull/3848).
diff --git a/mddocs/changelog/next_release/264.bugfix.md b/mddocs/changelog/next_release/264.bugfix.md
new file mode 100644
index 00000000..dd92e712
--- /dev/null
+++ b/mddocs/changelog/next_release/264.bugfix.md
@@ -0,0 +1 @@
+In 0.3.0 and 0.3.1 lineage with `granularity=DATASET` may return datasets which were not interacted with each other, but were inputs/outputs of the same run. Now only direct interactions are used while resolving lineage graph.
diff --git a/mddocs/changelog/next_release/265.feature.md b/mddocs/changelog/next_release/265.feature.md
new file mode 100644
index 00000000..0de09cc2
--- /dev/null
+++ b/mddocs/changelog/next_release/265.feature.md
@@ -0,0 +1,5 @@
+Add custom `dataRentgen_run` and `dataRentgen_operation` facets allowing to:
+
+- Passing custom `external_id`, `persistent_log_url` and other fields of Run.
+- Passing custom `name`, `description`, `group`, `positition` fields of Operation.
+- mark event as containing only Operation or both Run + Operation data.
diff --git a/mddocs/changelog/next_release/268.feature.md b/mddocs/changelog/next_release/268.feature.md
new file mode 100644
index 00000000..efd6a6a9
--- /dev/null
+++ b/mddocs/changelog/next_release/268.feature.md
@@ -0,0 +1,47 @@
+Add new entities `Tag` and `TagValue`. Tags can be used as additional properties for another entities. Add tags for dataset.
+
+### Response examples
+
+=== Before
+
+ ```python
+ {
+ "nodes": {
+ "datasets": {
+ "8400": {
+ "id": "8400",
+ "location": {...},
+ "name": "dataset_name",
+ "schema": {},
+ }
+ },
+ "relations": {...}
+ ```
+
+=== After
+
+ ```python
+ {
+ "nodes": {
+ "datasets": {
+ "8400": {
+ "id": "25896",
+ "location": {...},
+ "name": "dataset_name",
+ "schema": {...},
+ "tags": { # <---
+ {
+ "name": "environment",
+ "value": "production"
+ },
+ {
+ "name": "team",
+ "value": "my_awesome_team"
+ },
+ },
+ }
+ }
+ ...
+ },
+ "relations": {...}
+ ```
diff --git a/mddocs/changelog/next_release/269.improvement.md b/mddocs/changelog/next_release/269.improvement.md
new file mode 100644
index 00000000..15759d9c
--- /dev/null
+++ b/mddocs/changelog/next_release/269.improvement.md
@@ -0,0 +1 @@
+Dataset symlinks with no inputs/outputs are no longer removed from lineage graph.
diff --git a/mddocs/changelog/next_release/276.feature.md b/mddocs/changelog/next_release/276.feature.md
new file mode 100644
index 00000000..08cac1ae
--- /dev/null
+++ b/mddocs/changelog/next_release/276.feature.md
@@ -0,0 +1,6 @@
+Add REST API endpoints for managing personal tokens:
+
+- `GET /personal-tokens` - get personal tokens for current user.
+- `POST /personal-tokens` - create new personal token for current user.
+- `PATCH /personal-tokens/:id` - refresh personal token (revoke token and create new one).
+- `DELETE /personal-tokens/:id` - revoke personal token.
diff --git a/mddocs/changelog/next_release/281.feature.md b/mddocs/changelog/next_release/281.feature.md
new file mode 100644
index 00000000..a7610dcd
--- /dev/null
+++ b/mddocs/changelog/next_release/281.feature.md
@@ -0,0 +1 @@
+Introduce new HTTP2Kafka component. It allows using DataRentgen with OpenLineage HttpTransport. Authentication is done using personal tokens.
diff --git a/mddocs/comparison.md b/mddocs/comparison.md
new file mode 100644
index 00000000..658836f7
--- /dev/null
+++ b/mddocs/comparison.md
@@ -0,0 +1,94 @@
+(comparison)=
+
+# Comparison with other tools
+
+## Why not [DataHub](https://datahubproject.io/)?
+
+### DataHub cons
+
+- As Data Catalog, DataHub relies on database ingestion mechanism.
+ To extract and draw lineage between tables, it is required to *both* connect ingestor to all databases, and to enable integration with ETL (Spark, Airflow, etc).
+
+ There is an option `spark.datahub.metadata.dataset.materialize=true`, but in this case DataHub creates datasets without schema,
+ so ingestors are still required for column lineage.
+
+- DataHub Spark agent doesn't properly work if *Platform Instances* are enabled in DataHub.
+ Platform Instance is an additional hierarchy level for databases,
+ and there is no way to map it to database address used by Spark, Airflow and other ETL tools.
+
+- OpenLineage → DataHub integration collects each Spark command as dedicated *Pipeline Task*, producing a huge lineage graph.
+
+ Data.Rentgen has configurable `granularity` option while rendering the lineage graph.
+
+- No support for Job → Job hierarchy like Airflow Task → Spark application, or Airflow Task → Airflow Task dependencies.
+
+- High CPU and memory consumption.
+
+### DataHub pros
+
+- DataHub has information about real dataset column names, types, description.
+ Data.Rentgen has only information provided by ETL engine, e.g. selected columns, ETL engine-specific column types.
+- DataHub has table → view lineage, Data.Rentgen doesn't.
+
+## Why not [OpenMetadata](https://open-metadata.org/)?
+
+### OpenMetadata cons
+
+- Database ingestors are required to build a lineage graph, just like DataHub.
+- OpenLineage → OpenMetadata integration produces no lineage, for some unknown reason.
+- No support for Job → Job hierarchy like Airflow Task → Spark application, or Airflow Task → Airflow Task dependencies.
+- High CPU and memory consumption.
+
+### OpenMetadata pros
+
+- OpenMetadata has information about real dataset column names, types, description.
+
+ Data.Rentgen has only information available in ETL engine, e.g. selected columns, ETL engine-specific column types.
+
+- OpenMetadata has table → view lineage, Data.Rentgen doesn't.
+
+## Why not [Marquez](https://marquezproject.ai/)?
+
+### Marquez cons
+
+- OpenLineage → Marquez integration collects each Spark command as dedicated Jobs, producing too detailed lineage graph.
+
+ Data.Rentgen has configurable `granularity` option while rendering the lineage graph.
+
+- Severe performance issues while consuming lineage events.
+- No support for dataset symlinks, e.g. HDFS location → Hive table.
+- No support for Job → Job hierarchy like Airflow Task → Spark application, or Airflow Task → Airflow Task dependencies.
+- No releases since 2024.
+
+### Marquez pros
+
+- Marquez store and show lineage for any OpenLineage integration.
+ Data.Rentgen may require some adjustments for that.
+
+- Marquez store and show any facet produced by OpenLineage integration, including custom ones.
+ Data.Rentgen stores only selected facets.
+
+## Why not [Apache Atlas](https://atlas.apache.org)?
+
+- No Apache Spark 3.x integration in open source.
+- Only Apache Airflow 1.x integration, but no 2.x and 3.x support.
+- High CPU and memory consumption in production environment, as it uses HBase as storage layer.
+
+## Why not [Open Data Discovery](https://opendatadiscovery.org/)?
+
+- No Apache Spark integration.
+- Only Apache Airflow 1.x integration, but no 2.x and 3.x support.
+
+## Why not [Amudsen](https://www.amundsen.io)?
+
+- No Apache Spark integration.
+- No releases since 2024.
+
+## Why not [Spline](https://absaoss.github.io/spline/)?
+
+- No Apache Airflow integration.
+- ArangoDB changed license from Apache-2.0 to BSL [since 2024.02.19](https://arangodb.com/2024/02/update-evolving-arangodbs-licensing-model-for-a-sustainable-future/).
+
+## Why not [Egeria](https://egeria-project.org/)?
+
+Insanely complicated.
diff --git a/mddocs/contributing.md b/mddocs/contributing.md
new file mode 100644
index 00000000..400e8f20
--- /dev/null
+++ b/mddocs/contributing.md
@@ -0,0 +1,339 @@
+# Contributing Guide
+
+Welcome! There are many ways to contribute, including submitting bug reports, improving documentation, submitting feature requests, reviewing new submissions, or contributing code that can be incorporated into the project.
+
+## Initial setup for local development
+
+### Install Git
+
+Please follow [instruction](https://docs.github.com/en/get-started/quickstart/set-up-git).
+
+### Create a fork
+
+If you are not a member of a development team building Data.Rentgen, you should create a fork before making any changes.
+
+Please follow [instruction](https://docs.github.com/en/get-started/quickstart/fork-a-repo).
+
+### Clone the repo
+
+Open terminal and run these commands:
+
+```bash
+git clone https://github.com/MobileTeleSystems/data-rentgen -b develop
+
+cd data_rentgen
+```
+
+### Setup environment
+
+Firstly, install [make](https://www.gnu.org/software/make/manual/make.html). It is used for running complex commands in local environment.
+
+Secondly, create virtualenv and install dependencies:
+
+```bash
+make venv
+```
+
+If you already have venv, but need to install dependencies required for development:
+
+```bash
+make venv-install
+```
+
+We are using [poetry](https://python-poetry.org/docs/managing-dependencies/) for managing dependencies and building the package.
+It allows to keep development environment the same for all developers due to using lock file with fixed dependency versions.
+
+There are *extra* dependencies (included into package as optional):
+
+* `backend`
+* `client-sync`
+* `postgres`
+* `seed`
+
+And *groups* (not included into package, used locally and in CI):
+
+* `test` - for running tests
+* `dev` - for development, like linters, formatters, mypy, pre-commit and so on
+* `docs` - for building documentation
+
+### Enable pre-commit hooks
+
+[pre-commit](https://pre-commit.com/) hooks allows to validate & fix repository content before making new commit.
+It allows to run linters, formatters, fix file permissions and so on. If something is wrong, changes cannot be committed.
+
+Firstly, install pre-commit hooks:
+
+```bash
+pre-commit install --install-hooks
+```
+
+Ant then test hooks run:
+
+```bash
+pre-commit run
+```
+
+## How to
+
+### Run development instance locally
+
+Start DB container & seed database with some examples:
+
+```bash
+make db db-seed
+```
+
+Then start development server:
+
+```bash
+make dev-server
+```
+
+And open [http://localhost:8000/docs](http://localhost:8000/docs)
+
+Settings are stored in `.env.local` file.
+
+To start developlment consumer, open a new terminal window/tab, and run:
+
+```bash
+make broker dev-consumer
+```
+
+### Working with migrations
+
+Start database:
+
+```bash
+make db-start
+```
+
+Generate revision:
+
+```bash
+make db-revision ARGS="-m 'Message'"
+```
+
+Upgrade db to `head` migration:
+
+```bash
+make db-upgrade
+```
+
+Downgrade db to `head-1` migration:
+
+```bash
+make db-downgrade
+```
+
+### Run tests locally
+
+This is as simple as:
+
+```bash
+make test
+```
+
+This command starts all necessary containers (Postgres, Kafka), runs all necessary migrations, and then runs Pytest.
+
+You can pass additional arguments to pytest like this:
+
+```bash
+make test PYTEST_ARGS="-m client-sync -lsx -vvvv --log-cli-level=INFO"
+```
+
+Stop all containers and remove created volumes:
+
+```bash
+make test-cleanup ARGS="-v"
+```
+
+Get fixtures not used by any test:
+
+```bash
+make test-check-fixtures
+```
+
+### Run production instance locally
+
+Firstly, build production image:
+
+```bash
+make prod-build
+```
+
+And then start it:
+
+```bash
+make prod
+```
+
+Then open [http://localhost:8000/docs](http://localhost:8000/docs)
+
+Settings are stored in `.env.docker` file.
+
+### Build documentation
+
+Build documentation using Sphinx & open it:
+
+```bash
+make docs
+```
+
+If documentation should be build cleanly instead of reusing existing build result:
+
+```bash
+make docs-fresh
+```
+
+## Review process
+
+Please create a new GitHub issue for any significant changes and enhancements that you wish to make. Provide the feature you would like to see, why you need it, and how it will work. Discuss your ideas transparently and get community feedback before proceeding.
+
+Significant Changes that you wish to contribute to the project should be discussed first in a GitHub issue that clearly outlines the changes and benefits of the feature.
+
+Small Changes can directly be crafted and submitted to the GitHub Repository as a Pull Request.
+
+### Create pull request
+
+Commit your changes:
+
+```bash
+git commit -m "Commit message"
+git push
+```
+
+Then open Github interface and [create pull request](https://docs.github.com/en/get-started/quickstart/contributing-to-projects#making-a-pull-request).
+Please follow guide from PR body template.
+
+After pull request is created, it get a corresponding number, e.g. 123 (`pr_number`).
+
+### Write release notes
+
+Data.Rentgen uses [towncrier](https://pypi.org/project/towncrier/) for changelog management.
+
+To submit a change note about your PR, add a text file into the [docs/changelog/next_release](./next_release) folder. It should contain an explanation of what applying this PR will change in the way end-users interact with the project. One sentence is usually enough but feel free to add as many details as you feel necessary for the users to understand what it means.
+
+**Use the past tense** for the text in your fragment because, combined with others, it will be a part of the “news digest” telling the readers **what changed** in a specific version of the library *since the previous version*.
+
+reStructuredText syntax for highlighting code (inline or block), linking parts of the docs or external sites. If you wish to sign your change, feel free to add `-- by :user:`github-username` at the end (replace `github-username` with your own!).
+
+Finally, name your file following the convention that Towncrier understands: it should start with the number of an issue or a PR followed by a dot, then add a patch type, like `feature`, `doc`, `misc` etc., and add `.rst` as a suffix. If you need to add more than one fragment, you may add an optional sequence number (delimited with another period) between the type and the suffix.
+
+In general the name will follow `..rst` pattern, where the categories are:
+
+* `feature`: Any new feature
+* `bugfix`: A bug fix
+* `improvement`: An improvement
+* `doc`: A change to the documentation
+* `dependency`: Dependency-related changes
+* `misc`: Changes internal to the repo like CI, test and build changes
+
+A pull request may have more than one of these components, for example
+a code change may introduce a new feature that deprecates an old
+feature, in which case two fragments should be added. It is not
+necessary to make a separate documentation fragment for documentation
+changes accompanying the relevant code changes.
+
+#### Examples for adding changelog entries to your Pull Requests
+
+```rst
+Added a ``:github:user:`` role to Sphinx config -- by :github:user:`someuser`
+```
+
+```rst
+Fixed behavior of ``backend`` -- by :github:user:`someuser`
+```
+
+```rst
+Added support of ``timeout`` in ``LDAP``
+-- by :github:user:`someuser`, :github:user:`anotheruser` and :github:user:`otheruser`
+```
+
+#### How to skip change notes check?
+
+Just add `ci:skip-changelog` label to pull request.
+
+#### Release Process
+
+Before making a release from the `develop` branch, follow these steps:
+
+1. Checkout to `develop` branch and update it to the actual state
+
+```bash
+git checkout develop
+git pull -p
+```
+
+1. Backup `NEXT_RELEASE.rst`
+
+```bash
+cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst"
+```
+
+1. Build the Release notes with Towncrier
+
+```bash
+VERSION=$(poetry version -s)
+towncrier build "--version=${VERSION}" --yes
+```
+
+1. Change file with changelog to release version number
+
+```bash
+mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst"
+```
+
+1. Remove content above the version number heading in the `${VERSION}.rst` file
+
+```bash
+awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst"
+```
+
+1. Update Changelog Index
+
+```bash
+awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst
+```
+
+1. Restore `NEXT_RELEASE.rst` file from backup
+
+```bash
+mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst"
+```
+
+1. Commit and push changes to `develop` branch
+
+```bash
+git add .
+git commit -m "Prepare for release ${VERSION}"
+git push
+```
+
+1. Merge `develop` branch to `master`, **WITHOUT** squashing
+
+```bash
+git checkout master
+git pull
+git merge develop
+git push
+```
+
+1. Add git tag to the latest commit in `master` branch
+
+```bash
+git tag "$VERSION"
+git push origin "$VERSION"
+```
+
+1. Update version in `develop` branch **after release**:
+
+```bash
+git checkout develop
+
+NEXT_VERSION=$(echo "$VERSION" | awk -F. '/[0-9]+\./{$NF++;print}' OFS=.)
+poetry version "$NEXT_VERSION"
+
+git add .
+git commit -m "Bump version"
+git push
+```
diff --git a/mddocs/entities/dataset_column_lineage.png b/mddocs/entities/dataset_column_lineage.png
new file mode 100644
index 00000000..01b8b98c
Binary files /dev/null and b/mddocs/entities/dataset_column_lineage.png differ
diff --git a/mddocs/entities/dataset_lineage.png b/mddocs/entities/dataset_lineage.png
new file mode 100644
index 00000000..51594bf5
Binary files /dev/null and b/mddocs/entities/dataset_lineage.png differ
diff --git a/mddocs/entities/dataset_list.png b/mddocs/entities/dataset_list.png
new file mode 100644
index 00000000..688bd78e
Binary files /dev/null and b/mddocs/entities/dataset_list.png differ
diff --git a/mddocs/entities/dataset_schema.png b/mddocs/entities/dataset_schema.png
new file mode 100644
index 00000000..622f813b
Binary files /dev/null and b/mddocs/entities/dataset_schema.png differ
diff --git a/mddocs/entities/dataset_symlinks.png b/mddocs/entities/dataset_symlinks.png
new file mode 100644
index 00000000..ef051a36
Binary files /dev/null and b/mddocs/entities/dataset_symlinks.png differ
diff --git a/mddocs/entities/dependency_relation.png b/mddocs/entities/dependency_relation.png
new file mode 100644
index 00000000..1e0a5360
Binary files /dev/null and b/mddocs/entities/dependency_relation.png differ
diff --git a/mddocs/entities/direct_column_lineage.png b/mddocs/entities/direct_column_lineage.png
new file mode 100644
index 00000000..ce250a3f
Binary files /dev/null and b/mddocs/entities/direct_column_lineage.png differ
diff --git a/mddocs/entities/index.md b/mddocs/entities/index.md
new file mode 100644
index 00000000..a98b6b88
--- /dev/null
+++ b/mddocs/entities/index.md
@@ -0,0 +1,412 @@
+# Entities { #entities }
+
+```mermaid
+---
+title: Entities diagram
+---
+
+flowchart LR
+ subgraph locations1 [locations 1]
+ addresses1@{shape: docs, label: "addresses"}
+ end
+ subgraph locations2 [locations 2]
+ addresses2@{shape: docs, label: "addresses"}
+ end
+ subgraph locations3 [locations 3]
+ addresses3@{shape: docs, label: "addresses"}
+ end
+ dataset1[(dataset 1)]
+ dataset2[(dataset 2)]
+ operations@{shape: procs}
+ runs@{shape: procs, fill: yellow}
+
+ style runs fill:lightyellow
+ job
+ style job fill:lightblue
+ user@{shape: stadium}
+ style user fill:lightblue
+
+ dataset1 -- SYMLINK ---> dataset2
+ dataset2 -- SYMLINK --> dataset1
+
+ dataset2 -- located in --> locations2
+
+ dataset1 -. INPUT .-> operations
+ operations -. OUTPUT .-> dataset1
+ dataset1 -- located in --> locations1
+
+ operations -- PARENT --> runs
+
+ runs -- PARENT ----> job
+ runs -- started by ----> user
+
+ job -- located in ---> locations3
+
+ runs -- PARENT --> runs
+
+```
+
+## Nodes
+
+Nodes are independent entities which describe information about some real entity, like table, ETL job, ETL job run and so on.
+
+### Location
+
+Represents information "where is dataset located", "where is job started from".
+This is the analog of [OpenLineage namespace](https://openlineage.io/docs/spec/naming/) concept.
+
+Examples:
+
+- `hive://some-cluster`
+- `hdfs://some-cluster`
+- `oracle://some.host.name:1521`
+- `postgres://some.host.name:5432`
+- `yarn://some-cluster`
+- `local://some.host.name`
+- `http://airflow-web-ui.domain.com:8080`
+
+It contains following fields:
+
+- `id: int` - internal unique identifier.
+- `type: str` - location type, e.g. `hive`, `hdfs`, `oracle` and so on.
+- `name: str` - location name, e.g. `some-cluster`, `some.host.name`
+- `external_id: str | None` - external identified of this location in some third-party system (e.g. PlatformInstance in [Datahub](https://datahubproject.io/)).
+- `addresses` - list of alternative location addresses (see below):
+
+ - `url: str` - alternative address, in URL form.
+
+
+
+#### Location addresses
+
+In real world, the same physical host or cluster may have multiple addresses, for example:
+
+- PostgreSQL instance may be accessed by its host name `postgres://some.host.name:5432` or by IP `postgres://192.128.20.14:5432`
+- With or without port number - `postgres://some.host.name:5432` vs. `postgres://some.host.name`
+
+Also clusters like Hadoop, Kafka and so on, may be accessed by multiple host names:
+
+- `hdfs://some-cluster` → `[hdfs://some-cluster.name.node1:8082, hdfs://some-cluster.name.node2]`.
+- `kafka://bootstrap.server1,bootstrap.server2,bootstrap.server3` → `[kafka://bootstrap.server1,kafka://bootstrap.server2,kafka://bootstrap.server3]`.
+
+Each Spark application run may connect to any of these addresses, and access the same data.
+
+Having a list of alternative addresses of specific location allows to resolve this ambiguity, and always match the same physical table on the same cluster
+to the same Data.Rentgen dataset. This prevents creating duplicates of dataset or job.
+
+### Dataset
+
+Represents information about some table/topic/collection/folder, stored in specific location.
+
+Examples:
+
+- `hive://some-cluster` + `myschema.mytable` - table inside a Hive cluster.
+- `postgres://some.host.name:5432` + `mydb.myschema.mytable` - table inside a Postgres instance.
+- `hdfs://some-cluster` + `/app/warehouse/hive/managed/myschema.db/mytable` - folder inside a HDFS cluster.
+
+Note that all information Data.Rentgen has was actually reported by ETL jobs, and not by database. There are no database integrations.
+
+For example, Spark command read something from PostgreSQL object `public.dataset_name`. This can be a table, a view, a foreign table - *we don't know*.
+
+That's why the information about datasets is very limited:
+
+- `id: int` - internal unique identifier.
+- `location: Location` - Location where data is actually stored in, like RDMBS instance or cluster.
+- `name: str` - qualified name of Dataset, like `mydb.myschema.mytable` or `/app/warehouse/hive/managed/myschema.df/mytable`
+- `schema: Schema | None` - schema of dataset.
+
+
+
+#### Dataset schema
+
+Schema only exists as a part of some interaction, like Spark application written some dataframe to ORC file,
+or Flink fetched some data from PostgreSQL table.
+
+Also, there can be multiple schemas of dataset:
+
+- If dataset is an input, it may contain only *selected* columns. We call this schema projection.
+- If dataset is an output, the schema field usually represents actual table columns. Except `DEFAULT` or `COMPUTED` columns.
+- If dataset is both input and output, we prefer using the output schema, because it has more information (like column types).
+
+It contains following fields:
+
+- `id: int` - internal unique identifier.
+- `fields: list[SchemaField]`:
+
+ - `name: str` - column name
+ - `type: str | None` - column type, if any.
+ Note that this is types in ETL engine (Spark, Flink, etc), and not types of source (Postgres, Clickhouse).
+ - `description: str | None` - column description/comment, if any.
+ - `fields: list[SchemaField]` - if column contain nested fields (e.g. `struct`, `array`, `map`).
+
+- `relevance_type: Enum` - describes if this schema information is relevant:
+
+ - `EXACT_MATCH` - returned if all interactions with this dataset used only one schema.
+ - `LATEST_KNOWN` - if there are multiple interactions with this dataset, but with different schemas. In this case a schema of the most recent interaction is returned.
+
+
+
+### Job
+
+Represents information about ETL job in specific location.
+This is an abstraction to group by different runs of the same Spark application, Airflow DAG, Airflow Task, etc.
+
+Examples:
+
+- `yarn://some-cluster` + `my-spark-session` - Spark applicationName, running inside a YARN cluster (`master=yarn`).
+- `local://some.host.name` + `my-spark-session` - Spark applicationName, running on a host (`master=local`).
+- `http://airflow-web-ui.domain.com:8080` + `my_dag` - Airflow DAG, created in Airflow instance.
+- `http://airflow-web-ui.domain.com:8080` + `my_dag.mytask` - Airflow Task within Airflow DAG, created in Airflow instance.
+- `http://flink.domain.com:18081` + `some_flink_application` - Flink job running in Flink instance.
+- `local://some.host.name` + `my_project` - dbt project running on a host.
+
+It contains following fields:
+
+- `id: int` - internal unique identifier.
+- `location: Location` - Location where Job is run, e.g. cluster or host name.
+- `name: str` - name of Job, like `my-session-name`, `mydag`, `mydag.mytask`
+- `type: str` - type of Job, like:
+
+ - `SPARK_APPLICATION`
+ - `AIRLOW_DAG`
+ - `AIRFLOW_TASK`
+ - `FLINK_JOB`
+ - `DBT_JOB`
+ - `UNKNOWN`
+
+- `parent_job_id: int` - parent Job which started this specific Job, e.g. Spark applicationId was started by Airflow Task Instance, or Airflow Task Instance is a started by Airflow DagRun.
+- `tags: list[Tag]` - tags of job.
+
+
+
+### Tags
+
+Datasets and jobs can have multiple tags which are arbitrary `key: value` pairs.
+
+- `id: int` - tag identifier
+- `name: str` - tag name, usually in format `source.name`, e.g. `airflow.version`, `company.team`
+- `values: list[TagValue]` - tag values bound to dataset/job:
+
+- `id: int` - tag value identifier
+- `value: str` - tag value, e.g. `1.3.4`, `production`, `Some team`
+
+
+
+### User
+
+Represents information about some user.
+
+It contains following fields:
+
+- `id: bigint` - internal unique identifier.
+- `name: str` - username.
+
+### Run
+
+Represents information about Job run:
+
+- for Spark applicationName it is a Spark applicationId
+- for Airflow DAG it is a DagRun
+- for Airflow Task it is a TaskInstance
+- for Apache Flink it is jobId
+- for dbt it is `dbt run` instance
+
+It contains following fields:
+
+- `id: uuidv7` - unique identifier, generated on client.
+- `created_at: timestamp` - extracted UUIDv7 timestamp, used for filtering purpose.
+- `job_id: int` - bound to specific Job.
+- `parent_run_id: uuidv7` - parent Run which triggered this specific Run, e.g. Spark applicationId was triggered by Airflow Task Instance, or Airflow Task Instance is a child of Airflow DagRun.
+- `started_at: timestamp | None` - timestamp when OpenLineage event with `eventType=START` was received.
+- `started_by user: User | None` - Spark session started as specific OS user/Kerberos principal.
+- `start_reason: Enum | None` - "why this Run was started?":
+
+ - `MANUAL`
+ - `AUTOMATIC` - e.g. by schedule or triggered by another run.
+
+- `status: Enum` - run status. Currently these statuses are supported:
+
+ - `UNKNOWN`
+ - `STARTED`
+ - `SUCCEEDED`
+ - `FAILED`
+ - `KILLED`
+
+- `ended_at: timestamp | None` - timestamp when OpenLineage event with `eventType=COMPLETE|FAIL|ABORT` was received.
+- `ended_reason: str | None` - reason of receiving this status, if it is `FAILED` or `KILLED`.
+- `external_id : str | None` - external identifier of this Run, e.g. Spark `applicationId` or Airflow `dag_run_id`.
+- `attempt: str | None` - external attempt number of this Run, e.g. Spark `attemptId` in YARN, or Airflow Task `try_number`.
+- `running_log_url: str | None` - external URL there specific Run information could be found (e.g. Spark UI).
+- `persistent_log_url: str | None` - external URL there specific Run logs could be found (e.g. Spark History server, Airflow Web UI).
+
+
+
+
+
+
+
+
+
+### Operation
+
+Represents specific Spark job or Spark execution information. For now, Airflow DAG and Airflow task does not have any operations.
+
+It contains following fields:
+
+- `id: uuidv7` - unique identifier, generated on client.
+- `created_at: timestamp` - extracted UUIDv7 timestamp, used for filtering purpose.
+- `run_id: uuidv7` - bound to specific Run.
+- `started_at: timestamp | None` - timestamp when OpenLineage event with `eventType=START` was received.
+- `status: Enum` - run status. Currently these statuses are supported:
+
+ - `UNKNOWN`
+ - `STARTED`
+ - `SUCCEEDED`
+ - `FAILED`
+ - `KILLED`
+
+- `ended_at: timestamp | None` - timestamp when OpenLineage event with `eventType=COMPLETE|FAIL|ABORT` was received.
+- `name: str` - name of operation, e.g. Spark command , dbt command name.
+- `position: int | None` - positional number of operation, e.g. number of Spark execution in Spark UI or `map_index` of Airflow Task.
+- `group: str | None` - field to group operations by, e.g. Spark job `jobGroup` or DBT command type (`MODEL`, `SQL`, `TEST`, `SNAPSHOT`).
+- `description: str | None` - operation description, e.g. Spark job `jobDescription` field, Airflow Operator name.
+- `sql_query: str | None` - SQL query executed by this operation, if any.
+
+
+
+## Relations
+
+These entities describe relationship between different nodes.
+
+### Dataset Symlink
+
+Represents dataset relations like `Hive metastore table → HDFS/S3 location of table`, and vice versa.
+
+It contains following fields:
+
+- `from: Dataset` - symlink starting point.
+- `to: Dataset` - symlink end point.
+- `type: Enum` - type of symlink. these types are supported:
+
+ - `METASTORE` - from HDFS location to Hive table in metastore.
+ - `WAREHOUSE` - from Hive table to HDFS/S3 location.
+
+!!! note
+
+ Currently, OpenLineage sends only symlinks `HDFS location → Hive table` which [do not exist in the real world](https://github.com/OpenLineage/OpenLineage/issues/2718#issuecomment-2134746258).
+ Message consumer automatically adds a reverse symlink `Hive table → HDFS location` to simplify building lineage graph, but this is temporary solution.
+
+
+
+### Parent Relation
+
+Relation between child run/operation and its parent. For example:
+
+- Spark job (applicationName) is parent for all its runs (applicationId).
+- Airflow DAG is parent of Airflow task.
+- Airflow Task Instance can start a Spark run (applicationId), dbt run, and so on.
+
+It contains following fields:
+
+- `from: Job | Run` - parent entity.
+- `to: Run | Operation` - child entity.
+
+
+
+### Dependency relation
+
+Relation between job/job or run/run which shows the order of executing ETL jobs.
+For example, one Airflow Task can depend on another Airflow Task.
+
+It contains following fields:
+
+- `from: Job | Run` - entity which should be waited before current job/run will be started.
+- `to: Job | Run` - entity which waits.
+- `type: str` - type of dependency, any arbitrary string provided by integration, usually something like `DIRECT_DEPENDENCY`, `INDIRECT_DEPENDENCY`.
+
+
+
+### Input relation
+
+Relation Dataset → Operation, describing the process of reading some data from specific table/folder by specific operation.
+
+It is also possible to aggregate all inputs of specific Dataset → Run, Dataset → Job or Dataset -> Dataset by adjusting interaction `granularity` option of Lineage graph.
+
+It contains following fields:
+
+- `from: Dataset` - data source.
+- `to: Operation | Run | Job | Dataset` - data target.
+- `num_rows: int | None` - number of rows read from dataset. For `granularity=JOB|RUN` it is a sum of all read rows from this dataset. For `granularity=DATASET` always `None`.
+- `num_bytes: int | None` - number of bytes read from dataset. For `granularity=JOB|RUN` it is a sum of all read bytes from this dataset. For `granularity=DATASET` always `None`.
+- `num_files: int | None` - number of files read from dataset. For `granularity=JOB|RUN` it is a sum of all read files from this dataset. For `granularity=DATASET` always `None`.
+
+
+
+### Output relation
+
+Relation Operation → Dataset, describing the process of writing some data to specific table/folder by specific Spark command, or table/folder metadata changes.
+
+It is also possible to aggregate all outputs of specific Run → Dataset or Job → Dataset combination, by adjusting `granularity` option of Lineage graph.
+
+It contains following fields:
+
+- `from: Operation | Run | Job` - output source.
+- `to: Dataset` - output target.
+- `types: list[Enum]` - type of output. these types are supported:
+
+ - `CREATE`
+ - `ALTER`
+ - `RENAME`
+ - `APPEND`
+ - `OVERWRITE`
+ - `DROP`
+ - `TRUNCATE`
+
+ For `granularity=JOB|RUN` it is a combination of all output types for this dataset.
+
+- `num_rows: int | None` - number of rows written from dataset. For `granularity=JOB|RUN` it is a sum of all written rows to this dataset.
+- `num_bytes: int | None` - number of bytes written from dataset. For `granularity=JOB|RUN` it is a sum of all written bytes to this dataset.
+- `num_files: int | None` - number of files written from dataset. For `granularity=JOB|RUN` it is a sum of all written files to this dataset.
+
+
+
+### Direct Column Lineage relation
+
+Relation Dataset columns → Dataset columns, describing how each target dataset column is related to some source dataset columns.
+
+- `from: Dataset` - source dataset.
+- `to: Dataset` - target dataset.
+- `fields: dict[str, list[SourceColumn]]` - mapping between target column name and source columns, where `SourceColumn` is:
+
+ - `field: str` - source column name
+ - `types: list[Enum]` - types of transformation applied to source column. Supported types are:
+
+ - `IDENTITY` - column is used as-is, e.g. `SELECT source_column AS target_column`
+ - `TRANSFORMATION` - some non-masking function is applied to column value, e.g. `SELECT source_column || '_suffix' AS target_column`
+ - `TRANSFORMATION_MASKING` - some masking function is applied to column value, e.g. `SELECT hash(source_column) AS target_column`
+ - `AGGREGATION` - some non-masking aggregation function is applied to column value, e.g. `SELECT max(source_column) AS target_column`
+ - `AGGREGATION_MASKING` - some masking aggregation function is applied to column value, e.g. `SELECT count(DISTINCT source_column) AS target_column`
+ - `UNKNOWN` - some unknown transformation type.
+
+
+
+### Indirect Column Lineage relation
+
+Relation Dataset columns → Dataset, describing how the entire target dataset is related to some source dataset columns.
+
+- `from: Dataset` - source dataset.
+- `to: Dataset` - target dataset.
+- `fields: list[Column]` - list of source columns, where `SourceColumn` is:
+
+ - `field: str` - source column name
+ - `types: list[Enum]` - types of transformation applied to source column. Supported types are:
+
+ - `FILTER` - column is used in `WHERE` clause, e.g. `SELECT * WHERE source_column = 'abc'`
+ - `JOIN` - column is used in JOIN clause, e.g. `SELECT * FROM source_dataset1 JOIN source_dataset2 ON source_dataset1.id = source_dataset2.id`
+ - `GROUP_BY` - column is used in `GROUP BY` clause, e.g. `SELECT source_column, count(*) FROM source_dataset GROUP BY source_column`
+ - `SORT` - column is used in `ORDER BY` clause, e.g. `SELECT * FROM source_dataset ORDER BY source_column`
+ - `WINDOW` - column is used in `WINDOW` clause, e.g. `SELECT max(*) OVER (source_column) AS target_column`
+ - `CONDITIONAL` - column is used in `CASE` or `IF` clause, e.g. `SELECT CASE source_column THEN 1 WHEN 'abc' ELSE 'cde' END AS target_column`
+ - `UNKNOWN` - some unknown transformation type.
+
+
diff --git a/mddocs/entities/indirect_column_lineage.png b/mddocs/entities/indirect_column_lineage.png
new file mode 100644
index 00000000..2593b4c2
Binary files /dev/null and b/mddocs/entities/indirect_column_lineage.png differ
diff --git a/mddocs/entities/input.png b/mddocs/entities/input.png
new file mode 100644
index 00000000..bff3b145
Binary files /dev/null and b/mddocs/entities/input.png differ
diff --git a/mddocs/entities/input_relation.png b/mddocs/entities/input_relation.png
new file mode 100644
index 00000000..bff3b145
Binary files /dev/null and b/mddocs/entities/input_relation.png differ
diff --git a/mddocs/entities/job_lineage.png b/mddocs/entities/job_lineage.png
new file mode 100644
index 00000000..4eaa086b
Binary files /dev/null and b/mddocs/entities/job_lineage.png differ
diff --git a/mddocs/entities/job_list.png b/mddocs/entities/job_list.png
new file mode 100644
index 00000000..0b4cf16f
Binary files /dev/null and b/mddocs/entities/job_list.png differ
diff --git a/mddocs/entities/location_list.png b/mddocs/entities/location_list.png
new file mode 100644
index 00000000..65690489
Binary files /dev/null and b/mddocs/entities/location_list.png differ
diff --git a/mddocs/entities/output.png b/mddocs/entities/output.png
new file mode 100644
index 00000000..213dc612
Binary files /dev/null and b/mddocs/entities/output.png differ
diff --git a/mddocs/entities/output_relation.png b/mddocs/entities/output_relation.png
new file mode 100644
index 00000000..213dc612
Binary files /dev/null and b/mddocs/entities/output_relation.png differ
diff --git a/mddocs/entities/parent.png b/mddocs/entities/parent.png
new file mode 100644
index 00000000..6c0b4493
Binary files /dev/null and b/mddocs/entities/parent.png differ
diff --git a/mddocs/entities/parent_relation.png b/mddocs/entities/parent_relation.png
new file mode 100644
index 00000000..753807f8
Binary files /dev/null and b/mddocs/entities/parent_relation.png differ
diff --git a/mddocs/entities/run_lineage.png b/mddocs/entities/run_lineage.png
new file mode 100644
index 00000000..6bbd2c31
Binary files /dev/null and b/mddocs/entities/run_lineage.png differ
diff --git a/mddocs/entities/run_list.png b/mddocs/entities/run_list.png
new file mode 100644
index 00000000..dd3c61e6
Binary files /dev/null and b/mddocs/entities/run_list.png differ
diff --git a/mddocs/entities/symlink_relation.png b/mddocs/entities/symlink_relation.png
new file mode 100644
index 00000000..ef051a36
Binary files /dev/null and b/mddocs/entities/symlink_relation.png differ
diff --git a/mddocs/entities/tags.png b/mddocs/entities/tags.png
new file mode 100644
index 00000000..adb9ddb0
Binary files /dev/null and b/mddocs/entities/tags.png differ
diff --git a/mddocs/index.md b/mddocs/index.md
new file mode 100644
index 00000000..7b277145
--- /dev/null
+++ b/mddocs/index.md
@@ -0,0 +1,95 @@
+{{ datarentgen_logo_wide }}
+
+[](https://www.repostatus.org/#wip) [](https://hub.docker.com/r/mtsrus/data-rentgen) [](https://pypi.org/project/data-rentgen/) [](https://github.com/MobileTeleSystems/data-rentgen/blob/develop/LICENSE.txt) [](https://badge.fury.io/py/data-rentgen) [](https://data-rentgen.readthedocs.io/)
+[](https://github.com/MobileTeleSystems/data-rentgen/actions) [](https://codecov.io/github/MobileTeleSystems/data-rentgen) [](https://results.pre-commit.ci/latest/github/MobileTeleSystems/data-rentgen/develop)
+
+# What is Data.Rentgen?
+
+Data.Rentgen is a Data Motion Lineage service, compatible with [OpenLineage](https://openlineage.io/) specification.
+
+Currently we support consuming lineage from:
+
+* Apache Spark
+* Apache Airflow
+* Apache Hive
+* Apache Flink
+* dbt
+
+**Note**: service is under active development, so it doesn’t have stable API for now.
+
+# Goals
+
+* Collect lineage events produced by OpenLineage clients & integrations.
+* Store operation-grained events for better detalization (instead of job grained [Marquez](https://marquezproject.ai/)).
+* Provide API for fetching both job/run ↔ dataset lineage and dataset ↔ dataset lineage.
+
+# Features
+
+* Support consuming large amounts of lineage events, use Apache Kafka as event buffer.
+* Store data in tables partitioned by event timestamp, to speed up lineage graph resolution.
+* Lineage graph is build with user-specified time boundaries (unlike Marquez where lineage is build only for last job run).
+* Lineage graph can be build with different granularity. e.g. merge all individual Spark commands into Spark applicationId or Spark applicationName.
+* Column-level lineage support.
+* Authentication support.
+
+# Non-goals
+
+* This is **not** a Data Catalog. DataRentgen doesn’t track dataset schema change, owner and so on. Use [Datahub](https://datahubproject.io/) or [OpenMetadata](https://open-metadata.org/) instead.
+* Static Data Lineage like view → table is not supported.
+
+# Limitations
+
+* OpenLineage have integrations with Trino, Debezium and some other lineage sources. DataRentgen support may be added later.
+* Unlike Marquez, DataRentgen parses only limited set of facets send by OpenLineage, and doesn’t store custom facets. This can be changed in future.
+
+# Screenshots
+
+## Lineage graph
+
+Dataset-level lineage graph
+
+
+
+Dataset column-level lineage graph
+
+
+
+Job-level lineage graph
+
+
+
+Run-level lineage graph
+
+
+
+## Datasets
+
+
+
+## Runs
+
+
+
+## Spark application
+
+
+
+## Spark run
+
+
+
+## Spark command
+
+
+
+## Hive query
+
+
+
+## Airflow DagRun
+
+
+
+## Airflow TaskInstance
+
+
diff --git a/mddocs/install.md b/mddocs/install.md
new file mode 100644
index 00000000..d41416cd
--- /dev/null
+++ b/mddocs/install.md
@@ -0,0 +1,36 @@
+# Install Data.Rentgen { #overview-install }
+
+## Requirements
+
+- [Docker](https://docs.docker.com/engine/install/)
+- [docker-compose](https://github.com/docker/compose/releases/)
+
+## Install & run
+
+Copy `docker-compose.yml` and `.env.docker` from this repo:
+
+### "docker-compose.yml"
+
+!include(../../docker-compose.yml)
+
+### note ".env.docker"
+
+!include(../../.env.docker)
+
+Then start containers using `docker-compose`:
+
+ ```console
+ $ VERSION=latest docker compose --profile all up -d --wait
+ ...
+ ```
+
+`docker-compose` will download required images, create containers and start them in a proper order. Options can be set via `.env.docker` file or `environment` section in `docker-compose.yml`.
+
+`VERSION` is a tag of docker image. You can find all available tags [here](https://hub.docker.com/r/mtsrus/data-rentgen/tags).
+
+### Access Data.Rentgen
+
+After all containers are started and ready, you can:
+
+- Browse frontend at
+- Open REST API Swagger doc at
diff --git a/mddocs/integrations/airflow/dag_job_details.png b/mddocs/integrations/airflow/dag_job_details.png
new file mode 100644
index 00000000..98dc082a
Binary files /dev/null and b/mddocs/integrations/airflow/dag_job_details.png differ
diff --git a/mddocs/integrations/airflow/dag_run_details.png b/mddocs/integrations/airflow/dag_run_details.png
new file mode 100644
index 00000000..27d11cd6
Binary files /dev/null and b/mddocs/integrations/airflow/dag_run_details.png differ
diff --git a/mddocs/integrations/airflow/index.md b/mddocs/integrations/airflow/index.md
new file mode 100644
index 00000000..4c7b8772
--- /dev/null
+++ b/mddocs/integrations/airflow/index.md
@@ -0,0 +1,197 @@
+# Apache Airflow integration { #overview-setup-airflow }
+
+Using [OpenLineage integration with Apache Airflow](https://openlineage.io/docs/integrations/airflow/).
+
+## Requirements
+
+- [Apache Airflow](https://airflow.apache.org/) 2.x or 3.x
+- OpenLineage 1.19.0 or higher, recommended 1.34.0+
+- OpenLineage integration for Airflow (see below)
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Entity mapping
+
+- Airflow DAG → Data.Rentgen Job
+- Airflow DAGRun → Data.Rentgen Run
+- Airflow Task → Data.Rentgen Job
+- Airflow TaskInstance → Data.Rentgen Run + Data.Rentgen Operation
+
+## Install
+
+- For Airflow 2.7 or higher, use [apache-airflow-providers-openlineage](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/index.html) 1.9.0 or higher:
+
+ ```console title="KafkaTransport"
+ $ pip install "apache-airflow-providers-openlineage>=2.6.1" "openlineage-python[kafka]>=1.40.1" zstd
+ ...
+ ```
+
+ ```console title="HttpTransport (requires HTTP2Kafka)"
+ $ pip install "apache-airflow-providers-openlineage>=2.6.1"
+ ...
+ ```
+
+- For Airflow 2.1.x-2.6.x, use [OpenLineage integration for Airflow](https://openlineage.io/docs/integrations/airflow/) 1.19.0 or higher
+
+ ```console title="KafkaTransport"
+ $ pip install "openlineage-airflow>=1.40.1" "openlineage-python[kafka]>=1.40.1" zstd
+ ...
+ ```
+
+ ```console title="HttpTransport (requires HTTP2Kafka)"
+ $ pip install "openlineage-airflow>=1.40.1"
+ ...
+ ```
+
+## Setup
+
+### Via OpenLineage config file
+
+- Create `openlineage.yml` file with content like:
+
+ ```yaml title="KafkaTransport"
+ transport:
+ type: kafka
+ topic: input.runs
+ config:
+ # should be accessible from Airflow scheduler
+ bootstrap.servers: localhost:9093
+ security.protocol: SASL_PLAINTEXT
+ sasl.mechanism: SCRAM-SHA-256
+ # Kafka auth credentials
+ sasl.username: data_rentgen
+ sasl.password: changeme
+ compression.type: zstd
+ acks: all
+ ```
+
+ ```yaml title="HttpTransport (requires HTTP2Kafka)"
+ transport:
+ type: http
+ # http2kafka URL, should be accessible from Airflow scheduler
+ url: http://localhost:8002
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # create a PersonalToken, and pass it here
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+ ```
+
+- Pass path to config file via `AIRFLOW__OPENLINEAGE__CONFIG_PATH` environment variable:
+
+ ```ini
+ AIRFLOW__OPENLINEAGE__NAMESPACE=http://airflow.hostname.fqdn:8080
+ AIRFLOW__OPENLINEAGE__CONFIG_PATH=/path/to/openlineage.yml
+ ```
+
+### Via Airflow config file
+
+Setup OpenLineage integration using `airflow.cfg` config file:
+
+```ini title="KafkaTransport"
+[openlineage]
+# set here address of Airflow Web UI
+namespace = http://airflow.hostname.fqdn:8080
+# set here Kafka connection address & credentials
+transport = {"type": "kafka", "config": {"bootstrap.servers": "localhost:9093", "security.protocol": "SASL_PLAINTEXT", "sasl.mechanism": "SCRAM-SHA-256", "sasl.username": "data_rentgen", "sasl.password": "changeme", "compression.type": "zstd", "acks": "all"}, "topic": "input.runs", "flush": true}
+```
+
+```ini title="HttpTransport (requires HTTP2Kafka)"
+[openlineage]
+ # set here address of Airflow Web UI
+ namespace = http://airflow.hostname.fqdn:8080
+ # set here HTTP2Kafka url & create PersonalToken
+ transport = {"type": "http", "url": "http://localhost:8002", "endpoint": "/v1/openlineage", "compression": "gzip", "auth": {"type": "api_key", "apiKey": "personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC"}}
+```
+
+### Via Airflow environment variables
+
+Set environment variables for all Airflow components (e.g. via `docker-compose.yml`).Depending on your shell, you may remove single quotes
+
+```bash title="KafkaTransport"
+# set here address of Airflow Web UI
+AIRFLOW__OPENLINEAGE__NAMESPACE='http://airflow.hostname.fqdn:8080'
+# set here Kafka broker address & auth credentials
+AIRFLOW__OPENLINEAGE__TRANSPORT='{"type": "kafka", "config": {"bootstrap.servers": "localhost:9093", "security.protocol": "SASL_PLAINTEXT", "sasl.mechanism": "SCRAM-SHA-256", "sasl.username": "data_rentgen", "sasl.password": "changeme", "compression.type": "zstd", "acks": "all"}, "topic": "input.runs", "flush": true}'
+```
+
+```bash title="HttpTransport (requires HTTP2Kafka)"
+# set here address of Airflow Web UI
+AIRFLOW__OPENLINEAGE__NAMESPACE='http://airflow.hostname.fqdn:8080'
+# set here HTTP2Kafka url & create PersonalToken
+AIRFLOW__OPENLINEAGE__TRANSPORT='{"type": "http", "url": "http://localhost:8002", "endpoint": "/v1/openlineage", "compression": "gzip", "auth": {"type": "api_key", "apiKey": "personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC"}}'
+```
+
+### Airflow 2.1.x and 2.2.x
+
+For Airflow 2.1-2.2 it OpenLineage integration should be enabled explicitly by adding `airflow.cfg` config entry:
+
+```ini
+[lineage]
+backend=openlineage.lineage_backend.OpenLineageBackend
+```
+
+Or by setting up environment variable:
+
+```ini
+AIRFLOW__LINEAGE__BACKEND=openlineage.lineage_backend.OpenLineageBackend
+```
+
+## Collect and send lineage
+
+Run some Airflow dag with tasks, and wait until finished.
+Lineage will be send to Data.Rentgen automatically by OpenLineage integration.
+
+## See results
+
+Browse frontend page [Jobs](http://localhost:3000/jobs) to see what information was extracted by OpenLineage & DataRentgen.
+
+### Job list page
+
+
+
+### DAG job details page
+
+
+
+### DAG run details page
+
+
+
+### Task job details page
+
+
+
+### Task run details page
+
+
+
+### Job level lineage
+
+
+
+### Job dependencies
+
+
+
+## Extra configuration
+
+### Collecting DAG tags
+
+By default, following job tags are created:
+
+- `airflow.version`
+- `openlineage_adapter.version`
+- `openlineage_client.version` (using OpenLineage client 1.38.0+)
+
+If is possible to provide custom DAG tags as well. But DataRentgen is able to extract only tags in format `key:value`, e.g.:
+
+```python title="mydag.py"
+from airflow.models import DAG
+
+with DAG(
+ dag_id="mydag",
+ tags=["environment:production", "layer:bronze"],
+)
+```
diff --git a/mddocs/integrations/airflow/job_hierarchy.png b/mddocs/integrations/airflow/job_hierarchy.png
new file mode 100644
index 00000000..e740470c
Binary files /dev/null and b/mddocs/integrations/airflow/job_hierarchy.png differ
diff --git a/mddocs/integrations/airflow/job_lineage.png b/mddocs/integrations/airflow/job_lineage.png
new file mode 100644
index 00000000..51fd33db
Binary files /dev/null and b/mddocs/integrations/airflow/job_lineage.png differ
diff --git a/mddocs/integrations/airflow/job_list.png b/mddocs/integrations/airflow/job_list.png
new file mode 100644
index 00000000..ad80a11f
Binary files /dev/null and b/mddocs/integrations/airflow/job_list.png differ
diff --git a/mddocs/integrations/airflow/task_job_details.png b/mddocs/integrations/airflow/task_job_details.png
new file mode 100644
index 00000000..0a5ee793
Binary files /dev/null and b/mddocs/integrations/airflow/task_job_details.png differ
diff --git a/mddocs/integrations/airflow/task_run_details.png b/mddocs/integrations/airflow/task_run_details.png
new file mode 100644
index 00000000..57cb248e
Binary files /dev/null and b/mddocs/integrations/airflow/task_run_details.png differ
diff --git a/mddocs/integrations/dbt/index.md b/mddocs/integrations/dbt/index.md
new file mode 100644
index 00000000..058ec198
--- /dev/null
+++ b/mddocs/integrations/dbt/index.md
@@ -0,0 +1,210 @@
+# dbt integration { #overview-setup-dbt }
+
+Using [OpenLineage integration with dbt](https://openlineage.io/docs/integrations/dbt).
+
+## Requirements
+
+- [dbt](https://www.getdbt.com/) 1.3 or higher
+- OpenLineage 1.19.0 or higher, recommended 1.40.1+
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Limitations
+
+- Currently there is no way to pass dataset tags, [see issue](https://github.com/OpenLineage/OpenLineage/issues/3500)
+
+## Entity mapping
+
+- dbt project → Data.Rentgen Job
+- dbt run → Data.Rentgen Run
+- dbt model, snapshot, sql, test → Data.Rentgen Operation
+
+## Install
+
+```console title="KafkaTransport"
+$ pip install "openlineage-dbt>=1.40.1" "openlineage-python[kafka]>=1.40.1" zstd
+...
+```
+
+```console title="HttpTransport (requires HTTP2Kafka)"
+$ pip install "openlineage-dbt>=1.40.1"
+...
+```
+
+## Setup
+
+- Create `openlineage.yml` file with content like:
+
+```yaml title="KafkaTransport"
+ transport:
+ type: kafka
+ topic: input.runs
+ config:
+ # should be accessible from host
+ bootstrap.servers: localhost:9093
+ security.protocol: SASL_PLAINTEXT
+ sasl.mechanism: SCRAM-SHA-256
+ # Kafka auth credentials
+ sasl.username: data_rentgen
+ sasl.password: changeme
+ compression.type: zstd
+ acks: all
+```
+
+```yaml title="KafkaTransport"
+ transport:
+ # "type: http" for OpenLineage below 1.35.0
+ type: async_http
+ # http2kafka URL, should be accessible from host
+ url: http://localhost:8002
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # create a PersonalToken, and pass it here
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+```
+
+- Set environment variables:
+
+```ini
+OPENLINEAGE_NAMESPACE=local://dbt.host.name
+OPENLINEAGE_CONFIG=/path/to/openlineage.yml
+```
+
+## Collect and send lineage
+
+Replace `dbt` CLI commands:
+
+```shell
+$ dbt run myproject
+...
+$ dbt test myproject
+...
+```
+
+with `dbt-ol` CLI:
+
+```shell
+$ dbt-ol run myproject
+...
+$ dbt-ol test myproject
+...
+```
+
+Lineage will be send to Data.Rentgen automatically by OpenLineage integration.
+
+## See results
+
+Browse frontend page [Jobs](http://localhost:3000/jobs) to see what information was extracted by OpenLineage & DataRentgen
+
+### Job list page
+
+
+
+### Job details page
+
+
+
+### Job-level lineage
+
+
+
+### Run details
+
+
+
+### Run lineage
+
+
+
+### Operation details
+
+
+
+### Operation lineage
+
+
+
+## Extra configuration
+
+### Collecting model tags
+
+By default, following job tags are created:
+
+- `dbt.version`
+- `openlineage_adapter.version`
+- `openlineage_client.version` (using OpenLineage client 1.38.0+)
+
+It is possible to provide custom tags via model config:
+
+```yaml title="dbt_project.yaml"
+models:
+ jaffle_shop:
+ materialized: table
+ staging:
+ materialized: view
+ +tags:
+ - environment:production
+ - layer:bronze
+```
+
+## Binding Airflow Task with Spark application
+
+If OpenLineage event contains [Parent Run facet](https://openlineage.io/docs/spec/facets/run-facets/parent_run/),
+DataRentgen can use this information to bind dbt run to the run it was triggered by, e.g. Airflow task:
+
+
+
+To fill up this facet, it is required to:
+
+- Setup OpenLineage integration for dbt
+- Setup [OpenLineage integration for Airflow][overview-setup-airflow]
+- Pass parent Run info from Airflow to dbt by using [Airflow macros](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/macros.html#lineage-job-run-macros):
+
+```py title="BashOperator"
+from airflow.providers.standard.operators.bash import BashOperator
+
+task = BashOperator(
+ task_id="dbt_run_task",
+ cwd="/path/to/project",
+ bash_command="dbt-ol run",
+ append_env=True,
+ env={
+ # Pass parent Run info from Airflow to Spark
+ "OPENLINEAGE_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_parent_id(task_instance) }}",
+ # For apache-airflow-providers-openlineage 2.4.0 or above
+ "OPENLINEAGE_ROOT_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_root_parent_id(task_instance) }}",
+ }
+)
+```
+
+``` py title="SSHOperator"
+from airflow.providers.ssh.operators.ssh import SSHOperator
+
+task = SSHOperator(
+ task_id="dbt_run_task",
+ ssh_conn_id="some_host",
+ command="cd /path/to/project && dbt-ol run",
+ environment={
+ "OPENLINEAGE_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_parent_id(task_instance) }}",
+ # For apache-airflow-providers-openlineage 2.4.0 or above
+ "OPENLINEAGE_ROOT_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_root_parent_id(task_instance) }}",
+ }
+)
+```
+
+```py title="KubernetesPodOperator"
+from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
+
+task = SSHOperator(
+ task_id="dbt_run_task",
+ cmds=["bash", "-cx"],
+ arguments=["cd /path/to/project && dbt-ol run"],
+ env_vars={
+ "OPENLINEAGE_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_parent_id(task_instance) }}",
+ # For apache-airflow-providers-openlineage 2.4.0 or above
+ "OPENLINEAGE_ROOT_PARENT_ID": "{{ macros.OpenLineageProviderPlugin.lineage_root_parent_id(task_instance) }}",
+ }
+)
+```
diff --git a/mddocs/integrations/dbt/job_details.png b/mddocs/integrations/dbt/job_details.png
new file mode 100644
index 00000000..ad9b0125
Binary files /dev/null and b/mddocs/integrations/dbt/job_details.png differ
diff --git a/mddocs/integrations/dbt/job_lineage.png b/mddocs/integrations/dbt/job_lineage.png
new file mode 100644
index 00000000..344b604f
Binary files /dev/null and b/mddocs/integrations/dbt/job_lineage.png differ
diff --git a/mddocs/integrations/dbt/job_list.png b/mddocs/integrations/dbt/job_list.png
new file mode 100644
index 00000000..45c9e4c8
Binary files /dev/null and b/mddocs/integrations/dbt/job_list.png differ
diff --git a/mddocs/integrations/dbt/operation_details.png b/mddocs/integrations/dbt/operation_details.png
new file mode 100644
index 00000000..47e27305
Binary files /dev/null and b/mddocs/integrations/dbt/operation_details.png differ
diff --git a/mddocs/integrations/dbt/operation_lineage.png b/mddocs/integrations/dbt/operation_lineage.png
new file mode 100644
index 00000000..07e71633
Binary files /dev/null and b/mddocs/integrations/dbt/operation_lineage.png differ
diff --git a/mddocs/integrations/dbt/run_details.png b/mddocs/integrations/dbt/run_details.png
new file mode 100644
index 00000000..ac64e273
Binary files /dev/null and b/mddocs/integrations/dbt/run_details.png differ
diff --git a/mddocs/integrations/dbt/run_lineage.png b/mddocs/integrations/dbt/run_lineage.png
new file mode 100644
index 00000000..96dd93bb
Binary files /dev/null and b/mddocs/integrations/dbt/run_lineage.png differ
diff --git a/mddocs/integrations/flink1/dataset_lineage.png b/mddocs/integrations/flink1/dataset_lineage.png
new file mode 100644
index 00000000..1672f937
Binary files /dev/null and b/mddocs/integrations/flink1/dataset_lineage.png differ
diff --git a/mddocs/integrations/flink1/index.md b/mddocs/integrations/flink1/index.md
new file mode 100644
index 00000000..0f1fcc9b
--- /dev/null
+++ b/mddocs/integrations/flink1/index.md
@@ -0,0 +1,174 @@
+# Apache Flink 1.x integration { #overview-setup-flink1 }
+
+Using [OpenLineage integration with Apache Flink 1.x](https://openlineage.io/docs/integrations/flink/flink1).
+
+## Requirements
+
+- [Apache Flink](https://flink.apache.org/) 1.x
+- OpenLineage 1.31.0 or higher, recommended 1.40.1+
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Limitations
+
+- Only `standalone-job` (application mode) is supported, but not `jobmanager` (session mode): (https://github.com/OpenLineage/OpenLineage/issues/2150)[OpenLineageissue]
+- Currently there is no way to pass job tags, [see issue](https://github.com/OpenLineage/OpenLineage/issues/4280)
+
+## Entity mapping
+
+- Flink job → Data.Rentgen Job
+- Flink job run → Data.Rentgen Run + Data.Rentgen Operation
+
+## Installation
+
+- Add dependencies [openlineage-flink](https://mvnrepository.com/artifact/io.openlineage/openlineage-flink) and [kafka-clients](https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients) to your Flink job:
+
+ ```groovy title="build.gradle"
+
+ implementation "io.openlineage:openlineage-flink:1.40.1"
+ // For KafkaTransport only
+ implementation "org.apache.kafka:kafka-clients:3.9.0"
+ ```
+
+- Register `OpenLineageFlinkJobListener` in the code of your Flink job:
+
+ ```java title="MyFlinkJob.java"
+
+ import io.openlineage.flink.OpenLineageFlinkJobListener;
+
+ StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+ JobListener listener = OpenLineageFlinkJobListener.builder()
+ .executionEnvironment(env)
+ .build();
+ env.registerJobListener(listener);
+ ```
+
+## Setup
+
+- Modify Flink `config.yaml` to include:
+
+ ```yaml title="config.yaml"
+
+ execution.attached: true # capture job stop events
+ ```
+
+- Create `openlineage.yml` file with content like:
+
+```yaml KafkaTransport title="openlineage.yml"
+
+ job:
+ namespace: http://some.host.name:18081 # set namespace to match Flink address
+ name: flink_examples_stateful # set job name
+
+ # Send RUNNING event every 1 hour.
+ # Using default interval (1 minute) just floods Kafka with useless RUNNING events.
+ trackingIntervalInSeconds: 3600
+
+ transport:
+ type: kafka
+ topicName: input.runs
+ properties:
+ bootstrap.servers: broker:9092 # not using localhost in docker
+ security.protocol: SASL_PLAINTEXT
+ sasl.mechanism: SCRAM-SHA-256
+ sasl.jaas.config: |
+ org.apache.kafka.common.security.scram.ScramLoginModule required
+ username="data_rentgen"
+ password="changeme";
+ key.serializer: org.apache.kafka.common.serialization.StringSerializer
+ value.serializer: org.apache.kafka.common.serialization.StringSerializer
+ compression.type: zstd
+ acks: all
+```
+
+```yaml HttpTransport (requires HTTP2Kafka) title="openlineage.yml"
+
+job:
+ # set namespace to match Flink address
+ namespace: http://some.host.name:18081
+ # set job name
+ name: flink_examples_stateful
+
+# Send RUNNING event every 1 hour.
+# Using default interval (1 minute) just floods Kafka with useless RUNNING events.
+trackingIntervalInSeconds: 3600
+
+transport:
+ type: http
+ # should be accessible inside jobmanager container
+ # not using localhost in docker!
+ url: http://http2kafka:8000
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # create a PersonalToken, and pass it here
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+```
+
+- Pass path to config file via `OPENLINEAGE_CONFIG` environment variable of `jobmanager`:
+
+ ```bash
+ OPENLINEAGE_CONFIG=/path/to/openlineage.yml
+ ```
+
+At the end, this should look like this (see [Official documentation](https://nightlies.apache.org/flink/flink-docs-release-1.20/docs/deployment/resource-providers/standalone/docker/)):
+
+```yaml title="docker-compose.yml"
+
+services:
+ jobmanager:
+ image: flink:1.20.1-scala_2.12-java11
+ ports:
+ - "18081:8081"
+ # only standalone-job is supported
+ command: standalone-job --job-classname my.awesome.FlinkStatefulApplication
+ volumes:
+ - ./artifacts/:/opt/flink/usrlib/ # path to you Flink Job .jar files
+ - ./config.yaml:/opt/flink/conf/config.yaml
+ - ./openlineage.yml:/opt/flink/conf/openlineage.yml
+ environment:
+ - OPENLINEAGE_CONFIG=/path/to/openlineage.yml
+
+ taskmanager:
+ image: flink:1.20.1-scala_2.12-java11
+ depends_on:
+ - jobmanager
+ command: taskmanager
+ volumes:
+ - ./artifacts/:/opt/flink/usrlib/ # path to you Flink Job .jar files
+ - ./config.yaml:/opt/flink/conf/config.yaml
+```
+
+## Collect and send lineage
+
+Just start your Flink job. OpenLineage integration will automatically collect and send lineage to DataRentgen.
+
+## See results
+
+Browse frontend pages [Jobs](http://localhost:3000/jobs) to see what information was extracted by OpenLineage & DataRentgen.
+
+### Job list page
+
+
+
+### Job details page
+
+
+
+### Run details page
+
+
+
+### Dataset level lineage
+
+
+
+### Job level lineage
+
+
+
+### Run level lineage
+
+
diff --git a/mddocs/integrations/flink1/job_details.png b/mddocs/integrations/flink1/job_details.png
new file mode 100644
index 00000000..3d976f99
Binary files /dev/null and b/mddocs/integrations/flink1/job_details.png differ
diff --git a/mddocs/integrations/flink1/job_lineage.png b/mddocs/integrations/flink1/job_lineage.png
new file mode 100644
index 00000000..0bf0afc5
Binary files /dev/null and b/mddocs/integrations/flink1/job_lineage.png differ
diff --git a/mddocs/integrations/flink1/job_list.png b/mddocs/integrations/flink1/job_list.png
new file mode 100644
index 00000000..34e1c373
Binary files /dev/null and b/mddocs/integrations/flink1/job_list.png differ
diff --git a/mddocs/integrations/flink1/run_details.png b/mddocs/integrations/flink1/run_details.png
new file mode 100644
index 00000000..81b675b2
Binary files /dev/null and b/mddocs/integrations/flink1/run_details.png differ
diff --git a/mddocs/integrations/flink1/run_lineage.png b/mddocs/integrations/flink1/run_lineage.png
new file mode 100644
index 00000000..3dd6515c
Binary files /dev/null and b/mddocs/integrations/flink1/run_lineage.png differ
diff --git a/mddocs/integrations/flink2/index.md b/mddocs/integrations/flink2/index.md
new file mode 100644
index 00000000..8dbe8bdf
--- /dev/null
+++ b/mddocs/integrations/flink2/index.md
@@ -0,0 +1,192 @@
+# Apache Flink 2.x integration { #overview-setup-flink2 }
+
+Using [OpenLineage integration with Apache Flink 2.x](https://openlineage.io/docs/integrations/flink/flink2).
+
+## Requirements
+
+- [Apache Flink](https://flink.apache.org/) 2.x
+- OpenLineage 1.31.0 or higher, recommended 1.40.1+
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Limitations
+
+- Currently there is no way to pass job tags, [see issue](https://github.com/OpenLineage/OpenLineage/issues/4280)
+
+## Entity mapping
+
+- Flink job → Data.Rentgen Job
+- Flink job run → Data.Rentgen Run + Data.Rentgen Operation
+
+## Installation
+
+- Download these jars and place then in `openlineage/jars/` directory:
+
+ - KafkaTransport:
+
+ - [openlineage-java](https://mvnrepository.com/artifact/io.openlineage/openlineage-java)
+ - [openlineage-flink](https://mvnrepository.com/artifact/io.openlineage/openlineage-flink)
+ - [kafka-clients](https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients)
+ - [zstd-jni](https://mvnrepository.com/artifact/com.github.luben/zstd-jni)
+
+ - HttpTransport (requires HTTP2Kafka):
+
+ - [openlineage-flink](https://mvnrepository.com/artifact/io.openlineage/openlineage-flink)
+
+- Set environment variable `CLASSPATH` of Flink's `JobManager` to point to this directory path:
+
+ ```ini
+ CLASSPATH=/path/to/openlineage/jars/
+ ```
+
+- Configure Flink `JobManager` to load these dependencies using its own ClassLoader:
+
+ ```yaml title="config.yaml"
+
+ # For KafkaTransport
+ classloader.parent-first-patterns.additional: ["io.openlineage.", "org.apache.kafka.","com.github.luben."]
+ # For HttpTransport
+ #classloader.parent-first-patterns.additional: ["io.openlineage."]
+ ```
+
+ Otherwise Flink will load all classes from job's classloader, and this could lead to errors like:
+
+ ```text
+ org.apache.kafka.common.KafkaException: class org.apache.kafka.common.serialization.StringSerializer is not an instance of org.apache.kafka.common.serialization.Serializer
+ java.util.ServiceConfigurationError: io.openlineage.client.transports.TransportBuilder: io.openlineage.client.transports.HttpTransportBuilder not a subtype
+ ```
+
+ See [Flink documentation](https://nightlies.apache.org/flink/flink-docs-release-2.0/docs/deployment/config/#class-loading) for more details.
+
+## Setup
+
+- Add `OpenLineageJobStatusChangedListenerFactory` to Flink `config.yaml`:
+
+ ```yaml title="config.yaml"
+
+ # For KafkaTransport
+ classloader.parent-first-patterns.additional: ["io.openlineage.", "org.apache.kafka.","com.github.luben."]
+ # For HttpTransport
+ #classloader.parent-first-patterns.additional: ["io.openlineage."]
+
+ # capture job events
+ execution.job-status-changed-listeners: io.openlineage.flink.listener.OpenLineageJobStatusChangedListenerFactory
+ # capture job stop events
+ execution.attached: true
+ # set namespace to match Flink address
+ execution.job-listener.openlineage.namespace: http://some.host.name:18081
+ # set job name
+ execution.job-listener.openlineage.job-name: flink_examples_stateful
+ ```
+
+- Create `openlineage.yml` file with content like:
+
+ ```yaml KafkaTransport title="openlineage.yml"
+
+ # Send RUNNING event every 1 hour.
+ # Using default interval (1 minute) just floods Kafka with useless RUNNING events.
+ trackingIntervalInSeconds: 600
+
+ transport:
+ type: kafka
+ topicName: input.runs
+ properties:
+ # should be accessible inside jobmanager container
+ # not using localhost in docker!
+ bootstrap.servers: broker:9092
+ security.protocol: SASL_PLAINTEXT
+ sasl.mechanism: SCRAM-SHA-256
+ # Kafka auth credentials
+ sasl.jaas.config: |
+ org.apache.kafka.common.security.scram.ScramLoginModule required
+ username="data_rentgen"
+ password="changeme";
+ key.serializer: org.apache.kafka.common.serialization.StringSerializer
+ value.serializer: org.apache.kafka.common.serialization.StringSerializer
+ compression.type: zstd
+ acks: all
+ ```
+
+ ```yaml HttpTransport (requires HTTP2Kafka) title="openlineage.yml"
+
+ # Send RUNNING event every 1 hour.
+ # Using default interval (1 minute) just floods Kafka with useless RUNNING events.
+ trackingIntervalInSeconds: 3600
+
+ transport:
+ type: http
+ url: http://http2kafka:8000 # not using localhost in docker
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # create a PersonalToken, and pass it here
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+ ```
+
+- Pass path to config file via `OPENLINEAGE_CONFIG` environment variable of `jobmanager`:
+
+ ```bash
+ OPENLINEAGE_CONFIG=/path/to/openlineage.yml
+ ```
+
+At the end, this should look like this (see [Official documentation](https://nightlies.apache.org/flink/flink-docs-release-2.0/docs/deployment/resource-providers/standalone/docker/)):
+
+```yaml title="docker-compose.yml"
+
+services:
+ jobmanager:
+ image: flink:2.0.0-scala_2.12-java11
+ ports:
+ - "18081:8081"
+ # supported both standalone-job and jobmanager
+ command: standalone-job --job-classname my.awesome.FlinkStatefulApplication
+ volumes:
+ - ./artifacts/:/opt/flink/usrlib/ # path to you Flink Job .jar files, if using standalone-job
+ - ./config.yaml:/opt/flink/conf/config.yaml
+ - ./openlineage/jars/:/opt/flink/usrlib/openlineage/
+ - ./openlineage.yml:/opt/flink/conf/openlineage.yml
+ environment:
+ - CLASSPATH=/opt/flink/usrlib/openlineage/
+
+ taskmanager:
+ image: flink:2.0.0-scala_2.12-java11
+ depends_on:
+ - jobmanager
+ command: taskmanager
+ volumes:
+ - ./artifacts/:/opt/flink/usrlib/ # path to you Flink Job .jar files, if using standalone-job
+ - ./config.yaml:/opt/flink/conf/config.yaml
+```
+
+## Collect and send lineage
+
+Just start your Flink job. OpenLineage integration will automatically collect and send lineage to DataRentgen.
+
+## See results
+
+Browse frontend pages [Jobs](http://localhost:3000/jobs) to see what information was extracted by OpenLineage & DataRentgen.
+
+### Job list page
+
+
+
+### Job details page
+
+
+
+### Run details page
+
+
+
+### Dataset level lineage
+
+
+
+### Job level lineage
+
+
+
+### Run level lineage
+
+
diff --git a/mddocs/integrations/hive/dataset_lineage.png b/mddocs/integrations/hive/dataset_lineage.png
new file mode 100644
index 00000000..9ac0470c
Binary files /dev/null and b/mddocs/integrations/hive/dataset_lineage.png differ
diff --git a/mddocs/integrations/hive/index.md b/mddocs/integrations/hive/index.md
new file mode 100644
index 00000000..d549debc
--- /dev/null
+++ b/mddocs/integrations/hive/index.md
@@ -0,0 +1,247 @@
+# Apache Hive integration { #overview-setup-hive }
+
+Using [OpenLineage integration with Apache Hive](https://openlineage.io/docs/integrations/hive/).
+
+## Requirements
+
+- [Apache Hive](https://hive.apache.org/) 3.1.3 (4.0 is not yet supported)
+- OpenLineage 1.34.0 or higher, recommended 1.40.1+
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Limitations
+
+- **Hive CLI** is not supported. HiveServer2 is required.
+
+- As for OpenLineage 1.40.1 version only these queries are parsed as containing lineage:
+
+ - `CREATE TABLE .. AS SELECT ...`
+ - `INSERT INTO ... SELECT ...`
+
+ Other query types are ignored by OpenLineage integration, including:
+
+ - `CREATE TABLE ...`, `ALTER TABLE ...`, `TRUNCATE TABLE ...`, `DROP TABLE ...`.
+ - `INSERT INTO ... VALUES ...`, `UPDATE`, `DELETE`, `MERGE`.
+ - `LOAD DATA`, `EXPORT`, `IMPORT`.
+ - `SELECT` data directly to JDBC client.
+
+- Hive sends events when user session started, but not when stopped. So all Hive sessions in Data.Rentgen are in `STARTED` status.
+
+- Currently there is no way to pass job tags, [see issue](https://github.com/OpenLineage/OpenLineage/issues/4280)
+
+## Entity mapping
+
+- Hive user + user IP → Data.Rentgen Job
+- Hive session → Data.Rentgen Run
+- Hive query → Data.Rentgen Operation
+
+## Installation
+
+Download these jars and place then in `/path/to/jars/` directory on HiveServer2 machine:
+
+- KafkaTransport:
+
+ - [openlineage-java](https://mvnrepository.com/artifact/io.openlineage/openlineage-java)
+ - [openlineage-hive](https://mvnrepository.com/artifact/io.openlineage/openlineage-hive)
+ - [kafka-clients](https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients)
+ - [zstd-jni](https://mvnrepository.com/artifact/com.github.luben/zstd-jni)
+
+- HttpTransport (requires HTTP2Kafka):
+
+ - [openlineage-hive](https://mvnrepository.com/artifact/io.openlineage/openlineage-hive)
+
+## Setup
+
+Change `hive-site.xml` configuration file:
+
+```xml KafkaTransport title="hive-site.xml"
+
+
+
+
+ hive.conf.validation
+ false
+
+
+
+
+ hive.aux.jars.path
+ /path/to/jars/
+
+
+
+
+ hive.server2.session.hook
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+ hive.exec.post.hooks
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+ hive.exec.failure.hooks
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+
+
+ hive.openlineage.transport.type
+ kafka
+
+
+ hive.openlineage.transport.topicName
+ input.runs
+
+
+ hive.openlineage.transport.properties.bootstrap.servers
+
+ localhost:9093
+
+
+ hive.openlineage.transport.properties.security.protocol
+ SASL_PLAINTEXT
+
+
+ hive.openlineage.transport.properties.sasl.mechanism
+ SCRAM-SHA-256
+
+
+ hive.openlineage.transport.properties.sasl.jaas.config
+ org.apache.kafka.common.security.scram.ScramLoginModule required username="data_rentgen" password="changeme";
+
+
+ hive.openlineage.transport.properties.key.serializer
+ org.apache.kafka.common.serialization.StringSerializer
+
+
+ hive.openlineage.transport.properties.value.serializer
+ org.apache.kafka.common.serialization.StringSerializer
+
+
+ hive.openlineage.transport.properties.compression.type
+ zstd
+
+
+ hive.openlineage.transport.properties.acks
+ all
+
+
+
+
+ hive.openlineage.namespace
+ hive://my.hive.host:10000
+
+
+```
+
+```xml HttpTransport (requires HTTP2Kafka) title="hive-site.xml"
+
+
+
+
+ hive.conf.validation
+ false
+
+
+
+
+ hive.aux.jars.path
+ /path/to/jars/
+
+
+
+
+ hive.server2.session.hook
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+ hive.exec.post.hooks
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+ hive.exec.failure.hooks
+ io.openlineage.hive.hooks.HiveOpenLineageHook
+
+
+
+
+ hive.openlineage.transport.type
+ http
+
+
+ hive.openlineage.transport.url
+
+ http://localhost:8002
+
+
+ hive.openlineage.transport.endpoint
+ /v1/openlineage
+
+
+ hive.openlineage.transport.compression
+ gzip
+
+
+ hive.openlineage.transport.auth.type
+ api_key
+
+
+ hive.openlineage.transport.auth.apiKey
+
+ personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+
+
+
+
+ hive.openlineage.namespace
+ hive://my.hive.host:10000
+
+
+```
+
+## Collect and send lineage
+
+Connect to you HiveServer2 instance JDBC interface, e.g. using `beeline` or DBeaver.
+After query was executed, integration will send lineage events to DataRentgen.
+
+!!! note
+ By default, Job is created with name `{username}@{clientIp}`. You can override this name by executing this statement:
+ ```sql
+ SET hive.openlineage.job.name=my_session_name;
+ ```
+
+## See results
+
+Browse frontend pages [Jobs](http://localhost:3000/jobs) to see what information was extracted by OpenLineage & DataRentgen.
+
+### Job list page
+
+
+
+### Job details page
+
+
+
+### Run details page
+
+
+
+### Operation details page
+
+
+
+### Dataset level lineage
+
+
+
+### Job level lineage
+
+
+
+### Run level lineage
+
+
+
+### Operation level lineage
+
+
diff --git a/mddocs/integrations/hive/job_details.png b/mddocs/integrations/hive/job_details.png
new file mode 100644
index 00000000..8d39f523
Binary files /dev/null and b/mddocs/integrations/hive/job_details.png differ
diff --git a/mddocs/integrations/hive/job_lineage.png b/mddocs/integrations/hive/job_lineage.png
new file mode 100644
index 00000000..102b9537
Binary files /dev/null and b/mddocs/integrations/hive/job_lineage.png differ
diff --git a/mddocs/integrations/hive/job_list.png b/mddocs/integrations/hive/job_list.png
new file mode 100644
index 00000000..2512bff6
Binary files /dev/null and b/mddocs/integrations/hive/job_list.png differ
diff --git a/mddocs/integrations/hive/operation_details.png b/mddocs/integrations/hive/operation_details.png
new file mode 100644
index 00000000..71d41d0c
Binary files /dev/null and b/mddocs/integrations/hive/operation_details.png differ
diff --git a/mddocs/integrations/hive/operation_lineage.png b/mddocs/integrations/hive/operation_lineage.png
new file mode 100644
index 00000000..b1076147
Binary files /dev/null and b/mddocs/integrations/hive/operation_lineage.png differ
diff --git a/mddocs/integrations/hive/run_details.png b/mddocs/integrations/hive/run_details.png
new file mode 100644
index 00000000..6d39f98d
Binary files /dev/null and b/mddocs/integrations/hive/run_details.png differ
diff --git a/mddocs/integrations/hive/run_lineage.png b/mddocs/integrations/hive/run_lineage.png
new file mode 100644
index 00000000..c7492944
Binary files /dev/null and b/mddocs/integrations/hive/run_lineage.png differ
diff --git a/mddocs/integrations/spark/dataset_column_lineage.png b/mddocs/integrations/spark/dataset_column_lineage.png
new file mode 100644
index 00000000..3994f6f7
Binary files /dev/null and b/mddocs/integrations/spark/dataset_column_lineage.png differ
diff --git a/mddocs/integrations/spark/dataset_lineage.png b/mddocs/integrations/spark/dataset_lineage.png
new file mode 100644
index 00000000..d4ea7fba
Binary files /dev/null and b/mddocs/integrations/spark/dataset_lineage.png differ
diff --git a/mddocs/integrations/spark/index.md b/mddocs/integrations/spark/index.md
new file mode 100644
index 00000000..68b79eae
--- /dev/null
+++ b/mddocs/integrations/spark/index.md
@@ -0,0 +1,335 @@
+# Apache Spark integration { #overview-setup-spark }
+
+Using [OpenLineage integration with Apache Spark](https://openlineage.io/docs/integrations/spark/).
+
+## Requirements
+
+- [Apache Spark](https://spark.apache.org/) 3.x or higher
+- OpenLineage 1.23.0 or higher, recommended 1.40.1+
+- Running [message-broker][message-broker]
+- (Optional) [http2kafka][http2kafka]
+
+## Entity mapping
+
+- Spark applicationName → Data.Rentgen Job
+- Spark applicationId → Data.Rentgen Run
+- Spark job, execution, RDD → Data.Rentgen Operation
+
+## Setup
+
+### Via OpenLineage config file
+
+- Create `openlineage.yml` file with content like:
+
+ ```yaml KafkaTransport title="openlineage.yml"
+ transport:
+ type: kafka
+ topicName: input.runs
+ properties:
+ # should be accessible from Spark driver
+ bootstrap.servers: localhost:9093
+ security.protocol: SASL_PLAINTEXT
+ sasl.mechanism: SCRAM-SHA-256
+ sasl.jaas.config: |
+ org.apache.kafka.common.security.scram.ScramLoginModule required
+ username="data_rentgen"
+ password="changeme";
+ key.serializer: org.apache.kafka.common.serialization.StringSerializer
+ value.serializer: org.apache.kafka.common.serialization.StringSerializer
+ compression.type: zstd
+ acks: all
+ ```
+
+ ```yaml HttpTransport (requires HTTP2Kafka) title="openlineage.yml"
+ transport:
+ type: http
+ # http2kafka URL, should be accessible from Spark driver
+ url: http://localhost:8002
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # create a PersonalToken, and pass it here
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+ ```
+
+- Pass path to config file via `OPENLINEAGE_CONFIG` environment variable:
+
+ ```bash
+ OPENLINEAGE_NAMESPACE=local://hostname.as.fqdn
+ # set here location of Spark session, e.g. current host, YARN cluster or K8s cluster:
+ OPENLINEAGE_CONFIG=/path/to/openlineage.yml
+ #OPENLINEAGE_NAMESPACE=yarn://some-cluster
+ #OPENLINEAGE_NAMESPACE=k8s://some-cluster
+ ```
+
+- Setup `OpenLineageSparkListener` via SparkSession config:
+
+```python title="etl.py"
+from pyspark.sql import SparkSession
+
+spark = (
+ SparkSession.builder
+ # install OpenLineage integration and Kafka client
+ .config(
+ "spark.jars.packages",
+ # For KafkaTransport
+ "io.openlineage:openlineage-spark_2.12:1.34.0,org.apache.kafka:kafka-clients:3.9.0",
+ # For HttpTransport
+ #"io.openlineage:openlineage-spark_2.12:1.40.1",
+ )
+ .config(
+ "spark.extraListeners",
+ "io.openlineage.spark.agent.OpenLineageSparkListener"
+ )
+ # set Spark session master & applicationName
+ .master("local")
+ .appName("mysession")
+ # few other important options
+ .config("spark.openlineage.jobName.appendDatasetName", "false")
+ .config("spark.openlineage.columnLineage.datasetLineageEnabled", "true")
+ .getOrCreate()
+)
+```
+
+### Via `SparkSession` config
+
+Add OpenLineage integration package, setup `OpenLineageSparkListener` in SparkSession config:
+
+```python KafkaTransport title="etl.py"
+from pyspark.sql import SparkSession
+
+spark = (
+ SparkSession.builder
+ # install OpenLineage integration and Kafka client
+ .config(
+ "spark.jars.packages",
+ "io.openlineage:openlineage-spark_2.12:1.40.1,org.apache.kafka:kafka-clients:3.9.0",
+ )
+ .config(
+ "spark.extraListeners", "io.openlineage.spark.agent.OpenLineageSparkListener"
+ )
+ # set Spark session master & applicationName
+ .master("local")
+ .appName("mysession")
+ # set here location of Spark session, e.g. current host, YARN cluster or K8s cluster:
+ .config("spark.openlineage.namespace", "local://hostname.as.fqdn")
+ # .config("spark.openlineage.namespace", "yarn://some-cluster")
+ # .config("spark.openlineage.namespace", "k8s://some-cluster")
+ .config("spark.openlineage.transport.type", "kafka")
+ # set here Kafka connection address & credentials
+ .config("spark.openlineage.transport.topicName", "input.runs")
+ .config(
+ # should be accessible from Spark driver
+ "spark.openlineage.transport.properties.bootstrap.servers",
+ "localhost:9093",
+ )
+ .config(
+ "spark.openlineage.transport.properties.security.protocol",
+ "SASL_PLAINTEXT",
+ )
+ .config(
+ "spark.openlineage.transport.properties.sasl.mechanism",
+ "SCRAM-SHA-256",
+ )
+ .config(
+ "spark.openlineage.transport.properties.sasl.jaas.config",
+ 'org.apache.kafka.common.security.scram.ScramLoginModule required username="data_rentgen" password="changeme";',
+ )
+ .config("spark.openlineage.transport.properties.acks", "all")
+ .config(
+ "spark.openlineage.transport.properties.key.serializer",
+ "org.apache.kafka.common.serialization.StringSerializer",
+ )
+ .config(
+ "spark.openlineage.transport.properties.value.serializer",
+ "org.apache.kafka.common.serialization.StringSerializer",
+ )
+ .config("spark.openlineage.transport.properties.compression.type", "zstd")
+ # few other important options
+ .config("spark.openlineage.jobName.appendDatasetName", "false")
+ .config("spark.openlineage.columnLineage.datasetLineageEnabled", "true")
+ .getOrCreate()
+)
+```
+
+```python HttpTransport (requires HTTP2Kafka) title="etl.py"
+from pyspark.sql import SparkSession
+
+spark = (
+ SparkSession.builder
+ # install OpenLineage integration and Kafka client
+ .config(
+ "spark.jars.packages",
+ "io.openlineage:openlineage-spark_2.12:1.40.1",
+ )
+ .config(
+ "spark.extraListeners", "io.openlineage.spark.agent.OpenLineageSparkListener"
+ )
+ # set Spark session master & applicationName
+ .master("local")
+ .appName("mysession")
+ # set here location of Spark session, e.g. current host, YARN cluster or K8s cluster:
+ .config("spark.openlineage.namespace", "local://hostname.as.fqdn")
+ # .config("spark.openlineage.namespace", "yarn://some-cluster")
+ # .config("spark.openlineage.namespace", "k8s://some-cluster")
+ .config("spark.openlineage.transport.type", "http")
+ # http2kafka url, should be accessible from Spark driver
+ .config("spark.openlineage.transport.url", "http://localhost:8002")
+ .config("spark.openlineage.transport.endpoint", "/v1/openlineage")
+ .config("spark.openlineage.transport.compression", "gzip")
+ .config("spark.openlineage.transport.auth.type", "api_key")
+ .config(
+ #Create a PersonalToken, and pass it here
+ "spark.openlineage.transport.auth.apiKey",
+ "personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC",
+ )
+ # few other important options
+ .config("spark.openlineage.jobName.appendDatasetName", "false")
+ .config("spark.openlineage.columnLineage.datasetLineageEnabled", "true")
+ .getOrCreate()
+)
+```
+
+## Collect and send lineage
+
+- Use `SparkSession` as context manager, to properly catch session stop events:
+
+```python title="etl.py"
+with SparkSession.builder.getOrCreate() as spark:
+ # work with spark inside this context
+```
+
+- Perform some data operations using Spark, like:
+
+```python title="etl.py"
+df = spark.read.format("jdbc").options(...).load()
+df.write.format("csv").save("/output/path")
+```
+
+Lineage will be send to Data.Rentgen automatically by `OpenLineageSparkListener`.
+
+## See results
+
+Browse frontend page [Jobs](http://localhost:3000/jobs)
+to see what information was extracted by OpenLineage & DataRentgen.
+
+### Job list page
+
+
+
+### Job details page
+
+
+
+### Run details page
+
+
+
+### Operation details page
+
+
+
+### Dataset level lineage
+
+
+
+
+
+### Job level lineage
+
+
+
+### Run level lineage
+
+
+
+### Operation level lineage
+
+
+
+## Extra configuration
+
+### Collecting job tags
+
+By default, following job tags are created:
+
+- `spark.version`
+- `openlineage_adapter.version`
+
+It is possible to provide custom job tags using OpenLineage configuration:
+
+```yaml title="openlineage.yaml"
+jobs:
+ tags:
+ - environment:production
+ - layer:bronze
+```
+
+```python title="etl.py"
+
+SparkSession.builder.config("spark.openlineage.job.tags", "environment:production;layer:bronze")
+```
+
+## Binding Airflow Task with Spark application
+
+If OpenLineage event contains [Parent Run facet](https://openlineage.io/docs/spec/facets/run-facets/parent_run/),
+DataRentgen can use this information to bind Spark application to the run it was triggered by, e.g. Airflow task:
+
+
+
+To fill up this facet, it is required to:
+
+- Setup OpenLineage integration for Spark
+- Setup :ref:`OpenLineage integration for Airflow `
+- `Pass parent Run info from Airflow to Spark `_:
+
+``` python title="dag.py"
+def my_etl(
+ parent_job_namespace: str,
+ parent_job_name: str,
+ parent_run_id: str,
+ root_job_namespace: str,
+ root_job_name: str,
+ root_run_id: str,
+):
+ spark = (
+ SparkSession.builder
+ # install OpenLineage integration (see above)
+ # Pass parent Run info from Airflow to Spark
+ .config("spark.openlineage.parentJobNamespace", parent_job_namespace)
+ .config("spark.openlineage.parentJobName", parent_job_name)
+ .config("spark.openlineage.parentRunId", parent_run_id)
+ .config("spark.openlineage.rootJobNamespace", root_job_namespace)
+ .config("spark.openlineage.rootJobName", root_job_name)
+ .config("spark.openlineage.rootRunId", root_run_id)
+ .getOrCreate()
+ )
+
+ with spark:
+ # actual ETL code
+
+
+from airflow.providers.standard.operators.python import PythonOperator
+
+task = PythonOperator(
+ task_id="spark_etl",
+ python_callable=my_etl,
+ # Using Jinja templates to pass Airflow macros to Python function
+ op_kwargs={
+ "parent_job_namespace": "{{ macros.OpenLineageProviderPlugin.lineage_job_namespace() }}",
+ "parent_job_name": "{{ macros.OpenLineageProviderPlugin.lineage_job_name(task_instance) }}",
+ "parent_run_id": "{{ macros.OpenLineageProviderPlugin.lineage_run_id(task_instance) }}",
+ # For apache-airflow-providers-openlineage 2.4.0 or above
+ "root_job_namespace": "{{ macros.OpenLineageProviderPlugin.lineage_root_job_namespace(task_instance) }}",
+ "root_job_name": "{{ macros.OpenLineageProviderPlugin.lineage_root_job_name(task_instance) }}",
+ "root_run_id": "{{ macros.OpenLineageProviderPlugin.lineage_root_run_id(task_instance) }}",
+ },
+)
+```
+
+ The exact way of substituting Airflow macros to SparkSession config may be different depending on used Airflow operator:
+
+- PythonOperator - via kwargs & [Airflow macros](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/macros.html#lineage-job-run-macros>):
+- BashOperator, SSHOperator, KubernetesPodOperator - via environment variables & Airflow macros
+- SparkSubmitOperator - via [spark_inject_parent_job_info=true in airflow.conf](https://openlineage.io/docs/integrations/spark/configuration/airflow#automatic-injection)
diff --git a/mddocs/integrations/spark/job_details.png b/mddocs/integrations/spark/job_details.png
new file mode 100644
index 00000000..108b7974
Binary files /dev/null and b/mddocs/integrations/spark/job_details.png differ
diff --git a/mddocs/integrations/spark/job_lineage.png b/mddocs/integrations/spark/job_lineage.png
new file mode 100644
index 00000000..54d2935d
Binary files /dev/null and b/mddocs/integrations/spark/job_lineage.png differ
diff --git a/mddocs/integrations/spark/job_list.png b/mddocs/integrations/spark/job_list.png
new file mode 100644
index 00000000..82107fb9
Binary files /dev/null and b/mddocs/integrations/spark/job_list.png differ
diff --git a/mddocs/integrations/spark/operation_details.png b/mddocs/integrations/spark/operation_details.png
new file mode 100644
index 00000000..4d99b5b1
Binary files /dev/null and b/mddocs/integrations/spark/operation_details.png differ
diff --git a/mddocs/integrations/spark/operation_lineage.png b/mddocs/integrations/spark/operation_lineage.png
new file mode 100644
index 00000000..5351f1a8
Binary files /dev/null and b/mddocs/integrations/spark/operation_lineage.png differ
diff --git a/mddocs/integrations/spark/run_details.png b/mddocs/integrations/spark/run_details.png
new file mode 100644
index 00000000..9ffec708
Binary files /dev/null and b/mddocs/integrations/spark/run_details.png differ
diff --git a/mddocs/integrations/spark/run_lineage.png b/mddocs/integrations/spark/run_lineage.png
new file mode 100644
index 00000000..d355d945
Binary files /dev/null and b/mddocs/integrations/spark/run_lineage.png differ
diff --git a/mddocs/personal_tokens/create.png b/mddocs/personal_tokens/create.png
new file mode 100644
index 00000000..2254e1a5
Binary files /dev/null and b/mddocs/personal_tokens/create.png differ
diff --git a/mddocs/personal_tokens/created.png b/mddocs/personal_tokens/created.png
new file mode 100644
index 00000000..666eb662
Binary files /dev/null and b/mddocs/personal_tokens/created.png differ
diff --git a/mddocs/personal_tokens/empty_list.png b/mddocs/personal_tokens/empty_list.png
new file mode 100644
index 00000000..f7d65798
Binary files /dev/null and b/mddocs/personal_tokens/empty_list.png differ
diff --git a/mddocs/personal_tokens/index.md b/mddocs/personal_tokens/index.md
new file mode 100644
index 00000000..108a0f07
--- /dev/null
+++ b/mddocs/personal_tokens/index.md
@@ -0,0 +1,144 @@
+# Personal Tokens { #personal-tokens }
+
+Different [AuthProviders][auth-server] uses different ways to authorize user - via cookie, via short-lived access token, and so on.
+This is okay for navigating in UI using browser, but not so convenient for accessing API endpoints from some script.
+
+For example, OpenLineage is usually integrated into long running ETL processes, and it cannot make additional request to `/auth` endpoint,
+refresh access tokens, store cookies, follow redirects from Keycloak, and so on.
+The only way OpenLineage can authorize in API is to use long lived tokens (`Bearer` auth).
+
+This is why Data.Rentgen allows to use Personal Tokens for authorization. These tokens have higher priority than AuthProvider, and can be used
+with any API endpoint.
+
+Token is valid if:
+
+- It is not expired. Max token duration is 1 year (configurable).
+- It is not revoked.
+
+Personal tokens can be disabled in [Server config][auth-server-personal-tokens].
+
+## Managing personal tokens
+
+### Create token
+
+Go to user menu -> `Personal tokens`:
+
+
+
+Click on `Create`:
+
+
+
+Enter token name (mandatory) and expiration date (optional, max 1 year):
+
+
+
+After token is created, its content will be copied to a clipboard:
+
+
+
+Token content looks like this:
+
+```text
+personal_token_AAAAAAAAAAA.BBBBBBBBBBBBBB.CCCCCCCCCCCC
+```
+
+**It is important to save this value into some secret place. If token content is lost, it cannot be generated again.**
+
+### Refresh token
+
+Token can be refreshed, to update its expiration date. Go to tokens list:
+
+
+
+
+
+And then click on `Refresh`:
+
+
+
+The only field which can be changed here is `Until`. It can be set manually, or be reset to use max available until date (now + 1year).
+
+After token is refreshed, it's content will be copied to a clipboard. Please save it to a secret place, and replace old token value with a new one.
+
+### Revoke token
+
+Token can be revoked if it is not needed anymore or if it was leaked/compromised.
+
+To do this, navigate to tokens list:
+
+
+
+
+
+Click `Revoke` button, and then on `Confirm`:
+
+
+
+After token is revoked, it cannot be used anymore.
+
+## Using personal tokens
+
+### With OpenLineage
+
+!!! note
+ Due to OpenLineage limitations, {ref}`http2kafka` can be used only with Personal Tokens, and no other auth methods are supported.
+
+
+Use [OpenLineage HTTPTransport](https://openlineage.io/docs/client/python#http-transport):
+
+```yaml title="openlineage.yaml"
+transport:
+ type: http
+ url: http://localhost:8002 # http2kafka URL
+ endpoint: /v1/openlineage
+ compression: gzip
+ auth:
+ type: api_key
+ # replace with you token
+ apiKey: personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+```
+
+```ini title="spark.conf"
+spark.openlineage.transport.type=http
+spark.openlineage.transport.url=http://localhost:8002 # http2kafka URL
+spark.openlineage.transport.endpoint=/v1/openlineage
+spark.openlineage.transport.compression=gzip
+spark.openlineage.transport.auth.type=api_key
+# replace with you token
+spark.openlineage.transport.auth.apiKey=personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC
+```
+
+### With Swagger UI
+
+Open (API) or (HTTP2Kafka), and click on `Authorize` button:
+
+
+
+Enter token content into `HTTPBearer` auth method, and click `Authorize`
+
+`
+
+### With `curl`
+
+```bash
+curl -XGET http://localhost:8000/v1/datasets -H 'Authorization: Bearer personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC'
+```
+
+### With `requests`
+
+```python
+import requests
+
+session = requests.Session()
+personal_token = "personal_token_AAAAAAAAAAAA.BBBBBBBBBBBBBBBBBBBBBBB.CCCCCCCCCCCCCCCCCCCCC"
+
+with requests.Session() as session:
+response = session.get(
+ url="http://localhost:8000/v1/datasets",
+ headers={
+ "Authorization": f"Bearer {personal_token}",
+ },
+)
+response.raise_for_status()
+```
diff --git a/mddocs/personal_tokens/list.png b/mddocs/personal_tokens/list.png
new file mode 100644
index 00000000..92905ccf
Binary files /dev/null and b/mddocs/personal_tokens/list.png differ
diff --git a/mddocs/personal_tokens/refresh.png b/mddocs/personal_tokens/refresh.png
new file mode 100644
index 00000000..13203b67
Binary files /dev/null and b/mddocs/personal_tokens/refresh.png differ
diff --git a/mddocs/personal_tokens/revoke.png b/mddocs/personal_tokens/revoke.png
new file mode 100644
index 00000000..542c6da9
Binary files /dev/null and b/mddocs/personal_tokens/revoke.png differ
diff --git a/mddocs/personal_tokens/swagger_header.png b/mddocs/personal_tokens/swagger_header.png
new file mode 100644
index 00000000..56537884
Binary files /dev/null and b/mddocs/personal_tokens/swagger_header.png differ
diff --git a/mddocs/personal_tokens/swagger_httpbearer.png b/mddocs/personal_tokens/swagger_httpbearer.png
new file mode 100644
index 00000000..dbd6fa3c
Binary files /dev/null and b/mddocs/personal_tokens/swagger_httpbearer.png differ
diff --git a/mddocs/personal_tokens/user_menu.png b/mddocs/personal_tokens/user_menu.png
new file mode 100644
index 00000000..5cd005c3
Binary files /dev/null and b/mddocs/personal_tokens/user_menu.png differ
diff --git a/mddocs/reference/architecture.md b/mddocs/reference/architecture.md
new file mode 100644
index 00000000..7cb2c1de
--- /dev/null
+++ b/mddocs/reference/architecture.md
@@ -0,0 +1,65 @@
+# Architecture { #Architecture }
+
+## Components
+
+Data.Rentgen is build using following components:
+
+- [OpenLineage](https://openlineage.io/docs/) clients & integrations with third-party modules (e.g. Apache Spark, Apache Airflow).
+- [`message-broker`][message-broker], receiving events in JSON format.
+- [`message-consumer`][message-consumer], parsing JSON messages.
+- [`database`][database] for storing consumed & cleaned up data.
+- [`server`][server], serving database data.
+- [`frontend`][frontend], accessing REST API to navigate created entities & lineage graph.
+- [`http2kafka`][http2kafka] (optional), proxy for sending OpenLineage events to Kafka using HTTP API.
+
+## Architecture diagram
+
+```plantuml
+
+ @startuml
+ title Data.Rentgen artitecture
+ skinparam componentStyle rectangle
+ left to right direction
+
+ frame "Data.Rentgen" {
+ queue "Kafka" as KAFKA
+ component "Message consumer" as CONSUMER
+ database "PostgreSQL" as DB
+ component "REST API server" as API
+ component "Frontend" as FRONTEND
+ component "HTTP2Kafka" as HTTP2KAFKA
+ }
+
+ frame "OpenLineage" {
+ agent "OpenLineage Spark" as SPARK
+ agent "OpenLineage Airflow" as AIRFLOW
+ agent "OpenLineage Hive" as HIVE
+ agent "OpenLineage Flink" as FLINK
+ agent "OpenLineage dbt" as DBT
+ agent "OpenLineage other" as OTHER
+ agent "OpenLineage KafkaTransport" as KAFKA_TRANSPORT
+ agent "OpenLineage HttpTransport" as HTTP_TRANSPORT
+ }
+
+ actor "User" as USER
+
+ [SPARK] --> [KAFKA_TRANSPORT]
+ [AIRFLOW] --> [KAFKA_TRANSPORT]
+ [HIVE] --> [KAFKA_TRANSPORT]
+ [FLINK] --> [KAFKA_TRANSPORT]
+ [DBT] --> [KAFKA_TRANSPORT]
+ [KAFKA_TRANSPORT] --> [KAFKA]
+
+ [OTHER] --> [HTTP_TRANSPORT]
+ [HTTP_TRANSPORT] --> [HTTP2KAFKA]
+ [HTTP2KAFKA] --> [KAFKA]
+
+ [KAFKA] --> [CONSUMER]
+ [CONSUMER] --> [DB]
+
+ [API] --> [DB]
+ [FRONTEND] --> [API]
+ [USER] --> [FRONTEND]
+
+ @enduml
+```
diff --git a/mddocs/reference/broker/index.md b/mddocs/reference/broker/index.md
new file mode 100644
index 00000000..16cc0b06
--- /dev/null
+++ b/mddocs/reference/broker/index.md
@@ -0,0 +1,61 @@
+# Message Broker { #message-broker }
+
+Message broker is component used by OpenLineage to store all received events. Then these avents are handled by [`message-consumer`][message-consumer], in batches.
+
+Currently, Data.Rentgen supports only [Apache Kafka](https://kafka.apache.org/) as message broker.
+
+## Why Kafka?
+
+Other popular OpenLineage server implementations use HTTP protocol for receiving events. In out experience, Kafka is much superior for this case:
+
+- Kafka is designed to be scalable. If performance level is not enough, just add another broker to the cluster. For HTTP servers it's not that simple,
+ as this requires load balancing on reverse proxy side or DNS side.
+- Kafka is designed to receive A LOT of events per second, like millions, and store them on disk as fast as possible. So no events are lost
+ even if [`message-consumer`][message-consumer] is overloaded - events are already on disk, and will be handled later.
+- ETL scripts are mostly run on schedule The usual pattern is almost zero events during the day, but huge spikes at every whole hour
+ (e.g. at 00:00, 01:00, 03:00, 12:00). Kafka is used as an intermediate buffer which smooths these spikes.
+- Events stored in Kafka can be read in batches, even if OpenLineage integration initially send them one-by-one.
+ Batching gives x10 more performance than handling individual events.
+- HTTP/HTTPS protocol have higher latency than Kafka TCP protocol. Some OpenLineage integrations are sensitive to latency - for example,
+ [Flink job listener documentation](https://nightlies.apache.org/flink/flink-docs-master/api/java/org/apache/flink/core/execution/JobListener.html)
+ explicitly says: *If you block the thread the invoker of environment execute methods is possibly blocked*. The less time required for sending response, the better.
+
+## Requirements
+
+- Apache Kafka 3.x. It is recommended to use latest Kafka version.
+
+### Setup
+
+#### With Docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile broker up -d --wait
+ ...
+ ```
+
+ `docker-compose` will download Apache Kafka image, create container and volume, and then start container.
+
+ Image entrypoint will create database if volume is empty.
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+### "docker-compose.yml"
+
+--8<--
+docker-compose.yml:101:117, 177
+--8<--
+
+### ".env.docker"
+
+--8<--
+.env.docker:7:20
+--8<--
+
+#### Without Docker
+
+Please follow [Apache Kafka installation instruction](https://kafka.apache.org/quickstart#quickstart_startserver).
diff --git a/mddocs/reference/consumer/configuration/consumer-specific.md b/mddocs/reference/consumer/configuration/consumer-specific.md
new file mode 100644
index 00000000..c913085a
--- /dev/null
+++ b/mddocs/reference/consumer/configuration/consumer-specific.md
@@ -0,0 +1,3 @@
+# Consumer-specific settings { #configuration-consumer-specific }
+
+::: data_rentgen.consumer.settings.consumer.ConsumerSettings
diff --git a/mddocs/reference/consumer/configuration/index.md b/mddocs/reference/consumer/configuration/index.md
new file mode 100644
index 00000000..f89950b1
--- /dev/null
+++ b/mddocs/reference/consumer/configuration/index.md
@@ -0,0 +1,11 @@
+# Consumer configuration { #configuration-consumer }
+
+::: data_rentgen.consumer.settings.ConsumerApplicationSettings
+ options:
+ docstring_style: sphinx
+ members:
+ - database
+ - logging
+ - kafka
+ - consumer
+ - producer
diff --git a/mddocs/reference/consumer/configuration/kafka.md b/mddocs/reference/consumer/configuration/kafka.md
new file mode 100644
index 00000000..273cee5c
--- /dev/null
+++ b/mddocs/reference/consumer/configuration/kafka.md
@@ -0,0 +1,13 @@
+# Kafka settings { #configuration-consumer-kafka }
+
+::: data_rentgen.consumer.settings.kafka.KafkaSettings
+
+::: data_rentgen.consumer.settings.security.scram.KafkaSecurityScram256Settings
+
+::: data_rentgen.consumer.settings.security.scram.KafkaSecurityScram512Settings
+
+::: data_rentgen.consumer.settings.security.plain.KafkaSecurityPlaintextSettings
+
+::: data_rentgen.consumer.settings.security.gssapi.KafkaSecurityGSSAPISettings
+
+::: data_rentgen.consumer.settings.security.anonymous.KafkaSecurityAnonymousSettings
diff --git a/mddocs/reference/consumer/configuration/logging.md b/mddocs/reference/consumer/configuration/logging.md
new file mode 100644
index 00000000..e91d6684
--- /dev/null
+++ b/mddocs/reference/consumer/configuration/logging.md
@@ -0,0 +1,7 @@
+# Logging settings { #configuration-consumer-logging }
+
+::: data_rentgen.logging.settings.LoggingSettings
+ options:
+ members:
+ - setup
+ - preset
diff --git a/mddocs/reference/consumer/configuration/producer-specific.md b/mddocs/reference/consumer/configuration/producer-specific.md
new file mode 100644
index 00000000..75a73bf5
--- /dev/null
+++ b/mddocs/reference/consumer/configuration/producer-specific.md
@@ -0,0 +1,3 @@
+# Producer-specific settings { #configuration-producer-specific }
+
+::: data_rentgen.consumer.settings.producer.ProducerSettings
diff --git a/mddocs/reference/consumer/index.md b/mddocs/reference/consumer/index.md
new file mode 100644
index 00000000..c233408e
--- /dev/null
+++ b/mddocs/reference/consumer/index.md
@@ -0,0 +1,80 @@
+# Message Consumer { #message-consumer }
+
+Data.Rentgen fetches messages from a [`message-broker`][message-broker] using a [FastStream](https://faststream.airt.ai) based consumer, parses incoming messages, and creates all parsed entities in the [`database`][database]. Malformed messages are send back to broker, to different topic.
+
+## Install & run
+
+### With docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile consumer up -d --wait
+ ...
+ ```
+
+ `docker-compose` will download all necessary images, create containers, and then start consumer process.
+
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+### "docker-compose.yml"
+
+--8<--
+docker-compose.yml:120:138
+--8<--
+
+### ".env.docker"
+
+--8<--
+.env.docker:22:24,29:34
+--8<--
+
+### Without docker
+
+- Install Python 3.10 or above
+- Setup [`database`][database], run migrations and create partitions
+- Setup [`message-broker`][message-broker]
+- Create virtual environment
+
+ ```console
+ $ python -m venv /some/.venv
+ ...
+ $ source /some/.venv/activate
+ ...
+ ```
+
+- Install `data-rentgen` package with following *extra* dependencies:
+
+ ```console
+ $ pip install data-rentgen[consumer,postgres]
+ ...
+ ```
+
+!!! note
+ For `SASL_GSSAPI` auth mechanism you also need to install system packages providing `kinit` and `kdestroy` binaries:
+ ```console
+ $ apt install libkrb5-dev krb5-user gcc make autoconf # Debian-based
+ ...
+ $ dnf install krb5-devel krb5-libs krb5-workstation gcc make autoconf # CentOS, OracleLinux
+ ...
+ ```
+ And then install `gssapi` extra:
+ ```console
+ $ pip install data-rentgen[consumer,postgres,gssapi]
+ ...
+ ```
+
+- Run consumer process
+
+ ```console
+ $ python -m data_rentgen.consumer
+ ...
+ ```
+
+## See also
+
+[Consumer configuration][configuration-consumer]
diff --git a/mddocs/reference/database/cleanup_partitions_cli.md b/mddocs/reference/database/cleanup_partitions_cli.md
new file mode 100644
index 00000000..e5ee34df
--- /dev/null
+++ b/mddocs/reference/database/cleanup_partitions_cli.md
@@ -0,0 +1,61 @@
+# CLI for cleaning old partitions { #cleanup-partitions-cli }
+
+This script is designed to manage PostgreSQL table partitions by providing functionalities to list, detach, remove, or truncate old partitions.
+
+```shell
+usage: python3 -m data_rentgen.db.scripts.cleanup_partitions truncate --keep-after 2025-01-01
+```
+
+The `cleanup_partitions.py` script helps automate the cleanup of old table partitions based on a specified keep-after date. It supports different commands for dry runs, detaching partitions, removing data, and truncating partitions.
+It's automatically inditifies partitioned tables and their granularity.
+
+## Arguments
+
+- `command`: (Optional) Specifies the operation mode.
+ - Choices: `dry_run`, `detach`, `truncate`, `drop`
+ - Default: `dry_run`
+ - Description:
+ - `dry_run`: Logs the names of partitions that would be affected by the cleanup without executing any SQL commands.
+ - `detach`: Generates and executes `ALTER TABLE ... DETACH PARTITION ...` commands for identified old partitions. This keeps partition data intact, but consumer & server will have no access to these partitions.
+ - `truncate`: Generates and executes `TRUNCATE TABLE ...` commands, removing all rows from the identified old partition tables but keeping the table structure. **This option is preferred if you have streaming operations, g.e. Flink or Spark Streaming jobs**.
+ - `drop`: First detaches partitions, then generates and executes `DROP TABLE ...` commands, permanently deleting the partition tables and their data.
+- `--keep-after`: (Optional) The cut-off date for partitions. Partitions with data before this date will be considered for cleanup.
+ - Type: Date (e.g., `YYYY-MM-DD`). The script uses isoparse for parsing, so various ISO formats are supported.
+ - Default: The current date - 1 year.
+ - Description: Only partitions whose date components are strictly before this specified date will be processed taking into account granularity of the table.
+
+## Examples
+
+1. Perform a Dry Run (default):
+
+ ```shell
+ python3 -m data_rentgen.db.scripts.cleanup_partitions dry_run --keep-after 2024-01-01
+ ```
+
+ This command will log which partitions would be affected if you were to clean up partitions older than January 1, 2024, without making any changes to your database.
+
+2. Detach Partitions Older Than a Specific Date:
+
+ ```shell
+ python3 -m data_rentgen.db.scripts.cleanup_partitions detach_partitions --keep-after 2024-01-01
+ ```
+
+ This will detach all partitions created before January 1, 2024, from their parent tables. The detached tables will still exist with their data.
+
+3. Remove Data and Drop Partitions Older Than a Specific Date:
+
+ ```shell
+ python3 -m data_rentgen.db.scripts.cleanup_partitions remove_data --keep-after 2024-01-01
+ ```
+
+ This will detach and then **drop all partitions** created before January 1, 2024, permanently deleting their data.
+
+4. Truncate Data in Partitions Older Than a Specific Date:
+
+ This option is preferred with streaming `Jobs`
+
+ ```shell
+ python3 -m data_rentgen.db.scripts.cleanup_partitions truncate --keep-after 2024-01-01
+ ```
+
+ This will delete all rows from partitions created before January 1, 2024, but will keep the empty partition tables.
diff --git a/mddocs/reference/database/configuration.md b/mddocs/reference/database/configuration.md
new file mode 100644
index 00000000..b9303909
--- /dev/null
+++ b/mddocs/reference/database/configuration.md
@@ -0,0 +1,7 @@
+# Database settings { #configuration-database }
+
+::: data_rentgen.db.settings.DatabaseSettings
+ options:
+ docstring_style: sphinx
+ members:
+ - url
diff --git a/mddocs/reference/database/create_partitions_cli.md b/mddocs/reference/database/create_partitions_cli.md
new file mode 100644
index 00000000..5915b3dc
--- /dev/null
+++ b/mddocs/reference/database/create_partitions_cli.md
@@ -0,0 +1,9 @@
+# CLI for creating partitions { #create-partitions-cli }
+
+
diff --git a/mddocs/reference/database/index.md b/mddocs/reference/database/index.md
new file mode 100644
index 00000000..f3ac7cfc
--- /dev/null
+++ b/mddocs/reference/database/index.md
@@ -0,0 +1,175 @@
+# Relation Database { #database }
+
+Data.Rentgen uses relational database as a storage for lineage entities and relations.
+
+Currently, Data.Rentgen supports only [PostgreSQL](https://www.postgresql.org/), as it relies on table partitioning, full-text search and specific aggregation functions.
+
+## Migrations
+
+After a database is started, it is required to run migration script. If database is empty, it creates all the required tables and indexes. If database is not empty, it will perform database structure upgrade.
+
+Migration script is a thin wrapper around [Alembic cli](https://alembic.sqlalchemy.org/en/latest/tutorial.html#running-our-first-migration), options and commands are just the same.
+
+!!! warning
+
+ Other containers (consumer, server) should be stopped while running migrations, to prevent interference.
+
+## Partitions
+
+After migrations are performed, it is required to run [`create-partitions-cli`][create-partitions-cli] which creates partitions for some tables in the database.
+By default, it creates monthly partitions, for current and next month. This can be changed by overriding command args.
+
+This script should run on schedule, depending on partitions granularity.
+Scheduling can be done by adding a dedicated entry to [crontab](https://help.ubuntu.com/community/CronHowto).
+
+It's strongly recommended also to add old partitions cleanup script to cron [`cleanup-partitions-cli`][cleanup-partitions-cli].
+Scheduling setup is same is for creating of partitions.
+
+## Analytic views
+
+Along with migrations few analytics views are created. These are managed by [`refresh-analytic-views-cli`][refresh-analytic-views-cli], and should be executed by schedule.
+
+## Seeding
+
+By default, database is created with no data. To seed database with some examples, use [`db-seed-cli`][db-seed-cli].
+
+## Requirements
+
+- PostgreSQL 12 or higher. It is recommended to use latest Postgres version.
+
+## Install & run
+
+### With Docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile analytics,cleanup,seed up -d
+ ...
+ ```
+
+ `docker-compose` will download PostgreSQL image, create container and volume, and then start container.
+ Image entrypoint will create database if volume is empty.
+
+ After that, several one-off containers will start:
+
+ - `db-create-partitions` will create necessary partitions in db.
+ - `db-cleanup-partitions` will cleanup old partitions.
+ - `db-refresh-views` will refresh analytic views.
+ - `db-seed` will seed database with some examples (optional, can be omitted).
+
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+### "docker-compose.yml"
+
+--8<--
+docker-compose.yml:1:69,176
+--8<--
+
+### ".env.docker"
+
+--8<--
+.env.docker:1:5,23
+--8<--
+
+- Add scripts to crontab:
+
+ ```console
+ $ crontab -e
+ ...
+ ```
+
+ ```text
+ 0 0 * * * docker compose -f "/path/to/docker-compose.yml" start db-create-partitions db-refresh-views db-cleanup-partitions
+ ```
+
+### Without Docker
+
+- For installing PostgreSQL, please follow [installation instruction](https://www.postgresql.org/download/).
+- Install Python 3.10 or above
+- Create virtual environment
+
+ ```console
+ $ python -m venv /some/.venv
+ ...
+ $ source /some/.venv/activate
+ ```
+
+- Install `data-rentgen` package with following *extra* dependencies:
+
+ ```console
+ $ pip install data-rentgen[postgres]
+ ...
+ ```
+
+- Configure [`Database connection`][configuration-database] using environment variables, e.g. by creating `.env` file:
+
+ ```console title="/some/.env"
+
+ $ export DATA_RENTGEN__DATABASE__URL=postgresql+asyncpg://data_rentgen:changeme@localhost:5432/data_rentgen
+ ...
+ ```
+
+ And then read values from this file:
+
+ ```console
+ $ source /some/.env
+ ...
+ ```
+
+- Run migrations:
+
+ ```console
+ $ python -m data_rentgen.db.migrations upgrade head
+ ...
+ ```
+
+!!! note
+ This command should be executed after each upgrade to new Data.Rentgen version.
+
+- Create partitions:
+
+ ```console
+ $ python -m data_rentgen.db.scripts.create_partitions
+ ...
+ ```
+
+- Create analytic views:
+
+ ```console
+ $ python -m data_rentgen.db.scripts.refresh_analytic_views
+ ...
+ ```
+
+- Seed database with example data (optional, can be omitted):
+
+ ```console
+ $ python -m data_rentgen.db.scripts.seed
+ ...
+ ```
+
+- Add scripts to crontab:
+
+ ```console
+ $ crontab -e
+ ...
+ ```
+
+ ```text
+ # read settings from .env file, and run script using a specific venv with all required dependencies
+ 0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.create_partitions"
+ 0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.cleanup_partitions truncate --keep-after $(date --date='-1year' '+%Y-%m-%d')"
+ 0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.refresh_analytic_views"
+ ```
+
+## See also
+
+[Configuration][configuration]
+[Create partitions cli][create-partitions-cli]
+[Cleanup partitions cli][cleanup-partitions-cli]
+[Refresh analytic views cli][refresh-analytic-views-cli]
+[Seed cli][db-seed-cli]
+[Structure][database-structure]
diff --git a/mddocs/reference/database/refresh_analytic_views_cli.md b/mddocs/reference/database/refresh_analytic_views_cli.md
new file mode 100644
index 00000000..6a488e07
--- /dev/null
+++ b/mddocs/reference/database/refresh_analytic_views_cli.md
@@ -0,0 +1,31 @@
+# CLI for refreshing analytic views { #refresh-analytic-views-cli }
+
+Analytic views are:
+
+- `input_daily_stats`, `input_weekly_stats`, `input_monthly_stats`
+- `output_daily_stats`, `output_weekly_stats`, `output_monthly_stats`
+
+Views content is based on data in `output` and `input` tables and has such structure:
+
+- `dataset_name` - Name of dataset.
+- `dataset_location` - Name of dataset location (e.g. clusster name).
+- `dataset_location_type` - Type of dataset location (e.g. hive, hdfs, postgres).
+- `user_id` - Internal user id.
+- `user_name` - Internal user name (e.g. name of user which run spark job).
+- `last_interaction_dt` - Time when user lat time interact with dataset. Read or write depens on base table.
+- `num_of_interactions` - Number of interactions in given interval.
+- `sum_bytes` - Sum of bytes in given interval.
+- `sum_rows` - Sum of rows in given interval.
+- `sum_files` - Sum of files in given interval.
+
+We provide three types of views: `day`, `week` and `month`, based on the time period in which the aggregation occur.
+
+By default these materialized views are empty(`WITH NO DATA`).
+In order to fill these tables with data you need to run refresh script (see below).
+
+
diff --git a/mddocs/reference/database/seed_cli.md b/mddocs/reference/database/seed_cli.md
new file mode 100644
index 00000000..105662d8
--- /dev/null
+++ b/mddocs/reference/database/seed_cli.md
@@ -0,0 +1,8 @@
+# CLI for seeding database { #db-seed-cli }
+
+
diff --git a/mddocs/reference/database/structure.md b/mddocs/reference/database/structure.md
new file mode 100644
index 00000000..531e710d
--- /dev/null
+++ b/mddocs/reference/database/structure.md
@@ -0,0 +1,186 @@
+# Database structure { #database-structure }
+
+```mermaid
+---
+title: Database structure
+---
+
+erDiagram
+
+ address {
+ bigint id UK, PK
+ bigint location_id UK, FK
+ varchar(256) url UK
+ }
+
+ location {
+ bigint id UK
+ varchar(32) type UK
+ varchar(256) name UK
+ varchar(256) external_id
+ tsvector search_vector
+ }
+
+ user {
+ bigint id UK, PK
+ varchar(256) name UK
+ }
+ dataset {
+ bigint id UK
+ bigint location_id UK, FK
+ varchar(256) name UK
+ tsvector search_vector
+ }
+
+ dataset_symlink {
+ bigint id UK
+ bigint from_dataset_id UK, FK
+ bigint to_dataset_id UK, FK
+ varchar(32) type
+ }
+
+ job {
+ bigint id UK
+ bigint location_id UK, FK
+ varchar(256) name UK
+ varchar(32) type
+ tsvector search_vector
+ }
+
+ run {
+ timestamptz created_at UK
+ uuid(v7) id UK
+ bigint job_id UK
+ smallint status
+ bigint parent_run_id
+ timestamptz started_at
+ bigint started_by_user_id
+ varchar(32) start_reason
+ timestamptz ended_at
+ text end_reason
+ text external_id
+ varchar(64) attempt
+ timestamptz persistent_log_url
+ timestamptz running_log_url
+ tsvector search_vector
+ }
+
+ sql_query {
+ bigint id UK
+ uuid(v5) fingerprint UK
+ text query
+ }
+
+ operation {
+ timestamptz created_at UK
+ uuid(v7) id UK
+ uuid(v7) run_id UK
+ smallint status
+ text name
+ varchar(32) type
+ int position
+ text group
+ text description
+ timestamptz started_at
+ timestamptz ended_at
+ bigint sql_query_id
+ }
+
+ schema {
+ bigint id UK
+ uuid(v5) digest UK
+ json fields
+ }
+
+ input {
+ timestamptz created_at UK
+ uuid(v7) id UK
+ uuid(v7) operation_id UK
+ uuid(v7) run_id UK
+ bigint job_id UK
+ bigint dataset_id UK
+ bigint schema_id
+ bigint num_bytes
+ bigint num_rows
+ bigint num_files
+ }
+
+ output {
+ timestamptz created_at UK
+ uuid(v7) id UK
+ uuid(v7) operation_id UK
+ uuid(v7) run_id UK
+ bigint job_id UK
+ bigint dataset_id UK
+ varchar(32) type UK
+ bigint schema_id
+ bigint num_bytes
+ bigint num_rows
+ bigint num_files
+ }
+
+ dataset_column_relation {
+ bigint id UK
+ uuid(v5) fingerprint UK
+ varchar(255) source_column UK
+ varchar(255) target_column UK
+ smallint type
+ }
+
+ column_lineage {
+ timestamptz created_at UK
+ uuid(v7) id UK
+ uuid(v7) operation_id UK
+ uuid(v7) run_id UK
+ bigint job_id UK
+ bigint source_dataset_id UK
+ bigint target_dataset_id UK
+ uuid(v5) fingerprint
+ }
+
+ personal_token {
+ uuid(v7) id UK
+ bigint user_id UK
+ varchar(64) name UK
+ jsonb scopes
+ date since
+ date until
+ timestamptz revoked_at
+ }
+
+
+ address ||--o{ location: "included in"
+ dataset ||--o{ location: has
+ job ||--o{ location: has
+
+ dataset_symlink ||--o{ dataset: "from_dataset_id"
+ dataset_symlink ||--o{ dataset: "to_dataset_id"
+
+ run ||--o{ job: relates
+ run ||--o{ user: "started_by_user_id"
+ run |o--o{ run: "parent_run_id"
+
+ operation ||--o{ run: "contained in"
+ operation |o--o{ sql_query: "execute"
+
+ input ||--o{ operation: relates
+ input ||--o{ run: relates
+ input ||--o{ job: relates
+ input ||--o{ dataset: relates
+ input |o--o{ schema: relates
+
+ output ||--o{ operation: relates
+ output ||--o{ run: relates
+ output ||--o{ job: relates
+ output ||--o{ dataset: relates
+ output |o--o{ schema: relates
+
+ column_lineage ||--o{ operation: relates
+ column_lineage ||--o{ run: relates
+ column_lineage ||--o{ job: relates
+ column_lineage ||--o{ dataset: "source_dataset_id"
+ column_lineage ||--o{ dataset: "target_dataset_id"
+ column_lineage ||--o{ dataset_column_relation: "fingerprint"
+
+ personal_token ||--o{ user: relates
+```
diff --git a/mddocs/reference/frontend/configuration.md b/mddocs/reference/frontend/configuration.md
new file mode 100644
index 00000000..1ef6c289
--- /dev/null
+++ b/mddocs/reference/frontend/configuration.md
@@ -0,0 +1,20 @@
+# Frontend configuration { #configuration-frontend }
+
+## API url
+
+Data.Rentgen UI requires REST API to be accessible from browser. API url is set up using environment variable:
+
+```bash
+DATA_RENTGEN__UI__API_BROWSER_URL=http://localhost:8000
+```
+
+If both REST API and frontend are served on the same domain (e.g. through Nginx reverse proxy), for example:
+
+- REST API → `/api`
+- Frontend → `/`
+
+Then you can use relative path:
+
+```bash
+DATA_RENTGEN__UI__API_BROWSER_URL=/api
+```
diff --git a/mddocs/reference/frontend/index.md b/mddocs/reference/frontend/index.md
new file mode 100644
index 00000000..3e708694
--- /dev/null
+++ b/mddocs/reference/frontend/index.md
@@ -0,0 +1,45 @@
+# Frontend { #frontend }
+
+Data.Rentgen provides a [Frontend (UI)](https://github.com/MobileTeleSystems/data-rentgen-ui) based on [ReactAdmin](https://marmelab.com/react-admin/) and [ReactFlow](https://reactflow.dev/),
+providing users the ability to navigate entities and build lineage graph.
+
+## Install & run
+
+### With Docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile frontend up -d --wait
+ ...
+ ```
+
+ `docker-compose` will download Data.Rentgen UI image, create containers, and then start them.
+
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+??? note "docker-compose.yml"
+
+ ```yaml hl_lines="140-151" linenums="1"
+ ----8<----
+ docker-compose.yml
+ ----8<----
+ ```
+
+??? note ".env.docker"
+
+ ```ini hl_lines="36-37" linenums="1"
+ ----8<----
+ .env.docker
+ ----8<----
+ ```
+
+- After frontend is started and ready, open .
+
+## See also
+
+[Configuration][configuration-frontend]
diff --git a/mddocs/reference/http2kafka/alternatives.md b/mddocs/reference/http2kafka/alternatives.md
new file mode 100644
index 00000000..b7f5501e
--- /dev/null
+++ b/mddocs/reference/http2kafka/alternatives.md
@@ -0,0 +1,73 @@
+# Alternatives { #http2kafka-alternatives }
+
+HTTP → Kafka proxy is build with mandatory authentication using personal tokens. This may be a drawback for some use cases.
+
+If your use case requires sending OpenLineage events to DataRentgen via HTTP, but without any authentication, there are some alternatives described below.
+
+## Fluentbit
+
+[Fluentbit](https://fluentbit.io/) is a lightweight yet powerful logging processor written on C.
+It can accept [HTTP requests](https://docs.fluentbit.io/manual/data-pipeline/inputs/http) and write body to [Kafka topic](https://docs.fluentbit.io/manual/data-pipeline/outputs/kafka).
+
+Config example:
+
+```yaml title="fluent-bit.yml"
+
+pipeline:
+ # receive HTTP requests on port 8002
+ inputs:
+ - name: http
+ port: 8002
+ mem_buf_limit: 50MB
+
+ # Route events to partition using message key
+ filters:
+ - name: lua
+ match: '*'
+ call: set_message_key
+ code: |
+ function set_message_key(tag, timestamp, record)
+ local new_record = record
+ if record.run.facets.parent then
+ if record.run.facets.parent.root then
+ new_record.messageKey = "run:" .. record.run.facets.parent.root.job.namespace .. "/" .. record.run.facets.parent.root.job.name
+ else
+ new_record.messageKey = "run:" .. record.run.facets.parent.job.namespace .. "/" .. record.run.facets.parent.job.name
+ end
+ else
+ new_record.messageKey = "run:" .. record.job.namespace .. "/" .. record.job.name
+ end
+ return 1, timestamp, new_record
+ end
+
+ # Write data to Kafka topic
+ outputs:
+ - name: kafka
+ match: '*'
+ format: json
+ timestamp_key: eventTime
+ timestamp_format: iso8601_ns
+ message_key_field: messageKey
+ brokers: localhost:9093
+ topics: input.runs
+ rdkafka.security.protocol: SASL_PLAINTEXT
+ rdkafka.sasl.mechanism: SCRAM-SHA-256
+ rdkafka.sasl.username: data_rentgen
+ rdkafka.sasl.password: changeme
+ rdkafka.client.id: fluent-bit
+ rdkafka.request.required.acks: 1
+ rdkafka.log.connection.close: false
+```
+
+```yaml title="docker-compose.yml"
+
+services:
+ fluent-bit:
+ image: central-mirror.services.mts.ru/fluent/fluent-bit
+ restart: unless-stopped
+ command: --config /fluent-bit/etc/fluent-bit.yml
+ volumes:
+ - ./fluent-bit.yml:/fluent-bit/etc/fluent-bit.yml
+ # Kafka hostnames should be resolvable from container network
+ network_mode: host
+```
diff --git a/mddocs/reference/http2kafka/configuration/debug.md b/mddocs/reference/http2kafka/configuration/debug.md
new file mode 100644
index 00000000..55563ba1
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/debug.md
@@ -0,0 +1,95 @@
+# Enabling debug { #configuration-http2kafka-debug }
+
+## Return debug info in REST API responses
+
+By default, server does not add error details to response bodies,
+to avoid exposing instance-specific information to end users.
+
+You can change this by setting:
+
+```console
+$ export DATA_RENTGEN__SERVER__DEBUG=False
+$ # start REST API server
+$ curl -XPOST http://localhost:8002/failing/endpoint ...
+{
+ "error": {
+ "code": "unknown",
+ "message": "Got unhandled exception. Please contact support",
+ "details": null,
+ },
+}
+```
+
+```console
+$ export DATA_RENTGEN__SERVER__DEBUG=True
+$ # start REST API server
+$ curl -XPOST http://localhost:8002/failing/endpoint ...
+Traceback (most recent call last):
+File ".../uvicorn/protocols/http/h11_impl.py", line 408, in run_asgi
+ result = await app( # type: ignore[func-returns-value]
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File ".../site-packages/uvicorn/middleware/proxy_headers.py", line 84, in __call__
+ return await self.app(scope, receive, send)
+```
+
+!!! warning
+
+ This is only for development environment only. Do **NOT** use on production!
+
+## Print debug logs on backend
+
+See [`configuration-server-logging`][configuration-server-logging], but replace log level `INFO` with `DEBUG`.
+
+## Fill up `X-Request-ID` header on backend
+
+Server can add `X-Request-ID` header to responses, which allows to match request on client with backend response.
+
+This is done by `request_id` middleware, which is enabled by default and can configured as described below:
+
+::: data_rentgen.server.settings.request_id.RequestIDSettings
+
+## Print request ID to backend logs
+
+This is done by adding a specific filter to logging handler:
+
+??? note "logging.yml"
+
+ ```yaml hl_lines="6-12 23-24 35"
+ ----8<----
+ data_rentgen/logging/presets/plain.yml
+ ----8<----
+ ```
+
+Resulting logs look like:
+
+```text
+2023-12-18 17:14:11.711 uvicorn.access:498 [INFO] 018c15e97a068ae09484f8c25e2799dd 127.0.0.1:34884 - "GET /monitoring/ping HTTP/1.1" 200
+```
+
+## Use `X-Request-ID` header on client
+
+If client got `X-Request-ID` header from backend, it is printed to logs with `DEBUG` level:
+
+```pycon
+>>> import logging
+>>> logging.basicConfig(level=logging.DEBUG)
+>>> client.ping()
+DEBUG:urllib3.connectionpool:http://localhost:8002 "GET /monitoring/ping HTTP/1.1" 200 15
+DEBUG:data_rentgen.client.base:Request ID: '018c15e97a068ae09484f8c25e2799dd'
+```
+
+Also, if REST API response was not successful, `Request ID` is added to exception message:
+
+```pycon
+>>> client.get_namespace("unknown")
+requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http://localhost:8002/v1/namespaces/unknown
+Request ID: '018c15eb80fa81a6b38c9eaa519cd322'
+```
+
+## Fill up `X-Application-Version` header on REST API side
+
+Server can add `X-Application-Version` header to responses, which allows to determine which version of backend is deployed.
+
+This is done by `application_version` middleware, which is enabled by default and can configured as described below:
+
+::: data_rentgen.server.settings.application_version.ApplicationVersionSettings
diff --git a/mddocs/reference/http2kafka/configuration/index.md b/mddocs/reference/http2kafka/configuration/index.md
new file mode 100644
index 00000000..a77485cd
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/index.md
@@ -0,0 +1,16 @@
+# HTTP2Kafka configuration { #configuration-http2kafka }
+
+## Configuration
+
+[kafka][configuration-consumer-kafka]
+[producer-specific][configuration-producer-specific]
+[logging][configuration-http2kafka-logging]
+[monitoring][configuration-http2kafka-monitoring]
+[static_files][configuration-http2kafka-static-files]
+[openapi][configuration-http2kafka-openapi]
+[debug][configuration-http2kafka-debug]
+
+
+
+
+::: data_rentgen.server.settings.ServerSettings
diff --git a/mddocs/reference/http2kafka/configuration/kafka.md b/mddocs/reference/http2kafka/configuration/kafka.md
new file mode 100644
index 00000000..273cee5c
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/kafka.md
@@ -0,0 +1,13 @@
+# Kafka settings { #configuration-consumer-kafka }
+
+::: data_rentgen.consumer.settings.kafka.KafkaSettings
+
+::: data_rentgen.consumer.settings.security.scram.KafkaSecurityScram256Settings
+
+::: data_rentgen.consumer.settings.security.scram.KafkaSecurityScram512Settings
+
+::: data_rentgen.consumer.settings.security.plain.KafkaSecurityPlaintextSettings
+
+::: data_rentgen.consumer.settings.security.gssapi.KafkaSecurityGSSAPISettings
+
+::: data_rentgen.consumer.settings.security.anonymous.KafkaSecurityAnonymousSettings
diff --git a/mddocs/reference/http2kafka/configuration/logging.md b/mddocs/reference/http2kafka/configuration/logging.md
new file mode 100644
index 00000000..ee4831b0
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/logging.md
@@ -0,0 +1,3 @@
+# Logging settings { #configuration-http2kafka-logging }
+
+::: data_rentgen.logging.settings.LoggingSettings
diff --git a/mddocs/reference/http2kafka/configuration/monitoring.md b/mddocs/reference/http2kafka/configuration/monitoring.md
new file mode 100644
index 00000000..4a61aaf7
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/monitoring.md
@@ -0,0 +1,9 @@
+# Setup monitoring { #configuration-http2kafka-monitoring }
+
+REST API server provides the following endpoints with Prometheus compatible metrics:
+
+- `GET /monitoring/metrics` - server metrics, like number of requests per path and response status, CPU and RAM usage, and so on.
+
+These endpoints are enabled and configured using settings below:
+
+::: data_rentgen.server.settings.monitoring.MonitoringSettings
diff --git a/mddocs/reference/http2kafka/configuration/openapi.md b/mddocs/reference/http2kafka/configuration/openapi.md
new file mode 100644
index 00000000..d01d1806
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/openapi.md
@@ -0,0 +1,13 @@
+# OpenAPI settings { #configuration-http2kafka-openapi }
+
+These settings used to control exposing OpenAPI.json and SwaggerUI/ReDoc endpoints.
+
+::: data_rentgen.server.settings.openapi.OpenAPISettings
+
+::: data_rentgen.server.settings.openapi.SwaggerSettings
+
+::: data_rentgen.server.settings.openapi.RedocSettings
+
+::: data_rentgen.server.settings.openapi.LogoSettings
+
+::: data_rentgen.server.settings.openapi.FaviconSettings
diff --git a/mddocs/reference/http2kafka/configuration/producer-specific.md b/mddocs/reference/http2kafka/configuration/producer-specific.md
new file mode 100644
index 00000000..75a73bf5
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/producer-specific.md
@@ -0,0 +1,3 @@
+# Producer-specific settings { #configuration-producer-specific }
+
+::: data_rentgen.consumer.settings.producer.ProducerSettings
diff --git a/mddocs/reference/http2kafka/configuration/static_files.md b/mddocs/reference/http2kafka/configuration/static_files.md
new file mode 100644
index 00000000..3d4116b5
--- /dev/null
+++ b/mddocs/reference/http2kafka/configuration/static_files.md
@@ -0,0 +1,17 @@
+# Serving static files { #configuration-http2kafka-static-files }
+
+These settings used to control serving static files by a http2kafka.
+
+
+
+
+
+::: data_rentgen.server.settings.static_files.StaticFilesSettings
+ options:
+ members:
+ - directory
+ - enabled
diff --git a/mddocs/reference/http2kafka/index.md b/mddocs/reference/http2kafka/index.md
new file mode 100644
index 00000000..46640d6b
--- /dev/null
+++ b/mddocs/reference/http2kafka/index.md
@@ -0,0 +1,96 @@
+# HTTP2Kafka proxy { #http2kafka }
+
+Some of OpenLineage integrations support only HttpTransport, but not KafkaTransport, e.g. Trino.
+
+Data.Rentgen HTTP → Kafka proxy is optional component which provides a simple HTTP API receiving
+[OpenLineage run events](https://openlineage.io/docs/spec/object-model) in JSON format and sending them to Kafka topic as is,
+so they can be handled by {ref}`message-consumer` in a proper way.
+
+## OpenLineage HttpTransport or KafkaTransport?
+
+Introducing http2kafka into the chain reduces performance a bit:
+
+- It parses all incoming events for validation and routing purposes. The larger the event, the slower the parsing.
+- HTTP/HTTPS protocol is far more complex than Kafka TCP protocol, and has much higher latency in the first place.
+
+If OpenLineage integration supports both HttpTransport and KafkaTransport, and Kafka doesn't use complex authentication not supported by OpenLineage (e.g. OAUTHBEARER), prefer KafkaTransport.
+
+If this is not possible, http2kafka is the way to go.
+
+## Install & run
+
+### With docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile http2kafka up -d --wait
+ ...
+ ```
+
+ `docker-compose` will download all necessary images, create containers, and then start the component.
+
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+>
+
+??? note "docker-compose.yml"
+
+ ```yaml hl_lines="155-173" linenums="1"
+ ----8<----
+ docker-compose.yml
+ ----8<----
+ ```
+
+??? note ".env.docker"
+
+ ```ini hl_lines="29-34" linenums="1"
+ ----8<----
+ .env.docker
+ ----8<----
+ ```
+
+- After component is started and ready, open .
+
+### Without docker
+
+- Install Python 3.10 or above
+
+- Setup {ref}`message-broker`
+
+- Create virtual environment
+
+ ```console
+ $ python -m venv /some/.venv
+ ...
+ $ source /some/.venv/activate
+ ```
+
+- Install `data-rentgen` package with following *extra* dependencies:
+
+ ```console
+ $ pip install data-rentgen[http2kafka]
+ ...
+ ```
+
+- Run http2kafka process
+
+ ```console
+ $ python -m data_rentgen.http2kafka --host 0.0.0.0 --port 8002
+ ...
+ ```
+
+ This is a thin wrapper around [uvicorn](https://www.uvicorn.org/#command-line-options) cli,
+ options and commands are just the same.
+
+- After server is started and ready, open [http://localhost:8002/docs](http://localhost:8002/docs).
+
+## See also
+
+[Configuration][configuration-http2kafka]
+[OpenAPI][http2kafka-openapi]
+[Alternatives][http2kafka-alternatives]
diff --git a/mddocs/reference/http2kafka/openapi.md b/mddocs/reference/http2kafka/openapi.md
new file mode 100644
index 00000000..3f85c2ca
--- /dev/null
+++ b/mddocs/reference/http2kafka/openapi.md
@@ -0,0 +1,14 @@
+# OpenAPI specification { #http2kafka-openapi }
+
+% this page cannot be properly rendered in local environment, it should be build in CI first
+
+
+
+----8<----
+mddocs/docs/en/_static/swagger_http2kafka.html
+----8<----
diff --git a/mddocs/reference/server/auth/custom.md b/mddocs/reference/server/auth/custom.md
new file mode 100644
index 00000000..5058f6f9
--- /dev/null
+++ b/mddocs/reference/server/auth/custom.md
@@ -0,0 +1,5 @@
+# Custom Auth provider { #auth-server-custom }
+
+You can implement custom auth provider by inheriting from class below and implementing necessary methods.
+
+::: data_rentgen.server.providers.auth.AuthProvider
diff --git a/mddocs/reference/server/auth/dummy.md b/mddocs/reference/server/auth/dummy.md
new file mode 100644
index 00000000..969fe0f9
--- /dev/null
+++ b/mddocs/reference/server/auth/dummy.md
@@ -0,0 +1,70 @@
+# Dummy Auth provider { #auth-server-dummy }
+
+## Description
+
+This auth provider allows to sign-in with any username and password, and and then issues an access token.
+
+After successful auth, username is saved to backend database.
+
+## Interaction schema
+
+```plantuml title="Interaction schema"
+
+ @startuml
+ title DummyAuthProvider
+ participant "Client"
+ participant "Backend"
+
+ == POST v1/auth/token ==
+
+ activate "Client"
+ alt Successful case
+ "Client" -> "Backend" ++ : login + password
+ "Backend" --> "Backend" : Password is completely ignored
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" -> "Backend" : Create user if not exist
+ "Backend" -[#green]> "Client" -- : Generate and return access_token
+
+ else User is blocked
+ "Client" -> "Backend" ++ : login + password
+ "Backend" --> "Backend" : Password is completely ignored
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" x-[#red]> "Client" -- : 401 Unauthorized
+
+ else User is deleted
+ "Client" -> "Backend" ++ : login + password
+ "Backend" --> "Backend" : Password is completely ignored
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" x-[#red]> "Client" -- : 404 Not found
+ end
+
+ == GET v1/datasets ==
+
+ alt Successful case
+ "Client" -> "Backend" ++ : access_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" -> "Backend" : Get data
+ "Backend" -[#green]> "Client" -- : Return data
+
+ else Token is expired
+ "Client" -> "Backend" ++ : access_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" x-[#red]> "Client" -- : 401 Unauthorized
+
+ else User is not found
+ "Client" -> "Backend" ++ : access_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" x-[#red]> "Client" -- : 404 Not found
+ end
+
+ deactivate "Client"
+ @enduml
+```
+
+## Configuration
+
+::: data_rentgen.server.settings.auth.dummy.DummyAuthProviderSettings
+
+::: data_rentgen.server.settings.auth.jwt.JWTSettings
diff --git a/mddocs/reference/server/auth/index.md b/mddocs/reference/server/auth/index.md
new file mode 100644
index 00000000..2c6deeb7
--- /dev/null
+++ b/mddocs/reference/server/auth/index.md
@@ -0,0 +1,27 @@
+# Authentication and Authorization { #auth-server }
+
+## Overview
+
+To access the service's endpoints, a client must authenticate. The service provides several options for authentication.
+
+Currently, the service does not implement a role-based model, and all users have the same level of permissions.
+
+Authentication is implemented via middleware as follows: before each endpoint call, the `get_user()` function is invoked. This function attempts to retrieve the username from the provided token.
+
+Data Rentgen supports different auth provider implementations. You can change implementation via settings:
+
+::: data_rentgen.server.settings.auth.AuthSettings
+
+Right now service has two scenarios for authentication:
+
+- [Dummy(JWT Tokens)](https://jwt.io/) a lightweight option for testing and development.
+- [Keycloak authentication](https://www.keycloak.org/) recommended option. Integrates with Keycloak for token-based authentication.
+
+### Authentication Providers
+
+[dummy][auth-server-dummy]
+[keycloak][auth-server-keycloak]
+
+### For developers
+
+[custom][auth-server-custom]
diff --git a/mddocs/reference/server/auth/keycloak.md b/mddocs/reference/server/auth/keycloak.md
new file mode 100644
index 00000000..df2a86a6
--- /dev/null
+++ b/mddocs/reference/server/auth/keycloak.md
@@ -0,0 +1,101 @@
+# Keycloak Provider { #auth-server-keycloak }
+
+## Description
+
+Keycloak auth provider uses [python-keycloak](https://pypi.org/project/python-keycloak/) library to interact with Keycloak server. During the authentication process,
+KeycloakAuthProvider redirects user to Keycloak authentication page.
+
+After successful authentication, Keycloak redirects user back to Data.Rentgen with authorization code.
+Then KeycloakAuthProvider exchanges authorization code for an access token and uses it to get user information from Keycloak server.
+If user is not found in Data.Rentgen database, KeycloakAuthProvider creates it. Finally, KeycloakAuthProvider returns user with access token.
+
+## Interaction schema
+
+```plantuml title="Interaction schema"
+
+ @startuml
+ title DummyAuthProvider
+ participant "Frontend"
+ participant "Backend"
+ participant "Keycloak"
+
+ == Frontend Authentication at Keycloak ==
+
+ Frontend -> Backend : Request endpoint with authentication (/v1/locations)
+
+ Backend x-[#red]> Frontend: 401 with redirect url in 'details' response field
+
+ Frontend -> Keycloak : Redirect user to Keycloak login page
+
+ alt Successful login
+ Frontend --> Keycloak : Log in with login and password
+ else Login failed
+ Keycloak x-[#red]> Frontend -- : Display error (401 Unauthorized)
+ end
+
+ Keycloak -> Frontend : Callback to Frontend /callback which is proxy between Keycloak and Backend
+
+ Frontend -> Backend : Send request to Backend '/v1/auth/callback'
+
+ Backend -> Keycloak : Check original 'state' and exchange code for token's
+ Keycloak --> Backend : Return token's
+ Backend --> Frontend : Set token's in user's browser in cookies
+
+ Frontend --> Backend : Request to /v1/locations with session cookies
+ Backend -> Backend : Get user info from token and check user in internal backend database
+ Backend -> Backend : Create user in internal backend database if not exist
+ Backend -[#green]> Frontend -- : Return requested data
+
+
+ == GET v1/datasets ==
+
+
+ alt Successful case
+ "Frontend" -> "Backend" ++ : access_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" -> "Backend" : Get data
+ "Backend" -[#green]> "Frontend" -- : Return data
+
+ else Token is expired (Successful case)
+ "Frontend" -> "Backend" ++ : access_token, refresh_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" -[#yellow]> "Backend" : Token is expired
+ "Backend" --> "Keycloak" : Try to refresh token
+ "Backend" --> "Backend" : Validate new token
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" -> "Backend" : Get data
+ "Backend" -[#green]> "Frontend" -- : Return data
+
+ else Create new User
+ "Frontend" -> "Backend" ++ : access_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" --> "Backend" : Check user in internal backend database
+ "Backend" --> "Backend" : Create new user
+ "Backend" -> "Backend" : Get data
+ "Backend" -[#green]> "Frontend" -- : Return data
+
+ else Token is expired and bad refresh token
+ "Frontend" -> "Backend" ++ : access_token, refresh_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" -[#yellow]> "Backend" : Token is expired
+ "Backend" --> "Keycloak" : Try to refresh token
+ "Backend" x-[#red]> "Frontend" -- : RedirectResponse can't refresh
+
+ else Bad Token payload
+ "Frontend" -> "Backend" ++ : access_token, refresh_token
+ "Backend" --> "Backend" : Validate token
+ "Backend" x-[#red]> "Frontend" -- : 307 Authorization error
+
+ end
+
+ deactivate "Frontend"
+ @enduml
+
+```
+
+## Basic Configuration
+
+::: data_rentgen.server.settings.auth.keycloak.KeycloakAuthProviderSettings
+
+::: data_rentgen.server.settings.auth.keycloak.KeycloakSettings
diff --git a/mddocs/reference/server/auth/personal_tokens.md b/mddocs/reference/server/auth/personal_tokens.md
new file mode 100644
index 00000000..af746987
--- /dev/null
+++ b/mddocs/reference/server/auth/personal_tokens.md
@@ -0,0 +1,78 @@
+(auth-server-personal-tokens)=
+
+# Personal Tokens
+
+## Description
+
+This auth schema (not actually a AuthProvider) allows to access API endpoints using {ref}`personal-tokens`.
+If enabled, it has higher priority than AuthProvider.
+
+:::{note}
+Some endpoints, like creating/refreshing Personal Tokens, cannot be used with this auth type,
+as they require human interaction.
+:::
+
+## Interaction schema
+
+```{eval-rst}
+.. dropdown:: Interaction schema
+
+ .. plantuml::
+
+ @startuml
+ title PersonalTokensAuthProvider
+ participant "Client"
+ participant "Backend"
+ participant "Database"
+
+ == POST v1/personal-tokens ==
+
+ activate "Client"
+ "Client" -> "Backend" ++ : create new Personal token
+ "Backend" -[#green]> "Client" -- : Generate and return personal_token
+
+ == GET v1/datasets ==
+
+ alt Successful case (first request)
+ "Client" -> "Backend" ++ : Authorization Bearer personal_token
+ "Backend" -[#green]> "Backend" : Validate token
+ "Backend" -[#red]x "Backend" : Get token info from in-memory cache
+ "Backend" --> "Database" ++ : Fetch token info
+ "Database" --> "Backend" : Return token info
+ "Backend" -[#green]> "Backend" : Cache token
+ "Backend" --> "Database" : Fetch data
+ "Database" -[#green]> "Backend" -- : Return data
+ "Backend" -[#green]> "Client" -- : Return data
+
+ else Successful case (second request)
+ "Client" -> "Backend" ++ : Authorization Bearer personal_token
+ "Backend" -[#green]> "Backend" : Validate token
+ "Backend" -[#green]> "Backend" : Get token info from in-memory cache
+ "Backend" --> "Database" ++ : Fetch data
+ "Database" -[#green]> "Backend" -- : Return data
+ "Backend" -[#green]> "Client" -- : Return data
+
+ else Token is expired
+ "Client" -> "Backend" ++ : Authorization Bearer personal_token
+ "Backend" -[#red]x "Backend" : Validate token
+ "Backend" x-[#red]> "Client" -- : 401 Unauthorized
+
+ else Token was revoked
+ "Client" -> "Backend" ++ : Authorization Bearer personal_token
+ "Backend" -[#green]> "Backend" : Validate token
+ "Backend" -[#red]x "Backend" : Get token info from in-memory cache
+ "Backend" --> "Database" : Fetch token info
+ "Database" x-[#red]> "Backend" -- : No active token in database
+ "Backend" x-[#red]> "Client" -- : 401 Unauthorized
+ end
+
+ deactivate "Client"
+ @enduml
+```
+
+## Basic Configuration
+
+```{eval-rst}
+.. autopydantic_model:: data_rentgen.server.settings.auth.personal_token.PersonalTokenSettings
+
+```
diff --git a/mddocs/reference/server/configuration/cors.md b/mddocs/reference/server/configuration/cors.md
new file mode 100644
index 00000000..fcf955a4
--- /dev/null
+++ b/mddocs/reference/server/configuration/cors.md
@@ -0,0 +1,5 @@
+# CORS settings { #configuration-server-cors }
+
+These settings used to control [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS) options.
+
+::: data_rentgen.server.settings.cors.CORSSettings
diff --git a/mddocs/reference/server/configuration/debug.md b/mddocs/reference/server/configuration/debug.md
new file mode 100644
index 00000000..b113d61c
--- /dev/null
+++ b/mddocs/reference/server/configuration/debug.md
@@ -0,0 +1,94 @@
+# Enabling debug { #configuration-server-debug }
+
+## Return debug info in REST API responses
+
+By default, server does not add error details to response bodies, to avoid exposing instance-specific information to end users.
+
+You can change this by setting:
+
+```console
+$ export DATA_RENTGEN__SERVER__DEBUG=False
+$ # start REST API server
+$ curl -XPOST http://localhost:8000/failing/endpoint ...
+{
+ "error": {
+ "code": "unknown",
+ "message": "Got unhandled exception. Please contact support",
+ "details": null,
+ },
+}
+```
+
+```console
+$ export DATA_RENTGEN__SERVER__DEBUG=True
+$ # start REST API server
+$ curl -XPOST http://localhost:8000/failing/endpoint ...
+Traceback (most recent call last):
+File ".../uvicorn/protocols/http/h11_impl.py", line 408, in run_asgi
+ result = await app( # type: ignore[func-returns-value]
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File ".../site-packages/uvicorn/middleware/proxy_headers.py", line 84, in __call__
+ return await self.app(scope, receive, send)
+```
+
+!!! warning
+
+ This is only for development environment only. Do **NOT** use on production!
+
+## Print debug logs on backend
+
+See [Logging settings][configuration-server-logging], but replace log level `INFO` with `DEBUG`.
+
+## Fill up `X-Request-ID` header on backend
+
+Server can add `X-Request-ID` header to responses, which allows to match request on client with backend response.
+
+This is done by `request_id` middleware, which is enabled by default and can configured as described below:
+
+::: data_rentgen.server.settings.request_id.RequestIDSettings
+
+## Print request ID to backend logs
+
+This is done by adding a specific filter to logging handler:
+
+??? note "logging.yml"
+
+ ```yaml hl_lines="6-12 23-24 35"
+ ----8<----
+ data_rentgen/logging/presets/plain.yml
+ ----8<----
+ ```
+
+Resulting logs look like:
+
+```text
+2023-12-18 17:14:11.711 uvicorn.access:498 [INFO] 018c15e97a068ae09484f8c25e2799dd 127.0.0.1:34884 - "GET /monitoring/ping HTTP/1.1" 200
+```
+
+## Use `X-Request-ID` header on client
+
+If client got `X-Request-ID` header from backend, it is printed to logs with `DEBUG` level:
+
+```pycon
+>>> import logging
+>>> logging.basicConfig(level=logging.DEBUG)
+>>> client.ping()
+DEBUG:urllib3.connectionpool:http://localhost:8000 "GET /monitoring/ping HTTP/1.1" 200 15
+DEBUG:data_rentgen.client.base:Request ID: '018c15e97a068ae09484f8c25e2799dd'
+```
+
+Also, if REST API response was not successful, `Request ID` is added to exception message:
+
+```pycon
+>>> client.get_namespace("unknown")
+requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http://localhost:8000/v1/namespaces/unknown
+Request ID: '018c15eb80fa81a6b38c9eaa519cd322'
+```
+
+## Fill up `X-Application-Version` header on REST API side
+
+Server can add `X-Application-Version` header to responses, which allows to determine which version of backend is deployed.
+
+This is done by `application_version` middleware, which is enabled by default and can configured as described below:
+
+::: data_rentgen.server.settings.application_version.ApplicationVersionSettings
diff --git a/mddocs/reference/server/configuration/index.md b/mddocs/reference/server/configuration/index.md
new file mode 100644
index 00000000..9cc834a1
--- /dev/null
+++ b/mddocs/reference/server/configuration/index.md
@@ -0,0 +1,15 @@
+# REST API server configuration { #configuration-server }
+
+## Configuration
+
+[logging][configuration-server-logging]
+[monitoring][configuration-server-monitoring]
+[cors][configuration-server-cors]
+[session][configuration-server-session]
+[static_files][configuration-server-static-files]
+[openapi][configuration-server-openapi]
+[debug][configuration-server-debug]
+
+::: data_rentgen.server.settings.ServerApplicationSettings
+
+::: data_rentgen.server.settings.ServerSettings
diff --git a/mddocs/reference/server/configuration/logging.md b/mddocs/reference/server/configuration/logging.md
new file mode 100644
index 00000000..717f0734
--- /dev/null
+++ b/mddocs/reference/server/configuration/logging.md
@@ -0,0 +1,3 @@
+# Logging settings { #configuration-server-logging }
+
+::: data_rentgen.logging.settings.LoggingSettings
diff --git a/mddocs/reference/server/configuration/monitoring.md b/mddocs/reference/server/configuration/monitoring.md
new file mode 100644
index 00000000..f03a5d53
--- /dev/null
+++ b/mddocs/reference/server/configuration/monitoring.md
@@ -0,0 +1,9 @@
+# Setup monitoring { #configuration-server-monitoring }
+
+REST API server provides the following endpoints with Prometheus compatible metrics:
+
+- `GET /monitoring/metrics` - server metrics, like number of requests per path and response status, CPU and RAM usage, and so on.
+
+These endpoints are enabled and configured using settings below:
+
+::: data_rentgen.server.settings.monitoring.MonitoringSettings
diff --git a/mddocs/reference/server/configuration/openapi.md b/mddocs/reference/server/configuration/openapi.md
new file mode 100644
index 00000000..bd408be8
--- /dev/null
+++ b/mddocs/reference/server/configuration/openapi.md
@@ -0,0 +1,13 @@
+# OpenAPI settings { #configuration-server-openapi }
+
+These settings used to control exposing OpenAPI.json and SwaggerUI/ReDoc endpoints.
+
+::: data_rentgen.server.settings.openapi.OpenAPISettings
+
+::: data_rentgen.server.settings.openapi.SwaggerSettings
+
+::: data_rentgen.server.settings.openapi.RedocSettings
+
+::: data_rentgen.server.settings.openapi.LogoSettings
+
+::: data_rentgen.server.settings.openapi.FaviconSettings
diff --git a/mddocs/reference/server/configuration/session.md b/mddocs/reference/server/configuration/session.md
new file mode 100644
index 00000000..c09d7354
--- /dev/null
+++ b/mddocs/reference/server/configuration/session.md
@@ -0,0 +1,5 @@
+# Session cookie settings { #configuration-server-session }
+
+These settings used to control cookie settings used for storing user session data.
+
+::: data_rentgen.server.settings.session.SessionSettings
diff --git a/mddocs/reference/server/configuration/static_files.md b/mddocs/reference/server/configuration/static_files.md
new file mode 100644
index 00000000..cb4fb1ee
--- /dev/null
+++ b/mddocs/reference/server/configuration/static_files.md
@@ -0,0 +1,5 @@
+# Serving static files { #configuration-server-static-files }
+
+These settings used to control serving static files by a server.
+
+::: data_rentgen.server.settings.static_files.StaticFilesSettings
diff --git a/mddocs/reference/server/index.md b/mddocs/reference/server/index.md
new file mode 100644
index 00000000..4bf54be0
--- /dev/null
+++ b/mddocs/reference/server/index.md
@@ -0,0 +1,80 @@
+# REST API Server { #server }
+
+Data.Rentgen REST API server provides simple HTTP API for accessing entities stored in [`database`][базе данных].
+Implemented using [FastAPI](https://fastapi.tiangolo.com/).
+
+## Install & run
+
+### With docker
+
+- Install [Docker](https://docs.docker.com/engine/install/)
+
+- Install [docker-compose](https://github.com/docker/compose/releases/)
+
+- Run the following command:
+
+ ```console
+ $ docker compose --profile server up -d --wait
+ ...
+ ```
+
+ `docker-compose` will download all necessary images, create containers, and then start the server.
+
+ Options can be set via `.env` file or `environment` section in `docker-compose.yml`
+
+??? note "docker-compose.yml"
+
+ ```yaml hl_lines="71-99" linenums="1"
+ ----8<----
+ docker-compose.yml
+ ----8<----
+ ```
+
+??? note ".env.docker"
+
+ ```ini hl_lines="22-27" linenums="1"
+ ----8<----
+ .env.docker
+ ----8<----
+ ```
+
+- After server is started and ready, open [http://localhost:8000/docs](http://localhost:8000/docs).
+
+### Without docker
+
+- Install Python 3.10 or above
+
+- Setup [`database`][database], run migrations and create partitions
+
+- Create virtual environment
+
+ ```console
+ $ python -m venv /some/.venv
+ ...
+ $ source /some/.venv/activate
+ ```
+
+- Install `data-rentgen` package with following *extra* dependencies:
+
+ ```console
+ $ pip install data-rentgen[server,postgres]
+ ...
+ ```
+
+- Run server process
+
+ ```console
+ $ python -m data_rentgen.server --host 0.0.0.0 --port 8000
+ ...
+ ```
+
+ This is a thin wrapper around [uvicorn](https://www.uvicorn.org/#command-line-options) cli,
+ options and commands are just the same.
+
+- After server is started and ready, open [http://localhost:8000/docs](http://localhost:8000/docs).
+
+## See also
+
+- [Authentication and Authorization][auth-server]
+- [REST API server configuration][configuration-server]
+- [OpenAPI specification][server-openapi]
diff --git a/mddocs/reference/server/openapi.md b/mddocs/reference/server/openapi.md
new file mode 100644
index 00000000..bff0881f
--- /dev/null
+++ b/mddocs/reference/server/openapi.md
@@ -0,0 +1,7 @@
+# OpenAPI specification { #server-openapi }
+
+% this page cannot be properly rendered in local environment, it should be build in CI first
+
+----8<----
+mddocs/docs/en/_static/swagger_server.html
+----8<----
diff --git a/mddocs/robots.txt b/mddocs/robots.txt
new file mode 100644
index 00000000..30059ceb
--- /dev/null
+++ b/mddocs/robots.txt
@@ -0,0 +1,5 @@
+User-agent: *
+Allow: /*/stable/
+Allow: /en/stable/ # Fallback for bots that don't understand wildcards
+Disallow: /
+Sitemap: https://data-rentgen.readthedocs.io/sitemap.xml
\ No newline at end of file
diff --git a/mddocs/security.md b/mddocs/security.md
new file mode 100644
index 00000000..f680874a
--- /dev/null
+++ b/mddocs/security.md
@@ -0,0 +1,25 @@
+# Security
+
+## Supported Python versions
+
+* Server: 3.10 or above
+
+## Product development security recommendations
+
+1. Update dependencies to last stable version
+2. Build SBOM for the project
+3. Perform SAST (Static Application Security Testing) where possible
+
+## Product development security requirements
+
+1. No binaries in repository
+2. No passwords, keys, access tokens in source code
+3. No “Critical” and/or “High” vulnerabilities in contributed source code
+
+## Vulnerability reports
+
+Please, use email [mailto:onetools@mts.ru](mailto:onetools@mts.ru) for reporting security issues or anything that can cause any consequences for security.
+
+Please avoid any public disclosure (including registering issues) at least until it is fixed.
+
+Thank you in advance for understanding.