From d3250b7615c7bc1b897b30198dc66def585068c0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:48:03 +0000 Subject: [PATCH 1/8] [INIT] Start translation to Simplified-Chinese --- .translation-init | 1 + 1 file changed, 1 insertion(+) create mode 100644 .translation-init diff --git a/.translation-init b/.translation-init new file mode 100644 index 0000000000..6064308ab7 --- /dev/null +++ b/.translation-init @@ -0,0 +1 @@ +Translation initialization: 2025-10-25T10:48:02.510790 From 2bf502844c06645bed78e70131b05621ae903125 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:50:25 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/00-sql-analytics.md | 313 +++++++++----------- 1 file changed, 143 insertions(+), 170 deletions(-) diff --git a/docs/cn/guides/54-query/00-sql-analytics.md b/docs/cn/guides/54-query/00-sql-analytics.md index 64afb2a6fa..dd69ec1754 100644 --- a/docs/cn/guides/54-query/00-sql-analytics.md +++ b/docs/cn/guides/54-query/00-sql-analytics.md @@ -2,276 +2,249 @@ title: SQL 分析(SQL Analytics) --- -> **场景(Scenario):** EverDrive Smart Vision 的分析师整理了一组共享的驾驶会话(drive sessions)和关键帧(key frames),使每个下游工作负载都能查询相同的 ID,而无需在系统之间复制数据。 +> **场景:**CityDrive 将每一次行车记录仪运行结果加载到共享关系表中,以便分析人员能够针对所有下游工作负载筛选、关联并聚合相同的 `video_id` / `frame_id` 对。 -本教程将构建一个微型的 **EverDrive Smart Vision** 数据集,并展示 Databend 的单一查询优化器(Query Optimizer)如何在其余指南中发挥作用。您在此处创建的每个 ID(`SES-20240801-SEA01`、`FRAME-0001` …)都会重新出现在 JSON、向量、地理和 ETL 演练中,形成一致的自动驾驶故事。 +本指南建模了该目录的关系部分,并突出展示实用的 SQL 构建块。这里的示例 ID 会在 JSON、向量、地理以及 ETL 指南中再次出现。 -## 1. 创建示例表 -两张表分别记录测试会话和从行车记录仪视频中提取的重要帧。 +## 1. 创建基础表 +`citydrive_videos` 存储片段元数据,而 `frame_events` 记录从每个片段中提取的关键帧。 ```sql -CREATE OR REPLACE TABLE drive_sessions ( - session_id VARCHAR, - vehicle_id VARCHAR, - route_name VARCHAR, - start_time TIMESTAMP, - end_time TIMESTAMP, - weather VARCHAR, - camera_setup VARCHAR +CREATE OR REPLACE TABLE citydrive_videos ( + video_id STRING, + vehicle_id STRING, + capture_date DATE, + route_name STRING, + weather STRING, + camera_source STRING, + duration_sec INT ); CREATE OR REPLACE TABLE frame_events ( - frame_id VARCHAR, - session_id VARCHAR, - frame_index INT, - captured_at TIMESTAMP, - event_type VARCHAR, - risk_score DOUBLE + frame_id STRING, + video_id STRING, + frame_index INT, + collected_at TIMESTAMP, + event_tag STRING, + risk_score DOUBLE, + speed_kmh DOUBLE ); -INSERT INTO drive_sessions VALUES - ('SES-20240801-SEA01', 'VEH-01', 'Seattle → Bellevue → Seattle', '2024-08-01 09:00', '2024-08-01 10:10', 'Sunny', 'Dual 1080p'), - ('SES-20240802-SEA02', 'VEH-02', 'Downtown Night Loop', '2024-08-02 20:15', '2024-08-02 21:05', 'Light Rain','Night Vision'), - ('SES-20240803-SEA03', 'VEH-03', 'Harbor Industrial Route', '2024-08-03 14:05', '2024-08-03 15:30', 'Overcast', 'Thermal + RGB'); +INSERT INTO citydrive_videos VALUES + ('VID-20250101-001', 'VEH-21', '2025-01-01', 'Downtown Loop', 'Rain', 'roof_cam', 3580), + ('VID-20250101-002', 'VEH-05', '2025-01-01', 'Port Perimeter', 'Overcast', 'front_cam',4020), + ('VID-20250102-001', 'VEH-21', '2025-01-02', 'Airport Connector', 'Clear', 'front_cam',3655), + ('VID-20250103-001', 'VEH-11', '2025-01-03', 'CBD Night Sweep', 'LightFog', 'rear_cam', 3310); INSERT INTO frame_events VALUES - ('FRAME-0001', 'SES-20240801-SEA01', 120, '2024-08-01 09:32:15', 'SuddenBrake', 0.82), - ('FRAME-0002', 'SES-20240801-SEA01', 342, '2024-08-01 09:48:03', 'CrosswalkPedestrian', 0.67), - ('FRAME-0003', 'SES-20240802-SEA02', 88, '2024-08-02 20:29:41', 'NightLowVisibility', 0.59), - ('FRAME-0004', 'SES-20240802-SEA02', 214, '2024-08-02 20:48:12', 'EmergencyVehicle', 0.73), - ('FRAME-0005', 'SES-20240803-SEA03', 305, '2024-08-03 15:02:44', 'CyclistOvertake', 0.64); + ('FRAME-0101', 'VID-20250101-001', 125, '2025-01-01 08:15:21', 'hard_brake', 0.81, 32.4), + ('FRAME-0102', 'VID-20250101-001', 416, '2025-01-01 08:33:54', 'pedestrian', 0.67, 24.8), + ('FRAME-0201', 'VID-20250101-002', 298, '2025-01-01 11:12:02', 'lane_merge', 0.74, 48.1), + ('FRAME-0301', 'VID-20250102-001', 188, '2025-01-02 09:44:18', 'hard_brake', 0.59, 52.6), + ('FRAME-0401', 'VID-20250103-001', 522, '2025-01-03 21:18:07', 'night_lowlight', 0.63, 38.9); ``` -> 需要回顾表 DDL?请参阅 [CREATE TABLE](/sql/sql-commands/ddl/table/ddl-create-table)。 +文档: [CREATE TABLE](/sql/sql-commands/ddl/table/ddl-create-table)、[INSERT](/sql/sql-commands/dml/dml-insert)。 --- -## 2. 过滤最近会话 -让分析聚焦在最新的驾驶记录上。 +## 2. 筛选工作集 +将调查重点放在最新的行驶记录上。 ```sql -WITH recent_sessions AS ( - SELECT * - FROM drive_sessions - WHERE start_time >= DATEADD('day', -7, CURRENT_TIMESTAMP) +WITH recent_videos AS ( + SELECT * + FROM citydrive_videos + WHERE capture_date >= DATEADD('day', -3, TODAY()) ) -SELECT * -FROM recent_sessions -ORDER BY start_time DESC; +SELECT v.video_id, + v.route_name, + v.weather, + COUNT(f.frame_id) AS flagged_frames +FROM recent_videos v +LEFT JOIN frame_events f USING (video_id) +GROUP BY v.video_id, v.route_name, v.weather +ORDER BY flagged_frames DESC; ``` -尽早过滤可加快后续连接(JOIN)与聚合(GROUP BY)。文档:[WHERE & CASE](/sql/sql-commands/query-syntax/query-select#where-clause)。 +文档: [DATEADD](/sql/sql-functions/datetime-functions/date-add)、[GROUP BY](/sql/sql-commands/query-syntax/query-select#group-by-clause)。 --- -## 3. 连接(JOIN) -### INNER JOIN ... USING -合并会话元数据与帧级事件。 - +## 3. JOIN 模式 +### 用于帧上下文的内连接(INNER JOIN) ```sql -WITH recent_events AS ( - SELECT * - FROM frame_events - WHERE captured_at >= DATEADD('day', -7, CURRENT_TIMESTAMP) -) -SELECT e.frame_id, - e.captured_at, - e.event_type, - e.risk_score, - s.vehicle_id, - s.route_name, - s.weather -FROM recent_events e -JOIN drive_sessions s USING (session_id) -ORDER BY e.captured_at; +SELECT f.frame_id, + f.event_tag, + f.risk_score, + v.route_name, + v.camera_source +FROM frame_events AS f +JOIN citydrive_videos AS v USING (video_id) +ORDER BY f.collected_at; ``` -### NOT EXISTS(反连接/Anti Join) -查找缺少会话元数据的事件。 - +### 反连接(Anti Join)质量检查 ```sql SELECT frame_id -FROM frame_events e +FROM frame_events f WHERE NOT EXISTS ( - SELECT 1 - FROM drive_sessions s - WHERE s.session_id = e.session_id + SELECT 1 + FROM citydrive_videos v + WHERE v.video_id = f.video_id ); ``` -### LATERAL FLATTEN(JSON 展开/Unnest) -将事件与 JSON 载荷中的检测对象合并。 - +### 使用 LATERAL FLATTEN 处理嵌套检测 ```sql -SELECT e.frame_id, - obj.value['type']::STRING AS object_type -FROM frame_events e -JOIN frame_payloads p USING (frame_id), - LATERAL FLATTEN(p.payload['objects']) AS obj; +SELECT f.frame_id, + obj.value['type']::STRING AS detected_type, + obj.value['confidence']::DOUBLE AS confidence +FROM frame_events AS f +JOIN frame_payloads AS p ON f.frame_id = p.frame_id, + LATERAL FLATTEN(input => p.payload['objects']) AS obj +WHERE f.event_tag = 'pedestrian' +ORDER BY confidence DESC; ``` -更多模式:[JOIN 参考](/sql/sql-commands/query-syntax/query-join)。 +文档: [JOIN](/sql/sql-commands/query-syntax/query-join)、[FLATTEN](/sql/sql-functions/table-functions/flatten)。 --- -## 4. 分组(GROUP BY) -### GROUP BY route_name, event_type -标准 `GROUP BY` 比较路线与事件类型。 - +## 4. 车队 KPI 聚合 +### 按路线划分的行为 ```sql -WITH recent_events AS ( - SELECT * - FROM frame_events - WHERE captured_at >= DATEADD('week', -4, CURRENT_TIMESTAMP) -) -SELECT route_name, - event_type, - COUNT(*) AS event_count, - AVG(risk_score) AS avg_risk -FROM recent_events -JOIN drive_sessions USING (session_id) -GROUP BY route_name, event_type -ORDER BY avg_risk DESC, event_count DESC; +SELECT v.route_name, + f.event_tag, + COUNT(*) AS occurrences, + AVG(f.risk_score) AS avg_risk +FROM frame_events f +JOIN citydrive_videos v USING (video_id) +GROUP BY v.route_name, f.event_tag +ORDER BY avg_risk DESC, occurrences DESC; ``` -### GROUP BY ROLLUP -增加路线小计及总计。 - +### ROLLUP 汇总 ```sql -SELECT route_name, - event_type, - COUNT(*) AS event_count, - AVG(risk_score) AS avg_risk -FROM frame_events -JOIN drive_sessions USING (session_id) -GROUP BY ROLLUP(route_name, event_type) -ORDER BY route_name NULLS LAST, event_type; +SELECT v.route_name, + f.event_tag, + COUNT(*) AS occurrences +FROM frame_events f +JOIN citydrive_videos v USING (video_id) +GROUP BY ROLLUP(v.route_name, f.event_tag) +ORDER BY v.route_name NULLS LAST, f.event_tag; ``` -### GROUP BY CUBE -生成路线与事件类型的所有组合。 - +### 针对路线 × 天气覆盖的 CUBE ```sql -SELECT route_name, - event_type, - COUNT(*) AS event_count, - AVG(risk_score) AS avg_risk -FROM frame_events -JOIN drive_sessions USING (session_id) -GROUP BY CUBE(route_name, event_type) -ORDER BY route_name NULLS LAST, event_type; +SELECT v.route_name, + v.weather, + COUNT(DISTINCT v.video_id) AS videos +FROM citydrive_videos v +GROUP BY CUBE(v.route_name, v.weather) +ORDER BY v.route_name NULLS LAST, v.weather NULLS LAST; ``` --- -## 5. 窗口函数(WINDOW FUNCTION) -### SUM(...) OVER(运行总计/running total) -用运行 `SUM` 跟踪每次驾驶的累积风险。 - +## 5. 窗口函数(Window Functions) +### 按视频计算的风险累计 ```sql -WITH session_event_scores AS ( - SELECT session_id, - captured_at, - risk_score - FROM frame_events +WITH ordered_events AS ( + SELECT video_id, collected_at, risk_score + FROM frame_events ) -SELECT session_id, - captured_at, +SELECT video_id, + collected_at, risk_score, SUM(risk_score) OVER ( - PARTITION BY session_id - ORDER BY captured_at + PARTITION BY video_id + ORDER BY collected_at ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS cumulative_risk -FROM session_event_scores -ORDER BY session_id, captured_at; +FROM ordered_events +ORDER BY video_id, collected_at; ``` -### AVG(...) OVER(移动平均/moving average) -显示最近三个事件的风险移动平均: - +### 最近帧的滚动平均 ```sql -WITH session_event_scores AS ( - SELECT session_id, - captured_at, - risk_score - FROM frame_events -) -SELECT session_id, - captured_at, +SELECT video_id, + frame_id, + frame_index, risk_score, AVG(risk_score) OVER ( - PARTITION BY session_id - ORDER BY captured_at + PARTITION BY video_id + ORDER BY frame_index ROWS BETWEEN 3 PRECEDING AND CURRENT ROW - ) AS moving_avg_risk -FROM session_event_scores -ORDER BY session_id, captured_at; + ) AS rolling_avg_risk +FROM frame_events +ORDER BY video_id, frame_index; ``` -窗口函数(Window Functions)让你以内联方式表达滚动总计或平均值。完整列表:[窗口函数(Window Functions)](/sql/sql-functions/window-functions)。 +文档: [窗口函数(Window functions)](/sql/sql-functions/window-functions)。 --- -## 6. 聚合索引加速(Aggregating Index Acceleration) -用[聚合索引(Aggregating Index)](/guides/performance/aggregating-index)缓存繁重汇总,让仪表盘保持秒级响应。 +## 6. 聚合索引(Aggregating Index)加速 +为仪表盘(Dashboard)持久保存常用汇总结果。 ```sql -CREATE OR REPLACE AGGREGATING INDEX idx_route_event_summary ON frame_events +CREATE OR REPLACE AGGREGATING INDEX idx_video_event_summary AS -SELECT session_id, - event_type, +SELECT video_id, + event_tag, COUNT(*) AS event_count, AVG(risk_score) AS avg_risk FROM frame_events -GROUP BY session_id, event_type; +GROUP BY video_id, event_tag; ``` -再次运行相同的汇总查询——优化器将自动命中索引: +当分析人员重复执行熟悉的 KPI 时,查询优化器(Query Optimizer)会直接从该索引提供结果: ```sql -SELECT s.route_name, - e.event_type, +SELECT v.route_name, + e.event_tag, COUNT(*) AS event_count, AVG(e.risk_score) AS avg_risk FROM frame_events e -JOIN drive_sessions s USING (session_id) -WHERE s.start_time >= DATEADD('week', -8, CURRENT_TIMESTAMP) -GROUP BY s.route_name, e.event_type +JOIN citydrive_videos v USING (video_id) +WHERE v.capture_date >= DATEADD('day', -14, TODAY()) +GROUP BY v.route_name, e.event_tag ORDER BY avg_risk DESC; ``` -`EXPLAIN` 该语句可看到 `AggregatingIndex` 节点而非全表扫描。Databend 在新帧到达时自动刷新索引,无需额外 ETL 即可实现亚秒级仪表盘体验。 +文档: [聚合索引(Aggregating Index)](/guides/performance/aggregating-index) 与 [EXPLAIN](/sql/sql-commands/explain-cmds/explain)。 --- -## 7. 存储过程自动化(Stored Procedure Automation) -将报告逻辑封装到存储过程(Stored Procedure)中,确保在定时任务中按预期执行。 +## 7. 存储过程(Stored Procedure)自动化 +封装逻辑,使计划任务始终生成一致的报告。 ```sql -CREATE OR REPLACE PROCEDURE generate_weekly_route_report(days_back INT) -RETURNS TABLE(route_name VARCHAR, event_count BIGINT, avg_risk DOUBLE) +CREATE OR REPLACE PROCEDURE citydrive_route_report(days_back UINT8) +RETURNS TABLE(route_name STRING, event_tag STRING, event_count BIGINT, avg_risk DOUBLE) LANGUAGE SQL AS $$ BEGIN RETURN TABLE ( - SELECT s.route_name, - COUNT(*) AS event_count, - AVG(e.risk_score) AS avg_risk + SELECT v.route_name, + e.event_tag, + COUNT(*) AS event_count, + AVG(e.risk_score) AS avg_risk FROM frame_events e - JOIN drive_sessions s USING (session_id) - WHERE e.captured_at >= DATEADD('day', -days_back, CURRENT_TIMESTAMP) - GROUP BY s.route_name + JOIN citydrive_videos v USING (video_id) + WHERE v.capture_date >= DATEADD('day', -:days_back, TODAY()) + GROUP BY v.route_name, e.event_tag ); END; $$; -CALL PROCEDURE generate_weekly_route_report(28); +CALL PROCEDURE citydrive_route_report(30); ``` -返回的结果集可直接用于笔记本、ETL 任务或自动告警。了解更多:[存储过程脚本(Stored Procedure Scripting)](/sql/stored-procedure-scripting)。 +存储过程(Stored Procedure)可以手动触发,通过 [TASKS](/guides/load-data/continuous-data-pipelines/task) 触发,或由编排工具触发。 --- -至此,您已拥有完整闭环:摄取会话数据、过滤、连接、聚合、加速重查询、趋势分析并发布。只需替换过滤条件或连接方式,即可将同一套方案应用于驾驶员评分、传感器退化或算法对比等其他智能驾驶 KPI。 \ No newline at end of file +借助这些表和模式,CityDrive 其余指南可以引用完全相同的 `video_id` 主键——`frame_metadata_catalog` 用于 JSON 搜索,帧嵌入用于相似度分析,GPS 位置用于地理查询,以及单一的 ETL 流程保持它们之间的同步。 \ No newline at end of file From e78fedb5705016f174fd9c2ec5e21e3139dc142c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:51:04 +0000 Subject: [PATCH 3/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/01-json-search.md | 157 +++++++--------------- 1 file changed, 47 insertions(+), 110 deletions(-) diff --git a/docs/cn/guides/54-query/01-json-search.md b/docs/cn/guides/54-query/01-json-search.md index 11d1202079..97f3bbb085 100644 --- a/docs/cn/guides/54-query/01-json-search.md +++ b/docs/cn/guides/54-query/01-json-search.md @@ -1,140 +1,77 @@ --- -title: JSON 与搜索(Search) +title: JSON 与搜索 --- -> **场景(Scenario):** EverDrive Smart Vision 的感知服务会为每个观察到的帧发出 JSON 有效载荷(payloads),安全分析师需要在不将数据移出 Databend 的情况下搜索检测结果。 +> **场景:** CityDrive 会为每个提取的帧附加一个元数据 JSON 负载,并需要在不将其从 Databend 中复制出去的情况下,对该 JSON 进行类似 Elasticsearch 的过滤。 -EverDrive 的感知 Pipeline(流水线)会发出 JSON 有效载荷,我们可以使用 Elasticsearch 风格的语法进行查询。通过将有效载荷存储为 VARIANT 类型并在创建表时声明倒排索引(inverted index),Databend 允许您直接在数据上运行 Lucene 的 `QUERY` 过滤器。 +Databend 将这些异构信号保存在同一个 warehouse 中。倒排索引为 VARIANT 列提供类似 Elasticsearch 的搜索能力,位图表汇总标签覆盖情况,向量索引回答相似性查找,原生 GEOMETRY 列支持空间过滤。 -## 1. 创建示例表 -每个帧都携带着来自感知模型(边界框、速度、分类)的结构化元数据。 +## 1. 创建元数据表 +为每个帧存储一个 JSON 负载,这样每次搜索都针对相同的结构。 ```sql -CREATE OR REPLACE TABLE frame_payloads ( - frame_id VARCHAR, - run_stage VARCHAR, - payload VARIANT, - logged_at TIMESTAMP, - INVERTED INDEX idx_frame_payloads(payload) -- 声明倒排索引(inverted index) -); - -INSERT INTO frame_payloads VALUES - ('FRAME-0001', 'detection', PARSE_JSON('{ - "objects": [ - {"type":"vehicle","bbox":[545,220,630,380],"confidence":0.94}, - {"type":"pedestrian","bbox":[710,200,765,350],"confidence":0.88} - ], - "ego": {"speed_kmh": 32.5, "accel": -2.1} - }'), '2024-08-01 09:32:16'), - ('FRAME-0002', 'detection', PARSE_JSON('{ - "objects": [ - {"type":"pedestrian","bbox":[620,210,670,360],"confidence":0.91} - ], - "scene": {"lighting":"daytime","weather":"sunny"} - }'), '2024-08-01 09:48:04'), - ('FRAME-0003', 'tracking', PARSE_JSON('{ - "objects": [ - {"type":"vehicle","speed_kmh": 18.0,"distance_m": 6.2}, - {"type":"emergency_vehicle","sirens":true} - ], - "scene": {"lighting":"night","visibility":"low"} - }'), '2024-08-02 20:29:42'); -``` - -## 2. 提取 JSON 路径 -查看有效载荷以确认结构。 - -```sql -SELECT frame_id, - payload['objects'][0]['type']::STRING AS first_object, - payload['ego']['speed_kmh']::DOUBLE AS ego_speed, - payload['scene']['lighting']::STRING AS lighting -FROM frame_payloads -ORDER BY logged_at; +CREATE DATABASE IF NOT EXISTS video_unified_demo; +USE video_unified_demo; + +CREATE OR REPLACE TABLE frame_metadata_catalog ( + doc_id STRING, + meta_json VARIANT, + captured_at TIMESTAMP, + INVERTED INDEX idx_meta_json (meta_json) +) CLUSTER BY (captured_at); ``` -使用 `::STRING` / `::DOUBLE` 进行类型转换(Casting)可以将 JSON 值暴露给常规的 SQL 过滤器。Databend 还通过 `QUERY` 函数支持在此数据之上进行 Elasticsearch 风格的搜索——通过在变体字段前加上列名(例如 `payload.objects.type`)来引用它们。更多提示:[加载半结构化数据](/guides/load-data/load-semistructured/load-ndjson)。 - ---- - -## 3. Elasticsearch 风格的搜索(Search) -`QUERY` 使用 Elasticsearch/Lucene 语法,因此您可以组合布尔逻辑、范围、权重(boosts)和列表。以下是 EverDrive 有效载荷上的几种模式: - -### 数组匹配(Array Match) -查找检测到行人的帧: +> 需要多模态数据(向量嵌入、GPS 轨迹、标签位图)?可以从 [向量](./02-vector-db.md) 和 [地理](./03-geo-analytics.md) 指南中获取这些模式,以便与此处展示的搜索结果结合使用。 +## 2. 使用 `QUERY()` 的搜索模式 +### 数组匹配 ```sql -SELECT frame_id -FROM frame_payloads -WHERE QUERY('payload.objects.type:pedestrian') -ORDER BY logged_at DESC -LIMIT 10; +SELECT doc_id, + captured_at, + meta_json['detections'] AS detections +FROM frame_metadata_catalog +WHERE QUERY('meta_json.detections.objects.type:pedestrian') +ORDER BY captured_at DESC +LIMIT 5; ``` ### 布尔 AND -车辆行驶速度大于 30 km/h **且** 检测到行人: - ```sql -SELECT frame_id, - payload['ego']['speed_kmh']::DOUBLE AS ego_speed -FROM frame_payloads -WHERE QUERY('payload.objects.type:pedestrian AND payload.ego.speed_kmh:[30 TO *]') -ORDER BY ego_speed DESC; +SELECT doc_id, captured_at +FROM frame_metadata_catalog +WHERE QUERY('meta_json.scene.weather_code:rain + AND meta_json.camera.sensor_view:roof') +ORDER BY captured_at; ``` ### 布尔 OR / 列表 -夜间驾驶遇到紧急车辆或骑自行车的人: - ```sql -SELECT frame_id -FROM frame_payloads -WHERE QUERY('payload.scene.lighting:night AND payload.objects.type:(emergency_vehicle OR cyclist)'); +SELECT doc_id, + meta_json['media_meta']['tagging']['labels'] AS labels +FROM frame_metadata_catalog +WHERE QUERY('meta_json.media_meta.tagging.labels:(hard_brake OR swerve OR lane_merge)') +ORDER BY captured_at DESC +LIMIT 10; ``` ### 数值范围 -速度在 10–25 km/h 之间(包含)或严格在 25–40 km/h 之间: - ```sql -SELECT frame_id, - payload['ego']['speed_kmh'] AS speed -FROM frame_payloads -WHERE QUERY('payload.ego.speed_kmh:[10 TO 25] OR payload.ego.speed_kmh:{25 TO 40}') -ORDER BY speed; +SELECT doc_id, + meta_json['vehicle']['speed_kmh']::DOUBLE AS speed +FROM frame_metadata_catalog +WHERE QUERY('meta_json.vehicle.speed_kmh:{30 TO 80}') +ORDER BY speed DESC +LIMIT 10; ``` -### 权重(Boosting) -优先考虑同时出现行人和车辆的帧,但强调行人项: - +### 提升 ```sql -SELECT frame_id, +SELECT doc_id, SCORE() AS relevance -FROM frame_payloads -WHERE QUERY('payload.objects.type:pedestrian^2 AND payload.objects.type:vehicle') +FROM frame_metadata_catalog +WHERE QUERY('meta_json.scene.weather_code:rain AND (meta_json.media_meta.tagging.labels:hard_brake^2 OR meta_json.media_meta.tagging.labels:swerve)') ORDER BY relevance DESC -LIMIT 10; -``` - -请参阅 [搜索函数](/sql/sql-functions/search-functions) 以了解 `QUERY`、`SCORE()` 和相关辅助函数支持的完整 Elasticsearch 语法。 - ---- - -## 4. 交叉引用帧事件 -将查询结果连接回在分析指南中创建的帧级风险评分。 - -```sql -WITH risky_frames AS ( - SELECT frame_id, - payload['ego']['speed_kmh']::DOUBLE AS ego_speed - FROM frame_payloads - WHERE QUERY('payload.objects.type:pedestrian AND payload.ego.speed_kmh:[30 TO *]') -) -SELECT r.frame_id, - e.event_type, - e.risk_score, - r.ego_speed -FROM risky_frames r -JOIN frame_events e USING (frame_id) -ORDER BY e.risk_score DESC; +LIMIT 8; ``` -由于 `frame_id` 在表之间共享,您可以立即从原始有效载荷跳转到精选分析结果。 \ No newline at end of file +`QUERY()` 遵循 Elasticsearch 语义(布尔逻辑、范围、提升、列表)。`SCORE()` 暴露 Elasticsearch 相关性,可在 SQL 中对结果重新排序。完整的运算符列表请参阅 [搜索函数](/sql/sql-functions/search-functions)。 \ No newline at end of file From 0abe3b3ae81965d7be74f908b1755b599a1981df Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:51:32 +0000 Subject: [PATCH 4/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/02-vector-db.md | 96 +++++++++++++------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/docs/cn/guides/54-query/02-vector-db.md b/docs/cn/guides/54-query/02-vector-db.md index 34e38f4395..2543ae692f 100644 --- a/docs/cn/guides/54-query/02-vector-db.md +++ b/docs/cn/guides/54-query/02-vector-db.md @@ -2,94 +2,98 @@ title: 向量搜索(Vector Search) --- -> **场景:** EverDrive Smart Vision 将紧凑的视觉嵌入(vision embeddings)附加到高风险帧,以便调查团队直接在 Databend 内检索相似场景。 +> **场景:** CityDrive 将每一帧的嵌入向量(Embedding)存储在 Databend 中,让语义相似性搜索(Semantic Similarity Search,“找到看起来像这样的帧”)与传统 SQL 分析并行运行——无需额外的向量服务。 -每帧都附带视觉嵌入,感知工程师可借此发现相似情况。本指南演示如何插入这些向量,并在同一 EverDrive ID 上执行语义搜索。 +`frame_embeddings` 表与 `frame_events`、`frame_payloads` 和 `frame_geo_points` 共享相同的 `frame_id` 键,从而将语义搜索与经典 SQL 紧密结合。 -## 1. 创建示例表 -为便于阅读,示例使用四维向量。生产环境中可保存 CLIP 或自监督模型输出的 512 维或 1536 维嵌入。 +## 1. 准备嵌入向量(Embedding)表 +生产模型通常会输出 512–1536 维。本示例使用 512 维,因此您可以直接将其复制到演示集群中而无需修改 DDL。 ```sql CREATE OR REPLACE TABLE frame_embeddings ( - frame_id VARCHAR, - session_id VARCHAR, - embedding VECTOR(4), - model_version VARCHAR, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - VECTOR INDEX idx_frame_embeddings(embedding) distance='cosine' + frame_id STRING, + video_id STRING, + sensor_view STRING, + embedding VECTOR(512), + encoder_build STRING, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + VECTOR INDEX idx_frame_embeddings(embedding) distance='cosine' ); INSERT INTO frame_embeddings VALUES - ('FRAME-0001', 'SES-20240801-SEA01', [0.18, 0.42, 0.07, 0.12]::VECTOR(4), 'clip-mini-v1', DEFAULT), - ('FRAME-0002', 'SES-20240801-SEA01', [0.20, 0.38, 0.12, 0.18]::VECTOR(4), 'clip-mini-v1', DEFAULT), - ('FRAME-0003', 'SES-20240802-SEA02', [0.62, 0.55, 0.58, 0.61]::VECTOR(4), 'night-fusion-v2', DEFAULT), - ('FRAME-0004', 'SES-20240802-SEA02', [0.57, 0.49, 0.52, 0.55]::VECTOR(4), 'night-fusion-v2', DEFAULT); + ('FRAME-0101', 'VID-20250101-001', 'roof_cam', RANDOM_VECTOR(512), 'clip-lite-v1', DEFAULT), + ('FRAME-0102', 'VID-20250101-001', 'roof_cam', RANDOM_VECTOR(512), 'clip-lite-v1', DEFAULT), + ('FRAME-0201', 'VID-20250101-002', 'front_cam',RANDOM_VECTOR(512), 'night-fusion-v2', DEFAULT), + ('FRAME-0401', 'VID-20250103-001', 'rear_cam', RANDOM_VECTOR(512), 'night-fusion-v2', DEFAULT); ``` -文档:[向量数据类型(Vector data type)](/sql/sql-reference/data-types/vector) 与 [向量索引(Vector index)](/sql/sql-reference/data-types/vector#vector-indexing)。 +文档: [向量数据类型(Vector Type)](/sql/sql-reference/data-types/vector) 和 [向量索引(Vector Index)](/sql/sql-reference/data-types/vector#vector-indexing)。 --- -## 2. COSINE_DISTANCE 搜索 -查找与 `FRAME-0001` 最相似的帧。 +## 2. 运行余弦相似度搜索(Cosine Search) +获取某一帧的嵌入向量(Embedding),让 HNSW 索引(HNSW Index)返回最接近的邻居。 ```sql WITH query_embedding AS ( - SELECT embedding - FROM frame_embeddings - WHERE frame_id = 'FRAME-0001' - LIMIT 1 + SELECT embedding + FROM frame_embeddings + WHERE frame_id = 'FRAME-0101' ) SELECT e.frame_id, - e.session_id, - cosine_distance(e.embedding, q.embedding) AS distance -FROM frame_embeddings e -CROSS JOIN query_embedding q + e.video_id, + COSINE_DISTANCE(e.embedding, q.embedding) AS distance +FROM frame_embeddings AS e +CROSS JOIN query_embedding AS q ORDER BY distance LIMIT 3; ``` -余弦距离计算将利用先前创建的 HNSW 索引,优先返回最近邻帧。 +距离越小表示越相似。`VECTOR INDEX` 向量索引(Vector Index)即使面对数百万帧也能保持低延迟。 ---- - -## 3. WHERE 过滤 + 相似度 -结合相似度搜索与传统谓词,缩小结果范围。 +在向量比较之前或之后添加传统谓词(Predicate)(路线、视频、传感器视角)可以缩小候选集合。 ```sql WITH query_embedding AS ( - SELECT embedding - FROM frame_embeddings - WHERE frame_id = 'FRAME-0003' - LIMIT 1 + SELECT embedding + FROM frame_embeddings + WHERE frame_id = 'FRAME-0201' ) SELECT e.frame_id, - cosine_distance(e.embedding, q.embedding) AS distance -FROM frame_embeddings e -CROSS JOIN query_embedding q -WHERE e.session_id = 'SES-20240802-SEA02' -ORDER BY distance; + e.sensor_view, + COSINE_DISTANCE(e.embedding, q.embedding) AS distance +FROM frame_embeddings AS e +CROSS JOIN query_embedding AS q +WHERE e.sensor_view = 'rear_cam' +ORDER BY distance +LIMIT 5; ``` +查询优化器(Query Optimizer)仍会在满足 `sensor_view` 过滤条件的同时使用向量索引(Vector Index)。 + --- -## 4. JOIN 语义 + 风险元数据 -将语义结果与风险评分或检测载荷关联,丰富调查维度。 +## 3. 丰富相似帧 +将匹配度最高的结果物化,然后与 `frame_events` 结合,为后续分析提供更多信息。 ```sql WITH query_embedding AS ( - SELECT embedding FROM frame_embeddings WHERE frame_id = 'FRAME-0001' LIMIT 1 + SELECT embedding + FROM frame_embeddings + WHERE frame_id = 'FRAME-0102' ), similar_frames AS ( - SELECT frame_id, - cosine_distance(e.embedding, q.embedding) AS distance + SELECT frame_id, + video_id, + COSINE_DISTANCE(e.embedding, q.embedding) AS distance FROM frame_embeddings e CROSS JOIN query_embedding q ORDER BY distance LIMIT 5 ) SELECT sf.frame_id, - fe.event_type, + sf.video_id, + fe.event_tag, fe.risk_score, sf.distance FROM similar_frames sf @@ -97,4 +101,4 @@ LEFT JOIN frame_events fe USING (frame_id) ORDER BY sf.distance; ``` -该混合视图呈现“外观类似 FRAME-0001 且触发高风险事件的帧”。 \ No newline at end of file +由于嵌入向量(Embedding)与关系型表并存,您可以从“看起来相似的帧”无缝转向“同时具有 `hard_brake` 标签、特定天气或 JSON 检测的帧”,而无需将数据导出到其他服务。 \ No newline at end of file From e3d328371a368f4bc0848e8d5b63560a4dca08c6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:53:11 +0000 Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/03-geo-analytics.md | 193 +++++++++++--------- 1 file changed, 105 insertions(+), 88 deletions(-) diff --git a/docs/cn/guides/54-query/03-geo-analytics.md b/docs/cn/guides/54-query/03-geo-analytics.md index 239caded11..9c86840205 100644 --- a/docs/cn/guides/54-query/03-geo-analytics.md +++ b/docs/cn/guides/54-query/03-geo-analytics.md @@ -1,139 +1,156 @@ --- -title: 地理空间分析(Geo Analytics) +title: 地理分析(Geo Analytics) --- -> **场景(Scenario):** EverDrive Smart Vision 会记录每个关键帧的 GPS 坐标,以便运营团队在城市中绘制危险驾驶热点图。 +> **场景:** CityDrive 为每个被标记的帧记录精确的 GPS 定位与交通信号距离,让运营团队仅凭 SQL 即可回答“事发地点在哪”。 -每帧都带有 GPS 坐标,因此我们可以把危险情况映射到整个城市。本指南新增一张地理空间表,并使用相同的 EverDrive 会话 ID 演示空间过滤、多边形和 H3 分桶。 +`frame_geo_points` 与 `signal_contact_points` 沿用本指南统一的 `video_id`/`frame_id` 键,无需复制数据即可从 SQL 指标跳转到地图。 -## 1. 创建示例表 -每条记录表示捕获关键帧时自车(ego vehicle)的位置。将坐标存储为 `GEOMETRY` 类型,即可复用本工作负载中的 `ST_X`、`ST_Y` 和 `HAVERSINE` 等函数。 +## 1. 创建位置表 +若已按 JSON 指南操作,这些表已存在。下方片段展示其结构及若干深圳示例。 ```sql -CREATE OR REPLACE TABLE drive_geo ( - frame_id VARCHAR, - session_id VARCHAR, - location GEOMETRY, - speed_kmh DOUBLE, - heading_deg DOUBLE +CREATE OR REPLACE TABLE frame_geo_points ( + video_id STRING, + frame_id STRING, + position_wgs84 GEOMETRY, + solution_grade INT, + source_system STRING, + created_at TIMESTAMP ); -INSERT INTO drive_geo VALUES - ('FRAME-0001', 'SES-20240801-SEA01', TO_GEOMETRY('SRID=4326;POINT(-122.3321 47.6062)'), 28.0, 90), - ('FRAME-0002', 'SES-20240801-SEA01', TO_GEOMETRY('SRID=4326;POINT(-122.3131 47.6105)'), 35.4, 120), - ('FRAME-0003', 'SES-20240802-SEA02', TO_GEOMETRY('SRID=4326;POINT(-122.3419 47.6205)'), 18.5, 45), - ('FRAME-0004', 'SES-20240802-SEA02', TO_GEOMETRY('SRID=4326;POINT(-122.3490 47.6138)'), 22.3, 60), - ('FRAME-0005', 'SES-20240803-SEA03', TO_GEOMETRY('SRID=4326;POINT(-122.3610 47.6010)'), 30.1, 210); +INSERT INTO frame_geo_points VALUES + ('VID-20250101-001','FRAME-0101',TO_GEOMETRY('SRID=4326;POINT(114.0579 22.5431)'),104,'fusion_gnss','2025-01-01 08:15:21'), + ('VID-20250101-001','FRAME-0102',TO_GEOMETRY('SRID=4326;POINT(114.0610 22.5460)'),104,'fusion_gnss','2025-01-01 08:33:54'), + ('VID-20250101-002','FRAME-0201',TO_GEOMETRY('SRID=4326;POINT(114.1040 22.5594)'),104,'fusion_gnss','2025-01-01 11:12:02'), + ('VID-20250102-001','FRAME-0301',TO_GEOMETRY('SRID=4326;POINT(114.0822 22.5368)'),104,'fusion_gnss','2025-01-02 09:44:18'), + ('VID-20250103-001','FRAME-0401',TO_GEOMETRY('SRID=4326;POINT(114.1195 22.5443)'),104,'fusion_gnss','2025-01-03 21:18:07'); + +CREATE OR REPLACE TABLE signal_contact_points ( + node_id STRING, + signal_position GEOMETRY, + video_id STRING, + frame_id STRING, + frame_position GEOMETRY, + distance_m DOUBLE, + created_at TIMESTAMP +); ``` -文档:[地理空间数据类型](/sql/sql-reference/data-types/geospatial)。 +文档:[地理空间类型](https://docs.databend.cn/sql/sql-reference/data-types/geospatial)。 --- -## 2. ST_DISTANCE 半径过滤 -`ST_DISTANCE` 函数用于测量几何体之间的距离。将帧位置和热点均转换到 Web Mercator(SRID 3857),结果以米为单位,再过滤 500 米以内。 +## 2. 空间过滤 +计算每帧与市中心坐标的距离,或判断其是否落在某多边形内。需要米级精度时,请转换至 SRID 3857。 ```sql -SELECT g.frame_id, - g.session_id, - e.event_type, - e.risk_score, +SELECT l.frame_id, + l.video_id, + f.event_tag, ST_DISTANCE( - ST_TRANSFORM(g.location, 3857), - ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(-122.3350 47.6080)'), 3857) - ) AS meters_from_hotspot -FROM drive_geo g -JOIN frame_events e USING (frame_id) + ST_TRANSFORM(l.position_wgs84, 3857), + ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(114.0600 22.5450)'), 3857) + ) AS meters_from_hq +FROM frame_geo_points AS l +JOIN frame_events AS f USING (frame_id) WHERE ST_DISTANCE( - ST_TRANSFORM(g.location, 3857), - ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(-122.3350 47.6080)'), 3857) - ) <= 500 -ORDER BY meters_from_hotspot; + ST_TRANSFORM(l.position_wgs84, 3857), + ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(114.0600 22.5450)'), 3857) + ) <= 400 +ORDER BY meters_from_hq; ``` -需要原始几何调试?在投影中加入 `ST_ASTEXT(g.location)`。偏好直接的大圆计算?改用 `HAVERSINE` 函数,它直接操作 `ST_X`/`ST_Y` 坐标。 - ---- - -## 3. ST_CONTAINS 多边形过滤 -检查事件是否发生在划定安全区内(如学校区域)。 +提示:调试时可加 `ST_ASTEXT(l.geom)`,或改用 [`HAVERSINE`](https://docs.databend.cn/sql/sql-functions/geospatial-functions#trigonometric-distance-functions) 做大圆计算。 ```sql WITH school_zone AS ( - SELECT TO_GEOMETRY('SRID=4326;POLYGON(( - -122.3415 47.6150, - -122.3300 47.6150, - -122.3300 47.6070, - -122.3415 47.6070, - -122.3415 47.6150 - ))') AS poly + SELECT TO_GEOMETRY('SRID=4326;POLYGON(( + 114.0505 22.5500, + 114.0630 22.5500, + 114.0630 22.5420, + 114.0505 22.5420, + 114.0505 22.5500 + ))') AS poly ) -SELECT g.frame_id, - g.session_id, - e.event_type -FROM drive_geo g -JOIN frame_events e USING (frame_id) +SELECT l.frame_id, + l.video_id, + f.event_tag +FROM frame_geo_points AS l +JOIN frame_events AS f USING (frame_id) CROSS JOIN school_zone -WHERE ST_CONTAINS(poly, g.location); +WHERE ST_CONTAINS(poly, l.position_wgs84); ``` --- -## 4. GEO_TO_H3 热力图 -按六边形单元聚合事件,构建路线热力图。 +## 3. 六边形聚合 +将风险帧聚合到六边形网格,为仪表盘提供数据。 ```sql -SELECT GEO_TO_H3(ST_X(location), ST_Y(location), 8) AS h3_cell, +SELECT GEO_TO_H3(ST_X(position_wgs84), ST_Y(position_wgs84), 8) AS h3_cell, COUNT(*) AS frame_count, - AVG(e.risk_score) AS avg_risk -FROM drive_geo -JOIN frame_events e USING (frame_id) + AVG(f.risk_score) AS avg_risk +FROM frame_geo_points AS l +JOIN frame_events AS f USING (frame_id) GROUP BY h3_cell ORDER BY avg_risk DESC; ``` -文档:[H3 函数](/sql/sql-functions/geospatial-functions#h3-indexing--conversion)。 +文档:[H3 函数](https://docs.databend.cn/sql/sql-functions/geospatial-functions#h3-indexing--conversion)。 --- -## 5. ST_DISTANCE + JSON 查询 -将空间距离检查与丰富的检测元数据(来自 JSON 指南)结合,生成精准告警。 +## 4. 交通上下文 +关联 `signal_contact_points` 与 `frame_geo_points` 以验证存储指标,或将空间谓词与 JSON 搜索结合。 ```sql -WITH near_intersection AS ( - SELECT frame_id - FROM drive_geo - WHERE ST_DISTANCE( - ST_TRANSFORM(location, 3857), - ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(-122.3410 47.6130)'), 3857) - ) <= 200 +SELECT t.node_id, + t.video_id, + t.frame_id, + ST_DISTANCE(t.signal_position, t.frame_position) AS recomputed_distance, + t.distance_m AS stored_distance, + l.source_system +FROM signal_contact_points AS t +JOIN frame_geo_points AS l USING (frame_id) +WHERE t.distance_m < 0.03 -- 视 SRID 而定,约 30 米内 +ORDER BY t.distance_m; +``` + +```sql +WITH near_junction AS ( + SELECT frame_id + FROM frame_geo_points + WHERE ST_DISTANCE( + ST_TRANSFORM(position_wgs84, 3857), + ST_TRANSFORM(TO_GEOMETRY('SRID=4326;POINT(114.0700 22.5400)'), 3857) + ) <= 150 ) -SELECT n.frame_id, - p.payload['objects'][0]['type']::STRING AS first_object, - e.event_type, - e.risk_score -FROM near_intersection n -JOIN frame_payloads p USING (frame_id) -JOIN frame_events e USING (frame_id) -WHERE QUERY('payload.objects.type:pedestrian'); +SELECT f.frame_id, + f.event_tag, + meta.meta_json['media_meta']['tagging']['labels'] AS labels +FROM near_junction nj +JOIN frame_events AS f USING (frame_id) +JOIN frame_metadata_catalog AS meta + ON meta.doc_id = nj.frame_id +WHERE QUERY('meta_json.media_meta.tagging.labels:hard_brake'); ``` -空间过滤器、JSON 运算符与经典 SQL 均可在一句话内完成。 +该模式先按地理过滤,再对剩余帧执行 JSON 搜索。 --- -## 6. 创建视图热力图 -将六边形级摘要导出到可视化工具或地图图层。 +## 5. 发布热力图视图 +将地理热力图暴露给 BI 或 GIS 工具,无需重跑繁重 SQL。 ```sql -CREATE OR REPLACE VIEW v_route_heatmap AS ( - SELECT GEO_TO_H3(ST_X(location), ST_Y(location), 7) AS h3_cell, - COUNT(*) AS frames, - AVG(e.risk_score) AS avg_risk - FROM drive_geo - JOIN frame_events e USING (frame_id) - GROUP BY h3_cell -); +CREATE OR REPLACE VIEW v_citydrive_geo_heatmap AS +SELECT GEO_TO_H3(ST_X(position_wgs84), ST_Y(position_wgs84), 7) AS h3_cell, + COUNT(*) AS frames, + AVG(f.risk_score) AS avg_risk +FROM frame_geo_points AS l +JOIN frame_events AS f USING (frame_id) +GROUP BY h3_cell; ``` -下游系统可直接查询 `v_route_heatmap`,在地图上渲染风险热点,无需重新处理原始遥测数据。 \ No newline at end of file +Databend 现可基于同一 `video_id` 同时提供向量、文本与空间查询,调查团队再也无需对接多条 Pipeline。 \ No newline at end of file From ba8d4e6045e5ea2ecd1640b654d3707bae1c18d1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:55:35 +0000 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/04-lakehouse-etl.md | 196 ++++++++++++-------- 1 file changed, 117 insertions(+), 79 deletions(-) diff --git a/docs/cn/guides/54-query/04-lakehouse-etl.md b/docs/cn/guides/54-query/04-lakehouse-etl.md index 3012fe00bd..e85bcaccc5 100644 --- a/docs/cn/guides/54-query/04-lakehouse-etl.md +++ b/docs/cn/guides/54-query/04-lakehouse-etl.md @@ -1,186 +1,224 @@ --- -title: 湖仓一体 ETL(Lakehouse ETL) +title: 湖仓 ETL --- -> **场景(Scenario):** EverDrive Smart Vision 的数据工程团队将每次路测批次导出为 Parquet 文件,以便统一工作负载在 Databend 内加载、查询并丰富同一份遥测数据。 +> **场景:** CityDrive 的数据工程团队将每批行车记录仪数据导出为 Parquet(视频、帧事件、元数据 JSON、嵌入向量、GPS 轨迹、交通信号距离),并希望使用一条 COPY 流水线刷新 Databend 中的共享表。 -EverDrive 的摄取循环非常简单: +加载循环非常直接: ``` -对象存储导出(例如 Parquet)→ Stage → COPY INTO →(可选)Stream & Task +对象存储 → Stage → COPY INTO 表 → (可选)STREAM/TASK ``` -调整桶路径/凭据(如格式不同,把 Parquet 换成实际格式),然后粘贴下方命令。所有语法均与官方[加载数据指南](/guides/load-data/)一致。 +根据你的环境调整桶路径或格式,然后粘贴以下命令。语法与[加载数据指南](/guides/load-data/)一致。 --- -## 1. Stage -EverDrive 的数据工程团队每批次导出四个文件——sessions、frame events、detection payloads(含嵌套 JSON 字段)和 frame embeddings——到 S3 桶。本指南以 Parquet 为例,只需修改 `FILE_FORMAT` 即可接入 CSV、JSON 或其他支持的格式。一次性创建命名连接,后续所有 Stage 复用。 +## 1. 创建 Stage +将可复用的 Stage 指向保存 CityDrive 导出文件的桶。替换凭证/URL 为你自己的账户;这里使用 Parquet,但任何支持的格式只需更换 `FILE_FORMAT` 即可。 ```sql -CREATE OR REPLACE CONNECTION everdrive_s3 +CREATE OR REPLACE CONNECTION citydrive_s3 STORAGE_TYPE = 's3' ACCESS_KEY_ID = '' SECRET_ACCESS_KEY = ''; -CREATE OR REPLACE STAGE drive_stage - URL = 's3://everdrive-lakehouse/raw/' - CONNECTION = (CONNECTION_NAME = 'everdrive_s3') +CREATE OR REPLACE STAGE citydrive_stage + URL = 's3://citydrive-lakehouse/raw/' + CONNECTION = (CONNECTION_NAME = 'citydrive_s3') FILE_FORMAT = (TYPE = 'PARQUET'); ``` -更多选项见[创建 Stage](/sql/sql-commands/ddl/stage/ddl-create-stage)。 +> [!IMPORTANT] +> 将占位符 AWS 密钥和桶 URL 替换为你环境中的真实值。没有有效凭证时,`LIST`、`SELECT ... FROM @citydrive_stage` 和 `COPY INTO` 语句会因 S3 返回 `InvalidAccessKeyId`/403 错误而失败。 -列出导出文件夹(本示例为 Parquet)确认可见: +快速检查: ```sql -LIST @drive_stage/sessions/; -LIST @drive_stage/frame-events/; -LIST @drive_stage/payloads/; -LIST @drive_stage/embeddings/; +LIST @citydrive_stage/videos/; +LIST @citydrive_stage/frame-events/; +LIST @citydrive_stage/manifests/; +LIST @citydrive_stage/frame-embeddings/; +LIST @citydrive_stage/frame-locations/; +LIST @citydrive_stage/traffic-lights/; ``` --- -## 2. Preview -加载前先查看 Parquet 文件,验证 schema 并抽样。 +## 2. 预览文件 +在 Stage 上执行 `SELECT`,在加载前确认模式并抽样。 ```sql SELECT * -FROM @drive_stage/sessions/session_2024_08_16.parquet +FROM @citydrive_stage/videos/capture_date=2025-01-01/videos.parquet LIMIT 5; SELECT * -FROM @drive_stage/frame-events/frame_events_2024_08_16.parquet +FROM @citydrive_stage/frame-events/batch_2025_01_01.parquet LIMIT 5; ``` -按需对 payloads 与 embeddings 重复预览。Databend 会自动使用 Stage 上指定的文件格式。 +Databend 会根据 Stage 定义推断格式,因此无需额外选项。 --- -## 3. COPY INTO -将各文件加载到指南用到的表中。通过内联类型转换把输入列映射到表列;下方投影以 Parquet 为例,其他格式同理。 +## 3. COPY INTO 统一表 +每个导出对应指南中使用的共享表之一。内联类型转换保证即使上游列顺序变化,模式仍保持一致。 -### Sessions +### `citydrive_videos` ```sql -COPY INTO drive_sessions (session_id, vehicle_id, route_name, start_time, end_time, weather, camera_setup) +COPY INTO citydrive_videos (video_id, vehicle_id, capture_date, route_name, weather, camera_source, duration_sec) FROM ( - SELECT session_id::STRING, + SELECT video_id::STRING, vehicle_id::STRING, + capture_date::DATE, route_name::STRING, - start_time::TIMESTAMP, - end_time::TIMESTAMP, weather::STRING, - camera_setup::STRING - FROM @drive_stage/sessions/ + camera_source::STRING, + duration_sec::INT + FROM @citydrive_stage/videos/ ) FILE_FORMAT = (TYPE = 'PARQUET'); ``` -### Frame Events +### `frame_events` ```sql -COPY INTO frame_events (frame_id, session_id, frame_index, captured_at, event_type, risk_score) +COPY INTO frame_events (frame_id, video_id, frame_index, collected_at, event_tag, risk_score, speed_kmh) FROM ( SELECT frame_id::STRING, - session_id::STRING, + video_id::STRING, frame_index::INT, - captured_at::TIMESTAMP, - event_type::STRING, - risk_score::DOUBLE - FROM @drive_stage/frame-events/ + collected_at::TIMESTAMP, + event_tag::STRING, + risk_score::DOUBLE, + speed_kmh::DOUBLE + FROM @citydrive_stage/frame-events/ ) FILE_FORMAT = (TYPE = 'PARQUET'); ``` -### Detection Payloads -payload 文件含嵌套列(`payload` 列为 JSON 对象)。用相同投影复制到 `frame_payloads` 表。 +### `frame_metadata_catalog` +```sql +COPY INTO frame_metadata_catalog (doc_id, meta_json, captured_at) +FROM ( + SELECT doc_id::STRING, + meta_json::VARIANT, + captured_at::TIMESTAMP + FROM @citydrive_stage/manifests/ +) +FILE_FORMAT = (TYPE = 'PARQUET'); +``` +### `frame_embeddings` ```sql -COPY INTO frame_payloads (frame_id, run_stage, payload, logged_at) +COPY INTO frame_embeddings (frame_id, video_id, sensor_view, embedding, encoder_build, created_at) FROM ( SELECT frame_id::STRING, - run_stage::STRING, - payload, - logged_at::TIMESTAMP - FROM @drive_stage/payloads/ + video_id::STRING, + sensor_view::STRING, + embedding::VECTOR(768), -- 替换为你的实际维度 + encoder_build::STRING, + created_at::TIMESTAMP + FROM @citydrive_stage/frame-embeddings/ ) FILE_FORMAT = (TYPE = 'PARQUET'); ``` -### Frame Embeddings +### `frame_geo_points` ```sql -COPY INTO frame_embeddings (frame_id, session_id, embedding, model_version, created_at) +COPY INTO frame_geo_points (video_id, frame_id, position_wgs84, solution_grade, source_system, created_at) FROM ( - SELECT frame_id::STRING, - session_id::STRING, - embedding::VECTOR(4), -- 将 4 替换为实际嵌入维度 - model_version::STRING, + SELECT video_id::STRING, + frame_id::STRING, + position_wgs84::GEOMETRY, + solution_grade::INT, + source_system::STRING, created_at::TIMESTAMP - FROM @drive_stage/embeddings/ + FROM @citydrive_stage/frame-locations/ ) FILE_FORMAT = (TYPE = 'PARQUET'); ``` -下游所有指南(分析/搜索/向量/地理)均可看到本批次数据。 +### `signal_contact_points` +```sql +COPY INTO signal_contact_points (node_id, signal_position, video_id, frame_id, frame_position, distance_m, created_at) +FROM ( + SELECT node_id::STRING, + signal_position::GEOMETRY, + video_id::STRING, + frame_id::STRING, + frame_position::GEOMETRY, + distance_m::DOUBLE, + created_at::TIMESTAMP + FROM @citydrive_stage/traffic-lights/ +) +FILE_FORMAT = (TYPE = 'PARQUET'); +``` + +完成后,所有下游工作负载——SQL 分析、Elasticsearch `QUERY()`、向量相似度、地理空间过滤——都将读取同一份数据。 --- -## 4. Stream(可选) -若希望下游作业在每次 `COPY INTO` 后感知新行,可在关键表(如 `frame_events`)上创建 Stream。用法参考[持续 Pipeline → Stream](/guides/load-data/continuous-data-pipelines/stream)。 +## 4. 增量响应 Stream(可选) +若希望下游作业仅消费上次批量后新增的行,可使用 Stream。 ```sql CREATE OR REPLACE STREAM frame_events_stream ON TABLE frame_events; -SELECT * FROM frame_events_stream; -- 显示上次消费后的新行 +SELECT * FROM frame_events_stream; -- 查看新复制的行 +-- …处理行… +SELECT * FROM frame_events_stream WITH CONSUME; -- 推进游标 ``` -处理完毕后执行 `CONSUME STREAM frame_events_stream;`(或将行插入另一表)以推进偏移。 +`WITH CONSUME` 确保处理完后 Stream 游标前移。参考:[Streams](/guides/load-data/continuous-data-pipelines/stream)。 --- -## 5. Task(可选) -Task 按调度执行**一条 SQL 语句**。可为每张表创建小 Task(或调用存储过程作为统一入口)。 +## 5. 定时加载 Task(可选) +Task 按调度执行**一条 SQL 语句**。可为每张表创建轻量级 Task,或将逻辑封装到存储过程统一入口。 ```sql -CREATE OR REPLACE TASK task_load_sessions +CREATE OR REPLACE TASK task_load_citydrive_videos WAREHOUSE = 'default' - SCHEDULE = 5 MINUTE + SCHEDULE = 10 MINUTE AS - COPY INTO drive_sessions (session_id, vehicle_id, route_name, start_time, end_time, weather, camera_setup) + COPY INTO citydrive_videos (video_id, vehicle_id, capture_date, route_name, weather, camera_source, duration_sec) FROM ( - SELECT session_id::STRING, + SELECT video_id::STRING, vehicle_id::STRING, + capture_date::DATE, route_name::STRING, - start_time::TIMESTAMP, - end_time::TIMESTAMP, weather::STRING, - camera_setup::STRING - FROM @drive_stage/sessions/ + camera_source::STRING, + duration_sec::INT + FROM @citydrive_stage/videos/ ) FILE_FORMAT = (TYPE = 'PARQUET'); -ALTER TASK task_load_sessions RESUME; +ALTER TASK task_load_citydrive_videos RESUME; CREATE OR REPLACE TASK task_load_frame_events WAREHOUSE = 'default' - SCHEDULE = 5 MINUTE -AS - COPY INTO frame_events (frame_id, session_id, frame_index, captured_at, event_type, risk_score) + SCHEDULE = 10 MINUTE + AS + COPY INTO frame_events (frame_id, video_id, frame_index, collected_at, event_tag, risk_score, speed_kmh) FROM ( SELECT frame_id::STRING, - session_id::STRING, + video_id::STRING, frame_index::INT, - captured_at::TIMESTAMP, - event_type::STRING, - risk_score::DOUBLE - FROM @drive_stage/frame-events/ + collected_at::TIMESTAMP, + event_tag::STRING, + risk_score::DOUBLE, + speed_kmh::DOUBLE + FROM @citydrive_stage/frame-events/ ) FILE_FORMAT = (TYPE = 'PARQUET'); ALTER TASK task_load_frame_events RESUME; - --- 对 frame_payloads 与 frame_embeddings 重复即可 ``` -cron 语法、依赖设置与错误处理见[持续 Pipeline → Task](/guides/load-data/continuous-data-pipelines/task)。 \ No newline at end of file +按相同模式为 `frame_metadata_catalog`、嵌入或 GPS 数据添加更多 Task。完整选项见:[Tasks](/guides/load-data/continuous-data-pipelines/task)。 + +--- + +作业运行后,Unified Workloads 系列的所有指南都将从同一组 CityDrive 表读取数据——无需额外 ETL 层,也无重复存储。 \ No newline at end of file From 3dfe0bd79749a092d30abec308db7f95c7830ae2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:56:11 +0000 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=8C=90=20Translate=2000-sql-analytics?= =?UTF-8?q?.md=20to=20Simplified-Chinese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/cn/guides/54-query/index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/cn/guides/54-query/index.md b/docs/cn/guides/54-query/index.md index ab8959fdc9..81f1dd725c 100644 --- a/docs/cn/guides/54-query/index.md +++ b/docs/cn/guides/54-query/index.md @@ -2,14 +2,14 @@ title: 统一工作负载 --- -Databend 现已作为统一引擎,支持 SQL 分析、多模态搜索、向量相似度、地理空间分析及持续 ETL。本迷你系列以 **EverDrive 智能视觉** 场景为例(会话 ID 如 `SES-20240801-SEA01`,帧 ID 如 `FRAME-0001`),演示同一数据集如何在不跨系统复制的情况下流经所有工作负载。 +CityDrive Intelligence 记录每一次行车记录仪行程,将其拆分为帧,并为每个 `video_id` 存储多种信号:关系型元数据、JSON 清单、行为标签、嵌入向量以及 GPS 轨迹。本系列指南展示 Databend 如何将所有这些工作负载集中在同一个 Warehouse 中——无需复制作业,也不需要额外的搜索集群。 | 指南 | 涵盖内容 | |-------|----------------| -| [SQL 分析](./00-sql-analytics.md) | 构建共享表、切分会话、添加窗口/聚合加速 | -| [JSON 与搜索](./01-json-search.md) | 存储检测负载并 `QUERY` 风险场景 | -| [向量搜索](./02-vector-db.md) | 保留帧嵌入并查找语义邻居 | -| [地理分析](./03-geo-analytics.md) | 使用 `HAVERSINE`、多边形、H3 映射事件 | -| [湖仓 ETL](./04-lakehouse-etl.md) | 暂存文件、`COPY INTO` 表、可选流/任务 | +| [SQL 分析(SQL Analytics)](./00-sql-analytics.md) | 基础表、筛选条件、连接(Join)、窗口、聚合索引 | +| [JSON 与搜索(JSON & Search)](./01-json-search.md) | 加载 `frame_metadata_catalog`,运行 Elasticsearch `QUERY()`,关联位图标签 | +| [向量搜索(Vector Search)](./02-vector-db.md) | 持久化嵌入向量,运行余弦搜索,连接风险指标 | +| [地理分析(Geo Analytics)](./03-geo-analytics.md) | 使用 `GEOMETRY`,距离/多边形筛选,交通信号灯连接 | +| [湖仓 ETL(Lakehouse ETL)](./04-lakehouse-etl.md) | 仅在 Stage 中预处理一次,向共享表执行 `COPY INTO`,添加 Stream/Task | -按顺序完成即可看到 Databend 的单个查询优化器(Query Optimizer)如何为同一车队数据上的分析、搜索、向量、地理及加载流水线提供支持。 \ No newline at end of file +按顺序阅读这些指南,可以看到相同的标识符如何从经典 SQL 延伸到文本搜索、向量、地理以及 ETL,全都基于同一个 CityDrive 场景。 \ No newline at end of file From 6b5779e6e52f62d8989996552a0c0650f256ecec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Oct 2025 10:56:16 +0000 Subject: [PATCH 8/8] chore: finalize translation for PR #2911 --- .translation-init | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .translation-init diff --git a/.translation-init b/.translation-init deleted file mode 100644 index 6064308ab7..0000000000 --- a/.translation-init +++ /dev/null @@ -1 +0,0 @@ -Translation initialization: 2025-10-25T10:48:02.510790