Skip to content

Commit a41549a

Browse files
authored
Merge pull request #32 from murphyatwork/murphy_opt_flat
Optimize Starrocks
2 parents 07b3024 + 0da5e48 commit a41549a

14 files changed

+113
-124
lines changed

starrocks/ddl_default.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
CREATE TABLE bluesky (
2+
`id` BIGINT AUTO_INCREMENT,
3+
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON"
4+
);
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ CREATE TABLE bluesky (
88
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
99
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
1010
`time_us` BIGINT AS get_json_int(data, '$.time_us')
11-
) ENGINE=OLAP
11+
)
1212
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`);

starrocks/ddl_zstd.sql

Lines changed: 0 additions & 15 deletions
This file was deleted.

starrocks/main.sh

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,30 +53,30 @@ benchmark() {
5353

5454
case $CHOICE in
5555
2)
56-
benchmark 10 lz4
57-
benchmark 10 zstd
56+
benchmark 10 default
57+
benchmark 10 materialized
5858
;;
5959
3)
60-
benchmark 100 lz4
61-
benchmark 100 zstd
60+
benchmark 100 default
61+
benchmark 100 materialized
6262
;;
6363
4)
64-
benchmark 1000 lz4
65-
benchmark 1000 zstd
64+
benchmark 1000 default
65+
benchmark 1000 materialized
6666
;;
6767
5)
68-
benchmark 1 lz4
69-
benchmark 1 zstd
70-
benchmark 10 lz4
71-
benchmark 10 zstd
72-
benchmark 100 lz4
73-
benchmark 100 zstd
74-
benchmark 1000 lz4
75-
benchmark 1000 zstd
68+
benchmark 1 materialized
69+
benchmark 1 default
70+
benchmark 10 materialized
71+
benchmark 10 default
72+
benchmark 100 materialized
73+
benchmark 100 default
74+
benchmark 1000 materialized
75+
benchmark 1000 default
7676
;;
7777
*)
78-
benchmark 1 lz4
79-
benchmark 1 zstd
78+
benchmark 1 materialized
79+
benchmark 1 default
8080
;;
8181
esac
8282

starrocks/queries.sql

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
SELECT cast(data->'commit.collection' AS VARCHAR) AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2-
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, count() AS count, count(DISTINCT cast(data->'did' AS VARCHAR)) AS users FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
3-
SELECT cast(data->'commit.collection' AS VARCHAR) AS event, hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4-
SELECT cast(data->'$.did' as VARCHAR) as user_id, min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5-
SELECT cast(data->'$.did' as VARCHAR) as user_id, date_diff('millisecond', min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span FROM bluesky WHERE (data->'kind' = 'commit') AND (data->'commit.operation' = 'create') AND (data->'commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
1+
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2+
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
3+
SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5+
SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;

starrocks/queries_formatted.sql

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
-- Q1 - Top event types
33
------------------------------------------------------------------------------------------------------------------------
44

5-
SELECT cast(data->'commit.collection' AS VARCHAR) AS event,
5+
SELECT get_json_string(data, 'commit.collection') AS event,
66
count() AS count
77
FROM bluesky
88
GROUP BY event
@@ -12,55 +12,55 @@ ORDER BY count DESC;
1212
-- Q2 - Top event types together with unique users per event type
1313
------------------------------------------------------------------------------------------------------------------------
1414
SELECT
15-
cast(data->'commit.collection' AS VARCHAR) AS event,
15+
get_json_string(data, 'commit.collection') AS event,
1616
count() AS count,
17-
count(DISTINCT cast(data->'did' AS VARCHAR)) AS users
17+
count(DISTINCT get_json_string(data, 'did')) AS users
1818
FROM bluesky
19-
WHERE (data->'kind' = 'commit')
20-
AND (data->'commit.operation' = 'create')
19+
WHERE (get_json_string(data, 'kind') = 'commit')
20+
AND (get_json_string(data, 'commit.operation') = 'create')
2121
GROUP BY event
2222
ORDER BY count DESC;
2323

2424
------------------------------------------------------------------------------------------------------------------------
2525
-- Q3 - When do people use BlueSky
2626
------------------------------------------------------------------------------------------------------------------------
2727
SELECT
28-
cast(data->'commit.collection' AS VARCHAR) AS event,
29-
hour(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) as hour_of_day,
28+
get_json_string(data, 'commit.collection') AS event,
29+
hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day,
3030
count() AS count
3131
FROM bluesky
32-
WHERE (data->'kind' = 'commit')
33-
AND (data->'commit.operation' = 'create')
34-
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], cast(data->'commit.collection' AS VARCHAR)))
32+
WHERE (get_json_string(data, 'kind') = 'commit')
33+
AND (get_json_string(data, 'commit.operation') = 'create')
34+
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
3535
GROUP BY event, hour_of_day
3636
ORDER BY hour_of_day, event;
3737

3838
------------------------------------------------------------------------------------------------------------------------
3939
-- Q4 - top 3 post veterans
4040
------------------------------------------------------------------------------------------------------------------------
4141
SELECT
42-
cast(data->'$.did' as VARCHAR) as user_id,
43-
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))) AS first_post_date
42+
get_json_string(data, '$.did') as user_id,
43+
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date
4444
FROM bluesky
45-
WHERE (data->'kind' = 'commit')
46-
AND (data->'commit.operation' = 'create')
47-
AND (data->'commit.collection' = 'app.bsky.feed.post')
45+
WHERE (get_json_string(data, 'kind') = 'commit')
46+
AND (get_json_string(data, 'commit.operation') = 'create')
47+
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
4848
GROUP BY user_id
49-
ORDER BY first_post_ts ASC
49+
ORDER BY first_post_date ASC
5050
LIMIT 3;
5151

5252
------------------------------------------------------------------------------------------------------------------------
5353
-- Q5 - top 3 users with longest activity
5454
------------------------------------------------------------------------------------------------------------------------
5555
SELECT
56-
cast(data->'$.did' as VARCHAR) as user_id,
56+
get_json_string(data, '$.did') as user_id,
5757
date_diff('millisecond',
58-
min(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000)))),
59-
max(from_unixtime(round(divide(cast(data->'time_us' AS BIGINT), 1000000))))) AS activity_span
58+
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))),
59+
max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span
6060
FROM bluesky
61-
WHERE (data->'kind' = 'commit')
62-
AND (data->'commit.operation' = 'create')
63-
AND (data->'commit.collection' = 'app.bsky.feed.post')
61+
WHERE (get_json_string(data, 'kind') = 'commit')
62+
AND (get_json_string(data, 'commit.operation') = 'create')
63+
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
6464
GROUP BY user_id
6565
ORDER BY activity_span DESC
6666
LIMIT 3;

starrocks/results/m6i.8xlarge_bluesky_1000m_lz4.json renamed to starrocks/results/m6i.8xlarge_bluesky_1000m_default.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
{
2-
"system": "Starrocks (lz4)",
2+
"system": "Starrocks (default)",
33
"version": "3.4.0-e94580b",
44
"os": "Ubuntu 24.04",
5-
"date": "2025-01-13",
5+
"date": "2025-03-24",
66
"machine": "m6i.8xlarge, 10000gib gp3",
77
"cluster_size": 1,
88
"comment": "",
99
"retains_structure": "yes",
1010
"tags": [
1111
],
1212
"dataset_size": 1000000000,
13-
"num_loaded_documents": null,
13+
"num_loaded_documents": 804000000,
1414
"data_compression": "lz4",
1515
"total_size": null,
1616
"result": [
17-
[null, null, null],
18-
[null, null, null],
19-
[null, null, null],
20-
[null, null, null],
17+
[2.27,1.24,1.21],
18+
[17.81,10.67,10.20],
19+
[7.38,6.78,7.62],
20+
[7.24, null, null],
2121
[null, null, null]
2222
]
2323
}

starrocks/results/m6i.8xlarge_bluesky_1000m_zstd.json renamed to starrocks/results/m6i.8xlarge_bluesky_1000m_materialized.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
{
2-
"system": "Starrocks (zstd)",
2+
"system": "Starrocks (materialized)",
33
"version": "3.4.0-e94580b",
44
"os": "Ubuntu 24.04",
5-
"date": "2025-01-13",
5+
"date": "2025-03-24",
66
"machine": "m6i.8xlarge, 10000gib gp3",
77
"cluster_size": 1,
88
"comment": "",
99
"retains_structure": "yes",
1010
"tags": [
1111
],
1212
"dataset_size": 1000000000,
13-
"num_loaded_documents": null,
13+
"num_loaded_documents": 997000000,
1414
"data_compression": "zstd",
15-
"total_size": null,
15+
"total_size": 191541000000,
1616
"result": [
17-
[null, null, null],
18-
[null, null, null],
19-
[null, null, null],
20-
[null, null, null],
17+
[1.75,1.56,1.54],
18+
[49.75,41.61,31.38],
19+
[12.90,12.58,5.76],
20+
[5.64,6.21,6.03],
2121
[null, null, null]
2222
]
2323
}
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
{
2-
"system": "Starrocks (lz4)",
2+
"system": "Starrocks (default)",
33
"version": "3.4.0-e94580b",
44
"os": "Ubuntu 24.04",
5-
"date": "2025-01-13",
5+
"date": "2025-03-24",
66
"machine": "m6i.8xlarge, 10000gib gp3",
77
"cluster_size": 1,
88
"comment": "",
99
"retains_structure": "yes",
1010
"tags": [
1111
],
1212
"dataset_size": 100000000,
13-
"num_loaded_documents": 100000000,
13+
"num_loaded_documents": 91000000,
1414
"data_compression": "lz4",
15-
"total_size": 19182000000,
15+
"total_size": 17109000000,
1616
"result": [
17-
[0.25,0.17,0.17],
18-
[8.13,4.33,3.82],
19-
[3.18,3.08,3.05],
20-
[4.06,4.07,4.12],
21-
[4.04,4.20,3.97]
17+
[0.61,0.16,0.16],
18+
[19.26,7.12,7.18],
19+
[1.12,1.08,1.08],
20+
[0.55,0.55,0.54],
21+
[0.60,0.60,0.60]
2222
]
2323
}

starrocks/results/m6i.8xlarge_bluesky_100m_zstd.json renamed to starrocks/results/m6i.8xlarge_bluesky_100m_materialized.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
2-
"system": "Starrocks (zstd)",
2+
"system": "Starrocks (materialized)",
33
"version": "3.4.0-e94580b",
44
"os": "Ubuntu 24.04",
5-
"date": "2025-01-13",
5+
"date": "2025-03-24",
66
"machine": "m6i.8xlarge, 10000gib gp3",
77
"cluster_size": 1,
88
"comment": "",
@@ -12,12 +12,12 @@
1212
"dataset_size": 100000000,
1313
"num_loaded_documents": 100000000,
1414
"data_compression": "zstd",
15-
"total_size": 31200000000,
15+
"total_size": 16190000000,
1616
"result": [
17-
[0.22,0.17,0.18],
18-
[28.09,3.94,3.89],
19-
[3.04,3.05,3.11],
20-
[3.99,4.04,3.94],
21-
[4.13,4.12,4.11]
17+
[0.21,0.17,0.18],
18+
[8.38,2.19,2.17],
19+
[2.16,1.10,1.06],
20+
[6.62,0.43,0.45],
21+
[0.48,0.48,0.49]
2222
]
2323
}

0 commit comments

Comments
 (0)