Skip to content

Commit 09e2467

Browse files
authored
Merge pull request #20 from lio-p/starrocks-candidate
Add Starrocks
2 parents b387882 + f085fb8 commit 09e2467

22 files changed

+609
-0
lines changed

starrocks/benchmark.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 3 ]]; then
5+
echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
RESULT_FILE_RUNTIMES="$2"
12+
RESULT_FILE_MEMORY_USAGE="$3"
13+
14+
# Construct the query log file name using $DB_NAME
15+
QUERY_LOG_FILE="query_log.txt"
16+
17+
# Print the database name
18+
echo "Running queries on database: $DB_NAME"
19+
20+
# Run queries and log the output
21+
./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt
22+
23+
# Process the query log and prepare the result
24+
RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
25+
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
26+
27+
# Output the result
28+
if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
29+
echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
30+
echo "Result written to $RESULT_FILE_RUNTIMES"
31+
else
32+
echo "$RESULT"
33+
fi

starrocks/count.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 2 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
13+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM '$TABLE_NAME';"

starrocks/create_and_load.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 7 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
DDL_FILE="$3"
13+
DATA_DIRECTORY="$4"
14+
NUM_FILES="$5"
15+
SUCCESS_LOG="$6"
16+
ERROR_LOG="$7"
17+
18+
# Validate arguments
19+
[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21+
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22+
23+
24+
echo "Create database"
25+
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
26+
27+
echo "Execute DDL"
28+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE"
29+
30+
echo "Load data"
31+
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"

starrocks/ddl_lz4.sql

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
CREATE TABLE bluesky (
2+
`id` BIGINT AUTO_INCREMENT,
3+
-- Main JSON column (comes after key columns)
4+
`data` JSON NULL COMMENT "Main JSON object",
5+
-- Key columns (must come first in the schema and in the same order as DUPLICATE KEY)
6+
`kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
7+
`operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
8+
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
9+
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
10+
`time_us` BIGINT AS get_json_int(data, '$.time_us')
11+
) ENGINE=OLAP
12+
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`);

starrocks/ddl_zstd.sql

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
CREATE TABLE bluesky (
2+
`id` BIGINT AUTO_INCREMENT,
3+
-- Main JSON column (comes after key columns)
4+
`data` JSON NULL COMMENT "Main JSON object",
5+
-- Key columns (must come first in the schema and in the same order as ORDER BY)
6+
`kind` VARCHAR(255) AS get_json_string(data, '$.kind'),
7+
`operation` VARCHAR(255) AS get_json_string(data, '$.commit.operation'),
8+
`collection` VARCHAR(255) AS get_json_string(data, '$.commit.collection'),
9+
`did` VARCHAR(255) AS get_json_string(data, '$.did'),
10+
`time_us` BIGINT AS get_json_int(data, '$.time_us')
11+
) ENGINE=OLAP
12+
ORDER BY(`kind`, `operation`, `collection`, `did`, `time_us`)
13+
PROPERTIES (
14+
"compression" = "ZSTD"
15+
);

starrocks/drop_table.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 2 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
6+
exit 1
7+
fi
8+
9+
DB_NAME="$1"
10+
TABLE_NAME="$2"
11+
12+
echo "Dropping table: $DB_NAME.$TABLE_NAME"
13+
14+
mysql -P 9030 -h 127.0.0.1 -u root -e "DROP TABLE IF EXISTS $DB_NAME.$TABLE_NAME"

starrocks/install.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
sudo snap install docker
4+
sudo apt-get update
5+
sudo apt-get install -y mysql-client
6+
sudo docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name quickstart starrocks/allin1-ubuntu
7+

starrocks/load_data.sh

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 6 ]]; then
5+
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
10+
# Arguments
11+
DATA_DIRECTORY="$1"
12+
DB_NAME="$2"
13+
TABLE_NAME="$3"
14+
MAX_FILES="$4"
15+
SUCCESS_LOG="$5"
16+
ERROR_LOG="$6"
17+
18+
# Validate arguments
19+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20+
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21+
22+
# Create a temporary directory for uncompressed files
23+
TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
24+
trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit
25+
26+
# Load data
27+
counter=0
28+
for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
29+
echo "Processing file: $file"
30+
31+
# Uncompress the file into the TEMP_DIR
32+
uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
33+
gunzip -c "$file" > "$uncompressed_file"
34+
35+
if [[ $? -ne 0 ]]; then
36+
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
37+
continue
38+
fi
39+
MAX_ATTEMPT=10
40+
attempt=0
41+
while [ $attempt -lt $MAX_ATTEMPT ]
42+
do
43+
# Attempt the import
44+
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
45+
response_body="$(cat /tmp/curl_body)"
46+
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
47+
echo $response_status
48+
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
49+
if [ "$response_status" = "Success" ]
50+
then
51+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
52+
rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing
53+
attempt=$((MAX_ATTEMPT))
54+
else
55+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
56+
attempt=$((attempt + 1))
57+
sleep 2
58+
fi
59+
else
60+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
61+
attempt=$((attempt + 1))
62+
sleep 2
63+
fi
64+
done
65+
66+
counter=$((counter + 1))
67+
if [[ $counter -ge $MAX_FILES ]]; then
68+
break
69+
fi
70+
done

starrocks/main.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/bin/bash
2+
3+
# Default data directory
4+
DEFAULT_DATA_DIRECTORY=~/data/bluesky
5+
6+
# Allow the user to optionally provide the data directory as an argument
7+
DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
8+
9+
# Define success and error log files
10+
SUCCESS_LOG="${2:-success.log}"
11+
ERROR_LOG="${3:-error.log}"
12+
13+
# Define prefix for output files
14+
OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15+
16+
# Check if the directory exists
17+
if [[ ! -d "$DATA_DIRECTORY" ]]; then
18+
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19+
exit 1
20+
fi
21+
22+
echo "Select the dataset size to benchmark:"
23+
echo "1) 1m (default)"
24+
echo "2) 10m"
25+
echo "3) 100m"
26+
echo "4) 1000m"
27+
echo "5) all"
28+
read -p "Enter the number corresponding to your choice: " choice
29+
30+
./install.sh
31+
32+
benchmark() {
33+
local size=$1
34+
local suffix=$2
35+
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
36+
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
37+
if (( file_count < size )); then
38+
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
39+
exit 1
40+
fi
41+
./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
42+
./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size"
43+
./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
44+
./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans"
45+
./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
46+
./drop_table.sh "bluesky_${size}m_${suffix}" bluesky
47+
}
48+
49+
case $choice in
50+
2)
51+
benchmark 10 lz4
52+
benchmark 10 zstd
53+
;;
54+
3)
55+
benchmark 100 lz4
56+
benchmark 100 zstd
57+
;;
58+
4)
59+
benchmark 1000 lz4
60+
benchmark 1000 zstd
61+
;;
62+
5)
63+
benchmark 1 lz4
64+
benchmark 1 zstd
65+
benchmark 10 lz4
66+
benchmark 10 zstd
67+
benchmark 100 lz4
68+
benchmark 100 zstd
69+
benchmark 1000 lz4
70+
benchmark 1000 zstd
71+
;;
72+
*)
73+
benchmark 1 lz4
74+
benchmark 1 zstd
75+
;;
76+
esac

starrocks/physical_query_plans.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 1 ]]; then
5+
echo "Usage: $0 <DB_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
12+
QUERY_NUM=1
13+
14+
cat queries.sql | while read -r query; do
15+
16+
# Print the query number
17+
echo "------------------------------------------------------------------------------------------------------------------------"
18+
echo "Physical query plan for query Q$QUERY_NUM:"
19+
echo
20+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"
21+
22+
# Increment the query number
23+
QUERY_NUM=$((QUERY_NUM + 1))
24+
done;

0 commit comments

Comments
 (0)