postgres-cluster/benchmark.sh at master · JGjorgji/postgres-cluster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#! /usr/bin/env bash

set -o pipefail
set -o nounset
set -o errexit

SCALE_FACTOR=1
FORCE=false
HOST=
USER="postgres"
PORT=
CONCURRENCY=1

if [[ -z ${DBGEN_LOCATION} ]]; then
    echo "Need to specify DBGEN_LOCATION env variable"
    exit 1
fi

while getopts "s:c:h:fp:" opt; do
    case ${opt} in
        s)
            SCALE_FACTOR=${OPTARG}
            ;;
        f)
            FORCE=true
            ;;
        c)
            CONCURRENCY=${OPTARG}
            ;;
        h)
            HOST=${OPTARG}
            ;;
        p)
            PORT=${OPTARG}
            ;;
        *)
            echo "Unknown command line option"
            exit 1
            ;;
    esac
done

PSQL="$(which psql) -U ${USER} -h ${HOST} -p ${PORT}"

vaccum_db () {
    ${PSQL} -c "VACUUM FULL;"
}

generate_data () {
    pushd ${DBGEN_LOCATION}
    # If -f is provided we regenerate all the data
    # since we don't know what the previous scale factor was
    rm -f *.tbl
    ./dbgen -s "${SCALE_FACTOR}" -T a -f
    popd
}

create_schema_and_tables () {
    ${PSQL} -c "CREATE SCHEMA ssb;"
    ${PSQL} < ${SCHEMA_FILE}
}

load_data () {
    pushd ${DBGEN_LOCATION}
    for file in *.tbl; do
        TABLENAME=$(echo ${file} | sed 's/\.tbl$//')
        while [[ wc -l "${file}" >= 7000000 ]]; do
            ${PSQL} -c "COPY ssb.${TABLENAME} FROM stdin WITH DELIMITER '|';" < head -n 7000000 ${file}
            vaccum_db &
            sed -ei '1,7000000d' "${file}" &
            wait
        done
    done
    popd
}

run_query () {
    QUERY_LOCATION=${1}

    for i in $(seq 1 "${CONCURRENCY}"); do
        fname="${RESULTDIR}/$(basename ${QUERY_LOCATION})-${i}"
        { time ${PSQL} < ${QUERY_LOCATION} ;} >  "${fname}" 2> "${fname}.time" &
    done
    wait
}

run_all () {
    for query in queries/*; do
        run_query ${query}
    done
}

main () {

    # Force needs to always be specified when changing scale factor between runs!!!
    if [[ ${FORCE} == true ]]; then
        generate_data
        ${PSQL} -c "DROP SCHEMA IF EXISTS ssb CASCADE;"
        create_schema_and_tables
        load_data
    fi

    # Always have the basedir present
    if [[ ! -d "${RESULTDIR}" ]]; then
        mkdir -p "${RESULTDIR}"
    fi

    RESULTDIR="${RESULTDIR}/${HOST}-${SCALE_FACTOR}-${CONCURRENCY}"

    # Wipe per scale factor results on rerun
    rm -rf "${RESULTDIR}"
    mkdir -p "${RESULTDIR}"

    echo $(date) > "${RESULTDIR}/started"

    # Start collectl on all nodes
    ansible all -m shell -a "nohup collectl --all > /root/{{ inventory_hostname }}.collectl &" -u root

    run_all

    # Stop collectl
    ansible all -m shell -a "pkill collectl" -u root

    # Get the results to the control node
    ansible all -m fetch -a "src=/root/{{ inventory_hostname }}.collectl dest=${RESULTDIR} flat=yes" -u root

    echo $(date) > "${RESULTDIR}/finished"
    exit 0
}

main