Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/actions/velox-deps-fetch/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ runs:
set +e
./fetch_centos_deps_image.sh
status=$?

if [[ $status -eq 0 ]]; then
echo "Successfully fetched and loaded deps image from S3"
echo "fetch_status=success" >> $GITHUB_OUTPUT
Expand All @@ -58,7 +58,7 @@ runs:
echo "Deps image not found in S3 or failed to load"
echo "fetch_status=not_found" >> $GITHUB_OUTPUT
echo "needs_build=true" >> $GITHUB_OUTPUT

# Check if we should fail instead of building
if [[ "${{ inputs.fail_on_fetch_error }}" == "true" ]]; then
echo "ERROR: fail_on_fetch_error is enabled. Exiting workflow."
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/velox-setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ runs:
run: |
VELOX_REF="${{ inputs.velox_commit }}"
echo "Input ref: $VELOX_REF"

if [[ "$VELOX_REF" == "main" || "$VELOX_REF" == "master" ]]; then
cd velox
ACTUAL_SHA=$(git rev-parse HEAD)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/presto-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
java:
if: ${{ inputs.run_java_tests }}
uses: ./.github/workflows/presto-test-composite.yml
with:
with:
presto_worker_type: 'java'
node_label: 'linux-amd64-cpu8'
presto_repository: ${{ inputs.presto_repository }}
Expand Down
40 changes: 20 additions & 20 deletions .github/workflows/velox-create-staging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
GH_TOKEN: ${{ secrets.VELOX_TEST_PAT || secrets.GITHUB_TOKEN }}
run: |
rapids-echo-stderr "Auto-fetch enabled. Scanning for PRs with 'ready-to-merge' label..."

PR_LIST=$(rapids-retry --quiet gh pr list \
--repo facebookincubator/velox \
--label "ready-to-merge" \
Expand Down Expand Up @@ -149,23 +149,23 @@ jobs:

for pr_num in $PR_LIST; do
rapids-echo-stderr "Checking PR #$pr_num..."

# Check if this PR touches the cudf directory (skip check for manual PRs)
if [ "$AUTO_FETCH" != "false" ]; then
affected_files=$(rapids-retry --quiet gh pr diff $pr_num --repo facebookincubator/velox --name-only)

if ! echo "$affected_files" | grep -q "velox/experimental/cudf"; then
rapids-echo-stderr " -> PR #$pr_num does not touch cudf. Skipping."
((SKIPPED_COUNT++))
continue
fi
fi

rapids-echo-stderr " -> PR #$pr_num: Attempting merge..."

# Fetch the specific PR head
rapids-retry git fetch upstream pull/$pr_num/head:pr-$pr_num

# Try to merge pr-$pr_num into staging - FAIL IMMEDIATELY on conflict
if ! git merge pr-$pr_num --no-edit; then
rapids-echo-stderr "❌ MERGE CONFLICT in PR #$pr_num!"
Expand All @@ -174,14 +174,14 @@ jobs:
git merge --abort
exit 1
fi

rapids-echo-stderr " -> ✅ Merged PR #$pr_num successfully."
((MERGED_COUNT++))
MERGED_PRS="$MERGED_PRS $pr_num"
done

rapids-echo-stderr "Summary: Merged=$MERGED_COUNT, Skipped=$SKIPPED_COUNT"

# Output the list of successfully merged PRs
echo "merged_prs=${MERGED_PRS}" >> $GITHUB_OUTPUT
echo "merged_count=$MERGED_COUNT" >> $GITHUB_OUTPUT
Expand All @@ -195,7 +195,7 @@ jobs:
TARGET_REPO: ${{ inputs.target_repository }}
run: |
rapids-echo-stderr "Pushing ${{ inputs.target_branch }} branch to ${TARGET_REPO}..."

# For scheduled runs, always force push since we're resetting to upstream/main
# For manual dispatch, respect the force_push input (defaults to false)
if [ "$EVENT_NAME" == "schedule" ] || [ "$FORCE_PUSH" == "true" ]; then
Expand All @@ -209,7 +209,7 @@ jobs:
exit 1
fi
fi

FINAL_COMMIT=$(git rev-parse HEAD)
rapids-echo-stderr "✅ Done! ${{ inputs.target_branch }} branch updated successfully."
echo "Final commit: $FINAL_COMMIT"
Expand All @@ -224,29 +224,29 @@ jobs:
TARGET_REPO: ${{ inputs.target_repository }}
run: |
rapids-echo-stderr "Checking for existing ${{ inputs.target_branch }} PR..."

# Check if a PR already exists from staging to main
EXISTING_PR=$(gh pr list --repo "$TARGET_REPO" --head ${{ inputs.target_branch }} --base main --state open --json number,url --jq '.[0]' 2>/dev/null || echo "")

if [ -n "$EXISTING_PR" ] && [ "$EXISTING_PR" != "null" ]; then
PR_NUM=$(echo "$EXISTING_PR" | jq -r '.number')
PR_URL=$(echo "$EXISTING_PR" | jq -r '.url')
rapids-echo-stderr "📌 PR already exists: $PR_URL"
rapids-echo-stderr "Skipping PR creation."
exit 0
fi

rapids-echo-stderr "Creating new PR from ${{ inputs.target_branch }} to main..."

# Build PR body with links to merged PRs
PR_TITLE="🚀 ${{ inputs.target_branch }}: Sync with upstream + cuDF PRs"

PR_BODY="## ${{ inputs.target_branch }} Branch Update

This PR aggregates the following cuDF-related PRs from [facebookincubator/velox](https://github.com/facebookincubator/velox):

"

if [ -n "$MERGED_PRS" ]; then
for pr_num in $MERGED_PRS; do
PR_BODY="${PR_BODY}- https://github.com/facebookincubator/velox/pull/${pr_num}
Expand All @@ -259,12 +259,12 @@ jobs:
PR_BODY="${PR_BODY}_No cuDF PRs were merged in this update._
"
fi

PR_BODY="${PR_BODY}
---
_This PR was automatically created by the [Update ${{ inputs.target_branch }} Branch](https://github.com/${{ github.repository }}/actions/workflows/velox-create-staging.yml) workflow._
"

NEW_PR_URL=$(gh pr create --repo "$TARGET_REPO" --head ${{ inputs.target_branch }} --base main --title "$PR_TITLE" --body "$PR_BODY" 2>&1) || {
# PR creation might fail if there are no commits difference
if echo "$NEW_PR_URL" | grep -q "No commits"; then
Expand Down
16 changes: 8 additions & 8 deletions .github/workflows/velox-deps-upload.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,43 +66,43 @@ jobs:
# Configure AWS region
export AWS_DEFAULT_REGION="${S3_BUCKET_REGION}"
export AWS_REGION="${S3_BUCKET_REGION}"

# Derive writer ARN string from reader ARN
WRITER_ARN_STRING=$(echo "${AWS_ARN_STRING}" | sed 's/reader/writer/g')

echo "Verifying S3 credentials for bucket: ${S3_BUCKET_NAME}"

# Assume WRITER IAM role
echo "Assuming WRITER IAM role..."
WRITER_CREDS_JSON=$(aws sts assume-role \
--role-arn "${WRITER_ARN_STRING}" \
--role-session-name "VerifyS3Access" \
--query "Credentials" \
--output json)

if [ $? -ne 0 ]; then
echo "❌ Failed to assume WRITER IAM role."
exit 1
fi
echo "✅ Successfully assumed WRITER IAM role."

# Set writer credentials
export AWS_ACCESS_KEY_ID=$(echo "$WRITER_CREDS_JSON" | jq -r '.AccessKeyId')
export AWS_SECRET_ACCESS_KEY=$(echo "$WRITER_CREDS_JSON" | jq -r '.SecretAccessKey')
export AWS_SESSION_TOKEN=$(echo "$WRITER_CREDS_JSON" | jq -r '.SessionToken')

# Test LIST access
echo "Testing LIST access..."
if ! aws s3 ls "s3://${S3_BUCKET_NAME}/" > /dev/null; then
echo "❌ LIST access FAILED! Check writer role permissions."
exit 1
fi
echo "✅ LIST access verified."

# Generate unique test file name
TEST_FILE="s3-access-test-$(date +%s)-${RANDOM}.txt"
TEST_CONTENT="S3 access verification test - $(date)"

# Test WRITE access
echo "Testing WRITE access..."
if ! echo "${TEST_CONTENT}" | aws s3 cp - "s3://${S3_BUCKET_NAME}/${TEST_FILE}"; then
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/velox-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ jobs:
velox-gpu:
if: ${{ inputs.build_target == 'all' || inputs.build_target == 'gpu' }}
runs-on: linux-amd64-gpu-l4-latest-1

env:
GH_TOKEN: ${{ github.token }}
DOCKER_RUNTIME: nvidia
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ presto/docker/config/generated*/
# Generated Presto Docker Compose files
presto/docker/docker-compose/generated*/

devstate*
devstate*
67 changes: 67 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
ci:
autofix_commit_msg: "[pre-commit.ci] auto code formatting"
autofix_prs: false
autoupdate_branch: ""
autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
autoupdate_schedule: quarterly
submodules: false

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
# These will be enabled in a follow-up PR
# - repo: https://github.com/astral-sh/ruff-pre-commit
# rev: v0.14.10
# hooks:
# - id: ruff-check
# args: ["--fix"]
# - id: ruff-format
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: |
(?x)^(
^pyproject.toml$|
^presto/testing/common/queries/tpcds/queries.json$
)
- repo: https://github.com/rapidsai/pre-commit-hooks
rev: v1.2.1
hooks:
- id: verify-copyright
args: [--fix, --spdx]
files: |
(?x)
[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|rs|java)$|
CMakeLists[.]txt$|
CMakeLists_standalone[.]txt$|
meta[.]yaml$|
pyproject[.]toml$|
setup[.]cfg$|
^[.]pre-commit-config[.]yaml$|
Makefile$|
recipe[.]yaml$|
dependencies[.]yaml$
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.11.0.1
hooks:
- id: shellcheck
# These will be enabled in a follow-up PR
exclude: |
(?x)^(
^benchmark_data_tools/.*$|
^presto/.*$|
^scripts/.*$|
^template_rendering/.*$|
^velox/.*$
)

default_language_version:
python: python3
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ conduct. More information can be found at:
8. Wait for other developers to review your code and update code as needed.
9. Once reviewed and approved, a RAPIDS developer will merge your pull request.

If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Authentication files are stored in `~/.sccache-auth/` by default and credentials
A Docker-based benchmarking infrastructure has been added to facilitate running Velox benchmarks with support for CPU/GPU execution engines and profiling capabilities. The infrastructure uses a dedicated `velox-benchmark` Docker service with pre-configured volume mounts that automatically sync benchmark data and results. The data follows Hive directory structure, making it compatible with Presto. Currently, only TPC-H is implemented, but the infrastructure is designed to be easily extended to support additional benchmarks in the future.

### Prerequisites
The benchmarking infrastructure requires the same directory structure as Velox Testing, plus benchmark data using Hive directory structure. For TPC-H, the required data layout is shown below.
The benchmarking infrastructure requires the same directory structure as Velox Testing, plus benchmark data using Hive directory structure. For TPC-H, the required data layout is shown below.

```
velox-benchmark-data/
Expand All @@ -63,7 +63,7 @@ The benchmarking infrastructure requires the same directory structure as Velox T
└─ supplier/
```

By default, the data directory is named `velox-benchmark-data`, but you can specify a different directory using a command-line option. The data must follow the Hive-style partition layout backed by Parquet files.
By default, the data directory is named `velox-benchmark-data`, but you can specify a different directory using a command-line option. The data must follow the Hive-style partition layout backed by Parquet files.

### Building for Benchmarks
Before running benchmarks, Velox must be built with benchmarking support enabled:
Expand Down Expand Up @@ -122,7 +122,7 @@ A number of docker image build and container services infrastructure (using dock
├─ velox-testing
├─ presto
├─ velox
```
```
Specifically, the `velox-testing`, `presto`, and `velox` repositories have to be checked out as sibling directories under the same parent directory. Once that is done, navigate (`cd`) into the `velox-testing/presto/scripts` directory and execute the start up script for the needed presto deployment variant. The following scripts: `start_java_presto.sh`, `start_native_cpu_presto.sh`, and `start_native_gpu_presto.sh` can be used to build/deploy "Presto Java Coordinator + Presto Java Worker", "Presto Java Coordinator + Presto Native CPU Worker", and "Presto Java Coordinator + Presto Native GPU Worker" variants respectively. The presto server can then be accessed at http://localhost:8080.

Note that CPU and GPU builds require a local dependencies/run-time base Docker image (`presto/prestissimo-dependency:centos9`). The `start` scripts will not create this automatically. It must be obtained manually. Use the `build_centos_deps_image.sh` script to build an image locally, or the `fetch_centos_deps_image.sh` script to fetch a pre-built image from an external source. Note that the latter script currently requires additional credentials not available to third-parties.
Expand Down
2 changes: 1 addition & 1 deletion benchmark_data_tools/generate_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def generate_data_files_with_tpchgen(args):

# This dictionary maps each table to the number of partitions it should have based on it's
# expected file size relative to the SF.
# We generate a small sample bechmark (sf-0.01) to sample the ratio of how many rows are generated.
# We generate a small sample benchmark (sf-0.01) to sample the ratio of how many rows are generated.
def get_table_sf_ratios(scale_factor, max_rows):
int_scale_factor = int(scale_factor)
int_scale_factor = 1 if int_scale_factor < 1 else int_scale_factor
Expand Down
2 changes: 1 addition & 1 deletion benchmark_data_tools/rewrite_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def process_file(input_file_path, output_dir, input_dir, verbose, convert_decima

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Alter an exising directory of parquet files",
description="Alter an existing directory of parquet files",
formatter_class=argparse.RawDescriptionHelpFormatter
)

Expand Down
10 changes: 6 additions & 4 deletions presto/docker/presto_profiling_wrapper.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

set -e

if [[ "$PROFILE" == "ON" ]]; then
mkdir /presto_profiles

if [[ -z $PROFILE_ARGS ]]; then
PROFILE_ARGS="-t nvtx,cuda,osrt,ucx
--cuda-memory-usage=true
--cuda-um-cpu-page-faults=true
--cuda-um-gpu-page-faults=true
PROFILE_ARGS="-t nvtx,cuda,osrt,ucx
--cuda-memory-usage=true
--cuda-um-cpu-page-faults=true
--cuda-um-gpu-page-faults=true
--cudabacktrace=true"
fi
PROFILE_CMD="nsys launch $PROFILE_ARGS"
Expand Down
2 changes: 1 addition & 1 deletion presto/install_gperf.patch
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ index 4f0dfcd4c5..404e4ff76a 100755
+++ b/presto-native-execution/scripts/setup-centos.sh
@@ -38,7 +38,8 @@ function install_presto_deps_from_package_managers {
}

function install_gperf {
- wget ${WGET_OPTIONS} https://ftp.gnu.org/pub/gnu/gperf/gperf-3.1.tar.gz &&
+ wget ${WGET_OPTIONS} https://ftp.gnu.org/pub/gnu/gperf/gperf-3.1.tar.gz ||
Expand Down
2 changes: 1 addition & 1 deletion presto/pbench/benchmarks/tpch/sf100.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@
"duckdb_queries/20.sql",
"duckdb_queries/21.sql",
"duckdb_queries/22.sql"
]
]
}
Loading
Loading