rapidsai · bdice · Feb 4, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -49,7 +49,7 @@ runs:
         set +e
         ./fetch_centos_deps_image.sh
         status=$?
-        
+
         if [[ $status -eq 0 ]]; then
           echo "Successfully fetched and loaded deps image from S3"
           echo "fetch_status=success" >> $GITHUB_OUTPUT
@@ -58,7 +58,7 @@ runs:
           echo "Deps image not found in S3 or failed to load"
           echo "fetch_status=not_found" >> $GITHUB_OUTPUT
           echo "needs_build=true" >> $GITHUB_OUTPUT
-          
+
           # Check if we should fail instead of building
           if [[ "${{ inputs.fail_on_fetch_error }}" == "true" ]]; then
             echo "ERROR: fail_on_fetch_error is enabled. Exiting workflow."

@@ -32,7 +32,7 @@ runs:
       run: |
         VELOX_REF="${{ inputs.velox_commit }}"
         echo "Input ref: $VELOX_REF"
-        
+
         if [[ "$VELOX_REF" == "main" || "$VELOX_REF" == "master" ]]; then
           cd velox
           ACTUAL_SHA=$(git rev-parse HEAD)

@@ -60,7 +60,7 @@ jobs:
   java:
     if: ${{ inputs.run_java_tests }}
     uses: ./.github/workflows/presto-test-composite.yml
-    with:      
+    with:
       presto_worker_type: 'java'
       node_label: 'linux-amd64-cpu8'
       presto_repository: ${{ inputs.presto_repository }}

@@ -92,7 +92,7 @@ jobs:
           GH_TOKEN: ${{ secrets.VELOX_TEST_PAT || secrets.GITHUB_TOKEN }}
         run: |
           rapids-echo-stderr "Auto-fetch enabled. Scanning for PRs with 'ready-to-merge' label..."
-          
+
           PR_LIST=$(rapids-retry --quiet gh pr list \
             --repo facebookincubator/velox \
             --label "ready-to-merge" \
@@ -149,23 +149,23 @@ jobs:
 
           for pr_num in $PR_LIST; do
             rapids-echo-stderr "Checking PR #$pr_num..."
-            
+
             # Check if this PR touches the cudf directory (skip check for manual PRs)
             if [ "$AUTO_FETCH" != "false" ]; then
               affected_files=$(rapids-retry --quiet gh pr diff $pr_num --repo facebookincubator/velox --name-only)
-              
+
               if ! echo "$affected_files" | grep -q "velox/experimental/cudf"; then
                 rapids-echo-stderr " -> PR #$pr_num does not touch cudf. Skipping."
                 ((SKIPPED_COUNT++))
                 continue
               fi
             fi
-            
+
             rapids-echo-stderr " -> PR #$pr_num: Attempting merge..."
-            
+
             # Fetch the specific PR head
             rapids-retry git fetch upstream pull/$pr_num/head:pr-$pr_num
-            
+
             # Try to merge pr-$pr_num into staging - FAIL IMMEDIATELY on conflict
             if ! git merge pr-$pr_num --no-edit; then
               rapids-echo-stderr "❌ MERGE CONFLICT in PR #$pr_num!"
@@ -174,14 +174,14 @@ jobs:
               git merge --abort
               exit 1
             fi
-            
+
             rapids-echo-stderr " -> ✅ Merged PR #$pr_num successfully."
             ((MERGED_COUNT++))
             MERGED_PRS="$MERGED_PRS $pr_num"
           done
 
           rapids-echo-stderr "Summary: Merged=$MERGED_COUNT, Skipped=$SKIPPED_COUNT"
-          
+
           # Output the list of successfully merged PRs
           echo "merged_prs=${MERGED_PRS}" >> $GITHUB_OUTPUT
           echo "merged_count=$MERGED_COUNT" >> $GITHUB_OUTPUT
@@ -195,7 +195,7 @@ jobs:
           TARGET_REPO: ${{ inputs.target_repository }}
         run: |
           rapids-echo-stderr "Pushing ${{ inputs.target_branch }} branch to ${TARGET_REPO}..."
-          
+
           # For scheduled runs, always force push since we're resetting to upstream/main
           # For manual dispatch, respect the force_push input (defaults to false)
           if [ "$EVENT_NAME" == "schedule" ] || [ "$FORCE_PUSH" == "true" ]; then
@@ -209,7 +209,7 @@ jobs:
               exit 1
             fi
           fi
-          
+
           FINAL_COMMIT=$(git rev-parse HEAD)
           rapids-echo-stderr "✅ Done! ${{ inputs.target_branch }} branch updated successfully."
           echo "Final commit: $FINAL_COMMIT"
@@ -224,29 +224,29 @@ jobs:
           TARGET_REPO: ${{ inputs.target_repository }}
         run: |
           rapids-echo-stderr "Checking for existing ${{ inputs.target_branch }} PR..."
-          
+
           # Check if a PR already exists from staging to main
           EXISTING_PR=$(gh pr list --repo "$TARGET_REPO" --head ${{ inputs.target_branch }} --base main --state open --json number,url --jq '.[0]' 2>/dev/null || echo "")
-          
+
           if [ -n "$EXISTING_PR" ] && [ "$EXISTING_PR" != "null" ]; then
             PR_NUM=$(echo "$EXISTING_PR" | jq -r '.number')
             PR_URL=$(echo "$EXISTING_PR" | jq -r '.url')
             rapids-echo-stderr "📌 PR already exists: $PR_URL"
             rapids-echo-stderr "Skipping PR creation."
             exit 0
           fi
-          
+
           rapids-echo-stderr "Creating new PR from ${{ inputs.target_branch }} to main..."
-          
+
           # Build PR body with links to merged PRs
           PR_TITLE="🚀 ${{ inputs.target_branch }}: Sync with upstream + cuDF PRs"
-          
+
           PR_BODY="## ${{ inputs.target_branch }} Branch Update
-          
+
           This PR aggregates the following cuDF-related PRs from [facebookincubator/velox](https://github.com/facebookincubator/velox):
-          
+
           "
-          
+
           if [ -n "$MERGED_PRS" ]; then
             for pr_num in $MERGED_PRS; do
               PR_BODY="${PR_BODY}- https://github.com/facebookincubator/velox/pull/${pr_num}
@@ -259,12 +259,12 @@ jobs:
             PR_BODY="${PR_BODY}_No cuDF PRs were merged in this update._
           "
           fi
-          
+
           PR_BODY="${PR_BODY}
           ---
           _This PR was automatically created by the [Update ${{ inputs.target_branch }} Branch](https://github.com/${{ github.repository }}/actions/workflows/velox-create-staging.yml) workflow._
           "
-          
+
           NEW_PR_URL=$(gh pr create --repo "$TARGET_REPO" --head ${{ inputs.target_branch }} --base main --title "$PR_TITLE" --body "$PR_BODY" 2>&1) || {
             # PR creation might fail if there are no commits difference
             if echo "$NEW_PR_URL" | grep -q "No commits"; then

@@ -66,43 +66,43 @@ jobs:
           # Configure AWS region
           export AWS_DEFAULT_REGION="${S3_BUCKET_REGION}"
           export AWS_REGION="${S3_BUCKET_REGION}"
-          
+
           # Derive writer ARN string from reader ARN
           WRITER_ARN_STRING=$(echo "${AWS_ARN_STRING}" | sed 's/reader/writer/g')
-          
+
           echo "Verifying S3 credentials for bucket: ${S3_BUCKET_NAME}"
-          
+
           # Assume WRITER IAM role
           echo "Assuming WRITER IAM role..."
           WRITER_CREDS_JSON=$(aws sts assume-role \
             --role-arn "${WRITER_ARN_STRING}" \
             --role-session-name "VerifyS3Access" \
             --query "Credentials" \
             --output json)
-          
+
           if [ $? -ne 0 ]; then
             echo "❌ Failed to assume WRITER IAM role."
             exit 1
           fi
           echo "✅ Successfully assumed WRITER IAM role."
-          
+
           # Set writer credentials
           export AWS_ACCESS_KEY_ID=$(echo "$WRITER_CREDS_JSON" | jq -r '.AccessKeyId')
           export AWS_SECRET_ACCESS_KEY=$(echo "$WRITER_CREDS_JSON" | jq -r '.SecretAccessKey')
           export AWS_SESSION_TOKEN=$(echo "$WRITER_CREDS_JSON" | jq -r '.SessionToken')
-          
+
           # Test LIST access
           echo "Testing LIST access..."
           if ! aws s3 ls "s3://${S3_BUCKET_NAME}/" > /dev/null; then
             echo "❌ LIST access FAILED! Check writer role permissions."
             exit 1
           fi
           echo "✅ LIST access verified."
-          
+
           # Generate unique test file name
           TEST_FILE="s3-access-test-$(date +%s)-${RANDOM}.txt"
           TEST_CONTENT="S3 access verification test - $(date)"
-          
+
           # Test WRITE access
           echo "Testing WRITE access..."
           if ! echo "${TEST_CONTENT}" | aws s3 cp - "s3://${S3_BUCKET_NAME}/${TEST_FILE}"; then

@@ -106,7 +106,7 @@ jobs:
   velox-gpu:
     if: ${{ inputs.build_target == 'all' || inputs.build_target == 'gpu' }}
     runs-on: linux-amd64-gpu-l4-latest-1
-    
+
     env:
       GH_TOKEN: ${{ github.token }}
       DOCKER_RUNTIME: nvidia

@@ -22,4 +22,4 @@ presto/docker/config/generated*/
 # Generated Presto Docker Compose files
 presto/docker/docker-compose/generated*/
 
-devstate*
+devstate*
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+ci:
+  autofix_commit_msg: "[pre-commit.ci] auto code formatting"
+  autofix_prs: false
+  autoupdate_branch: ""
+  autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
+  autoupdate_schedule: quarterly
+  submodules: false
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+# These will be enabled in a follow-up PR
+# - repo: https://github.com/astral-sh/ruff-pre-commit
+#   rev: v0.14.10
+#   hooks:
+#     - id: ruff-check
+#       args: ["--fix"]
+#     - id: ruff-format
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        additional_dependencies: [tomli]
+        args: ["--toml", "pyproject.toml"]
+        exclude: |
+          (?x)^(
+            ^pyproject.toml$|
+            ^presto/testing/common/queries/tpcds/queries.json$
+          )
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v1.2.1
+    hooks:
+      - id: verify-copyright
+        args: [--fix, --spdx]
+        files: |
+          (?x)
+            [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|rs|java)$|
+            CMakeLists[.]txt$|
+            CMakeLists_standalone[.]txt$|
+            meta[.]yaml$|
+            pyproject[.]toml$|
+            setup[.]cfg$|
+            ^[.]pre-commit-config[.]yaml$|
+            Makefile$|
+            recipe[.]yaml$|
+            dependencies[.]yaml$
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.11.0.1
+    hooks:
+      - id: shellcheck
+        # These will be enabled in a follow-up PR
+        exclude: |
+          (?x)^(
+            ^benchmark_data_tools/.*$|
+            ^presto/.*$|
+            ^scripts/.*$|
+            ^template_rendering/.*$|
+            ^velox/.*$
+          )
+
+default_language_version:
+  python: python3
@@ -36,4 +36,4 @@ conduct. More information can be found at:
 8. Wait for other developers to review your code and update code as needed.
 9. Once reviewed and approved, a RAPIDS developer will merge your pull request.
 
-If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
+If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
@@ -48,7 +48,7 @@ Authentication files are stored in `~/.sccache-auth/` by default and credentials
 A Docker-based benchmarking infrastructure has been added to facilitate running Velox benchmarks with support for CPU/GPU execution engines and profiling capabilities. The infrastructure uses a dedicated `velox-benchmark` Docker service with pre-configured volume mounts that automatically sync benchmark data and results. The data follows Hive directory structure, making it compatible with Presto. Currently, only TPC-H is implemented, but the infrastructure is designed to be easily extended to support additional benchmarks in the future.
 
 ### Prerequisites
-The benchmarking infrastructure requires the same directory structure as Velox Testing, plus benchmark data using Hive directory structure. For TPC-H, the required data layout is shown below. 
+The benchmarking infrastructure requires the same directory structure as Velox Testing, plus benchmark data using Hive directory structure. For TPC-H, the required data layout is shown below.
 
 ```
   velox-benchmark-data/
@@ -63,7 +63,7 @@ The benchmarking infrastructure requires the same directory structure as Velox T
     └─ supplier/
 ```
 
-By default, the data directory is named `velox-benchmark-data`, but you can specify a different directory using a command-line option. The data must follow the Hive-style partition layout backed by Parquet files. 
+By default, the data directory is named `velox-benchmark-data`, but you can specify a different directory using a command-line option. The data must follow the Hive-style partition layout backed by Parquet files.
 
 ### Building for Benchmarks
 Before running benchmarks, Velox must be built with benchmarking support enabled:
@@ -122,7 +122,7 @@ A number of docker image build and container services infrastructure (using dock
   ├─ velox-testing
   ├─ presto
   ├─ velox
-``` 
+```
 Specifically, the `velox-testing`, `presto`, and `velox` repositories have to be checked out as sibling directories under the same parent directory. Once that is done, navigate (`cd`) into the `velox-testing/presto/scripts` directory and execute the start up script for the needed presto deployment variant. The following scripts: `start_java_presto.sh`, `start_native_cpu_presto.sh`, and `start_native_gpu_presto.sh` can be used to build/deploy "Presto Java Coordinator + Presto Java Worker", "Presto Java Coordinator + Presto Native CPU Worker", and "Presto Java Coordinator + Presto Native GPU Worker" variants respectively. The presto server can then be accessed at http://localhost:8080.
 
 Note that CPU and GPU builds require a local dependencies/run-time base Docker image (`presto/prestissimo-dependency:centos9`). The `start` scripts will not create this automatically. It must be obtained manually. Use the `build_centos_deps_image.sh` script to build an image locally, or the `fetch_centos_deps_image.sh` script to fetch a pre-built image from an external source. Note that the latter script currently requires additional credentials not available to third-parties.

@@ -100,7 +100,7 @@ def generate_data_files_with_tpchgen(args):
 
 # This dictionary maps each table to the number of partitions it should have based on it's
 # expected file size relative to the SF.
-# We generate a small sample bechmark (sf-0.01) to sample the ratio of how many rows are generated.
+# We generate a small sample benchmark (sf-0.01) to sample the ratio of how many rows are generated.
 def get_table_sf_ratios(scale_factor, max_rows):
     int_scale_factor = int(scale_factor)
     int_scale_factor = 1 if int_scale_factor < 1 else int_scale_factor

@@ -79,7 +79,7 @@ def process_file(input_file_path, output_dir, input_dir, verbose, convert_decima
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Alter an exising directory of parquet files",
+        description="Alter an existing directory of parquet files",
         formatter_class=argparse.RawDescriptionHelpFormatter
     )
 

@@ -1,15 +1,17 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 set -e
 
 if [[ "$PROFILE" == "ON" ]]; then
   mkdir /presto_profiles
 
   if [[ -z $PROFILE_ARGS ]]; then
-    PROFILE_ARGS="-t nvtx,cuda,osrt,ucx 
-                  --cuda-memory-usage=true 
-                  --cuda-um-cpu-page-faults=true 
-                  --cuda-um-gpu-page-faults=true 
+    PROFILE_ARGS="-t nvtx,cuda,osrt,ucx
+                  --cuda-memory-usage=true
+                  --cuda-um-cpu-page-faults=true
+                  --cuda-um-gpu-page-faults=true
                   --cudabacktrace=true"
   fi
   PROFILE_CMD="nsys launch $PROFILE_ARGS"

@@ -4,7 +4,7 @@ index 4f0dfcd4c5..404e4ff76a 100755
 +++ b/presto-native-execution/scripts/setup-centos.sh
 @@ -38,7 +38,8 @@ function install_presto_deps_from_package_managers {
  }
- 
+
  function install_gperf {
 -  wget ${WGET_OPTIONS} https://ftp.gnu.org/pub/gnu/gperf/gperf-3.1.tar.gz &&
 +  wget ${WGET_OPTIONS} https://ftp.gnu.org/pub/gnu/gperf/gperf-3.1.tar.gz ||

@@ -28,5 +28,5 @@
     "duckdb_queries/20.sql",
     "duckdb_queries/21.sql",
     "duckdb_queries/22.sql"
-  ] 
+  ]
 }