diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
new file mode 100644
index 0000000000..f58d87d112
--- /dev/null
+++ b/.github/workflows/build_and_test.yml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Reusable workflow invoked by build_main.yml. Modelled on Apache Spark's
+# build_and_test.yml: the heavy work runs wherever the calling push lands,
+# i.e. on the contributor's fork. Delegates the test matrix to the umbrella
+# ci.yml workflow, which fans out to the per-Spark-version reusables.
+
+name: Build and test
+
+on:
+ workflow_call:
+
+jobs:
+ ci:
+ name: CI
+ uses: ./.github/workflows/ci.yml
+ secrets: inherit
+ with:
+ # Force every gated job to run on fork-CI: contributor pushes don't necessarily
+ # touch paths that match ci.yml's filters (e.g. when iterating on the fork-CI
+ # workflow files themselves), so honour the Spark-style "run the full matrix"
+ # contract instead of skipping silently.
+ force_all: true
diff --git a/.github/workflows/build_main.yml b/.github/workflows/build_main.yml
new file mode 100644
index 0000000000..dcc0f41ef6
--- /dev/null
+++ b/.github/workflows/build_main.yml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Modelled on Apache Spark's build_main.yml. Triggers on push to any branch,
+# which means contributor pushes to a fork run CI on the fork's Actions
+# minutes/runners rather than on apache/datafusion-comet. The bridge to the
+# upstream PR check is notify_test_workflow.yml.
+
+name: "Build"
+
+on:
+ push:
+ branches:
+ - '**'
+
+jobs:
+ call-build-and-test:
+ permissions:
+ packages: write
+ name: Run
+ uses: ./.github/workflows/build_and_test.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b5c13261c2..e50cb6b6f6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,16 +22,30 @@
name: CI
concurrency:
- group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+ # Use github.ref (branch ref) instead of github.sha for the push-event fallback
+ # so consecutive pushes to the same branch cancel each other; sha would make
+ # every push a new group and stack runs up.
+ group: ${{ github.repository }}-${{ github.head_ref || github.ref }}-${{ github.workflow }}
cancel-in-progress: true
+# CI DISABLED 2026-05-31 — auto-triggers commented out. Active triggers:
+# * workflow_dispatch — manual runs
+# * workflow_call — invoked by build_and_test.yml on contributor forks
+# To rollback: restore the commented `pull_request` and `push` blocks below.
on:
- pull_request:
- types: [opened, synchronize, reopened, labeled]
- push:
- branches:
- - main
workflow_dispatch:
+ workflow_call:
+ inputs:
+ force_all:
+ description: 'Force-enable every gated job (used by fork-CI so the full matrix runs regardless of which paths changed).'
+ required: false
+ type: boolean
+ default: false
+ # pull_request:
+ # types: [opened, synchronize, reopened, labeled]
+ # push:
+ # branches:
+ # - main
jobs:
# ---------------------------------------------------------------------------
@@ -43,7 +57,7 @@ jobs:
# ---------------------------------------------------------------------------
preflight:
name: Preflight
- runs-on: ubuntu-slim
+ runs-on: ${{ github.repository == 'apache/datafusion-comet' && 'ubuntu-slim' || 'ubuntu-24.04' }}
steps:
- uses: actions/checkout@v6
@@ -89,7 +103,7 @@ jobs:
changes:
name: Detect changes
needs: preflight
- runs-on: ubuntu-slim
+ runs-on: ${{ github.repository == 'apache/datafusion-comet' && 'ubuntu-slim' || 'ubuntu-24.04' }}
outputs:
build_linux: ${{ steps.compute.outputs.build_linux }}
build_macos: ${{ steps.compute.outputs.build_macos }}
@@ -118,9 +132,10 @@ jobs:
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PUSH_BEFORE: ${{ github.event.before }}
PUSH_AFTER: ${{ github.sha }}
+ FORCE_ALL: ${{ inputs.force_all }}
run: |
set -euo pipefail
- if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
+ if [[ "$EVENT_NAME" == "workflow_dispatch" || "$FORCE_ALL" == "true" ]]; then
for key in build_linux build_macos benchmark docs spark_3_4 spark_3_5 spark_4_0 spark_4_1 iceberg_1_8 iceberg_1_9 iceberg_1_10; do
echo "${key}=true" >> "$GITHUB_OUTPUT"
done
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 76c166bc3b..0cc8f376c2 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -22,13 +22,16 @@ concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
+# CI DISABLED 2026-05-31 — auto-triggers commented out, only manual workflow_dispatch remains.
+# To rollback: remove `workflow_dispatch:` and restore the commented `push`/`pull_request`/`schedule` blocks below.
on:
- push:
- branches: [ "main" ]
- pull_request:
- branches: [ "main" ]
- schedule:
- - cron: '16 4 * * 1'
+ workflow_dispatch:
+ # push:
+ # branches: [ "main" ]
+ # pull_request:
+ # branches: [ "main" ]
+ # schedule:
+ # - cron: '16 4 * * 1'
permissions:
contents: read
diff --git a/.github/workflows/images/workflow-enable-button.png b/.github/workflows/images/workflow-enable-button.png
new file mode 100644
index 0000000000..f7299f233a
Binary files /dev/null and b/.github/workflows/images/workflow-enable-button.png differ
diff --git a/.github/workflows/label_new_issues.yml b/.github/workflows/label_new_issues.yml
index bbb4fb150d..01e263d0e2 100644
--- a/.github/workflows/label_new_issues.yml
+++ b/.github/workflows/label_new_issues.yml
@@ -17,9 +17,12 @@
name: Label new issues with requires-triage
+# CI DISABLED 2026-05-31 — auto-trigger commented out, only manual workflow_dispatch remains.
+# To rollback: remove `workflow_dispatch:` and restore the commented `issues` block below.
on:
- issues:
- types: [opened]
+ workflow_dispatch:
+ # issues:
+ # types: [opened]
permissions:
issues: write
diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml
index ad318888c3..7d711225e3 100644
--- a/.github/workflows/miri.yml
+++ b/.github/workflows/miri.yml
@@ -21,10 +21,12 @@ concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
+# CI DISABLED 2026-05-31 — nightly schedule commented out, only manual workflow_dispatch remains.
+# To rollback: restore the commented `schedule` block below.
on:
# nightly safety check
- schedule:
- - cron: '0 4 * * *'
+ # schedule:
+ # - cron: '0 4 * * *'
# manual trigger
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
workflow_dispatch:
diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml
new file mode 100644
index 0000000000..99be466956
--- /dev/null
+++ b/.github/workflows/notify_test_workflow.yml
@@ -0,0 +1,171 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+name: On pull request update
+on:
+ pull_request_target:
+ types: [opened, reopened, synchronize]
+
+jobs:
+ notify:
+ name: Notify test workflow
+ runs-on: ubuntu-slim
+ permissions:
+ actions: read
+ checks: write
+ steps:
+ - name: "Notify test workflow"
+ uses: actions/github-script@v9
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch'
+ const check_run_endpoint = 'GET /repos/:owner/:repo/commits/:ref/check-runs?per_page=100'
+
+ // TODO: Should use pull_request.user and pull_request.user.repos_url?
+ // If a different person creates a commit to another forked repo,
+ // it wouldn't be able to detect.
+ const params = {
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ id: 'build_main.yml',
+ branch: context.payload.pull_request.head.ref,
+ }
+ const check_run_params = {
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ ref: context.payload.pull_request.head.ref,
+ }
+
+ console.log('Ref: ' + context.payload.pull_request.head.ref)
+ console.log('SHA: ' + context.payload.pull_request.head.sha)
+
+ // Wait 3 seconds to make sure the fork repository triggered a workflow.
+ await new Promise(r => setTimeout(r, 3000))
+
+ let runs
+ try {
+ runs = await github.request(endpoint, params)
+ } catch (error) {
+ console.error(error)
+ // Assume that runs were not found.
+ }
+
+ const name = 'Build'
+ const head_sha = context.payload.pull_request.head.sha
+ let status = 'queued'
+
+ if (!runs || runs.data.workflow_runs.length === 0) {
+ status = 'completed'
+ const conclusion = 'action_required'
+
+ github.rest.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ conclusion: conclusion,
+ output: {
+ title: 'Workflow run detection failed',
+ summary: `
+ Unable to detect the workflow run for testing the changes in your PR.
+
+ 1. If you did not enable GitHub Actions in your forked repository, please enable it by clicking the button as shown in the image below. See also [Managing Github Actions Settings for a repository](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/enabling-features-for-your-repository/managing-github-actions-settings-for-a-repository) for more details.
+ 2. It is possible your branch is based on the old \`main\` branch in Apache Datafusion Comet, please sync your branch to the latest main branch. For example as below:
+ \`\`\`bash
+ git fetch upstream
+ git rebase upstream/main
+ git push origin YOUR_BRANCH --force
+ \`\`\``,
+ images: [
+ {
+ alt: 'enabling workflows button',
+ image_url: 'https://raw.githubusercontent.com/apache/datafusion-comet/main/.github/workflows/images/workflow-enable-button.png'
+ }
+ ]
+ }
+ })
+ } else {
+ const run_id = runs.data.workflow_runs[0].id
+
+ if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) {
+ throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+ }
+
+ // Here we get check run ID to provide Check run view instead of Actions view, see also SPARK-37879.
+ let retryCount = 0;
+ let check_run_head;
+ while (retryCount < 3) {
+ const check_runs = await github.request(check_run_endpoint, check_run_params);
+ check_run_head = check_runs.data.check_runs.find(r => r.name === "Run / CI / Preflight");
+ if (check_run_head) {
+ break;
+ }
+ retryCount++;
+ if (retryCount < 3) {
+ await new Promise(resolve => setTimeout(resolve, 3000));
+ }
+ }
+ if (!check_run_head) {
+ throw new Error('Failed to retrieve check_run_head after 3 attempts');
+ }
+
+ if (check_run_head.head_sha != context.payload.pull_request.head.sha) {
+ throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+ }
+
+ const check_run_url = 'https://github.com/'
+ + context.payload.pull_request.head.repo.full_name
+ + '/runs/'
+ + check_run_head.id
+ console.log('Check run URL: ' + check_run_url)
+
+ const actions_url = 'https://github.com/'
+ + context.payload.pull_request.head.repo.full_name
+ + '/actions/runs/'
+ + run_id
+ console.log('Actions URL: ' + actions_url)
+
+ github.rest.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ output: {
+ title: 'Test results',
+ summary: '[See test results](' + check_run_url + ')\n\n'
+ + 'If the tests fail for reasons unrelated to this pull request, '
+ + 'please rerun the workflow in your forked repository.\n'
+ + 'If the failures are related to this pull request, '
+ + 'please investigate them and push follow-up changes.',
+ text: JSON.stringify({
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ run_id: run_id
+ })
+ },
+ details_url: actions_url,
+ })
+ }
\ No newline at end of file
diff --git a/.github/workflows/pr_title_check.yml b/.github/workflows/pr_title_check.yml
index fe3674acb4..46e97df5c5 100644
--- a/.github/workflows/pr_title_check.yml
+++ b/.github/workflows/pr_title_check.yml
@@ -21,9 +21,12 @@ concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
+# CI DISABLED 2026-05-31 — auto-trigger commented out, only manual workflow_dispatch remains.
+# To rollback: remove `workflow_dispatch:` and restore the commented `pull_request` block below.
on:
- pull_request:
- types: [opened, edited, reopened]
+ workflow_dispatch:
+ # pull_request:
+ # types: [opened, edited, reopened]
jobs:
check-pr-title:
diff --git a/.github/workflows/spark_sql_test_reusable.yml b/.github/workflows/spark_sql_test_reusable.yml
index 4e89cda860..610806534d 100644
--- a/.github/workflows/spark_sql_test_reusable.yml
+++ b/.github/workflows/spark_sql_test_reusable.yml
@@ -121,6 +121,21 @@ jobs:
image: amd64/rust
steps:
- uses: actions/checkout@v6
+ - name: Free up disk space (container)
+ # Mirror apache/spark's dev/free_disk_space_container. The amd64/rust
+ # container inherits GitHub's tool-cache via the /__t mount; strip the
+ # entries we never use (CodeQL, Go, Node) to free ~5-10 GB before the
+ # Spark clone, Maven dep tree, sbt build cache, and per-test-suite
+ # warehouse dirs all start fighting for the runner's disk.
+ shell: bash
+ run: |
+ echo "Disk usage before cleanup:"
+ df -h || true
+ rm -rf /__t/CodeQL || true
+ rm -rf /__t/go || true
+ rm -rf /__t/node || true
+ echo "Disk usage after cleanup:"
+ df -h || true
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
@@ -152,9 +167,10 @@ jobs:
if [ "${{ inputs.spark-short }}" != "4.0" ] || [ "${{ inputs.java }}" != "21" ]; then
export SERIAL_SBT_TESTS=1
fi
- # Cap parallel forked test JVMs at 1 so that even when
- # SparkParallelTestGrouping is enabled we don't blow the
- # 7 GB runner budget (each forked test JVM has -Xmx2g).
+ # Cap parallel forked test JVMs at 1: on a 7 GB runner the 3 GB SBT
+ # controller and a single 5 GB forked test JVM (HEAP_SIZE override
+ # below) already over-commit RAM; any concurrency on top would push
+ # us straight into OOMs.
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ inputs.collect-fallback-logs }} \
build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM \
'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
@@ -164,17 +180,35 @@ jobs:
fi
env:
LC_ALL: "C.UTF-8"
- # Standard GitHub runners have 7 GB RAM; cap SBT heap so forked test
- # JVMs fit alongside it.
SBT_MEM: "3072"
- # Mirror Spark's own JDK 21 / 25 CI workaround. apache/spark's
- # build_java21.yml and build_java25.yml set this same env var to
- # process-isolate the V1/V2 Parquet and Orc source suites because
- # they exhibit cross-suite resource interactions (file-stream and
- # thread leaks) under the newer JDKs. project/SparkBuild.scala
- # reads DEDICATED_JVM_SBT_TESTS and forks a separate JVM per
- # listed suite. Empty value is a safe no-op.
- DEDICATED_JVM_SBT_TESTS: ${{ inputs.spark-short == '4.0' && 'org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite' || '' }}
+ HEAP_SIZE: "3g"
+ # Cap forked test JVM direct memory. Without this, -XX:MaxDirectMemorySize
+ # defaults to ≈ -Xmx (5 GB), letting Netty / DirectByteBuffer / shuffle
+ # buffers grow alongside the heap and push host RSS past the 7 GB
+ # runner ceiling. JAVA_TOOL_OPTIONS is honoured by every child JVM;
+ # SBT picks it up too but allocates negligible direct memory itself.
+ JAVA_TOOL_OPTIONS: "-XX:MaxDirectMemorySize=1g"
+ # Force these suites into their own forked JVMs so per-suite state
+ # (codegen classes, native pools, shuffle buffers) is reclaimed on
+ # JVM exit instead of accumulating across a single testOnly run:
+ # * AdaptiveQueryExecSuite — long, codegen- and shuffle-heavy;
+ # observed to SIGKILL the host on 7 GB runners after ~10 tests.
+ # * Parquet/Orc V1/V2 source suites — Spark 4.0 only; mirrors
+ # apache/spark's build_java21.yml / build_java25.yml workaround
+ # for cross-suite file-stream / thread leaks under newer JDKs.
+ # SparkBuild.scala reads this env var and installs the testGrouping
+ # regardless of SERIAL_SBT_TESTS, so listed suites still get their
+ # own JVM in serial-test mode (only run policy changes).
+ DEDICATED_JVM_SBT_TESTS: ${{ inputs.spark-short == '4.0' && 'org.apache.spark.sql.execution.adaptive.AdaptiveQueryExecSuite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite' || 'org.apache.spark.sql.execution.adaptive.AdaptiveQueryExecSuite' }}
+ - name: Dump kernel + memory state on failure
+ if: failure()
+ shell: bash
+ run: |
+ echo '== /proc/meminfo =='; cat /proc/meminfo 2>/dev/null || true
+ echo '== /proc/swaps =='; cat /proc/swaps 2>/dev/null || true
+ echo '== top RSS =='; ps -eo pid,rss,cmd --sort=-rss 2>/dev/null | head -20 || true
+ echo '== dmesg (best effort) ==';
+ dmesg -T 2>/dev/null | tail -200 || cat /var/log/kern.log 2>/dev/null | tail -200 || echo 'kernel log unavailable inside container'
- name: Upload fallback log
if: ${{ inputs.collect-fallback-logs }}
uses: actions/upload-artifact@v7
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 7ad3a4c011..6010856602 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -16,9 +16,12 @@
# under the License.
name: "Close stale PRs"
+# CI DISABLED 2026-05-31 — schedule commented out, only manual workflow_dispatch remains.
+# To rollback: remove `workflow_dispatch:` and restore the commented `schedule` block below.
on:
- schedule:
- - cron: "30 1 * * *"
+ workflow_dispatch:
+ # schedule:
+ # - cron: "30 1 * * *"
jobs:
close-stale-prs:
diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml
index 9e84da8da6..45ae416d64 100644
--- a/.github/workflows/take.yml
+++ b/.github/workflows/take.yml
@@ -16,9 +16,12 @@
# under the License.
name: Assign/unassign the issue via `take` or `untake` comment
+# CI DISABLED 2026-05-31 — auto-trigger commented out, only manual workflow_dispatch remains.
+# To rollback: remove `workflow_dispatch:` and restore the commented `issue_comment` block below.
on:
- issue_comment:
- types: created
+ workflow_dispatch:
+ # issue_comment:
+ # types: created
permissions:
issues: write
diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
new file mode 100644
index 0000000000..c7c66ccb08
--- /dev/null
+++ b/.github/workflows/update_build_status.yml
@@ -0,0 +1,111 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Update build status workflow
+
+on:
+ schedule:
+ - cron: "*/15 * * * *"
+
+jobs:
+ update:
+ name: Update build status
+ runs-on: ubuntu-22.04
+ permissions:
+ actions: read
+ checks: write
+ steps:
+ - name: "Update build status"
+ uses: actions/github-script@v9
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/pulls?state=:state'
+ const params = {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ state: 'open'
+ }
+
+ // See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
+ const maybeReady = ['behind', 'clean', 'draft', 'has_hooks', 'unknown', 'unstable'];
+
+ // Iterate open PRs
+ for await (const prs of github.paginate.iterator(endpoint,params)) {
+ // Each page
+ for await (const pr of prs.data) {
+ console.log('SHA: ' + pr.head.sha)
+ console.log(' Mergeable status: ' + pr.mergeable_state)
+ if (pr.mergeable_state == null || maybeReady.includes(pr.mergeable_state)) {
+ const checkRuns = await github.request('GET /repos/{owner}/{repo}/commits/{ref}/check-runs', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ ref: pr.head.sha
+ })
+
+ // Iterate GitHub Checks in the PR
+ for await (const cr of checkRuns.data.check_runs) {
+ if (cr.name == 'Build' && cr.conclusion != "action_required") {
+ // text contains parameters to make request in JSON.
+ const params = JSON.parse(cr.output.text)
+
+ // Get the workflow run in the forked repository
+ let run
+ try {
+ run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+ } catch (error) {
+ console.error(error)
+ // Run not found. This can happen when the PR author removes GitHub Actions runs or
+ // disables GitHub Actions.
+ continue
+ }
+
+ // Keep syncing the status of the checks
+ if (run.data.status == 'completed') {
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ') and conclusion (' + run.data.conclusion + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: run.data.status,
+ conclusion: run.data.conclusion,
+ details_url: run.data.details_url
+ })
+ } else {
+ // PATCH /check-runs accepts only queued | in_progress | completed,
+ // but workflow_run may also report requested / waiting / pending.
+ const status = run.data.status == 'in_progress' ? 'in_progress' : 'queued'
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ' -> ' + status + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: status,
+ details_url: run.data.details_url
+ })
+ }
+
+ break
+ }
+ }
+ }
+ }
+ }
\ No newline at end of file
diff --git a/dev/diffs/4.1.2.diff b/dev/diffs/4.1.2.diff
index f9911ce56d..580d0bcd8f 100644
--- a/dev/diffs/4.1.2.diff
+++ b/dev/diffs/4.1.2.diff
@@ -1662,6 +1662,28 @@ index ede5d285932..c9a8abb5a94 100644
import testImplicits._
before {
+diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
+index 64bb5a289b3..13fc3c6b0ef 100644
+--- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
++++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionAnsiErrorsSuite.scala
+@@ -20,6 +20,7 @@ import org.apache.spark._
+ import org.apache.spark.SparkBuildInfo
+ import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
+ import org.apache.spark.sql.QueryTest
++import org.apache.spark.sql.IgnoreComet
+ import org.apache.spark.sql.catalyst.expressions.{CaseWhen, Cast, CheckOverflowInTableInsert, ExpressionProxy, Literal, SubExprEvaluationRuntime}
+ import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
+ import org.apache.spark.sql.classic.SparkSession
+@@ -286,7 +287,8 @@ class QueryExecutionAnsiErrorsSuite extends QueryTest
+ )
+ }
+
+- test("INVALID_DATETIME_PATTERN with non-constant pattern") {
++ test("INVALID_DATETIME_PATTERN with non-constant pattern",
++ IgnoreComet("Comet exception type mismatch")) {
+ withTable("patterns") {
+ sql("create table patterns(pattern string) using parquet")
+ sql("insert into patterns values ('yyyyMMddHHMIss')")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala
index fcecaf25d4c..e5a511022cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala
diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md
index 3785358cf3..31dff866b0 100644
--- a/docs/source/contributor-guide/development.md
+++ b/docs/source/contributor-guide/development.md
@@ -660,6 +660,52 @@ make # Build everything and update generated docs
make test # Run tests (optional but recommended)
```
+## Forked-Branch CI
+
+Comet runs PR CI on the contributor's fork rather than on `apache/datafusion-comet`. The heavy build/test matrix consumes the fork owner's GitHub Actions minutes and runners; the upstream repo only hosts a thin `Build` check that links to the fork's run. The design mirrors Apache Spark's fork-CI bridge.
+
+### Flow
+
+```mermaid
+flowchart TD
+ subgraph fork["Forked repo (contributor)"]
+ P([push to any branch]) --> BM["build_main.yml — job: Run"]
+ BM -->|workflow_call| BAT["build_and_test.yml (reusable):
ci.yml fan-out, force_all=true"]
+ BAT --> CR["check runs:
Run / CI / Preflight, per-Spark-version matrix, ..."]
+ end
+
+ subgraph up["Upstream repo (apache/datafusion-comet)"]
+ PRT([pull_request_target]) --> N["notify_test_workflow.yml"]
+ SCH([schedule: every 15 min]) --> U["update_build_status.yml"]
+ N -->|create| B(["Build check on PR"])
+ U -->|"PATCH status / conclusion"| B
+ end
+
+ BM -.->|"① find run (id: build_main.yml)"| N
+ CR -.->|"② link check-run view"| N
+ BM -.->|"③ poll run status"| U
+```
+
+### Workflow responsibilities
+
+- **`build_main.yml`** (fork) — triggers on `push` to any branch and calls `build_and_test.yml`. Contributor pushes to their fork run here, on the fork's runners.
+- **`build_and_test.yml`** (fork, reusable) — invokes `ci.yml` with `force_all: true`, so the full per-Spark-version matrix runs even when changed paths don't match `ci.yml`'s filters (e.g. when iterating on the workflow files themselves).
+- **`notify_test_workflow.yml`** (upstream) — runs on `pull_request_target` for opened/reopened/synchronize. Looks up the matching `build_main.yml` run on the PR head's fork+branch, then creates a `Build` check on the PR pointing at the fork's check-run view (`Run / CI / Preflight`). If the run can't be found, it reports `action_required` with instructions for the contributor to enable GitHub Actions on their fork.
+- **`update_build_status.yml`** (upstream) — runs on a 15-minute cron, walks open PRs, and `PATCH`es each `Build` check's status/conclusion from the corresponding fork run. This is what eventually flips the upstream check from "queued" to "success/failure".
+
+### Contributor checklist
+
+If the upstream `Build` check fails with **"Workflow run detection failed"**:
+
+1. Ensure GitHub Actions is enabled on your fork (Settings → Actions → "Allow all actions").
+2. Rebase onto the latest upstream `main` and force-push — `notify_test_workflow.yml` looks up the run by `(fork owner, fork repo, branch, head sha)`, and a stale base can prevent the match:
+ ```sh
+ git fetch upstream
+ git rebase upstream/main
+ git push origin --force
+ ```
+3. If the fork's run is healthy but the upstream check stays "queued" longer than ~15 minutes, the cron job in `update_build_status.yml` may have skipped a cycle — pushing a follow-up commit re-triggers `notify_test_workflow.yml` immediately.
+
## How to format `.md` document
We are using `prettier` to format `.md` files.