From 8208483ad0d517845914c0fd1d3e8acb5fc4ca0f Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Sun, 7 Dec 2025 15:59:46 -0500 Subject: [PATCH 1/2] ci: get Avik a tpu --- .buildkite/pipeline.yml | 170 ++++++++++++++++----------------- .github/workflows/CI.yml | 62 ++++++------ .github/workflows/CommonCI.yml | 139 +++------------------------ src/Reactant.jl | 2 + 4 files changed, 131 insertions(+), 242 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 064ef33235..fec92f1d91 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,94 +1,94 @@ -steps: - - group: ":test_tube: Tests" - steps: - - label: ":julia: :linux: CUDA Julia v{{matrix.version}} -- {{matrix.group}} -- {{matrix.runtime}}" - matrix: - setup: - version: - - "1.10" - group: - - core - - neural_networks - - integration - runtime: - - "PJRT" - - "IFRT" - plugins: - - JuliaCI/julia#v1: - version: "{{matrix.version}}" - - JuliaCI/julia-coverage#v1: - codecov: true - dirs: - - src - - ext - - lib/ReactantCore/src - commands: | - touch LocalPreferences.toml +# steps: +# - group: ":test_tube: Tests" +# steps: +# - label: ":julia: :linux: CUDA Julia v{{matrix.version}} -- {{matrix.group}} -- {{matrix.runtime}}" +# matrix: +# setup: +# version: +# - "1.10" +# group: +# - core +# - neural_networks +# - integration +# runtime: +# - "PJRT" +# - "IFRT" +# plugins: +# - JuliaCI/julia#v1: +# version: "{{matrix.version}}" +# - JuliaCI/julia-coverage#v1: +# codecov: true +# dirs: +# - src +# - ext +# - lib/ReactantCore/src +# commands: | +# touch LocalPreferences.toml - echo "[Reactant]" >> LocalPreferences.toml - echo "xla_runtime = \"{{matrix.runtime}}\"" >> LocalPreferences.toml +# echo "[Reactant]" >> LocalPreferences.toml +# echo "xla_runtime = \"{{matrix.runtime}}\"" >> LocalPreferences.toml - cat LocalPreferences.toml +# cat LocalPreferences.toml - julia --project=. -e 'println("--- :julia: Instantiating project") - using Pkg - Pkg.develop([PackageSpec(path="lib/ReactantCore")])' +# julia --project=. -e 'println("--- :julia: Instantiating project") +# using Pkg +# Pkg.develop([PackageSpec(path="lib/ReactantCore")])' - julia --project=. -e 'println("--- :julia: Run Tests") - using Pkg - Pkg.test(; coverage="user")' - agents: - queue: "juliagpu" - cuda: "*" - env: - REACTANT_TEST_GROUP: "{{matrix.group}}" - JULIA_DEBUG: "Reactant,Reactant_jll" - CUDA_VISIBLE_DEVICES: 0 - REACTANT_BACKEND_GROUP: "GPU" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 120 +# julia --project=. -e 'println("--- :julia: Run Tests") +# using Pkg +# Pkg.test(; coverage="user")' +# agents: +# queue: "juliagpu" +# cuda: "*" +# env: +# REACTANT_TEST_GROUP: "{{matrix.group}}" +# JULIA_DEBUG: "Reactant,Reactant_jll" +# CUDA_VISIBLE_DEVICES: 0 +# REACTANT_BACKEND_GROUP: "GPU" +# if: build.message !~ /\[skip tests\]/ +# timeout_in_minutes: 120 - # - label: ":julia: :linux: AMDGPU Julia v{{matrix.version}} -- {{matrix.group}} -- {{matrix.runtime}}" - # matrix: - # setup: - # version: - # - "1.10" - # group: - # - core - # - neural_networks - # - integration - # runtime: - # - "IFRT" - # plugins: - # - JuliaCI/julia#v1: - # version: "{{matrix.version}}" - # - JuliaCI/julia-coverage#v1: - # codecov: true - # dirs: - # - src - # - ext - # - lib/ReactantCore/src - # agents: - # queue: "juliagpu" - # rocm: "*" - # commands: | - # touch LocalPreferences.toml +# # - label: ":julia: :linux: AMDGPU Julia v{{matrix.version}} -- {{matrix.group}} -- {{matrix.runtime}}" +# # matrix: +# # setup: +# # version: +# # - "1.10" +# # group: +# # - core +# # - neural_networks +# # - integration +# # runtime: +# # - "IFRT" +# # plugins: +# # - JuliaCI/julia#v1: +# # version: "{{matrix.version}}" +# # - JuliaCI/julia-coverage#v1: +# # codecov: true +# # dirs: +# # - src +# # - ext +# # - lib/ReactantCore/src +# # agents: +# # queue: "juliagpu" +# # rocm: "*" +# # commands: | +# # touch LocalPreferences.toml - # echo "[Reactant]" >> LocalPreferences.toml - # echo "xla_runtime = \"{{matrix.runtime}}\"" >> LocalPreferences.toml +# # echo "[Reactant]" >> LocalPreferences.toml +# # echo "xla_runtime = \"{{matrix.runtime}}\"" >> LocalPreferences.toml - # cat LocalPreferences.toml +# # cat LocalPreferences.toml - # julia --project=. -e 'println("--- :julia: Instantiating project") - # using Pkg - # Pkg.develop([PackageSpec(path="lib/ReactantCore")])' +# # julia --project=. -e 'println("--- :julia: Instantiating project") +# # using Pkg +# # Pkg.develop([PackageSpec(path="lib/ReactantCore")])' - # julia --project=. -e 'println("--- :julia: Run Tests") - # using Pkg - # Pkg.test(; coverage="user")' - # env: - # REACTANT_TEST_GROUP: "{{matrix.group}}" - # JULIA_DEBUG: "Reactant,Reactant_jll" - # CUDA_VISIBLE_DEVICES: 0 - # if: build.message !~ /\[skip tests\]/ - # timeout_in_minutes: 120 +# # julia --project=. -e 'println("--- :julia: Run Tests") +# # using Pkg +# # Pkg.test(; coverage="user")' +# # env: +# # REACTANT_TEST_GROUP: "{{matrix.group}}" +# # JULIA_DEBUG: "Reactant,Reactant_jll" +# # CUDA_VISIBLE_DEVICES: 0 +# # if: build.message !~ /\[skip tests\]/ +# # timeout_in_minutes: 120 diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b60f801cbc..cd39215f1c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -34,32 +34,32 @@ jobs: fail-fast: false matrix: version: - - "1.10" + # - "1.10" - "1.11" # - 'nightly' os: - - ubuntu-24.04 + # - ubuntu-24.04 # `ubuntu-22.04-arm` is considered more stable than `ubuntu-24.04-arm`: # . - - ubuntu-22.04-arm + # - ubuntu-22.04-arm # Disable `macOS-13` until # is resolved. # - macOS-13 - - macOS-latest - - windows-latest + # - macOS-latest + # - windows-latest - linux-x86-ct6e-180-4tpu test_group: - core - - neural_networks - - integration + # - neural_networks + # - integration runtime: - - "pjrt" + # - "pjrt" - "ifrt" - exclude: - - os: linux-x86-ct6e-180-4tpu - version: "1.10" - - os: linux-x86-ct6e-180-4tpu - runtime: "pjrt" + # exclude: + # - os: linux-x86-ct6e-180-4tpu + # version: "1.10" + # - os: linux-x86-ct6e-180-4tpu + # runtime: "pjrt" uses: ./.github/workflows/CommonCI.yml with: julia_version: ${{ matrix.version }} @@ -86,21 +86,21 @@ jobs: # assertions: true # test_group: ${{ matrix.test_group }} - downgrade: - strategy: - fail-fast: false - matrix: - test_group: - - core - - neural_networks - - integration - runtime: - - "pjrt" - - "ifrt" - uses: ./.github/workflows/CommonCI.yml - with: - julia_version: "1.10" - os: "ubuntu-24.04" - runtime: ${{ matrix.runtime }} - test_group: ${{ matrix.test_group }} - downgrade_testing: true + # downgrade: + # strategy: + # fail-fast: false + # matrix: + # test_group: + # - core + # - neural_networks + # - integration + # runtime: + # - "pjrt" + # - "ifrt" + # uses: ./.github/workflows/CommonCI.yml + # with: + # julia_version: "1.10" + # os: "ubuntu-24.04" + # runtime: ${{ matrix.runtime }} + # test_group: ${{ matrix.test_group }} + # downgrade_testing: true diff --git a/.github/workflows/CommonCI.yml b/.github/workflows/CommonCI.yml index a87f7fc392..5ecd198d37 100644 --- a/.github/workflows/CommonCI.yml +++ b/.github/workflows/CommonCI.yml @@ -67,122 +67,22 @@ jobs: if: ${{ ! inputs.assertions }} with: version: ${{ inputs.julia_version }} - - uses: julia-actions/cache@v2 - id: julia-cache - with: - cache-name: julia-cache;workflow=${{ inputs.julia_version }}-${{ inputs.os }}-${{ inputs.runtime }}-assertions=${{ inputs.assertions }}-${{ github.event_name }}-${{ inputs.test_group }}-${{ inputs.downgrade_testing }}-${{ inputs.localjll }} - - - uses: julia-actions/julia-downgrade-compat@v2 - if: ${{ inputs.downgrade_testing }} - with: - skip: "ReactantCore" - julia_version: ${{ inputs.julia_version }} - - # Local build of libReactant - - uses: bazel-contrib/setup-bazel@0.15.0 - if: ${{ inputs.localjll }} - name: Set up Bazel - with: - # Avoid downloading Bazel every time. - bazelisk-cache: true - # Store build cache per workflow. - disk-cache: ${{ github.workflow }}-${{ inputs.os }}-${{ inputs.julia_version }} - # Share repository cache between workflows. - repository-cache: true - bazelisk-version: 1.x - - name: Prepare build on macOS - if: ${{ startsWith(inputs.os, 'macOS-') && inputs.localjll }} - run: | - echo "SDKROOT=$(xcrun --show-sdk-path)" >> "${GITHUB_ENV}" - - name: Install numpy - if: ${{ startsWith(inputs.os, 'macOS-') && inputs.localjll }} - run: | - python -m pip install numpy - - name: Build libReactant - timeout-minutes: 120 - if: ${{ inputs.localjll }} - run: | - julia --color=yes --project=deps -e 'using Pkg; Pkg.instantiate()' - julia --color=yes --project=deps deps/build_local.jl --cc clang - cp LocalPreferences.toml test/ - - # Compile Julia if assertions are enabled - - uses: actions/checkout@v6 - if: ${{ inputs.assertions }} - with: - repository: "JuliaLang/julia" - ref: release-${{ inputs.julia_version }} - path: "julia" - - name: Compile Julia - if: ${{ inputs.assertions }} - run: | - sed -i.bak 's/exit 2/exit 0/g' julia/deps/tools/jlchecksum - make -C julia -j $(nproc) FORCE_ASSERTIONS=1 LLVM_ASSERTIONS=1 JULIA_PRECOMPILE=0 - echo $PWD/julia/usr/bin >> $GITHUB_PATH - # Install dependencies (specifically ReactantCore subdirectory for 1.10) - - name: "Install Dependencies" + - name: Prepare runtime directory for Upterm socket run: | - import Pkg - Pkg.Registry.update() - # Install packages present in subdirectories - dev_pks = Pkg.PackageSpec[] - for path in ("lib/ReactantCore",) - push!(dev_pks, Pkg.PackageSpec(; path)) - end - Pkg.develop(dev_pks) - shell: julia --color=yes --code-coverage=user --depwarn=yes --project=. {0} - # Only in Julia v1.10 we need to install `ReactantCore` manually. - if: ${{ inputs.julia_version == '1.10' || inputs.julia_version == 'lts' }} + export XDG_RUNTIME_DIR="/run/user/0" + mkdir -p "$XDG_RUNTIME_DIR/upterm" + chmod -R 700 "$XDG_RUNTIME_DIR" - # Run the tests (pjrt or ifrt or both) - - name: "Setup Runtime Preferences (PJRT)" - if: ${{ inputs.runtime == 'pjrt' || inputs.runtime == 'both' }} - uses: "DamianReeves/write-file-action@master" - with: - path: "LocalPreferences.toml" - write-mode: "overwrite" - contents: | - [Reactant] - xla_runtime = "PJRT" - - name: "Run Tests (PJRT)" - if: ${{ inputs.runtime == 'pjrt' || inputs.runtime == 'both' }} - timeout-minutes: 120 + - name: Install tmux run: | - import Pkg - Pkg.Registry.update() - Pkg.test(; - coverage="user", - allow_reresolve=parse(Bool, get(ENV, "ALLOW_RERESOLVE", "true")) - ) - shell: julia --color=yes --code-coverage=user --depwarn=yes --project=. {0} - env: - ALLOW_RERESOLVE: ${{ !inputs.downgrade_testing }} - REACTANT_TEST_GROUP: ${{ inputs.test_group }} - - - name: "Setup Runtime Preferences (IFRT)" - if: ${{ inputs.runtime == 'ifrt' || inputs.runtime == 'both' }} - uses: "DamianReeves/write-file-action@master" + apt-get update && apt-get -y install tmux + - name: Setup upterm session + uses: owenthereal/action-upterm@v1 with: - path: "LocalPreferences.toml" - write-mode: "overwrite" - contents: | - [Reactant] - xla_runtime = "IFRT" - - name: "Run Tests (IFRT)" - if: ${{ inputs.runtime == 'ifrt' || inputs.runtime == 'both' }} - timeout-minutes: 120 - run: | - import Pkg - Pkg.Registry.update() - Pkg.test(; - coverage="user", - allow_reresolve=parse(Bool, get(ENV, "ALLOW_RERESOLVE", "true")) - ) - shell: julia --color=yes --code-coverage=user --depwarn=yes --project=. {0} - env: - ALLOW_RERESOLVE: ${{ !inputs.downgrade_testing }} - REACTANT_TEST_GROUP: ${{ inputs.test_group }} + limit-access-to-actor: true + limit-access-to-users: avik-pal + wait-timeout-minutes: 30 - name: "Upload MLIR modules" uses: actions/upload-artifact@v5 @@ -190,23 +90,10 @@ jobs: if: always() with: name: "mlir-${{ inputs.julia_version }}-${{ inputs.os }}-${{ inputs.runtime }}-assertions=${{ inputs.assertions }}-${{ github.event_name }}-test_group=${{ inputs.test_group }}-${{ inputs.localjll }}" - path: "**/*.mlir" - retention-days: 90 + path: "**/*.zip" + retention-days: 5 overwrite: false - - name: Save Julia depot cache on cancel or failure - if: cancelled() || failure() - uses: actions/cache/save@v4 - with: - path: | - ${{ steps.julia-cache.outputs.cache-paths }} - key: ${{ steps.julia-cache.outputs.cache-key }} - - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 - with: - files: lcov.info - token: ${{ secrets.CODECOV_TOKEN }} env: JULIA_PKG_SERVER_REGISTRY_PREFERENCE: eager diff --git a/src/Reactant.jl b/src/Reactant.jl index e164886fb9..f2bab91c2e 100644 --- a/src/Reactant.jl +++ b/src/Reactant.jl @@ -30,6 +30,8 @@ using EnzymeCore: ReverseMode, ForwardMode +@info "Reactant.jl is loaded" + export allowscalar, @allowscalar # re-exported from GPUArraysCore is_extension_loaded(::Val) = false From 83c05af88dd7397b7592bfe14b7919ca51c2e0ed Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Sun, 7 Dec 2025 18:28:30 -0500 Subject: [PATCH 2/2] perf: generate xprof dumps --- .github/workflows/CommonCI.yml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/CommonCI.yml b/.github/workflows/CommonCI.yml index 5ecd198d37..2a6d3e930b 100644 --- a/.github/workflows/CommonCI.yml +++ b/.github/workflows/CommonCI.yml @@ -52,6 +52,9 @@ jobs: if: ${{ startsWith(inputs.os, 'ubuntu-') && inputs.localjll }} - uses: actions/checkout@v6 + with: + repository: LuxDL/Lux.jl + ref: ap/dump_performance_numbers - name: Set TMPDIR and create directory # We have to use `${GITHUB_WORKSPACE}` instead of `github.workspace` because GitHub @@ -74,15 +77,21 @@ jobs: mkdir -p "$XDG_RUNTIME_DIR/upterm" chmod -R 700 "$XDG_RUNTIME_DIR" - - name: Install tmux + - name: Run and dump xprof run: | - apt-get update && apt-get -y install tmux - - name: Setup upterm session - uses: owenthereal/action-upterm@v1 - with: - limit-access-to-actor: true - limit-access-to-users: avik-pal - wait-timeout-minutes: 30 + julia --threads=auto examples/Qwen3/generate_dumps.jl + zip -r traces.zip examples/Qwen3/traces + + # - name: Install tmux + # run: | + # apt-get update && apt-get -y install tmux + + # - name: Setup upterm session + # uses: owenthereal/action-upterm@v1 + # with: + # limit-access-to-actor: true + # limit-access-to-users: avik-pal + # wait-timeout-minutes: 30 - name: "Upload MLIR modules" uses: actions/upload-artifact@v5