Debug Test #19

Workflow file for this run

	name: "Debug Test"

	on:
	workflow_dispatch:
	# pull_request:
	# push:
	# branches: [multinode-firstclass]


	jobs:
	get-dsr1-configs:
	runs-on: ubuntu-latest
	outputs:
	multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }}
	steps:
	- name: Checkout code
	uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

	- id: get-dsr1-configs
	run: \|
	pip install pydantic
	CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --seq-lens 1k1k --model-prefix dsr1)
	echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
	echo $CONFIG_JSON_MULTI_NODE

	# get-gptoss-configs:
	# runs-on: ubuntu-latest
	# outputs:
	# multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }}
	# steps:
	# - name: Checkout code
	# uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

	# - id: get-gptoss-configs
	# run: \|
	# pip install pydantic
	# CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
	# echo "multi-node-search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT


	benchmark-dsr1-multi-node:
	needs: get-dsr1-configs
	uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	name: dsr1 1k1k /
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
	secrets: inherit
	with:
	isl: 1024
	osl: 1024
	max-model-len: 2048
	runner: ${{ matrix.config.runner }}
	image: ${{ matrix.config.image }}
	model: ${{ matrix.config.model }}
	framework: ${{ matrix.config.framework }}
	precision: ${{ matrix.config.precision }}
	exp-name: "dsr1_1k1k"
	conc: ${{ matrix.config.conc }}
	mtp-mode: ${{ matrix.config.mtp }}

	additional-settings: ${{ toJson(matrix.config.additional-settings) }}

	prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
	prefill-tp: ${{ matrix.config.prefill.tp }}
	prefill-ep: ${{ matrix.config.prefill.ep }}
	prefill-batch-size: ${{ matrix.config.prefill.batch-size }}
	prefill-max-num-tokens: ${{ matrix.config.prefill.max-num-tokens }}
	prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}

	decode-num-worker: ${{ matrix.config.decode.num-worker }}
	decode-tp: ${{ matrix.config.decode.tp }}
	decode-ep: ${{ matrix.config.decode.ep }}
	decode-batch-size: ${{ matrix.config.decode.batch-size }}
	decode-max-num-tokens: ${{ matrix.config.decode.max-num-tokens }}
	decode-dp-attn: ${{ matrix.config.decode.dp-attn }}

	# benchmark-gptoss-multi-node:
	# needs: get-gptoss-configs
	# uses: ./.github/workflows/benchmark-multinode-tmpl.yml
	# name: gptoss 1k1k /
	# strategy:
	# fail-fast: false
	# matrix:
	# config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
	# secrets: inherit
	# with:
	# exp-name: "gptoss_1k1k"
	# isl: 1024
	# osl: 1024
	# max-model-len: 2048
	# runner: ${{ matrix.config.runner }}
	# image: ${{ matrix.config.image }}
	# model: ${{ matrix.config.model }}
	# framework: ${{ matrix.config.framework }}
	# precision: ${{ matrix.config.precision }}
	# tp: ${{ matrix.config.tp }}
	# ep: ${{ matrix.config.ep }}
	# dp-attn: ${{ matrix.config.dp-attn }}
	# conc: ${{ matrix.config.conc }}

	collect-dsr1-results:
	needs: benchmark-dsr1-multi-node
	if: ${{ always() }}
	uses: ./.github/workflows/collect-results.yml
	secrets: inherit
	with:
	exp-name: "dsr1_1k1k"

	# collect-gptoss-results:
	# needs: benchmark-gptoss-multi-node
	# if: ${{ always() }}
	# uses: ./.github/workflows/collect-results.yml
	# secrets: inherit
	# with:
	# exp-name: "gptoss_1k1k"

	calc-success-rate:
	needs: [collect-dsr1-results]
	if: ${{ always() }}
	runs-on: ubuntu-latest

	env:
	RESULTS_DIR: "results/"
	STATS_FILENAME: "run_stats"
	GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

	steps:
	- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
	with:
	token: ${{ secrets.REPO_PAT }}
	fetch-depth: 0

	- name: Download results artifacts
	uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
	with:
	path: ${{ env.RESULTS_DIR }}
	pattern: results_*

	- name: Install python dependencies
	run: pip install PyGithub

	- name: Calculate success rate
	run: python3 utils/calc_success_rate.py $STATS_FILENAME

	- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
	with:
	name: "run-stats"
	path: ${{ env.STATS_FILENAME }}.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Debug Test #19

Workflow file

Debug Test #19

Uh oh!

Workflow file for this run