Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,23 @@


def classify_log_file(filename):
"""Return (platform, test_config, shard_num) from a log filename like rocm3.txt."""
"""Return (platform, test_config, shard_num) from a log filename like rocm3.txt.

Commit-vs-commit parity prefixes log files with the short commit SHA
(for example, 09e0c59b_rocm3.txt). In that mode the SHA label is the
platform name used by generate_summary.py, so preserve it here.
"""
stem = Path(filename).stem
label = None
m = re.match(r"(?P<label>[0-9a-f]{8,40})_(?P<stem>.+)", stem)
if m:
label = m.group("label")[:8]
stem = m.group("stem")
for prefix, (platform, test_config) in sorted(LOG_FILE_MAP.items(), key=lambda x: -len(x[0])):
if stem.startswith(prefix):
remainder = stem[len(prefix):]
if remainder.isdigit():
return platform, test_config, int(remainder)
return label or platform, test_config, int(remainder)
return None, None, None


Expand Down
44 changes: 30 additions & 14 deletions .automation_scripts/pytorch-unit-test-scripts/download_testlogs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def download_artifacts(wf, prefixes=[], test_folder=".", allowed_substrings=None
)
os.chdir("..")
# for older runs, add 'created':'<=YYYY-MM-DD'. see https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates
def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, ignore_status=False, status='success', error_msg='Error downloading workflow runs'):
def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, branch=None, ignore_status=False, status='success', error_msg='Error downloading workflow runs'):
if not workflow:
raise Exception("Workflow must be specified")
for page in range(max_pages):
Expand All @@ -299,8 +299,10 @@ def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, i
params['created'] = created
if sha:
params['head_sha'] = sha
if branch:
params['branch'] = branch
else:
params['branch'] = "main"
params['branch'] = branch or "main"
print(".")

# Uncomment below for additional debug info
Expand All @@ -316,6 +318,10 @@ def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, i
raise Exception(response.text)
if not workflow_runs:
continue
if branch:
workflow_runs = [wf for wf in workflow_runs if wf.get('head_branch') == branch]
if not workflow_runs:
continue
# Prefer completed runs over in-progress ones. When multiple
# runs exist for the same SHA, the most recent may still be
# running and have no artifacts yet.
Expand Down Expand Up @@ -496,6 +502,7 @@ def main():
sha = args.sha1
pr_id = None
status = "success"
sha_branch = "main" if args.sha1 else None
print(sha)

# When comparing two commits, prefix log filenames with short SHAs
Expand All @@ -516,7 +523,7 @@ def main():
#https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
periodic_fallback_used = False
try:
periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["distributed"], sha=periodic_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["distributed"], sha=periodic_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
except (IndexError, Exception):
periodic_wf = None
periodic_fallbacks = {
Expand All @@ -526,7 +533,7 @@ def main():
if periodic_wf is None and arch in periodic_fallbacks:
fallback_wf, fallback_prefix = periodic_fallbacks[arch]
print(f"Distributed not found in {ROCmWorkflowNames['distributed']}, falling back to {fallback_wf}")
periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=periodic_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=periodic_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
periodic_fallback_used = True
if periodic_wf is None:
raise Exception(error_msg)
Expand Down Expand Up @@ -578,7 +585,7 @@ def main():
error_msg="Error: rocm workflow not found in scanned workflow runs. Try increasing max_pages."
default_fallback_used = False
try:
rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["default"], sha=rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["default"], sha=rocm_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
except (IndexError, Exception):
rocm_wf = None
default_fallbacks = {
Expand All @@ -587,7 +594,7 @@ def main():
if rocm_wf is None and arch in default_fallbacks:
fallback_wf, fallback_prefix = default_fallbacks[arch]
print(f"Default not found in {ROCmWorkflowNames['default']}, falling back to {fallback_wf}")
rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=rocm_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
default_fallback_used = True
rocm_job_prefix['default'] = fallback_prefix
if rocm_wf is None:
Expand Down Expand Up @@ -632,7 +639,7 @@ def main():
print(f"Finding ROCm inductor tests in workflow '{ROCmWorkflowNames['inductor']}' by sha: {inductor_rocm_sha}")
print("===========================================")
error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
print(f"Using workflow '{ROCmWorkflowNames['inductor']}' with id:{inductor_wf_rocm['id']} for ROCm inductor")

folder_list = get_or_create_test_folder(inductor_wf_rocm)
Expand Down Expand Up @@ -678,11 +685,15 @@ def main():
if not args.ignore_status:
params['status'] = status
params['head_sha'] = sha
if sha_branch:
params['branch'] = sha_branch
resp = requests.get(
f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{CUDAWorkflowNames['default']}.yml/runs",
headers=authentication_headers, params=params,
)
trunk_runs = resp.json().get('workflow_runs', [])
if sha_branch:
trunk_runs = [run for run in trunk_runs if run.get('head_branch') == sha_branch]

for run in trunk_runs:
jobs = get_workflow_jobs(run)
Expand Down Expand Up @@ -717,8 +728,13 @@ def main():
headers=authentication_headers,
)
trunk_wf = resp.json()
print(f"CUDA test jobs are in trunk run {trunk_wf['id']} (found via check-runs)")
all_cuda_jobs = list(cuda_test_jobs)
if sha_branch and trunk_wf.get('head_branch') != sha_branch:
print(f"Skipping CUDA check-run from non-{sha_branch} branch run {trunk_wf.get('id')}")
trunk_wf = None
cuda_test_jobs = []
else:
print(f"CUDA test jobs are in trunk run {trunk_wf['id']} (found via check-runs)")
all_cuda_jobs = list(cuda_test_jobs)

if trunk_wf is None:
trunk_wf = trunk_runs[0] if trunk_runs else None
Expand Down Expand Up @@ -793,7 +809,7 @@ def main():
# find tests in inductor workflow with given sha and success status
#https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
inductor_wf_cuda = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["inductor"], sha=inductor_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
inductor_wf_cuda = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["inductor"], sha=inductor_sha, branch=sha_branch, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
print(f"Using workflow '{CUDAWorkflowNames['inductor']}' with id:{inductor_wf_cuda['id']} for CUDA inductor")

folder_list = get_or_create_test_folder(inductor_wf_cuda)
Expand Down Expand Up @@ -840,7 +856,7 @@ def main():
try:
baseline_default_wf = download_workflow_run(
created=args.created, max_pages=args.max_pages,
workflow=ROCmWorkflowNames["default"], sha=baseline_sha,
workflow=ROCmWorkflowNames["default"], sha=baseline_sha, branch="main",
ignore_status=args.ignore_status, status=status,
error_msg=f"Baseline default workflow not found for {baseline_sha}",
)
Expand Down Expand Up @@ -872,7 +888,7 @@ def main():
try:
baseline_dist_wf = download_workflow_run(
created=args.created, max_pages=args.max_pages,
workflow=ROCmWorkflowNames["distributed"], sha=baseline_sha,
workflow=ROCmWorkflowNames["distributed"], sha=baseline_sha, branch="main",
ignore_status=args.ignore_status, status=status,
error_msg=f"Baseline distributed workflow not found for {baseline_sha}",
)
Expand Down Expand Up @@ -904,7 +920,7 @@ def main():
try:
baseline_inductor_wf = download_workflow_run(
created=args.created, max_pages=args.max_pages,
workflow=ROCmWorkflowNames["inductor"], sha=baseline_sha,
workflow=ROCmWorkflowNames["inductor"], sha=baseline_sha, branch="main",
ignore_status=args.ignore_status, status=status,
error_msg=f"Baseline inductor workflow not found for {baseline_sha}",
)
Expand Down Expand Up @@ -943,7 +959,7 @@ def main():
try:
inductor_periodic_wf = download_workflow_run(
created=args.created, max_pages=args.max_pages,
workflow="inductor-periodic", sha=sha,
workflow="inductor-periodic", sha=sha, branch=sha_branch,
ignore_status=args.ignore_status, status=status,
error_msg=error_msg,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,15 @@ def summarize_xml_files(args):
# test file level running time: ROCm and CUDA
test_file_level_ROCm: Dict[Tuple[str], float] = {}
test_file_level_CUDA: Dict[Tuple[str], float] = {}
test_file_shards_ROCm: Dict[Tuple[str], set] = {}
test_file_shards_CUDA: Dict[Tuple[str], set] = {}
for (k,v) in list(test_cases_set1_running_time.items()):
test_file_name = k[0]
test_config_name = k[2]
tar_tup_rocm = (test_file_name, test_config_name,)
test_file_shards_ROCm.setdefault(tar_tup_rocm, set())
if v.get("shard"):
test_file_shards_ROCm[tar_tup_rocm].add(v["shard"])
if test_file_level_ROCm.get(tar_tup_rocm) == None:
test_file_level_ROCm[ ( test_file_name, test_config_name ) ] = v["running_time_xml"]
else:
Expand All @@ -309,6 +314,9 @@ def summarize_xml_files(args):
test_file_name = k[0]
test_config_name = k[2]
tar_tup_cuda = (test_file_name, test_config_name)
test_file_shards_CUDA.setdefault(tar_tup_cuda, set())
if v.get("shard"):
test_file_shards_CUDA[tar_tup_cuda].add(v["shard"])
if test_file_level_CUDA.get(tar_tup_cuda) == None:
test_file_level_CUDA[ ( test_file_name, test_config_name ) ] = v["running_time_xml"]
else:
Expand Down Expand Up @@ -588,43 +596,56 @@ def sorting_key(e):

# write test file running time to file
test_file_running_time_for_csv = {}
set1_running_time_col = f"{set1_name}_running_time"
set2_running_time_col = f"{set2_name}_running_time"
set1_tests_run_col = f"{set1_name}_tests_run"
set2_tests_run_col = f"{set2_name}_tests_run"
set1_test_shards_col = f"{set1_name}_test_shards"
set2_test_shards_col = f"{set2_name}_test_shards"
set1_passed_col = f"{set1_name}_passed"
set1_skipped_col = f"{set1_name}_skipped"
set1_missed_col = f"{set1_name}_missed"
for key_rocm in test_file_level_ROCm.keys():
item_values = {}
item_values["test_file"] = key_rocm[0]
item_values["test_config"] = key_rocm[1]
item_values["rocm_running_time"] = test_file_level_ROCm[key_rocm]
item_values["cuda_running_time"] = 0.0
item_values[set1_running_time_col] = test_file_level_ROCm[key_rocm]
item_values[set2_running_time_col] = 0.0
if key_rocm in test_file_level_CUDA.keys():
item_values["cuda_running_time"] = test_file_level_CUDA[key_rocm]
item_values["abs_time_diff"] = item_values["rocm_running_time"] - item_values["cuda_running_time"]
item_values[set2_running_time_col] = test_file_level_CUDA[key_rocm]
item_values["abs_time_diff"] = item_values[set1_running_time_col] - item_values[set2_running_time_col]
item_values["relative_time_diff"] = 0.0
if item_values["cuda_running_time"] != 0.0:
item_values["relative_time_diff"] = 100 * (item_values["rocm_running_time"] - item_values["cuda_running_time"]) / item_values["cuda_running_time"]
if item_values[set2_running_time_col] != 0.0:
item_values["relative_time_diff"] = 100 * (item_values[set1_running_time_col] - item_values[set2_running_time_col]) / item_values[set2_running_time_col]
# Add test counts
item_values["rocm_tests_run"] = test_file_counts_ROCm.get(key_rocm, {}).get('tests_run', 0)
item_values["cuda_tests_run"] = test_file_counts_CUDA.get(key_rocm, 0)
item_values["rocm_passed"] = test_file_counts_ROCm.get(key_rocm, {}).get('passed', 0)
item_values["rocm_skipped"] = test_file_counts_ROCm.get(key_rocm, {}).get('skipped', 0)
item_values["rocm_missed"] = test_file_counts_ROCm.get(key_rocm, {}).get('missed', 0)
item_values[set1_tests_run_col] = test_file_counts_ROCm.get(key_rocm, {}).get('tests_run', 0)
item_values[set2_tests_run_col] = test_file_counts_CUDA.get(key_rocm, 0)
item_values[set1_test_shards_col] = len(test_file_shards_ROCm.get(key_rocm, set()))
item_values[set2_test_shards_col] = len(test_file_shards_CUDA.get(key_rocm, set()))
item_values[set1_passed_col] = test_file_counts_ROCm.get(key_rocm, {}).get('passed', 0)
item_values[set1_skipped_col] = test_file_counts_ROCm.get(key_rocm, {}).get('skipped', 0)
item_values[set1_missed_col] = test_file_counts_ROCm.get(key_rocm, {}).get('missed', 0)
test_file_running_time_for_csv[key_rocm] = item_values

for key_cuda in test_file_level_CUDA.keys():
if not key_cuda in test_file_level_ROCm.keys():
item_values = {}
item_values["test_file"] = key_cuda[0]
item_values["test_config"] = key_cuda[1]
item_values["rocm_running_time"] = 0.0
item_values["cuda_running_time"] = test_file_level_CUDA[key_cuda]
item_values["abs_time_diff"] = item_values["rocm_running_time"] - item_values["cuda_running_time"]
item_values[set1_running_time_col] = 0.0
item_values[set2_running_time_col] = test_file_level_CUDA[key_cuda]
item_values["abs_time_diff"] = item_values[set1_running_time_col] - item_values[set2_running_time_col]
item_values["relative_time_diff"] = 0.0
if item_values["cuda_running_time"] != 0.0:
item_values["relative_time_diff"] = 100 * (item_values["rocm_running_time"] - item_values["cuda_running_time"]) / item_values["cuda_running_time"]
if item_values[set2_running_time_col] != 0.0:
item_values["relative_time_diff"] = 100 * (item_values[set1_running_time_col] - item_values[set2_running_time_col]) / item_values[set2_running_time_col]
# Add test counts
item_values["rocm_tests_run"] = test_file_counts_ROCm.get(key_cuda, {}).get('tests_run', 0)
item_values["cuda_tests_run"] = test_file_counts_CUDA.get(key_cuda, 0)
item_values["rocm_passed"] = test_file_counts_ROCm.get(key_cuda, {}).get('passed', 0)
item_values["rocm_skipped"] = test_file_counts_ROCm.get(key_cuda, {}).get('skipped', 0)
item_values["rocm_missed"] = test_file_counts_ROCm.get(key_cuda, {}).get('missed', 0)
item_values[set1_tests_run_col] = test_file_counts_ROCm.get(key_cuda, {}).get('tests_run', 0)
item_values[set2_tests_run_col] = test_file_counts_CUDA.get(key_cuda, 0)
item_values[set1_test_shards_col] = len(test_file_shards_ROCm.get(key_cuda, set()))
item_values[set2_test_shards_col] = len(test_file_shards_CUDA.get(key_cuda, set()))
item_values[set1_passed_col] = test_file_counts_ROCm.get(key_cuda, {}).get('passed', 0)
item_values[set1_skipped_col] = test_file_counts_ROCm.get(key_cuda, {}).get('skipped', 0)
item_values[set1_missed_col] = test_file_counts_ROCm.get(key_cuda, {}).get('missed', 0)
test_file_running_time_for_csv[key_cuda] = item_values

test_file_running_time_for_csv = dict(sorted(test_file_running_time_for_csv.items()))
Expand All @@ -634,24 +655,28 @@ def sorting_key_running_time(e):
return 0
elif e == "test_config":
return 1
elif e == "rocm_running_time":
elif e == set1_running_time_col:
return 2
elif e == "cuda_running_time":
elif e == set2_running_time_col:
return 3
elif e == "abs_time_diff":
return 4
elif e == "relative_time_diff":
return 5
elif e == "rocm_tests_run":
elif e == set1_tests_run_col:
return 6
elif e == "cuda_tests_run":
elif e == set2_tests_run_col:
return 7
elif e == "rocm_passed":
elif e == set1_test_shards_col:
return 8
elif e == "rocm_skipped":
elif e == set2_test_shards_col:
return 9
elif e == "rocm_missed":
elif e == set1_passed_col:
return 10
elif e == set1_skipped_col:
return 11
elif e == set1_missed_col:
return 12
else:
return 100

Expand Down
Loading