Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,16 @@ def scan_logs(logs_dir):
job_total = shard_totals.get((platform, test_config), 0)
job_shard_str = f"{shard_num}/{job_total}" if job_total else str(shard_num)

# If download_testlogs left a "<log>.job_url" file next to this log,
# it contains the URL of the upstream pytorch CI job that produced
# the log. We surface it in the LOG-BASED FAILURES table as a link
# to that job's page. Empty for older runs that predate this.
job_url_file = os.path.join(logs_dir, fname + ".job_url")
job_url = ""
if os.path.isfile(job_url_file):
with open(job_url_file) as f:
job_url = f.read().strip()

filepath = os.path.join(logs_dir, fname)
results, consistent_failures, flaky_tests = parse_log_file(filepath)

Expand All @@ -306,6 +316,7 @@ def scan_logs(logs_dir):
"test_name": ft["method"],
"job_shard": job_shard_str,
"test_shard": ft["test_shard"],
"job_url": job_url,
})

# Record every (test_file, test_shard) observed in this log file,
Expand Down Expand Up @@ -365,6 +376,7 @@ def scan_logs(logs_dir):
"category": "+".join(categories),
"reason": reason,
"exit_codes": ",".join(str(c) for c in info["exit_codes"]),
"job_url": job_url,
})

for test_path, shard_str in consistent_failures:
Expand All @@ -384,6 +396,7 @@ def scan_logs(logs_dir):
"category": "CONSISTENT_FAILURE",
"reason": f"{test_class}::{test_name}" if test_class else "",
"exit_codes": "",
"job_url": job_url,
})

def _sort_shards(vals):
Expand Down Expand Up @@ -420,6 +433,7 @@ def write_csv_report(failures, output_path):
"log_file", "platform", "test_config", "test_file",
"job_shard", "test_shard",
"status", "category", "reason", "exit_codes",
"job_url",
]
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
Expand All @@ -441,6 +455,7 @@ def write_flaky_report(flaky, output_path):
fieldnames = [
"log_file", "platform", "test_config", "test_file",
"test_class", "test_name", "job_shard", "test_shard",
"job_url",
]
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
Expand Down
26 changes: 23 additions & 3 deletions .automation_scripts/pytorch-unit-test-scripts/download_testlogs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ def write_test_log_to_file(filename, test_key, jobs, sha):
with open(filename, "w", encoding="utf-8") as f:
f.write(response.text)

# Save the upstream pytorch CI job's page URL next to the log so
# detect_log_failures.py can later surface it as a link in the
# LOG-BASED FAILURES table of the parity summary.
job_url = js[0].get('html_url', '')
if job_url:
with open(filename + ".job_url", "w", encoding="utf-8") as f:
f.write(job_url)

def get_workflow_jobs(wf):
"""Get all jobs for a workflow run."""
if wf is None:
Expand Down Expand Up @@ -239,11 +247,13 @@ def _shorten_unzipped_dirs():
unzipped-test-reports-runattempt1-test-default-1-6-linux.rocm.gpu.gfx942.1_68613413431.zip
unzipped-test-reports-runattempt1-test-osdc-default-1-5-mt-l-x86aavx2-29-113-l4_73385044118.zip
to:
test-default-1-6
test-default-1-5
test-default-1-6_68613413431
test-default-1-5_73385044118

Preserves the 'test-<config>' prefix so that summarize_xml_testreports.py
can still detect workflow type via substring matching.
can still detect workflow type via substring matching. The trailing
'_<jobid>' is the upstream pytorch CI job id, used to link to the
failing job from the parity summary.
"""
from pathlib import Path
for d in sorted(Path(".").glob("unzipped-*")):
Expand All @@ -252,6 +262,13 @@ def _shorten_unzipped_dirs():
m = re.search(r'test-(?:osdc-)?(default|distributed|inductor)-(\d+)-(\d+)', d.name)
if m:
short_name = f"test-{m.group(1)}-{m.group(2)}-{m.group(3)}"
# The original artifact name ends with "_<jobid>.zip" where
# <jobid> is the upstream pytorch CI job id (e.g.
# ..._68613413431.zip). Carry it onto short_name so
# summarize_xml_testreports.py can link to that job.
job_id_match = re.search(r'_(\d{6,})\.zip$', d.name)
if job_id_match:
short_name += f"_{job_id_match.group(1)}"
if not Path(short_name).exists():
d.rename(short_name)
print(f" Renamed {d.name} -> {short_name}")
Expand Down Expand Up @@ -297,6 +314,9 @@ def download_xml_files(workflow_run_id, workflow_run_attempts, prefixes=[], allo

_shorten_unzipped_dirs()

with open("_wf_run_id", "w") as f:
f.write(str(workflow_run_id))

# Delete raw zip files now that contents are extracted
for path in artifact_paths:
try:
Expand Down
30 changes: 26 additions & 4 deletions .automation_scripts/pytorch-unit-test-scripts/generate_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import csv
import os
import re
import sys


Expand Down Expand Up @@ -289,10 +290,12 @@ def collect_failed_tests(arch_data, archs, s1_name, s2_name):
'test_name': r.get('test_name', ''),
'test_config': r.get('test_config', ''),
f'shard_{s1_name}': r.get(f'shard_{s1_name}', ''),
f'job_url_{s1_name}': r.get(f'job_url_{s1_name}', ''),
f'status_{s1_name}': s1,
}
if has_set2:
entry[f'shard_{s2_name}'] = r.get(f'shard_{s2_name}', '')
entry[f'job_url_{s2_name}'] = r.get(f'job_url_{s2_name}', '')
entry[f'status_{s2_name}'] = s2
failed.append(entry)

Expand Down Expand Up @@ -418,6 +421,7 @@ def load_flaky_tests_as_log_failures(filepaths):
'category': 'FLAKY',
'reason': f'{test_class}::{test_name}' if test_class else test_name,
'exit_codes': '',
'job_url': row.get('job_url', ''),
})
return entries

Expand Down Expand Up @@ -691,6 +695,16 @@ def _xml_test_shard(t, platform):
_norm_test_file(t.get('test_file', '')))
return _format_test_shards(shard_lookup.get(key, ''))

def _job_id_link(url):
if not url:
return ''
# Use the job id (digits after "/job/" in the URL) as the visible
# link label so the cell reads e.g. [76905282313](...).
m = re.search(r'/job/(\d+)', url)
if not m:
return ''
return f'[{m.group(1)}]({url})'

cols = ['Arch', 'Test Config', 'Test File', 'Test Class', 'Test Name',
f'Job-Level Shard ({s1_name})',
f'Test-Level Shard ({s1_name})']
Expand All @@ -701,6 +715,9 @@ def _xml_test_shard(t, platform):
if has_set2:
cols.append(f'Status ({s2_name})')
cols.append('Also Failing In')
cols.append(f'Job ID ({s1_name})')
if has_set2:
cols.append(f'Job ID ({s2_name})')

if s1_failed:
lines.append(f'### FAILED TESTS ({len(s1_failed)})')
Expand All @@ -718,7 +735,11 @@ def _xml_test_shard(t, platform):
line += f" | {t[f'status_{s1_name}']}"
if has_set2:
line += f" | {t.get(f'status_{s2_name}', '')}"
line += f" | {t.get('also_failing_in', '')} |"
line += f" | {t.get('also_failing_in', '')}"
line += f" | {_job_id_link(t.get(f'job_url_{s1_name}', ''))}"
if has_set2:
line += f" | {_job_id_link(t.get(f'job_url_{s2_name}', ''))}"
line += ' |'
lines.append(line)
lines.append('')
else:
Expand Down Expand Up @@ -748,8 +769,8 @@ def _xml_test_shard(t, platform):
lines.append('These test failures were detected from CI log files but have no XML report')
lines.append('(typically due to timeouts, crashes, or process kills).')
lines.append('')
lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Job-Level Shard | Test-Level Shard | Category | Also Failing In |')
lines.append('| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |')
lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Job-Level Shard | Test-Level Shard | Category | Also Failing In | Job ID |')
lines.append('| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |')
for lf in rocm_log_failures:
test_class, test_name = _parse_log_failure_names(lf)
lines.append(
Expand All @@ -759,7 +780,8 @@ def _xml_test_shard(t, platform):
f"| {lf.get('job_shard', '')} "
f"| {lf.get('test_shard', lf.get('shard', ''))} "
f"| {lf.get('category', '')} "
f"| {lf.get('also_failing_in', '')} |"
f"| {lf.get('also_failing_in', '')} "
f"| {_job_id_link(lf.get('job_url', ''))} |"
)
lines.append('')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ def _extract_shard(dirname):
def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="."):
test_config = ""
test_cases = {}

# download_testlogs writes the upstream pytorch CI workflow run id
# into "_wf_run_id" alongside the shard dirs. We combine it with each
# shard dir's trailing "_<job_id>" to form the URL
# https://github.com/pytorch/pytorch/actions/runs/<wf>/job/<job_id>
# surfaced as the "Job ID" column in the FAILED TESTS table.
wf_run_id = ""
wf_id_file = os.path.join(path, "_wf_run_id")
if os.path.isfile(wf_id_file):
with open(wf_id_file) as f:
wf_run_id = f.read().strip()

items_list = os.listdir(path)
for dir in items_list:
new_dir = path + '/' + dir + '/'
Expand All @@ -80,6 +92,11 @@ def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="
elif "test-inductor" in new_dir:
test_config = TestConfigName.inductor.name
shard = _extract_shard(dir)
jid = re.search(r'_(\d+)$', dir)
job_url = (
f"https://github.com/pytorch/pytorch/actions/runs/{wf_run_id}/job/{jid.group(1)}"
if wf_run_id and jid else ""
)
for xml_report in Path(new_dir).glob("**/*.xml"):
try:
new_cases = parse_xml_report(
Expand All @@ -94,6 +111,7 @@ def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="
continue
for key, case in new_cases.items():
case["shard"] = shard
case["job_url"] = job_url
existing = test_cases.get(key)
if existing is None or _status_priority(case) > _status_priority(existing):
test_cases[key] = case
Expand Down Expand Up @@ -472,6 +490,8 @@ def summarize_xml_files(args):
item_values["test_config"] = config_name
item_values[f"shard_{set1_name}"] = v_values.get('shard', '') if v_values else ''
item_values[f"shard_{set2_name}"] = v1_values.get('shard', '') if v1_values else ''
item_values[f"job_url_{set1_name}"] = v_values.get('job_url', '') if v_values else ''
item_values[f"job_url_{set2_name}"] = v1_values.get('job_url', '') if v1_values else ''
# get test related info
item_values[f"message_{set1_name}"] = get_test_message(v[0])
item_values[f"message_{set2_name}"] = get_test_message(v[1]) if set2_path else ""
Expand Down Expand Up @@ -564,6 +584,10 @@ def sorting_key(e):
return 21
elif e == f"shard_{set2_name}":
return 22
elif e == f"job_url_{set1_name}":
return 23
elif e == f"job_url_{set2_name}":
return 24
elif e == "workflow_run_attempt" or e == "job_id":
return 1000
else:
Expand Down