diff --git a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py index 0156624c35973..bc1e4c0693b5f 100755 --- a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py +++ b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py @@ -292,6 +292,16 @@ def scan_logs(logs_dir): job_total = shard_totals.get((platform, test_config), 0) job_shard_str = f"{shard_num}/{job_total}" if job_total else str(shard_num) + # If download_testlogs left a ".job_url" file next to this log, + # it contains the URL of the upstream pytorch CI job that produced + # the log. We surface it in the LOG-BASED FAILURES table as a link + # to that job's page. Empty for older runs that predate this. + job_url_file = os.path.join(logs_dir, fname + ".job_url") + job_url = "" + if os.path.isfile(job_url_file): + with open(job_url_file) as f: + job_url = f.read().strip() + filepath = os.path.join(logs_dir, fname) results, consistent_failures, flaky_tests = parse_log_file(filepath) @@ -306,6 +316,7 @@ def scan_logs(logs_dir): "test_name": ft["method"], "job_shard": job_shard_str, "test_shard": ft["test_shard"], + "job_url": job_url, }) # Record every (test_file, test_shard) observed in this log file, @@ -365,6 +376,7 @@ def scan_logs(logs_dir): "category": "+".join(categories), "reason": reason, "exit_codes": ",".join(str(c) for c in info["exit_codes"]), + "job_url": job_url, }) for test_path, shard_str in consistent_failures: @@ -384,6 +396,7 @@ def scan_logs(logs_dir): "category": "CONSISTENT_FAILURE", "reason": f"{test_class}::{test_name}" if test_class else "", "exit_codes": "", + "job_url": job_url, }) def _sort_shards(vals): @@ -420,6 +433,7 @@ def write_csv_report(failures, output_path): "log_file", "platform", "test_config", "test_file", "job_shard", "test_shard", "status", "category", "reason", "exit_codes", + "job_url", ] with open(output_path, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) @@ -441,6 +455,7 @@ def write_flaky_report(flaky, output_path): fieldnames = [ "log_file", "platform", "test_config", "test_file", "test_class", "test_name", "job_shard", "test_shard", + "job_url", ] with open(output_path, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs index ac4214f99fecd..cd75c602d8e99 100755 --- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs +++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs @@ -90,6 +90,14 @@ def write_test_log_to_file(filename, test_key, jobs, sha): with open(filename, "w", encoding="utf-8") as f: f.write(response.text) + # Save the upstream pytorch CI job's page URL next to the log so + # detect_log_failures.py can later surface it as a link in the + # LOG-BASED FAILURES table of the parity summary. + job_url = js[0].get('html_url', '') + if job_url: + with open(filename + ".job_url", "w", encoding="utf-8") as f: + f.write(job_url) + def get_workflow_jobs(wf): """Get all jobs for a workflow run.""" if wf is None: @@ -239,11 +247,13 @@ def _shorten_unzipped_dirs(): unzipped-test-reports-runattempt1-test-default-1-6-linux.rocm.gpu.gfx942.1_68613413431.zip unzipped-test-reports-runattempt1-test-osdc-default-1-5-mt-l-x86aavx2-29-113-l4_73385044118.zip to: - test-default-1-6 - test-default-1-5 + test-default-1-6_68613413431 + test-default-1-5_73385044118 Preserves the 'test-' prefix so that summarize_xml_testreports.py - can still detect workflow type via substring matching. + can still detect workflow type via substring matching. The trailing + '_' is the upstream pytorch CI job id, used to link to the + failing job from the parity summary. """ from pathlib import Path for d in sorted(Path(".").glob("unzipped-*")): @@ -252,6 +262,13 @@ def _shorten_unzipped_dirs(): m = re.search(r'test-(?:osdc-)?(default|distributed|inductor)-(\d+)-(\d+)', d.name) if m: short_name = f"test-{m.group(1)}-{m.group(2)}-{m.group(3)}" + # The original artifact name ends with "_.zip" where + # is the upstream pytorch CI job id (e.g. + # ..._68613413431.zip). Carry it onto short_name so + # summarize_xml_testreports.py can link to that job. + job_id_match = re.search(r'_(\d{6,})\.zip$', d.name) + if job_id_match: + short_name += f"_{job_id_match.group(1)}" if not Path(short_name).exists(): d.rename(short_name) print(f" Renamed {d.name} -> {short_name}") @@ -297,6 +314,9 @@ def download_xml_files(workflow_run_id, workflow_run_attempts, prefixes=[], allo _shorten_unzipped_dirs() + with open("_wf_run_id", "w") as f: + f.write(str(workflow_run_id)) + # Delete raw zip files now that contents are extracted for path in artifact_paths: try: diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py index 406f4b49b78cc..c17322d6e8dbd 100644 --- a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py +++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py @@ -3,6 +3,7 @@ import argparse import csv import os +import re import sys @@ -289,10 +290,12 @@ def collect_failed_tests(arch_data, archs, s1_name, s2_name): 'test_name': r.get('test_name', ''), 'test_config': r.get('test_config', ''), f'shard_{s1_name}': r.get(f'shard_{s1_name}', ''), + f'job_url_{s1_name}': r.get(f'job_url_{s1_name}', ''), f'status_{s1_name}': s1, } if has_set2: entry[f'shard_{s2_name}'] = r.get(f'shard_{s2_name}', '') + entry[f'job_url_{s2_name}'] = r.get(f'job_url_{s2_name}', '') entry[f'status_{s2_name}'] = s2 failed.append(entry) @@ -418,6 +421,7 @@ def load_flaky_tests_as_log_failures(filepaths): 'category': 'FLAKY', 'reason': f'{test_class}::{test_name}' if test_class else test_name, 'exit_codes': '', + 'job_url': row.get('job_url', ''), }) return entries @@ -691,6 +695,16 @@ def _xml_test_shard(t, platform): _norm_test_file(t.get('test_file', ''))) return _format_test_shards(shard_lookup.get(key, '')) + def _job_id_link(url): + if not url: + return '' + # Use the job id (digits after "/job/" in the URL) as the visible + # link label so the cell reads e.g. [76905282313](...). + m = re.search(r'/job/(\d+)', url) + if not m: + return '' + return f'[{m.group(1)}]({url})' + cols = ['Arch', 'Test Config', 'Test File', 'Test Class', 'Test Name', f'Job-Level Shard ({s1_name})', f'Test-Level Shard ({s1_name})'] @@ -701,6 +715,9 @@ def _xml_test_shard(t, platform): if has_set2: cols.append(f'Status ({s2_name})') cols.append('Also Failing In') + cols.append(f'Job ID ({s1_name})') + if has_set2: + cols.append(f'Job ID ({s2_name})') if s1_failed: lines.append(f'### FAILED TESTS ({len(s1_failed)})') @@ -718,7 +735,11 @@ def _xml_test_shard(t, platform): line += f" | {t[f'status_{s1_name}']}" if has_set2: line += f" | {t.get(f'status_{s2_name}', '')}" - line += f" | {t.get('also_failing_in', '')} |" + line += f" | {t.get('also_failing_in', '')}" + line += f" | {_job_id_link(t.get(f'job_url_{s1_name}', ''))}" + if has_set2: + line += f" | {_job_id_link(t.get(f'job_url_{s2_name}', ''))}" + line += ' |' lines.append(line) lines.append('') else: @@ -748,8 +769,8 @@ def _xml_test_shard(t, platform): lines.append('These test failures were detected from CI log files but have no XML report') lines.append('(typically due to timeouts, crashes, or process kills).') lines.append('') - lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Job-Level Shard | Test-Level Shard | Category | Also Failing In |') - lines.append('| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |') + lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Job-Level Shard | Test-Level Shard | Category | Also Failing In | Job ID |') + lines.append('| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |') for lf in rocm_log_failures: test_class, test_name = _parse_log_failure_names(lf) lines.append( @@ -759,7 +780,8 @@ def _xml_test_shard(t, platform): f"| {lf.get('job_shard', '')} " f"| {lf.get('test_shard', lf.get('shard', ''))} " f"| {lf.get('category', '')} " - f"| {lf.get('also_failing_in', '')} |" + f"| {lf.get('also_failing_in', '')} " + f"| {_job_id_link(lf.get('job_url', ''))} |" ) lines.append('') diff --git a/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py index 72e587bbf54bd..97c4424d89b03 100755 --- a/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py +++ b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py @@ -69,6 +69,18 @@ def _extract_shard(dirname): def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="."): test_config = "" test_cases = {} + + # download_testlogs writes the upstream pytorch CI workflow run id + # into "_wf_run_id" alongside the shard dirs. We combine it with each + # shard dir's trailing "_" to form the URL + # https://github.com/pytorch/pytorch/actions/runs//job/ + # surfaced as the "Job ID" column in the FAILED TESTS table. + wf_run_id = "" + wf_id_file = os.path.join(path, "_wf_run_id") + if os.path.isfile(wf_id_file): + with open(wf_id_file) as f: + wf_run_id = f.read().strip() + items_list = os.listdir(path) for dir in items_list: new_dir = path + '/' + dir + '/' @@ -80,6 +92,11 @@ def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path=" elif "test-inductor" in new_dir: test_config = TestConfigName.inductor.name shard = _extract_shard(dir) + jid = re.search(r'_(\d+)$', dir) + job_url = ( + f"https://github.com/pytorch/pytorch/actions/runs/{wf_run_id}/job/{jid.group(1)}" + if wf_run_id and jid else "" + ) for xml_report in Path(new_dir).glob("**/*.xml"): try: new_cases = parse_xml_report( @@ -94,6 +111,7 @@ def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path=" continue for key, case in new_cases.items(): case["shard"] = shard + case["job_url"] = job_url existing = test_cases.get(key) if existing is None or _status_priority(case) > _status_priority(existing): test_cases[key] = case @@ -472,6 +490,8 @@ def summarize_xml_files(args): item_values["test_config"] = config_name item_values[f"shard_{set1_name}"] = v_values.get('shard', '') if v_values else '' item_values[f"shard_{set2_name}"] = v1_values.get('shard', '') if v1_values else '' + item_values[f"job_url_{set1_name}"] = v_values.get('job_url', '') if v_values else '' + item_values[f"job_url_{set2_name}"] = v1_values.get('job_url', '') if v1_values else '' # get test related info item_values[f"message_{set1_name}"] = get_test_message(v[0]) item_values[f"message_{set2_name}"] = get_test_message(v[1]) if set2_path else "" @@ -564,6 +584,10 @@ def sorting_key(e): return 21 elif e == f"shard_{set2_name}": return 22 + elif e == f"job_url_{set1_name}": + return 23 + elif e == f"job_url_{set2_name}": + return 24 elif e == "workflow_run_attempt" or e == "job_id": return 1000 else: