From 34cf4d10f7cd902525438e1fc4a76feb38a4c9eb Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 20 Aug 2025 10:29:01 +0000 Subject: [PATCH 01/12] Support names field in source maps This adds support for `names` field in source maps, which contains function names. Source map mappings are correspondingly updated and emsymbolizer now can provide function name information only with source maps. While source maps don't provide the full inlined hierarchies, this provides the name of the original (= pre-inlining) function, which may not exist in the final binary because they were inlined. This is because source maps are primarily intended for user debugging. This also demangles C++ function names using `llvm-cxxfilt`, so the printed names can be human-readable. I tested with `wasm-opt.wasm` from Binaryen by `if (EMSCRIPTEN)` setup here: https://github.com/WebAssembly/binaryen/blob/95b2cf0a4ab2386f099568c5c61a02163770af32/CMakeLists.txt#L311-L372 with `-g -gsource-map`. With this PR and https://github.com/WebAssembly/binaryen/pull/8068, the source map file size increases by 3.5x (8632423 -> 30070042) primarily due to the function name strings. From `llvm-dwarfdump` output, this also requires additional parsing of `DW_TAG_subprogram` and `DW_TAG_inlined_subroutine` tags which can be at any depths (because functions can be within nested namespaces or classes), so we cannot use `--recurse-depth=0` (#9580) anymore. In case of `wasm-opt.wasm` built with DWARF info, without `--recurse-depth=0` in the command line, the size of its text output increased by 27.5x, but with the `--filter-child-tag` / `-t` option (https://github.com/llvm/llvm-project/pull/165720), the text output increased only (?) by 3.2x, which I think is tolerable. This disables `names` field generation when `-t` option is not available in `llvm-dwarfdump` because it was added recently. To avoid this text size problem, we can consider using DWARF-parsing Python libraries like https://github.com/eliben/pyelftools, but this will make another third party dependency, so I'm not sure if it's worth it at this point. This also increased running time of `wasm-sourcemap.py`, in case of the `wasm-opt.wasm`, by 2.3x (6.6s -> 15.4s), but compared to the linking time this was not very noticeable. Fixes #20715 and closes #25116. --- test/core/test_dwarf.cpp | 26 +++++ test/test_other.py | 89 ++++++++++----- tools/emsymbolizer.py | 8 +- tools/wasm-sourcemap.py | 233 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 321 insertions(+), 35 deletions(-) create mode 100644 test/core/test_dwarf.cpp diff --git a/test/core/test_dwarf.cpp b/test/core/test_dwarf.cpp new file mode 100644 index 0000000000000..ad91ccda9cd4a --- /dev/null +++ b/test/core/test_dwarf.cpp @@ -0,0 +1,26 @@ +#include + +EM_JS(int, out_to_js, (int x), {}) + +class MyClass { +public: + void foo(); + void bar(); +}; + +void __attribute__((noinline)) MyClass::foo() { + out_to_js(0); // line 12 + out_to_js(1); + out_to_js(2); +} + +void __attribute__((always_inline)) MyClass::bar() { + out_to_js(3); + __builtin_trap(); // line 19 +} + +int main() { + MyClass mc; + mc.foo(); + mc.bar(); +} diff --git a/test/test_other.py b/test/test_other.py index 07f1b3f090f0a..e6f7e4168252b 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9639,12 +9639,50 @@ def check_dwarf_loc_info(address, funcs, locs): for loc in locs: self.assertIn(loc, out) - def check_source_map_loc_info(address, loc): + def check_source_map_loc_info(address, func, loc): out = self.run_process( [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout + self.assertIn(func, out) self.assertIn(loc, out) + def do_tests(src): + # 1. Test DWARF + source map together + # For DWARF, we check for the full inlined info for both function names and + # source locations. Source maps does not provide inlined info. So we only + # check for the info of the outermost function. + self.run_process([EMCC, test_file(src), '-g', '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # Source map shows the original (inlined) source location with the original + # function name + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 2. Test source map only + # The addresses, function names, and source locations are the same across + # the builds because they are relative offsets from the code section, so we + # don't need to recompute them + self.run_process([EMCC, test_file(src), '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 3. Test DWARF only + self.run_process([EMCC, test_file(src), '-g', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + + + # -- C program test -- # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 @@ -9667,31 +9705,32 @@ def check_source_map_loc_info(address, loc): # The first one corresponds to the innermost inlined location. unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3'] - # 1. Test DWARF + source map together - # For DWARF, we check for the full inlined info for both function names and - # source locations. Source maps provide neither function names nor inlined - # info. So we only check for the source location of the outermost function. - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) - - # 2. Test source map only - # The addresses, function names, and source locations are the same across - # the builds because they are relative offsets from the code section, so we - # don't need to recompute them - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-gsource-map', '-O1', '-o', 'test_dwarf.js']) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + do_tests('core/test_dwarf.c') - # 3. Test DWARF only - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-g', '-O1', '-o', 'test_dwarf.js']) - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # -- C++ program test -- + # We test two locations within test_dwarf.cpp: + # out_to_js(0); // line 12 + # __builtin_trap(); // line 19 + self.run_process([EMCC, test_file('core/test_dwarf.cpp'), + '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) + # Address of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') + # Address of __builtin_trap() within MyClass::bar(), inlined into main() + unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') + + # Function name of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_func = ['MyClass::foo()'] + # Function names of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined function. + unreachable_func = ['MyClass::bar()', 'main'] + + # Source location of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_loc = ['test_dwarf.cpp:12:3'] + # Source locations of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined location. + unreachable_loc = ['test_dwarf.cpp:19:3', 'test_dwarf.cpp:25:6'] + + do_tests('core/test_dwarf.cpp') def test_emsymbolizer_functions(self): 'Test emsymbolizer use cases that only provide function-granularity info' diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py index c71fc26da890d..62f6b7830a0a9 100755 --- a/tools/emsymbolizer.py +++ b/tools/emsymbolizer.py @@ -118,6 +118,7 @@ class Location: def __init__(self): self.version = None self.sources = [] + self.funcs = [] self.mappings = {} self.offsets = [] @@ -129,6 +130,7 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] + self.funcs = source_map_json['names'] chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' vlq_map = {c: i for i, c in enumerate(chars)} @@ -156,6 +158,7 @@ def decodeVLQ(string): src = 0 line = 1 col = 1 + func = 0 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) info = [] @@ -170,7 +173,9 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - # TODO: see if we need the name, which is the next field (data[4]) + if len(data) == 5: + func += data[4] + info.append(func) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -208,6 +213,7 @@ def lookup(self, offset, lower_bound=None): self.sources[info.source] if info.source is not None else None, info.line, info.column, + self.funcs[info.func] if info.func is not None else None, ) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 8d9fefc4fcc7f..bbdd46b130a54 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -24,9 +24,11 @@ __rootdir__ = os.path.dirname(__scriptdir__) sys.path.insert(0, __rootdir__) -from tools import utils +from tools import utils, shared from tools.system_libs import DETERMINISTIC_PREFIX +LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt') + EMSCRIPTEN_PREFIX = utils.normalize_path(utils.path_from_root()) logger = logging.getLogger('wasm-sourcemap') @@ -231,7 +233,158 @@ def extract_comp_dir_map(text): return map_stmt_list_to_comp_dir -def read_dwarf_entries(wasm, options): +def demangle_names(names): + # Only demangle names that look mangled + mangled_names = sorted(list({n for n in names if n.startswith('_Z')})) + if not mangled_names: + return {} + if not os.path.exists(LLVM_CXXFILT): + logger.warning('llvm-cxxfilt does not exist') + return {} + + # Gather all mangled names and call llvm-cxxfilt only once for all of them + try: + input_str = '\n'.join(mangled_names) + process = Popen([LLVM_CXXFILT], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) + stdout, stderr = process.communicate(input=input_str) + if process.returncode != 0: + logger.warning('llvm-cxxfilt failed: %s' % stderr) + return {} + + demangled_list = stdout.splitlines() + if len(demangled_list) != len(mangled_names): + logger.warning('llvm-cxxfilt output length mismatch') + return {} + + return dict(zip(mangled_names, demangled_list)) + except OSError: + logger.warning('Failed to run llvm-cxxfilt') + return {} + + +class FuncRange: + def __init__(self, name, low_pc, high_pc): + self.name = name + self.low_pc = low_pc + self.high_pc = high_pc + + +# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for +# each function in a list of FuncRanges. The result list will be sorted in the +# increasing order of low_pcs. +def extract_func_ranges(text): + # This function handles four cases: + # 1. DW_TAG_subprogram with DW_AT_name, DW_AT_low_pc, and DW_AT_high_pc. + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_name ("foo") + # ... + # + # 2. DW_TAG_subprogram with DW_AT_linkage_name, DW_AT_low_pc, and + # DW_AT_high_pc. Applies to mangled C++ functions. + # (We parse DW_AT_linkage_name instead of DW_AT_name here.) + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # ... + # + # 3. DW_TAG_subprogram with DW_AT_specification, DW_AT_low_pc, and + # DW_AT_high_pc. C++ function info can be split into two DIEs (one with + # DW_AT_linkage_name and DW_AT_declaration (true) and the other with + # DW_AT_specification). In this case we parse DW_AT_specification for the + # function name. + # 0x0000006d: DW_TAG_subprogram + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # DW_AT_declaration (true) + # ... + # 0x00000097: DW_TAG_subprogram + # DW_AT_low_pc (0x00000007) + # DW_AT_high_pc (0x0000004c) + # DW_AT_specification (0x0000006d "_ZN7MyClass3fooEv") + # ... + # + # 4. DW_TAG_inlined_subroutine with DW_AT_abstract_origin, DW_AT_low_pc, and + # DW_AT_high_pc. This represents an inlined function. We parse + # DW_AT_abstract_origin for the original function name. + # 0x0000011a: DW_TAG_inlined_subroutine + # DW_AT_abstract_origin (0x000000da "_ZN7MyClass3barEv") + # DW_AT_low_pc (0x00000078) + # DW_AT_high_pc (0x00000083) + # ... + + func_ranges = [] + dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) + + def get_name_from_tag(tag): + m = re.search(r'DW_AT_linkage_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + # If name is missing, check for DW_AT_specification annotation + m = re.search(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + return m.group(1) + return None + + for tag in dw_tags: + is_subprogram = re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag) + is_inlined = re.search(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine", tag) + if is_subprogram or is_inlined: + name = None + low_pc = None + high_pc = None + m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + low_pc = int(m.group(1), 16) + m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + high_pc = int(m.group(1), 16) + if is_subprogram: + name = get_name_from_tag(tag) + else: # is_inlined + m = re.search(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + name = m.group(1) + if name and low_pc is not None and high_pc is not None: + func_ranges.append(FuncRange(name, low_pc, high_pc)) + + # Demangle names + all_names = [item.name for item in func_ranges] + demangled_map = demangle_names(all_names) + for func_range in func_ranges: + if func_range.name in demangled_map: + func_range.name = demangled_map[func_range.name] + + # To correctly identify the innermost function for a given address, + # func_ranges is sorted primarily by low_pc in ascending order and secondarily + # by high_pc in descending order. This ensures that for overlapping ranges, + # the more specific (inner) range appears later in the list. + func_ranges.sort(key=lambda item: (item.low_pc, -item.high_pc)) + return func_ranges + + +# Returns true if the given llvm-dwarfdump has --filter-child-tags / -t option +def has_filter_child_tag_option(dwarfdump): + # To check if --filter-child-tags / -t option is available, run + # `llvm-dwarfdump -t`. If it is available, it will print to stderr: + # ... for the -t option: requires a value! + # If not, it will print: + # ... Unknown command line argument '-t'. + try: + process = Popen([dwarfdump, '-t'], stdout=PIPE, stderr=PIPE, text=True) + _, err = process.communicate() + return 'requires a value' in err + except OSError: + return False + + +def read_dwarf_info(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() elif options.dwarfdump: @@ -239,7 +392,24 @@ def read_dwarf_entries(wasm, options): if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) - process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) + dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm] + + # Recently --filter-child-tag / -t option was added to llvm-dwarfdump prune + # tags. Because it is a recent addition, check if it exists in the user's + # llvm-dwarfdump. If not, print only the top-level DW_TAG_compile_units for + # source location info and don't generate 'names' field. + if has_filter_child_tag_option(options.dwarfdump): + # We need only three tags in the debug info: DW_TAG_compile_unit for + # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine + # for the function ranges. + dwarfdump_cmd += ['-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', + '-t', 'DW_TAG_inlined_subroutine'] + else: + logger.warning('llvm-dwarfdump does not support -t. "names" field will not be generated in the source map.') + # Only print DW_TAG_compile_units + dwarfdump_cmd += ['--recurse-depth=0'] + + process = Popen(dwarfdump_cmd, stdout=PIPE, stderr=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code != 0: @@ -296,22 +466,61 @@ def read_dwarf_entries(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - return sorted(entries, key=lambda entry: entry['address']) + entries = sorted(entries, key=lambda entry: entry['address']) + func_ranges = extract_func_ranges(debug_line_chunks[0]) + return entries, func_ranges -def build_sourcemap(entries, code_section_offset, options): + +def build_sourcemap(entries, func_ranges, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) + # Add code section offset to the low/high pc in the function PC ranges + for func_range in func_ranges: + func_range.low_pc += code_section_offset + func_range.high_pc += code_section_offset + sources = [] sources_content = [] + # There can be duplicate names in case an original source function has + # multiple disjoint PC ranges or is inlined to multiple callsites. Make the + # 'names' list a unique list of names, and map the function ranges to the + # indices in that list. + names = sorted(list(set([item.name for item in func_ranges]))) + name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 + last_func_id = 0 + + active_funcs = [] + next_func_range_id = 0 + + # Get the function name ID that the given address falls into + def get_function_id(address): + nonlocal active_funcs + nonlocal next_func_range_id + + # Maintain a list of "active functions" whose ranges currently cover the + # address. As the address advances, it adds new functions that start and + # removes functions that end. The last function remaining in the active list + # at any point is the innermost function. + while next_func_range_id < len(func_ranges) and func_ranges[next_func_range_id].low_pc <= address: + # active_funcs contains (high_pc, id) pair + active_funcs.append((func_ranges[next_func_range_id].high_pc, next_func_range_id)) + next_func_range_id += 1 + active_funcs = [f for f in active_funcs if f[0] > address] + + if active_funcs: + func_range_id = active_funcs[-1][1] + name = func_ranges[func_range_id].name + return name_to_id[name] + return None for entry in entries: line = entry['line'] @@ -342,21 +551,27 @@ def build_sourcemap(entries, code_section_offset, options): sources_content.append(None) else: source_id = sources_map[source_name] + func_id = get_function_id(address) address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column - mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column + mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta) + if func_id is not None: + func_id_delta = func_id - last_func_id + last_func_id = func_id + mapping += encode_vlq(func_id_delta) + mappings.append(mapping) return {'version': 3, 'sources': sources, 'sourcesContent': sources_content, - 'names': [], + 'names': names, 'mappings': ','.join(mappings)} @@ -367,12 +582,12 @@ def main(args): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries = read_dwarf_entries(wasm_input, options) + entries, func_ranges = read_dwarf_info(wasm_input, options) code_section_offset = get_code_section_offset(wasm) logger.debug('Saving to %s' % options.output) - map = build_sourcemap(entries, code_section_offset, options) + map = build_sourcemap(entries, func_ranges, code_section_offset, options) with open(options.output, 'w', encoding='utf-8') as outfile: json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False) From 84f14d36f8fe4c14c5d4c2e28cba3dc7d0cb878d Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 26 Nov 2025 02:52:54 +0000 Subject: [PATCH 02/12] ruff fix --- tools/wasm-sourcemap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index bbdd46b130a54..62a7a4723787c 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -24,7 +24,7 @@ __rootdir__ = os.path.dirname(__scriptdir__) sys.path.insert(0, __rootdir__) -from tools import utils, shared +from tools import shared, utils from tools.system_libs import DETERMINISTIC_PREFIX LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt') @@ -235,7 +235,7 @@ def extract_comp_dir_map(text): def demangle_names(names): # Only demangle names that look mangled - mangled_names = sorted(list({n for n in names if n.startswith('_Z')})) + mangled_names = sorted({n for n in names if n.startswith('_Z')}) if not mangled_names: return {} if not os.path.exists(LLVM_CXXFILT): @@ -488,7 +488,7 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): # multiple disjoint PC ranges or is inlined to multiple callsites. Make the # 'names' list a unique list of names, and map the function ranges to the # indices in that list. - names = sorted(list(set([item.name for item in func_ranges]))) + names = sorted(set([item.name for item in func_ranges])) name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} From 1e78e2d3965ed0f66ab6b9ee112a5ae1f0719791 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 26 Nov 2025 02:55:30 +0000 Subject: [PATCH 03/12] ruff fix 2 --- tools/wasm-sourcemap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 62a7a4723787c..7435713399fc5 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -488,7 +488,7 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): # multiple disjoint PC ranges or is inlined to multiple callsites. Make the # 'names' list a unique list of names, and map the function ranges to the # indices in that list. - names = sorted(set([item.name for item in func_ranges])) + names = sorted(set(item.name for item in func_ranges)) name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} From 161757ac1745e16fbd167fd3d4e445c2bdb2b3ee Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 26 Nov 2025 03:22:43 +0000 Subject: [PATCH 04/12] ruff fix 3 --- tools/wasm-sourcemap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 7435713399fc5..b7075ded76873 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -488,7 +488,7 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): # multiple disjoint PC ranges or is inlined to multiple callsites. Make the # 'names' list a unique list of names, and map the function ranges to the # indices in that list. - names = sorted(set(item.name for item in func_ranges)) + names = sorted({item.name for item in func_ranges}) name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} From 131827cc899ad0c031ed6834594b96a01bbdd6a0 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 26 Nov 2025 03:25:28 +0000 Subject: [PATCH 05/12] ruff fix 4 (I should install it probably) --- test/test_other.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_other.py b/test/test_other.py index e6f7e4168252b..e419997dda2af 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9681,7 +9681,6 @@ def do_tests(src): out_to_js_call_loc) check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - # -- C program test -- # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 From d36dad5f1291eb38b8637133683b883184e11fed Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 27 Nov 2025 03:52:58 +0000 Subject: [PATCH 06/12] Remove fallbacks in case -t doesn't exist --- tools/wasm-sourcemap.py | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index b7075ded76873..7706617d4694f 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -369,21 +369,6 @@ def get_name_from_tag(tag): return func_ranges -# Returns true if the given llvm-dwarfdump has --filter-child-tags / -t option -def has_filter_child_tag_option(dwarfdump): - # To check if --filter-child-tags / -t option is available, run - # `llvm-dwarfdump -t`. If it is available, it will print to stderr: - # ... for the -t option: requires a value! - # If not, it will print: - # ... Unknown command line argument '-t'. - try: - process = Popen([dwarfdump, '-t'], stdout=PIPE, stderr=PIPE, text=True) - _, err = process.communicate() - return 'requires a value' in err - except OSError: - return False - - def read_dwarf_info(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() @@ -392,23 +377,13 @@ def read_dwarf_info(wasm, options): if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) - dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm] - - # Recently --filter-child-tag / -t option was added to llvm-dwarfdump prune - # tags. Because it is a recent addition, check if it exists in the user's - # llvm-dwarfdump. If not, print only the top-level DW_TAG_compile_units for - # source location info and don't generate 'names' field. - if has_filter_child_tag_option(options.dwarfdump): - # We need only three tags in the debug info: DW_TAG_compile_unit for - # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine - # for the function ranges. - dwarfdump_cmd += ['-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', - '-t', 'DW_TAG_inlined_subroutine'] - else: - logger.warning('llvm-dwarfdump does not support -t. "names" field will not be generated in the source map.') - # Only print DW_TAG_compile_units - dwarfdump_cmd += ['--recurse-depth=0'] + # We need only three tags in the debug info: DW_TAG_compile_unit for + # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine + # for the function ranges. + dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm, + '-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', + '-t', 'DW_TAG_inlined_subroutine'] process = Popen(dwarfdump_cmd, stdout=PIPE, stderr=PIPE) output, err = process.communicate() exit_code = process.wait() From 3bae4297402175f66b1151b0c8c227c705e40e53 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 01:16:44 +0000 Subject: [PATCH 07/12] Use shared.check_call for llvm-cxxfilt --- tools/wasm-sourcemap.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 078778baba966..bdb01e8f2e90e 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -243,13 +243,12 @@ def demangle_names(names): # Gather all mangled names and call llvm-cxxfilt only once for all of them try: input_str = '\n'.join(mangled_names) - process = Popen([LLVM_CXXFILT], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) - stdout, stderr = process.communicate(input=input_str) - if process.returncode != 0: - logger.warning('llvm-cxxfilt failed: %s' % stderr) + proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True) + if proc.returncode != 0: + logger.warning('llvm-cxxfilt failed: %s' % proc.stderr) return {} - demangled_list = stdout.splitlines() + demangled_list = proc.stdout.splitlines() if len(demangled_list) != len(mangled_names): logger.warning('llvm-cxxfilt output length mismatch') return {} From c8442567c45ce4613e5b4c6ff749bff2e41b749b Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 01:32:59 +0000 Subject: [PATCH 08/12] Remove try/except --- tools/wasm-sourcemap.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index bdb01e8f2e90e..b68e988d8684b 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -241,23 +241,19 @@ def demangle_names(names): return {} # Gather all mangled names and call llvm-cxxfilt only once for all of them - try: - input_str = '\n'.join(mangled_names) - proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True) - if proc.returncode != 0: - logger.warning('llvm-cxxfilt failed: %s' % proc.stderr) - return {} - - demangled_list = proc.stdout.splitlines() - if len(demangled_list) != len(mangled_names): - logger.warning('llvm-cxxfilt output length mismatch') - return {} - - return dict(zip(mangled_names, demangled_list)) - except OSError: - logger.warning('Failed to run llvm-cxxfilt') + input_str = '\n'.join(mangled_names) + proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True) + if proc.returncode != 0: + logger.warning('llvm-cxxfilt failed: %s' % proc.stderr) return {} + demangled_list = proc.stdout.splitlines() + if len(demangled_list) != len(mangled_names): + logger.warning('llvm-cxxfilt output length mismatch') + return {} + + return dict(zip(mangled_names, demangled_list)) + class FuncRange: def __init__(self, name, low_pc, high_pc): From d9c0232498db12f6b754f8aa8da23839faad82ab Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 01:41:35 +0000 Subject: [PATCH 09/12] ruff fix --- tools/wasm-sourcemap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index b68e988d8684b..14c1c134d606b 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -252,7 +252,7 @@ def demangle_names(names): logger.warning('llvm-cxxfilt output length mismatch') return {} - return dict(zip(mangled_names, demangled_list)) + return dict(zip(mangled_names, demangled_list, strict=True)) class FuncRange: From 007396bdcfbb31440c868db3f293372de976d4bc Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 01:47:41 +0000 Subject: [PATCH 10/12] Update ChangeLog --- ChangeLog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index d056b7699f631..e474f14d34ffd 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -20,6 +20,10 @@ See docs/process.md for more on how version tagging works. 4.0.22 (in development) ----------------------- +- Source maps now support 'names' field with function name information. + emsymbolizer will show function names when used with a source map. The size + of source maps and source map creation time may increase 2-3x, depending on + applications. (#25870) - The minimum version of python required to run emscripten was updated from 3.8 to 3.10. (#25891) From ecf51183f0ef6d226aeb12138f2966243d1bd8e3 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 01:49:21 +0000 Subject: [PATCH 11/12] Edit ChangeLog --- ChangeLog.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index e474f14d34ffd..2261f59300115 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -22,8 +22,8 @@ See docs/process.md for more on how version tagging works. ----------------------- - Source maps now support 'names' field with function name information. emsymbolizer will show function names when used with a source map. The size - of source maps and source map creation time may increase 2-3x, depending on - applications. (#25870) + of source maps may increase 2-3x and the link time can increase slightly due + to more processing on source map creation. (#25870) - The minimum version of python required to run emscripten was updated from 3.8 to 3.10. (#25891) From 796c42296173db9e996fc2e871c8735678806cd3 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 4 Dec 2025 03:02:59 +0000 Subject: [PATCH 12/12] Use precompiled regexes --- tools/wasm-sourcemap.py | 52 ++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 14c1c134d606b..63ae7ea1c8b49 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -219,13 +219,17 @@ def decode_octal_encoded_utf8(str): def extract_comp_dir_map(text): + compile_unit_pattern = re.compile(r"0x[0-9a-f]*: DW_TAG_compile_unit") + stmt_list_pattern = re.compile(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)") + comp_dir_pattern = re.compile(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)") + map_stmt_list_to_comp_dir = {} - chunks = re.split(r"0x[0-9a-f]*: DW_TAG_compile_unit", text) + chunks = compile_unit_pattern.split(text) # DW_TAG_compile_unit for chunk in chunks[1:]: - stmt_list_match = re.search(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)", chunk) + stmt_list_match = stmt_list_pattern.search(chunk) # DW_AT_stmt_list if stmt_list_match is not None: stmt_list = stmt_list_match.group(1) - comp_dir_match = re.search(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)", chunk) + comp_dir_match = comp_dir_pattern.search(chunk) # DW_AT_comp_dir comp_dir = decode_octal_encoded_utf8(comp_dir_match.group(1)) if comp_dir_match is not None else '' map_stmt_list_to_comp_dir[stmt_list] = comp_dir return map_stmt_list_to_comp_dir @@ -309,39 +313,50 @@ def extract_func_ranges(text): # DW_AT_high_pc (0x00000083) # ... + tag_pattern = re.compile(r'\r?\n(?=0x[0-9a-f]+:)') + subprogram_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_subprogram") + inlined_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine") + low_pc_pattern = re.compile(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)') + high_pc_pattern = re.compile(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)') + abstract_origin_pattern = re.compile(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)') + linkage_name_pattern = re.compile(r'DW_AT_linkage_name\s+\("([^"]+)"\)') + name_pattern = re.compile(r'DW_AT_name\s+\("([^"]+)"\)') + specification_pattern = re.compile(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)') + func_ranges = [] - dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) + dw_tags = tag_pattern.split(text) def get_name_from_tag(tag): - m = re.search(r'DW_AT_linkage_name\s+\("([^"]+)"\)', tag) + m = linkage_name_pattern.search(tag) # DW_AT_linkage_name if m: return m.group(1) - m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) + m = name_pattern.search(tag) # DW_AT_name if m: return m.group(1) # If name is missing, check for DW_AT_specification annotation - m = re.search(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + m = specification_pattern.search(tag) if m: return m.group(1) return None for tag in dw_tags: - is_subprogram = re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag) - is_inlined = re.search(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine", tag) + is_subprogram = subprogram_pattern.search(tag) # DW_TAG_subprogram + is_inlined = inlined_pattern.search(tag) # DW_TAG_inlined_subroutine + if is_subprogram or is_inlined: name = None low_pc = None high_pc = None - m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) + m = low_pc_pattern.search(tag) # DW_AT_low_pc if m: low_pc = int(m.group(1), 16) - m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) + m = high_pc_pattern.search(tag) # DW_AT_high_pc if m: high_pc = int(m.group(1), 16) if is_subprogram: name = get_name_from_tag(tag) else: # is_inlined - m = re.search(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + m = abstract_origin_pattern.search(tag) # DW_AT_abstract_origin if m: name = m.group(1) if name and low_pc is not None and high_pc is not None: @@ -380,8 +395,13 @@ def read_dwarf_info(wasm, options): else: utils.exit_with_error('Please specify either --dwarfdump or --dwarfdump-output') + debug_line_pattern = re.compile(r"debug_line\[(0x[0-9a-f]*)\]") + include_dir_pattern = re.compile(r"include_directories\[\s*(\d+)\] = \"([^\"]*)") + file_pattern = re.compile(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)") + line_pattern = re.compile(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?") + entries = [] - debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", output) + debug_line_chunks = debug_line_pattern.split(output) map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_line_chunks[0]) for stmt_list, line_chunk in zip(debug_line_chunks[1::2], debug_line_chunks[2::2], strict=True): comp_dir = map_stmt_list_to_comp_dir.get(stmt_list, '') @@ -402,16 +422,16 @@ def read_dwarf_info(wasm, options): # 0x0000000000000011 28 0 1 0 0 is_stmt include_directories = {'0': comp_dir} - for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk): + for dir in include_dir_pattern.finditer(line_chunk): include_directories[dir.group(1)] = os.path.join(comp_dir, decode_octal_encoded_utf8(dir.group(2))) files = {} - for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk): + for file in file_pattern.finditer(line_chunk): dir = include_directories[file.group(3)] file_path = os.path.join(dir, decode_octal_encoded_utf8(file.group(2))) files[file.group(1)] = file_path - for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk): + for line in line_pattern.finditer(line_chunk): entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None} if not entry['eos']: entries.append(entry)