diff --git a/ChangeLog.md b/ChangeLog.md index d056b7699f631..2261f59300115 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -20,6 +20,10 @@ See docs/process.md for more on how version tagging works. 4.0.22 (in development) ----------------------- +- Source maps now support 'names' field with function name information. + emsymbolizer will show function names when used with a source map. The size + of source maps may increase 2-3x and the link time can increase slightly due + to more processing on source map creation. (#25870) - The minimum version of python required to run emscripten was updated from 3.8 to 3.10. (#25891) diff --git a/test/core/test_dwarf.cpp b/test/core/test_dwarf.cpp new file mode 100644 index 0000000000000..ad91ccda9cd4a --- /dev/null +++ b/test/core/test_dwarf.cpp @@ -0,0 +1,26 @@ +#include + +EM_JS(int, out_to_js, (int x), {}) + +class MyClass { +public: + void foo(); + void bar(); +}; + +void __attribute__((noinline)) MyClass::foo() { + out_to_js(0); // line 12 + out_to_js(1); + out_to_js(2); +} + +void __attribute__((always_inline)) MyClass::bar() { + out_to_js(3); + __builtin_trap(); // line 19 +} + +int main() { + MyClass mc; + mc.foo(); + mc.bar(); +} diff --git a/test/test_other.py b/test/test_other.py index 9f5a0a03ee756..0ac9202d71ac9 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9631,12 +9631,49 @@ def check_dwarf_loc_info(address, funcs, locs): for loc in locs: self.assertIn(loc, out) - def check_source_map_loc_info(address, loc): + def check_source_map_loc_info(address, func, loc): out = self.run_process( [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout + self.assertIn(func, out) self.assertIn(loc, out) + def do_tests(src): + # 1. Test DWARF + source map together + # For DWARF, we check for the full inlined info for both function names and + # source locations. Source maps does not provide inlined info. So we only + # check for the info of the outermost function. + self.run_process([EMCC, test_file(src), '-g', '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # Source map shows the original (inlined) source location with the original + # function name + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 2. Test source map only + # The addresses, function names, and source locations are the same across + # the builds because they are relative offsets from the code section, so we + # don't need to recompute them + self.run_process([EMCC, test_file(src), '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 3. Test DWARF only + self.run_process([EMCC, test_file(src), '-g', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + + # -- C program test -- # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 @@ -9659,31 +9696,32 @@ def check_source_map_loc_info(address, loc): # The first one corresponds to the innermost inlined location. unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3'] - # 1. Test DWARF + source map together - # For DWARF, we check for the full inlined info for both function names and - # source locations. Source maps provide neither function names nor inlined - # info. So we only check for the source location of the outermost function. - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) - - # 2. Test source map only - # The addresses, function names, and source locations are the same across - # the builds because they are relative offsets from the code section, so we - # don't need to recompute them - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-gsource-map', '-O1', '-o', 'test_dwarf.js']) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + do_tests('core/test_dwarf.c') - # 3. Test DWARF only - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-g', '-O1', '-o', 'test_dwarf.js']) - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # -- C++ program test -- + # We test two locations within test_dwarf.cpp: + # out_to_js(0); // line 12 + # __builtin_trap(); // line 19 + self.run_process([EMCC, test_file('core/test_dwarf.cpp'), + '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) + # Address of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') + # Address of __builtin_trap() within MyClass::bar(), inlined into main() + unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') + + # Function name of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_func = ['MyClass::foo()'] + # Function names of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined function. + unreachable_func = ['MyClass::bar()', 'main'] + + # Source location of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_loc = ['test_dwarf.cpp:12:3'] + # Source locations of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined location. + unreachable_loc = ['test_dwarf.cpp:19:3', 'test_dwarf.cpp:25:6'] + + do_tests('core/test_dwarf.cpp') def test_emsymbolizer_functions(self): 'Test emsymbolizer use cases that only provide function-granularity info' diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py index 37d50cb6c0a26..a4046ce7c5a81 100755 --- a/tools/emsymbolizer.py +++ b/tools/emsymbolizer.py @@ -117,6 +117,7 @@ class Location: def __init__(self): self.version = None self.sources = [] + self.funcs = [] self.mappings = {} self.offsets = [] @@ -128,6 +129,7 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] + self.funcs = source_map_json['names'] chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' vlq_map = {c: i for i, c in enumerate(chars)} @@ -155,6 +157,7 @@ def decodeVLQ(string): src = 0 line = 1 col = 1 + func = 0 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) info = [] @@ -169,7 +172,9 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - # TODO: see if we need the name, which is the next field (data[4]) + if len(data) == 5: + func += data[4] + info.append(func) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -207,6 +212,7 @@ def lookup(self, offset, lower_bound=None): self.sources[info.source] if info.source is not None else None, info.line, info.column, + self.funcs[info.func] if info.func is not None else None, ) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 31f112f844a19..14c1c134d606b 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -25,6 +25,8 @@ from tools import shared, utils from tools.system_libs import DETERMINISTIC_PREFIX +LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt') + EMSCRIPTEN_PREFIX = utils.normalize_path(utils.path_from_root()) logger = logging.getLogger('wasm-sourcemap') @@ -229,14 +231,151 @@ def extract_comp_dir_map(text): return map_stmt_list_to_comp_dir -def read_dwarf_entries(wasm, options): +def demangle_names(names): + # Only demangle names that look mangled + mangled_names = sorted({n for n in names if n.startswith('_Z')}) + if not mangled_names: + return {} + if not os.path.exists(LLVM_CXXFILT): + logger.warning('llvm-cxxfilt does not exist') + return {} + + # Gather all mangled names and call llvm-cxxfilt only once for all of them + input_str = '\n'.join(mangled_names) + proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True) + if proc.returncode != 0: + logger.warning('llvm-cxxfilt failed: %s' % proc.stderr) + return {} + + demangled_list = proc.stdout.splitlines() + if len(demangled_list) != len(mangled_names): + logger.warning('llvm-cxxfilt output length mismatch') + return {} + + return dict(zip(mangled_names, demangled_list, strict=True)) + + +class FuncRange: + def __init__(self, name, low_pc, high_pc): + self.name = name + self.low_pc = low_pc + self.high_pc = high_pc + + +# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for +# each function in a list of FuncRanges. The result list will be sorted in the +# increasing order of low_pcs. +def extract_func_ranges(text): + # This function handles four cases: + # 1. DW_TAG_subprogram with DW_AT_name, DW_AT_low_pc, and DW_AT_high_pc. + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_name ("foo") + # ... + # + # 2. DW_TAG_subprogram with DW_AT_linkage_name, DW_AT_low_pc, and + # DW_AT_high_pc. Applies to mangled C++ functions. + # (We parse DW_AT_linkage_name instead of DW_AT_name here.) + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # ... + # + # 3. DW_TAG_subprogram with DW_AT_specification, DW_AT_low_pc, and + # DW_AT_high_pc. C++ function info can be split into two DIEs (one with + # DW_AT_linkage_name and DW_AT_declaration (true) and the other with + # DW_AT_specification). In this case we parse DW_AT_specification for the + # function name. + # 0x0000006d: DW_TAG_subprogram + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # DW_AT_declaration (true) + # ... + # 0x00000097: DW_TAG_subprogram + # DW_AT_low_pc (0x00000007) + # DW_AT_high_pc (0x0000004c) + # DW_AT_specification (0x0000006d "_ZN7MyClass3fooEv") + # ... + # + # 4. DW_TAG_inlined_subroutine with DW_AT_abstract_origin, DW_AT_low_pc, and + # DW_AT_high_pc. This represents an inlined function. We parse + # DW_AT_abstract_origin for the original function name. + # 0x0000011a: DW_TAG_inlined_subroutine + # DW_AT_abstract_origin (0x000000da "_ZN7MyClass3barEv") + # DW_AT_low_pc (0x00000078) + # DW_AT_high_pc (0x00000083) + # ... + + func_ranges = [] + dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) + + def get_name_from_tag(tag): + m = re.search(r'DW_AT_linkage_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + # If name is missing, check for DW_AT_specification annotation + m = re.search(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + return m.group(1) + return None + + for tag in dw_tags: + is_subprogram = re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag) + is_inlined = re.search(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine", tag) + if is_subprogram or is_inlined: + name = None + low_pc = None + high_pc = None + m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + low_pc = int(m.group(1), 16) + m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + high_pc = int(m.group(1), 16) + if is_subprogram: + name = get_name_from_tag(tag) + else: # is_inlined + m = re.search(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + name = m.group(1) + if name and low_pc is not None and high_pc is not None: + func_ranges.append(FuncRange(name, low_pc, high_pc)) + + # Demangle names + all_names = [item.name for item in func_ranges] + demangled_map = demangle_names(all_names) + for func_range in func_ranges: + if func_range.name in demangled_map: + func_range.name = demangled_map[func_range.name] + + # To correctly identify the innermost function for a given address, + # func_ranges is sorted primarily by low_pc in ascending order and secondarily + # by high_pc in descending order. This ensures that for overlapping ranges, + # the more specific (inner) range appears later in the list. + func_ranges.sort(key=lambda item: (item.low_pc, -item.high_pc)) + return func_ranges + + +def read_dwarf_info(wasm, options): if options.dwarfdump_output: output = utils.read_file(options.dwarfdump_output) elif options.dwarfdump: logger.debug('Reading DWARF information from %s' % wasm) if not os.path.exists(options.dwarfdump): utils.exit_with_error('llvm-dwarfdump not found: ' + options.dwarfdump) - proc = shared.check_call([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=shared.PIPE) + # We need only three tags in the debug info: DW_TAG_compile_unit for + # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine + # for the function ranges. + dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm, + '-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', + '-t', 'DW_TAG_inlined_subroutine'] + proc = shared.check_call(dwarfdump_cmd, stdout=shared.PIPE) output = proc.stdout else: utils.exit_with_error('Please specify either --dwarfdump or --dwarfdump-output') @@ -288,22 +427,61 @@ def read_dwarf_entries(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - return sorted(entries, key=lambda entry: entry['address']) + entries = sorted(entries, key=lambda entry: entry['address']) + + func_ranges = extract_func_ranges(debug_line_chunks[0]) + return entries, func_ranges -def build_sourcemap(entries, code_section_offset, options): +def build_sourcemap(entries, func_ranges, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) + # Add code section offset to the low/high pc in the function PC ranges + for func_range in func_ranges: + func_range.low_pc += code_section_offset + func_range.high_pc += code_section_offset + sources = [] sources_content = [] + # There can be duplicate names in case an original source function has + # multiple disjoint PC ranges or is inlined to multiple callsites. Make the + # 'names' list a unique list of names, and map the function ranges to the + # indices in that list. + names = sorted({item.name for item in func_ranges}) + name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 + last_func_id = 0 + + active_funcs = [] + next_func_range_id = 0 + + # Get the function name ID that the given address falls into + def get_function_id(address): + nonlocal active_funcs + nonlocal next_func_range_id + + # Maintain a list of "active functions" whose ranges currently cover the + # address. As the address advances, it adds new functions that start and + # removes functions that end. The last function remaining in the active list + # at any point is the innermost function. + while next_func_range_id < len(func_ranges) and func_ranges[next_func_range_id].low_pc <= address: + # active_funcs contains (high_pc, id) pair + active_funcs.append((func_ranges[next_func_range_id].high_pc, next_func_range_id)) + next_func_range_id += 1 + active_funcs = [f for f in active_funcs if f[0] > address] + + if active_funcs: + func_range_id = active_funcs[-1][1] + name = func_ranges[func_range_id].name + return name_to_id[name] + return None for entry in entries: line = entry['line'] @@ -334,21 +512,27 @@ def build_sourcemap(entries, code_section_offset, options): sources_content.append(None) else: source_id = sources_map[source_name] + func_id = get_function_id(address) address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column - mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column + mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta) + if func_id is not None: + func_id_delta = func_id - last_func_id + last_func_id = func_id + mapping += encode_vlq(func_id_delta) + mappings.append(mapping) return {'version': 3, 'sources': sources, 'sourcesContent': sources_content, - 'names': [], + 'names': names, 'mappings': ','.join(mappings)} @@ -359,12 +543,12 @@ def main(args): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries = read_dwarf_entries(wasm_input, options) + entries, func_ranges = read_dwarf_info(wasm_input, options) code_section_offset = get_code_section_offset(wasm) logger.debug('Saving to %s' % options.output) - map = build_sourcemap(entries, code_section_offset, options) + map = build_sourcemap(entries, func_ranges, code_section_offset, options) with open(options.output, 'w', encoding='utf-8') as outfile: json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False)