diff --git a/canbench-bin/src/instruction_tracing.rs b/canbench-bin/src/instruction_tracing.rs
index faf2814f..44c5b93b 100644
--- a/canbench-bin/src/instruction_tracing.rs
+++ b/canbench-bin/src/instruction_tracing.rs
@@ -307,65 +307,16 @@ fn inject_tracing(
 /// Renders the tracing to a file. Adapted from
 /// https://github.com/dfinity/ic-repl/blob/master/src/tracing.rs
 pub(super) fn write_traces_to_file(
-    input: Vec<(i32, i64)>,
+    input: InstructionTraceGraphNode,
     names: &BTreeMap<i32, String>,
     bench_fn: &str,
     filename: PathBuf,
 ) -> Result<(), String> {
     use inferno::flamegraph::{from_reader, Options};
-    let mut stack = Vec::new();
-    let mut prefix = Vec::new();
-    let mut result = Vec::new();
-    let mut prev = None;
-    for (id, count) in input.into_iter() {
-        if id >= 0 {
-            stack.push((id, count, 0));
-            let name = if id < i32::MAX {
-                match names.get(&id) {
-                    Some(name) => name.clone(),
-                    None => "func_".to_string() + &id.to_string(),
-                }
-            } else {
-                bench_fn.to_string()
-            };
-            prefix.push(name);
-        } else {
-            let end_id = reverse_func_id(id);
-            match stack.pop() {
-                None => return Err("pop empty stack".to_string()),
-                Some((start_id, start, children)) => {
-                    if start_id != end_id {
-                        return Err("func id mismatch".to_string());
-                    }
-                    let cost = count - start;
-                    let frame = prefix.join(";");
-                    prefix.pop().unwrap();
-                    if let Some((parent, parent_cost, children_cost)) = stack.pop() {
-                        stack.push((parent, parent_cost, children_cost + cost));
-                    }
-                    match prev {
-                        Some(prev) if prev == frame => {
-                            // Add an empty spacer to avoid collapsing adjacent same-named calls
-                            // See https://github.com/jonhoo/inferno/issues/185#issuecomment-671393504
-                            result.push(format!("{};spacer 0", prefix.join(";")));
-                        }
-                        _ => (),
-                    }
-                    result.push(format!("{} {}", frame, cost - children));
-                    prev = Some(frame);
-                }
-            }
-        }
-    }
-    let is_trace_incomplete = !stack.is_empty();
+    let mut result = convert_trace_graph_to_flamegraph(input, names, bench_fn);
     let mut opt = Options::default();
     opt.count_name = "instructions".to_string();
-    let bench_fn = if is_trace_incomplete {
-        bench_fn.to_string() + " (incomplete)"
-    } else {
-        bench_fn.to_string()
-    };
-    opt.title = bench_fn;
+    opt.title = bench_fn.to_string();
     opt.flame_chart = true;
     opt.no_sort = true;
     // Reserve result order to make flamegraph from left to right.
@@ -379,6 +330,49 @@ pub(super) fn write_traces_to_file(
     Ok(())
 }
 
+fn convert_trace_graph_to_flamegraph(
+    input: InstructionTraceGraphNode,
+    names: &BTreeMap<i32, String>,
+    bench_fn: &str,
+) -> Vec<String> {
+    let mut flamegraph = Vec::new();
+    let mut prefix = Vec::new();
+    convert_trace_node_to_flamegraph(input, names, bench_fn, &mut prefix, &mut flamegraph);
+    flamegraph
+}
+
+fn convert_trace_node_to_flamegraph(
+    input: InstructionTraceGraphNode,
+    names: &BTreeMap<i32, String>,
+    bench_fn: &str,
+    prefix: &mut Vec<String>,
+    flamegraph: &mut Vec<String>,
+) {
+    let InstructionTraceGraphNode {
+        func_id,
+        cost,
+        children,
+    } = input;
+    prefix.push(func_id_to_name(func_id, names, bench_fn));
+
+    for child in children {
+        convert_trace_node_to_flamegraph(child, names, bench_fn, prefix, flamegraph);
+    }
+    flamegraph.push(format!("{} {}", prefix.join(";"), cost));
+    prefix.pop();
+}
+
+fn func_id_to_name(func_id: i32, names: &BTreeMap<i32, String>, bench_fn: &str) -> String {
+    if func_id < i32::MAX {
+        match names.get(&func_id) {
+            Some(name) => name.clone(),
+            None => "func_".to_string() + &func_id.to_string(),
+        }
+    } else {
+        bench_fn.to_string()
+    }
+}
+
 /// Extracts function names from the module to be a map from function id to function name.
 fn extract_function_names(module: &Module) -> BTreeMap<i32, String> {
     module
diff --git a/canbench-bin/src/lib.rs b/canbench-bin/src/lib.rs
index 0679d5e0..402388a3 100644
--- a/canbench-bin/src/lib.rs
+++ b/canbench-bin/src/lib.rs
@@ -8,7 +8,7 @@ mod results_file;
 mod summary;
 mod table;
 
-use canbench_rs::{BenchResult, Measurement};
+use canbench_rs::{BenchResult, InstructionTraceGraphNode, Measurement};
 use candid::{Encode, Principal};
 use flate2::read::GzDecoder;
 use instruction_tracing::{prepare_instruction_tracing, write_traces_to_file};
@@ -117,14 +117,17 @@ pub fn run_benchmarks(
         }
 
         if let Some(instruction_tracing_canister_id) = instruction_tracing_canister_id {
-            run_instruction_tracing(
-                &pocket_ic,
-                instruction_tracing_canister_id,
-                bench_fn,
-                function_names_mapping.as_ref().unwrap(),
-                results_file,
-                result.total.instructions,
-            );
+            for aggregate in [true, false] {
+                run_instruction_tracing(
+                    &pocket_ic,
+                    instruction_tracing_canister_id,
+                    bench_fn,
+                    function_names_mapping.as_ref().unwrap(),
+                    results_file,
+                    result.total.instructions,
+                    aggregate,
+                );
+            }
         }
 
         new_results.insert(bench_fn.to_string(), result);
@@ -270,15 +273,16 @@ fn run_instruction_tracing(
     names_mapping: &BTreeMap<i32, String>,
     results_file: &Path,
     bench_instructions: u64,
+    aggregate: bool,
 ) {
-    let traces: Result<Vec<(i32, i64)>, String> = match pocket_ic.query_call(
+    let traces: Result<InstructionTraceGraphNode, String> = match pocket_ic.query_call(
         canister_id,
         Principal::anonymous(),
         &format!("__tracing__{bench_fn}"),
-        Encode!(&bench_instructions).unwrap(),
+        Encode!(&bench_instructions, &aggregate).unwrap(),
     ) {
         Ok(reply) => {
-            let res: Result<Vec<(i32, i64)>, String> =
+            let res: Result<InstructionTraceGraphNode, String> =
                 candid::decode_one(&reply).expect("error decoding tracing result");
             res
         }
@@ -295,7 +299,10 @@ fn run_instruction_tracing(
             traces,
             names_mapping,
             bench_fn,
-            results_file.with_file_name(format!("{bench_fn}.svg")),
+            results_file.with_file_name(format!(
+                "{bench_fn}{}.svg",
+                if aggregate { "_aggregated" } else { "" }
+            )),
         )
         .expect("failed to write tracing results"),
         Err(e) => {
diff --git a/canbench-rs-macros/src/lib.rs b/canbench-rs-macros/src/lib.rs
index 37fea20a..4cde0fa3 100644
--- a/canbench-rs-macros/src/lib.rs
+++ b/canbench-rs-macros/src/lib.rs
@@ -63,9 +63,9 @@ pub fn bench(arg_tokens: TokenStream, item: TokenStream) -> TokenStream {
 
                 #[ic_cdk::query]
                 #[allow(non_snake_case)]
-                fn #tracing_func_name(bench_instructions: u64) -> Result<Vec<(i32, i64)>, String> {
+                fn #tracing_func_name(bench_instructions: u64, aggregate: bool) -> Result<canbench_rs::InstructionTraceGraphNode, String> {
                     #func_name();
-                    canbench_rs::get_traces(bench_instructions)
+                    canbench_rs::get_traces(bench_instructions, aggregate)
                 }
             }
         }
@@ -91,11 +91,11 @@ pub fn bench(arg_tokens: TokenStream, item: TokenStream) -> TokenStream {
 
                 #[ic_cdk::query]
                 #[allow(non_snake_case)]
-                fn #tracing_func_name(bench_instructions: u64) -> Result<Vec<(i32, i64)>, String> {
+                fn #tracing_func_name(bench_instructions: u64, aggregate: bool) -> Result<canbench_rs::InstructionTraceGraphNode, String> {
                     canbench_rs::bench_fn(|| {
                         #func_name();
                     });
-                    canbench_rs::get_traces(bench_instructions)
+                    canbench_rs::get_traces(bench_instructions, aggregate)
                 }
             }
         }
diff --git a/canbench-rs/src/lib.rs b/canbench-rs/src/lib.rs
index 12efb59d..61a7bd20 100644
--- a/canbench-rs/src/lib.rs
+++ b/canbench-rs/src/lib.rs
@@ -467,7 +467,11 @@
 pub use canbench_rs_macros::bench;
 use candid::CandidType;
 use serde::{Deserialize, Serialize};
-use std::{cell::RefCell, collections::BTreeMap};
+use std::{
+    cell::RefCell,
+    collections::{hash_map::Entry, BTreeMap, HashMap},
+    ops::AddAssign,
+};
 
 thread_local! {
     static SCOPES: RefCell<BTreeMap<&'static str, Vec<MeasurementInternal>>> =
@@ -830,7 +834,22 @@ const BUFFER_SIZE: usize = NUM_BYTES_ENABLED_FLAG
     + NUM_BYTES_NUM_ENTRIES
     + MAX_NUM_LOG_ENTRIES * (NUM_BYTES_FUNC_ID + NUM_BYTES_INSTRUCTION_COUNTER);
 const LOGS_START_OFFSET: usize = NUM_BYTES_ENABLED_FLAG + NUM_BYTES_NUM_ENTRIES;
-const MAX_NUM_LOG_ENTRIES_IN_RESPONSE: usize = 131_000;
+/// The maximum number of nodes in the response. This number is chosen since the
+/// `InstructionTraceGraphNode` takes ~13B per node, and the maximum response size is 3MiB. This
+/// number is chosen to be below 3MiB / 13B ~ 241_979.
+const MAX_NUM_NODES_IN_RESPONSE: usize = 240_000;
+
+/// A node in the instruction trace graph. The parent-child relationship is determined by the call
+/// stack - if function `f` calls function `g`, then `g` is a child of `f`.
+#[derive(Debug, Clone, Serialize, Deserialize, CandidType)]
+pub struct InstructionTraceGraphNode {
+    /// The id of the function.
+    pub func_id: i32,
+    /// The instruction cost of the function, not including its descendants.
+    pub cost: i64,
+    /// The children of the graph node.
+    pub children: Vec<InstructionTraceGraphNode>,
+}
 
 #[export_name = "__prepare_tracing"]
 fn prepare_tracing() -> i32 {
@@ -840,7 +859,10 @@ fn prepare_tracing() -> i32 {
     })
 }
 
-pub fn get_traces(bench_instructions: u64) -> Result<Vec<(i32, i64)>, String> {
+pub fn get_traces(
+    bench_instructions: u64,
+    aggregate: bool,
+) -> Result<InstructionTraceGraphNode, String> {
     TRACING_BUFFER.with_borrow(|b| {
         if b[0] == 1 {
             panic!("Tracing is still enabled.");
@@ -878,11 +900,64 @@ pub fn get_traces(bench_instructions: u64) -> Result<Vec<(i32, i64)>, String> {
         traces.push((i32::MIN, unsafe { INSTRUCTIONS_END - instructions_start }));
         let traces = adjust_traces_for_overhead(traces, bench_instructions);
         // TODO(EXC-2020): consider using compression.
-        let traces = truncate_traces(traces);
-        Ok(traces)
+        let graph = convert_traces_to_graph(traces);
+        let graph = if aggregate {
+            agggregate_traces(graph)
+        } else {
+            graph
+        };
+        let graph = truncate_traces_graph(graph);
+        Ok(graph)
     })
 }
 
+/// Converts traces from a list of trace events (start or end of a function and the instruction
+/// count at that point) to a graph, where the parent-child relationship is determined by the call
+/// stack.
+fn convert_traces_to_graph(traces: Vec<(i32, i64)>) -> InstructionTraceGraphNode {
+    let (node, _, _) = convert_traces_range_to_graph(&traces, 0);
+    node
+}
+
+/// Converts `traces[start..]` to a graph. The return value is `(node, next_start, total_cost)`,
+/// where `node` is the graph node which corresponds to the function at `traces[start]`,
+/// `next_start` is the index of the next trace to process, and `total_cost` is the total cost of
+/// the children of the current function. The additional 2 values are used for recursion.
+fn convert_traces_range_to_graph(
+    traces: &[(i32, i64)],
+    start: usize,
+) -> (InstructionTraceGraphNode, usize, i64) {
+    let (func_id, start_counter) = traces[start];
+    assert!(func_id >= 0, "Traces are corrupted {}.", func_id);
+
+    // The following loop tries to find all children of the current function.
+    let mut next_start = start + 1;
+    let mut children = Vec::new();
+    let mut total_children_cost = 0;
+    loop {
+        // If the next trace is an end of a function, we've found all children. The function id
+        // should be `!func_id``, and it will be asserted after exiting the loop.
+        if traces[next_start].0 < 0 {
+            break;
+        }
+        let (child, new_next_start, child_cost) = convert_traces_range_to_graph(traces, next_start);
+        children.push(child);
+        next_start = new_next_start;
+        total_children_cost += child_cost;
+    }
+
+    let (end_func_id, end_counter) = traces[next_start];
+    assert_eq!(func_id, !end_func_id, "Traces are corrupted.");
+    let total_cost = end_counter - start_counter;
+    let cost = total_cost - total_children_cost;
+    let node = InstructionTraceGraphNode {
+        func_id,
+        cost,
+        children,
+    };
+    (node, next_start + 1, total_cost)
+}
+
 fn adjust_traces_for_overhead(traces: Vec<(i32, i64)>, bench_instructions: u64) -> Vec<(i32, i64)> {
     let num_logs = traces.len() - 2;
     let overhead = (traces[num_logs].1 as f64 - bench_instructions as f64) / (num_logs as f64);
@@ -899,48 +974,155 @@ fn adjust_traces_for_overhead(traces: Vec<(i32, i64)>, bench_instructions: u64)
         .collect()
 }
 
-fn truncate_traces(traces: Vec<(i32, i64)>) -> Vec<(i32, i64)> {
-    if traces.len() <= MAX_NUM_LOG_ENTRIES_IN_RESPONSE {
-        return traces;
-    }
-
-    let mut num_traces_by_depth = BTreeMap::new();
+/// Truncates the traces graph by depth, and depth (above which the nodes get truncated) is the
+/// largest one such that the total number of nodes is less than or equal to
+/// `MAX_NUM_NODES_IN_RESPONSE`.
+fn truncate_traces_graph(root: InstructionTraceGraphNode) -> InstructionTraceGraphNode {
+    let mut num_nodes_by_depth = BTreeMap::new();
+    increase_num_nodes_by_depth(&root, 0, &mut num_nodes_by_depth);
 
-    let mut depth = 0;
-    for (func_id, _) in traces.iter() {
-        if *func_id >= 0 {
-            depth += 1;
-            *num_traces_by_depth.entry(depth).or_insert(0) += 1;
-        } else {
-            depth -= 1;
-        }
-    }
-    assert_eq!(depth, 0, "Traces are not balanced.");
     let mut depth_to_truncate = 0;
-    let mut cumulative_traces = 0;
-    for (depth, num_traces) in num_traces_by_depth.iter() {
-        cumulative_traces += num_traces;
-        if cumulative_traces <= MAX_NUM_LOG_ENTRIES_IN_RESPONSE {
+    let mut cumulative_nodes = 0;
+    for (depth, num_nodes) in num_nodes_by_depth.iter() {
+        cumulative_nodes += num_nodes;
+        if cumulative_nodes <= MAX_NUM_NODES_IN_RESPONSE {
             depth_to_truncate = *depth;
         } else {
             break;
         }
     }
 
-    let truncated: Vec<_> = traces
-        .into_iter()
-        .scan(0, |depth, (func_id, instruction_counter)| {
-            if func_id >= 0 {
-                *depth += 1;
-                Some((*depth, func_id, instruction_counter))
-            } else {
-                *depth -= 1;
-                Some((*depth + 1, func_id, instruction_counter))
+    truncate_traces_graph_by_depth(root, depth_to_truncate)
+}
+
+/// A recursive function used to count the number of nodes in the graph by depth.
+fn increase_num_nodes_by_depth(
+    node: &InstructionTraceGraphNode,
+    depth: usize,
+    num_nodes_by_depth: &mut BTreeMap<usize, usize>,
+) {
+    *num_nodes_by_depth.entry(depth).or_insert(0) += 1;
+    for child in &node.children {
+        increase_num_nodes_by_depth(child, depth + 1, num_nodes_by_depth);
+    }
+}
+
+/// A recursive function to truncates the graph by the given depth. When the depth is 0, it means
+/// only the node itself should be kept and its children should be truncated.
+fn truncate_traces_graph_by_depth(
+    node: InstructionTraceGraphNode,
+    depth: usize,
+) -> InstructionTraceGraphNode {
+    let InstructionTraceGraphNode {
+        func_id,
+        cost,
+        children,
+    } = node;
+    let children = if depth == 0 {
+        Vec::new()
+    } else {
+        children
+            .into_iter()
+            .map(|child| truncate_traces_graph_by_depth(child, depth - 1))
+            .collect()
+    };
+    InstructionTraceGraphNode {
+        func_id,
+        cost,
+        children,
+    }
+}
+
+/// An auxiliary struct used to aggregate the traces. Compared to `InstructionTraceGraphNode`, it
+/// has an additional field `total_cost`, which is the total cost of the node, including its
+/// descendants, so that the children can be sorted by the total cost.
+struct AggregatedInstructionTraceGraphNode {
+    func_id: i32,
+    /// The cost of the node, not including its descendants.
+    cost: i64,
+    /// The total cost of the node, including its descendants.
+    total_cost: i64,
+    /// The children of the node. The key is the function id, and the value is the aggregated child.
+    children: HashMap<i32, AggregatedInstructionTraceGraphNode>,
+}
+
+impl AggregatedInstructionTraceGraphNode {
+    /// Aggregates the traces of a function into the aggregated node.
+    fn aggregate(node: InstructionTraceGraphNode) -> Self {
+        let InstructionTraceGraphNode {
+            func_id,
+            cost,
+            children,
+        } = node;
+        let mut aggregated_children = HashMap::new();
+        let mut total_cost = cost;
+        for child in children {
+            let aggregated_child = Self::aggregate(child);
+            total_cost += aggregated_child.total_cost;
+            match aggregated_children.entry(aggregated_child.func_id) {
+                Entry::Occupied(mut entry) => {
+                    *entry.get_mut() += aggregated_child;
+                }
+                Entry::Vacant(entry) => {
+                    entry.insert(aggregated_child);
+                }
             }
-        })
-        .filter(|(depth, _, _)| *depth <= depth_to_truncate)
-        .map(|(_, func_id, instruction_counter)| (func_id, instruction_counter))
-        .collect();
+        }
+        Self {
+            func_id,
+            cost,
+            total_cost,
+            children: aggregated_children,
+        }
+    }
+}
+
+impl AddAssign for AggregatedInstructionTraceGraphNode {
+    fn add_assign(&mut self, other: Self) {
+        assert_eq!(
+            self.func_id, other.func_id,
+            "Cannot aggregate traces with different func_ids."
+        );
+        self.cost += other.cost;
+        self.total_cost += other.total_cost;
+        for (other_func_id, other_child) in other.children {
+            match self.children.entry(other_func_id) {
+                Entry::Occupied(mut entry) => {
+                    *entry.get_mut() += other_child;
+                }
+                Entry::Vacant(entry) => {
+                    entry.insert(other_child);
+                }
+            }
+        }
+    }
+}
+
+impl From<AggregatedInstructionTraceGraphNode> for InstructionTraceGraphNode {
+    /// Converts the aggregated node to the original node.
+    fn from(node: AggregatedInstructionTraceGraphNode) -> Self {
+        let AggregatedInstructionTraceGraphNode {
+            func_id,
+            cost,
+            total_cost: _,
+            children,
+        } = node;
+        let mut children: Vec<_> = children.into_iter().map(|(_, child)| child).collect();
+        children.sort_by_key(|child| -child.total_cost);
+        let children = children
+            .into_iter()
+            .map(InstructionTraceGraphNode::from)
+            .collect();
+        InstructionTraceGraphNode {
+            func_id,
+            cost,
+            children,
+        }
+    }
+}
 
-    truncated
+/// Converts the traces graph from time-based to aggregated (children sorted by total cost).
+fn agggregate_traces(root: InstructionTraceGraphNode) -> InstructionTraceGraphNode {
+    let aggregated = AggregatedInstructionTraceGraphNode::aggregate(root);
+    InstructionTraceGraphNode::from(aggregated)
 }