From 68cf9e8a1ef31295765c490622ba60cb2fc977c5 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Tue, 3 Mar 2026 09:43:01 +0000 Subject: [PATCH 01/51] Add bolt-demo tutorial --- .wordlist.txt | 3 + .../bolt-demo/_index.md | 61 ++++++ .../bolt-demo/_next-steps.md | 8 + .../bolt-demo/brbe.md | 62 ++++++ .../bolt-demo/bsort.cpp | 113 ++++++++++ .../bolt-demo/good-candidates.md | 81 +++++++ .../bolt-demo/instrumentation.md | 28 +++ .../bolt-demo/orderfile.txt | 10 + .../bolt-demo/overview.md | 29 +++ .../bolt-demo/pmu.md | 41 ++++ .../bolt-demo/setup.md | 198 ++++++++++++++++++ .../bolt-demo/spe.md | 77 +++++++ .../bolt-demo/verify-optimization.md | 81 +++++++ 13 files changed, 792 insertions(+) create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/_next-steps.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md create mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md diff --git a/.wordlist.txt b/.wordlist.txt index 74fdeb8d1d..f965774344 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -573,6 +573,7 @@ BMS BoardRenderer BoatAttack Bolt +BOLT BOLT's bonza bool @@ -601,6 +602,7 @@ brian brianfrankcooper Broadcom Brossard +BRBE brstack BSON bsp @@ -701,6 +703,7 @@ CDE CDH CDK cdn +cdsort ce cea cebbb diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md new file mode 100644 index 0000000000..96a183e2ab --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md @@ -0,0 +1,61 @@ +--- +title: "Get started with BOLT" + +minutes_to_complete: 20 + +who_is_this_for: This is an introductory topic for performance‑minded developers + who have a compiled aarch64 Linux program and want to see if BOLT can make it run faster. + +learning_objectives: + - Identify whether a program is a good candidate for code layout optimization + - Apply BOLT to optimize a small program with poor spatial locality + - Use different profiling techniques, including BRBE, Instrumentation, SPE, and PMU events + - Verify the impact of BOLT optimization using performance metrics + + +prerequisites: + - An AArch64 system running Linux with [Perf](/install-guides/perf/) installed + - Linux kernel version 6.17 or later for [BRBE](./brbe) profiling + - Linux kernel version 6.14 or later for [SPE](./spe) profiling + - GCC version 13.3 or later to compile the demo program ([GCC](/install-guides/gcc/) ) + - BOLT version [21.1.8](https://github.com/llvm/llvm-project/releases/tag/llvmorg-21.1.8) or later (download [zip](https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.8/LLVM-21.1.8-Linux-ARM64.tar.xz)) + - A system with enough performance counters for the [TopDown](/install-guides/topdown-tool) methodology, typically a non-virtualized instance + + +author: Paschalis Mpeis + +### Tags +skilllevels: Introductory +subjects: Performance and Architecture +armips: + - Neoverse + - Cortex-A +tools_software_languages: + - BOLT + - perf + +operatingsystems: + - Linux + +further_reading: + - resource: + title: BOLT README + link: https://github.com/llvm/llvm-project/tree/main/bolt + type: documentation + - resource: + title: Arm Statistical Profiling Extension Whitepaper + link: https://developer.arm.com/documentation/109429/latest/ + type: documentation + - resource: + title: Arm Topdown Methodology + link: https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology + type: documentation + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md new file mode 100644 index 0000000000..da9f4f2e60 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md @@ -0,0 +1,62 @@ +--- +title: "BOLT with BRBE" +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### What is BRBE +BRBE stands for Branch Record Buffer Extension. It is an Arm hardware unit with a circular buffer that captures the most recent 32 or 64 taken branches. The exact size depends on the hardware implementation. + +For BOLT, BRBE provides an effective, low-overhead sampling mechanism that records taken branches directly in hardware without frequent interruptions. Each recorded taken branch represents a control-flow edge, which makes BRBE an edge-based profiling method. + +Taken branches are continuously added to the circular buffer, and the buffer is periodically sampled to keep overheads low. +Recording only taken branches is an efficient use of the buffer, since fall-through paths do not need to be captured at runtime. +During post-processing, fall-through edges between the recorded taken branches are reconstructed, extending the effective branch history beyond what is stored in the buffer. BOLT performs this reconstruction automatically. + +### When to use BRBE +When available, BRBE is the preferred profiling option for BOLT. +It is expected to have the lowest runtime overhead while still providing near-optimal profiles, close to those obtained with instrumentation. + +### Optimizing with BRBE +We check [BRBE availability](#availability) before recording a profile. +We then record a BRBE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. + +```bash { line_numbers=true } +mkdir -p prof +perf record -j any,u -o prof/brbe.data -- ./out/bsort +perf2bolt -p prof/brbe.data -o prof/brbe.fdata out/bsort +llvm-bolt out/bsort -o out/bsort.opt.brbe --data prof/brbe.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats +``` + + +### Availability +BRBE is an optional feature in processors that implement [Armv9.1](https://developer.arm.com/documentation/109697/2025_09/Feature-descriptions/The-Armv9-2-architecture-extension#extension__feat_FEAT_BRBE) or later. To check availability, we record a trace. + +On a successful recording we see: +```bash { command_line="user@host | 2-5"} +perf record -j any,u -o prof/brbe.data -- ./out/bsort +Bubble sorting 10000 elements +421 ms (first=100669 last=2147469841) +[ perf record: Woken up 161 times to write data ] +[ perf record: Captured and wrote 40.244 MB brbe.data (26662 samples) ] +``` + +When unavailable: +```bash { command_line="user@host | 2-3"} +perf record -j any,u -o prof/brbe.data -- ./out/bsort +Error: +cycles:P: PMU Hardware or event type doesn't support branch stack sampling. +``` + +To record a BRBE trace we need a Linux system that is version 6.17 or later. We can check the version using: +```bash +perf --version +``` + + +### Further Reading +- [Arm Architecture Reference Manual for A-profile architecture](https://developer.arm.com/documentation/ddi0487/latest) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp b/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp new file mode 100644 index 0000000000..298c2afd3a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp @@ -0,0 +1,113 @@ +#include +#include +#include + +#define ARRAY_LEN 10000 +#define FUNC_COPIES 5 +volatile bool Cond = false; +#define COND() (__builtin_expect(Cond, true)) + +#define NOPS(N) \ + asm volatile( \ + ".rept %0\n" \ + "nop\n" \ + ".endr\n" \ + : : "i"(N) : "memory") + +// Swap functionality plus some cold blocks. +#define SWAP_FUNC(ID) \ + static __attribute__((noinline)) \ + void swap##ID(int *left, int *right) { \ + if (COND()) NOPS(300); \ + int tmp = *left; \ + if (COND()) NOPS(300); else *left = *right; \ + if (COND()) NOPS(300); else *right = tmp; \ + } + +// Aligned at 16KiB +#define COLD_FUNC(ID) \ + static __attribute__((noinline, aligned(16384), used)) \ + void cold_func##ID(void) { \ + asm volatile("nop"); \ + } + +// Create copies of swap, and interleave with big chunks of cold code. +SWAP_FUNC(1) COLD_FUNC(1) +SWAP_FUNC(2) COLD_FUNC(2) +SWAP_FUNC(3) COLD_FUNC(3) +SWAP_FUNC(4) COLD_FUNC(4) +SWAP_FUNC(5) COLD_FUNC(5) + +typedef void (*swap_fty)(int *, int *); +static swap_fty const swap_funcs[FUNC_COPIES] = { + swap1, swap2, swap3, swap4, swap5 +}; + + +/* Sorting Logic */ +void bubble_sort(int *a, int n) { + if (n <= 1) + return; + + int end = n - 1; + int swapped = 1; + unsigned idx = 0; + + while (swapped && end > 0) { + swapped = 0; + // pick a different copy of the swap function, in a round-robin fashion + // and call it. + for (int i = 1; i <= end; ++i) { + if (a[i] < a[i - 1]) { + auto swap_func = swap_funcs[idx++]; + idx %= FUNC_COPIES; + swap_func(&a[i - 1], &a[i]); + swapped = 1; + } + } + --end; + } +} + +void sort_array(int *data) { + for (int i = 0; i < ARRAY_LEN; ++i) { + data[i] = rand(); + } + bubble_sort(data, ARRAY_LEN); +} + +/* Timers, helpers, and main */ +static struct timespec timer_start; +static inline void start_timer(void) { + clock_gettime(CLOCK_MONOTONIC, &timer_start); +} + +static inline void stop_timer(void) { + struct timespec timer_end; + clock_gettime(CLOCK_MONOTONIC, &timer_end); + long long ms = (timer_end.tv_sec - timer_start.tv_sec) * 1000LL + + (timer_end.tv_nsec - timer_start.tv_nsec) / 1000000LL; + printf("%lld ms ", ms); +} + +static void print_first_last(const int *data, int n) { + if (n <= 0) + return; + + const int first = data[0]; + const int last = data[n - 1]; + printf("(first=%d last=%d)\n", first, last); +} + +int main(void) { + srand(0); + printf("Bubble sorting %d elements\n", ARRAY_LEN); + int data[ARRAY_LEN]; + + start_timer(); + sort_array(data); + stop_timer(); + + print_first_last(data, ARRAY_LEN); + return 0; +} diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md new file mode 100644 index 0000000000..4043380c52 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md @@ -0,0 +1,81 @@ +--- +title: Good BOLT Candidates +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Which code is a good BOLT candidate? +A few hardware metrics can indicate whether a program is a good candidate for code-layout optimization. +These metrics are commonly analyzed using general methodologies such as the [Arm TopDown methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology). + +Here, we focus on a small set of TopDown indicators related to instruction delivery and code locality. +These indicators describe how efficiently the processor can fetch instructions and keep its execution pipeline busy. +When instruction delivery is inefficient, the workload is said to be **front-end bound**, meaning the CPU often waits for instructions instead of executing them. +This usually points to instruction fetch or code layout issues, where improving code layout can help. + +The L1 instruction cache (L1 I-cache) is the first and fastest cache used to store instructions close to the CPU. +When instructions are not found there, the CPU must fetch them from slower memory, which can stall execution. +MPKI, short for misses per kilo instructions, measures how often an event misses per 1,000 executed instructions, which makes it easier to compare across programs and workloads. +A high **L1 I-cache MPKI** usually indicates poor instruction locality in the binary. + +Based on these observations, the BOLT community suggests the following two indicators of a good candidate: +- Front-End bound workload above 10%. +- More than 30 L1 I-cache misses per kilo instructions (MPKI). + +Higher branch mispredictions or I-TLB misses can also indicate that layout optimization may help. + +We can use the Topdown Methodology (see [installation guide](/install-guides/topdown-tool)) to collect these metrics, which is based on the Linux [perf](/install-guides/perf/) tool. +Alternatively, we can compute only the L1 I-cache MPKI metric manually using plain Linux perf stat. + +{{< tabpane code=true >}} + {{< tab header="topdown-tool" language="bash" output_lines="2-21">}} + topdown-tool ./out/bsort + CPU Neoverse V1 metrics + ├── Stage 1 (Topdown metrics) + │ └── Topdown Level 1 (Topdown_L1) + │ └── ┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━┓ + │ ┃ Metric ┃ Value ┃ Unit ┃ + │ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━┩ + │ │ Backend Bound │ 11.77 │ % │ + │ │ Bad Speculation │ 17.92 │ % │ + │ » │ Frontend Bound │ 55.73 │ % │ « + │ │ Retiring │ 14.88 │ % │ + │ └─────────────────┴───────┴──────┘ + └── Stage 2 (uarch metrics) + ├── Misses Per Kilo Instructions (MPKI) + │ └── ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + │ ┃ Metric ┃ Value ┃ Unit ┃ + │ ┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ │ Branch MPKI │ 16.583 │ misses per 1,000 instructions │ + │ » │ L1I Cache MPKI │ 60.408 │ misses per 1,000 instructions │ « + │ └─────────────────────────┴────────┴───────────────────────────────┘ + ... + {{< /tab >}} + {{< tab header="perf stat" language="bash" output_lines="2-10">}} + perf stat -e instructions,L1-icache-misses:u ./out/bsort + Performance counter stats for './out/bsort': + + 957828603 instructions + 58003648 L1-icache-misses + + 0.282472631 seconds time elapsed + + 0.282541000 seconds user + 0.000000000 seconds sys + {{< /tab >}} +{{< /tabpane >}} + +We see that the program is **55%** front-end bound. +At Stage 2, the micro-architectural metrics report **60 L1I MPKI**, which indicates a good candidate for layout optimization. +The branch MPKI of **16** is also relatively high. + +Under the hood, the `topdown-tool` collects perf counters and applies formulas to derive these metrics. +To compute the L1 I-cache MPKI manually from the `perf stat` output, we apply: +$$\frac{(\text{L1-icache-misses} \times 1000)}{\text{instructions}}$$ + +### Further Reading +- [Arm Topdown methodology]( https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology) +- [Optimizing Clang : A Practical Example of Applying BOLT](https://github.com/llvm/llvm-project/blob/main/bolt/docs/OptimizingClang.md) +- [Metrics by metric group in Neoverse V2](https://developer.arm.com/documentation/109528/0200/Metrics-by-metric-group-in-Neoverse-V2?lang=en) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md new file mode 100644 index 0000000000..e7b20fbe14 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md @@ -0,0 +1,28 @@ +--- +title: "BOLT with Instrumentation" +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### What is instrumentation + +Instrumentation is a profiling method, not specific to BOLT, that augments code with counters to record exact execution counts. + +For BOLT, Instrumentation provides complete execution counts for the paths that run. This gives a near-optimal profile for code-layout optimization and therefore the highest optimization potential, without requiring special hardware. + +Instrumentation can increase binary size and add significant runtime overhead, making it less attractive for production use. It is mainly used when other profiling methods, such as BRBE, are unavailable, or for comparison to understand the maximum optimization potential. + +### Optimizing with instrumentation +We first build an instrumented binary and then execute the workload to generate a profile. +By default, BOLT writes the profile to `/tmp/prof.fdata`, unless a path is specified using the `--instrumentation-file` flag. +Finally, we use the generated profile to optimize the binary with BOLT. + +```bash +llvm-bolt --instrument out/bsort -o out/bsort.instr +./out/bsort.instr +llvm-bolt out/bsort -o out/bsort.opt.instr --data /tmp/prof.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats +``` diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt b/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt new file mode 100644 index 0000000000..1e6d1a8faa --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt @@ -0,0 +1,10 @@ +_ZL5swap1PiS_ +_ZL10cold_func1v +_ZL5swap2PiS_ +_ZL10cold_func2v +_ZL5swap3PiS_ +_ZL10cold_func3v +_ZL5swap4PiS_ +_ZL10cold_func4v +_ZL5swap5PiS_ +_ZL10cold_func5v diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md new file mode 100644 index 0000000000..6d668a8230 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md @@ -0,0 +1,29 @@ +--- +title: Overview +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Tutorial Overview + +This tutorial shows how to apply [BOLT](https://github.com/llvm/llvm-project/blob/main/bolt/README.md) in different configurations. +It is based on a demo from the 2025 LLVM Developers Conference: +[BOLT tutorial on AArch64 and how it competes or complements other PGOs](https://youtu.be/KdHtOMc5_c8?si=249wZTn_YcTFOjcJ&t=1452). + + +The input program is a pathological case based on [BubbleSort](../setup), a workload with poor spatial locality. +First, we check whether the input binary is a good candidate for code layout optimization. +If it is, we can capture a profile using one of several profiling methods: +- **[BRBE](../brbe)**: Samples deep branch stacks with low profiling overheads. +- **[Instrumentation](../instrumentation)**: Captures high-quality, complete profiles, but has high collection overhead. +- **[SPE](../spe)**: Samples individual branches. Use it if BRBE is not available, as profile quality can be lower. +- **[PMU](../pmu)**: Samples basic events such as instructions or cycles. This method provides the least profiling information. + + + +ETM and ETE generate data that you can use with BOLT. This tutorial does not cover these tracing methods. + +For each profiling method, we will perform the relevant BOLT optimization steps. +Finally, we will use hardware metrics to confirm how effective the optimization was. diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md new file mode 100644 index 0000000000..77cab0c413 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md @@ -0,0 +1,41 @@ +--- + +title: "BOLT with PMU" + +weight: 8 + +### FIXED, DO NOT MODIFY + +layout: learningpathall + +--- + +### What is PMU +PMU stands for Performance Monitoring Unit. It is an Arm hardware unit that provides event-based sampling of program execution. +PMU samples microarchitectural events such as instructions, cycles, branches, and other hardware events. This form of profiling is widely available across Arm systems. + +For BOLT, PMU provides samples that capture coarse hotness information. Samples are associated with instruction addresses and therefore with *basic blocks*, which are straight-line sequences of instructions that always execute in full once entered. This indicates how often those blocks run, rather than how control flows between them. +For this reason, PMU profiling is often referred to as *basic sampling* rather than *edge sampling*. While it is possible to sample branch events using the PMU, these samples do not include branch target information and therefore still do not provide control-flow edge information. + +Because functions consist of many basic blocks, PMU sampling can provide useful information at the function level. This makes it suitable for coarse-grained optimizations such as function reordering, but can be less effective for fine-grained block layout. Increasing the sampling frequency can improve coverage, but at the cost of higher profile collection overhead. + +### When to use PMU +PMU is most useful when BRBE and SPE are unavailable and instrumentation is not feasible. +It provides the least detailed control-flow information among the available methods, so it is typically used as a fallback option. + +### Optimizing with PMU +We record a PMU profile by running our workload under perf, convert it into a format that BOLT understands, and then run the BOLT optimization. +This tutorial uses instruction sampling. + + +```bash { line_numbers=true } +mkdir -p prof +perf record -e instructions:u -o prof/pmu.data -- ./out/bsort +perf2bolt out/bsort -p prof/pmu.data -o prof/pmu.fdata --ba +llvm-bolt out/bsort -o out/bsort.opt.pmu --data prof/pmu.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats +``` + +### Availability +PMU events are available on all Arm systems that support perf. No additional hardware features are required. diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md new file mode 100644 index 0000000000..35ee0fc415 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -0,0 +1,198 @@ +--- +title: Setup and Input +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + + +### Environment setup +We start in an empty directory and place the input program [bsort.cpp](../bsort.cpp) there. +The [last section](#why-bubble-sort) explains why we chose BubbleSort for this tutorial. + +We create and use the following directories as needed throughout this guide: + +- **out**: Stores output binaries +- **prof**: Stores profile data +- **heatmap**: Stores heatmap visualizations and related metrics + +### Compile the input program {#compile} +We now compile the input binary. +Because BOLT and PGO pipelines can include multiple stages, this binary is also called the **stage-0 binary**. + +To keep the example useful, we must keep the original function order. +Small programs like this are simple enough that compilers might reorder functions and improve layout without profile data. +That behavior is rare in real applications, but it can happen here. +To keep our example with poor locality, we pass specific options to the relevant toolchain. + +BOLT works with both LLVM and GNU toolchains. +GNU (gcc) provides a direct flag that preserves the original order: `-fno-toplevel-reorder`. +LLVM (clang) requires an order file that defines the initial layout. +You can find this file here: [orderfile.txt](../orderfile.txt). + +Both approaches are shown below. +Compile with your preferred toolchain, and ensure that relocations are enabled. +We explain why they matter [later](#why-relocations) in this tutorial. + +{{< tabpane code=true >}} + {{< tab header="GNU" language="bash">}} +mkdir -p out +gcc bsort.cpp -o out/bsort -O3 -Wl,--emit-relocs -fno-toplevel-reorder + {{< /tab >}} + {{< tab header="LLVM" language="bash">}} +mkdir -p out +clang bsort.cpp -o out/bsort -O3 -fuse-ld=lld -ffunction-sections -Wl,--emit-relocs -Wl,--symbol-ordering-file=orderfile.txt + {{< /tab >}} +{{< /tabpane >}} + +### Verify the function order +We now verify that the compiler preserved the original function order. +We do this by inspecting the symbols in the `.text` section. +The output should list the swap and cold functions interleaved, matching their order in the source file. + +{{< tabpane code=true >}} + {{< tab header="GNU" language="bash" output_lines="2-13">}} + objdump --syms --demangle out/bsort | grep ".text" | grep ")" | sort + 0000000000014000 l F .text 0000000000000e4c swap1(int*, int*) + 0000000000018000 l F .text 0000000000000008 cold_func1() + 0000000000018008 l F .text 0000000000000e4c swap2(int*, int*) + 000000000001c000 l F .text 0000000000000008 cold_func2() + 000000000001c008 l F .text 0000000000000e4c swap3(int*, int*) + 0000000000020000 l F .text 0000000000000008 cold_func3() + 0000000000020008 l F .text 0000000000000e4c swap4(int*, int*) + 0000000000024000 l F .text 0000000000000008 cold_func4() + 0000000000024008 l F .text 0000000000000e4c swap5(int*, int*) + 0000000000028000 l F .text 0000000000000008 cold_func5() + 0000000000028158 g F .text 00000000000000c0 bubble_sort(int*, int) + 0000000000028218 g F .text 00000000000000d8 sort_array(int*) + {{< /tab >}} + {{< tab header="LLVM" language="bash" output_lines="2-13">}} + llvm-objdump --syms --demangle out/bsort | grep ".text" | grep ")" | sort + 0000000000014000 l F .text 0000000000000e4c swap1(int*, int*) + 0000000000018000 l F .text 0000000000000008 cold_func1() + 0000000000018008 l F .text 0000000000000e4c swap2(int*, int*) + 000000000001c000 l F .text 0000000000000008 cold_func2() + 000000000001c008 l F .text 0000000000000e4c swap3(int*, int*) + 0000000000020000 l F .text 0000000000000008 cold_func3() + 0000000000020008 l F .text 0000000000000e4c swap4(int*, int*) + 0000000000024000 l F .text 0000000000000008 cold_func4() + 0000000000024008 l F .text 0000000000000e4c swap5(int*, int*) + 0000000000028000 l F .text 0000000000000008 cold_func5() + 0000000000028158 g F .text 00000000000000c0 bubble_sort(int*, int) + 0000000000028218 g F .text 00000000000000d8 sort_array(int*) + {{< /tab >}} +{{< /tabpane >}} + + +### Verify the presence of relocations +We now verify that the binary includes relocations. +This can be seen by checking for `.rel*.*` entries in the section table, such as `.rela.text`. + +{{< tabpane code=true >}} + {{< tab header="GNU" language="bash" output_lines="2-13">}} + readelf -S out/bsort | grep .rel + [ 9] .rela.dyn RELA 0000000000000520 00000520 + [10] .rela.plt RELA 0000000000000658 00000658 + [20] .data.rel.ro PROGBITS 0000000000038560 00018560 + [23] .relro_padding NOBITS 0000000000038750 00018750 + [28] .rela.text RELA 0000000000000000 000187b8 + [29] .rela.eh_frame RELA 0000000000000000 00018ea8 + [30] .rela.init RELA 0000000000000000 00019058 + [31] .rela.data RELA 0000000000000000 00019070 + [32] .rela.fini_array RELA 0000000000000000 00019088 + [33] .rela.init_array RELA 0000000000000000 000190a0 + [35] .rela.data.rel.ro RELA 0000000000000000 00019158 + {{< /tab >}} + {{< tab header="LLVM" language="bash" output_lines="2-12">}} + llvm-readelf -S out/bsort | grep .rel + [ 9] .rela.dyn RELA 0000000000000520 000520 000138 18 A 4 0 8 + [10] .rela.plt RELA 0000000000000658 000658 0000c0 18 AI 4 26 8 + [20] .data.rel.ro PROGBITS 0000000000038560 018560 000028 00 WA 0 0 8 + [23] .relro_padding NOBITS 0000000000038750 018750 0008b0 00 WA 0 0 1 + [28] .rela.text RELA 0000000000000000 0187b8 0006f0 18 I 36 14 8 + [29] .rela.eh_frame RELA 0000000000000000 018ea8 0001b0 18 I 36 12 8 + [30] .rela.init RELA 0000000000000000 019058 000018 18 I 36 15 8 + [31] .rela.data RELA 0000000000000000 019070 000018 18 I 36 24 8 + [32] .rela.fini_array RELA 0000000000000000 019088 000018 18 I 36 18 8 + [33] .rela.init_array RELA 0000000000000000 0190a0 000018 18 I 36 19 8 + [35] .rela.data.rel.ro RELA 0000000000000000 019158 000078 18 I 36 20 8 + {{< /tab >}} +{{< /tabpane >}} + + +### Why relocations are important {#why-relocations} +BOLT relies on relocations to update references after it changes the code layout. +Without relocations, BOLT is severely limited. For example, function reordering is disabled, which makes code layout optimizations ineffective. + +Because BOLT runs post-link, it may need to adjust locations that the linker patched in the original binary. +Relocations describe these locations, so they must be preserved for BOLT to be able to apply its full set of layout optimizations. + + +### Why Bubble Sort? +Bubble Sort keeps this tutorial simple. +The code is in one file, has no external dependencies, and runs in a few seconds under instrumentation with a small, fixed workload. +In its original form it is not a good candidate for code layout optimization. +To make it one, we add **cold code** blocks between hot paths. +This reduces code locality, which BOLT improves later. + +The code below shows the changes we introduced to reduce code locality. + +The main sort function is shown below. It rotates through 5 copies of the swap function, selecting a different one each time a swap is performed. +```cpp { line_numbers=true linenos=table line_start=48 } +void bubble_sort(int *a, int n) { + if (n <= 1) + return; + + int end = n - 1; + int swapped = 1; + unsigned idx = 0; + + while (swapped && end > 0) { + swapped = 0; + // pick a different copy of the swap function, in a round-robin fashion + // and call it. + for (int i = 1; i <= end; ++i) { + if (a[i] < a[i - 1]) { + auto swap_func = swap_funcs[idx++]; + idx %= FUNC_COPIES; + swap_func(&a[i - 1], &a[i]); + swapped = 1; + } + } + --end; + } +} +``` + +Each swap function is defined using a macro and includes some nop instructions on a cold path. +```cpp { line_numbers=true linenos=table line_start=18 } +#define SWAP_FUNC(ID) \ + static __attribute__((noinline)) \ + void swap##ID(int *left, int *right) { \ + if (COND()) NOPS(300); \ + int tmp = *left; \ + if (COND()) NOPS(300); else *left = *right; \ + if (COND()) NOPS(300); else *right = tmp; \ + } +``` + +To further reduce code locality, we place larger cold functions between hot ones. +These cold functions are also defined using a macro and consist entirely of nop instructions. +```cpp { line_numbers=true linenos=table line_start=28 } +#define COLD_FUNC(ID) \ + static __attribute__((noinline, aligned(16384), used)) \ + void cold_func##ID(void) { \ + asm volatile("nop"); \ + } +``` + +We use the above two macros to interleave the hot and cold functions in the binary. +Locality is reduced because each call uses a different swap function with large cold code regions placed between them. +```cpp { line_numbers=true linenos=table line_start=35 } +SWAP_FUNC(1) COLD_FUNC(1) +SWAP_FUNC(2) COLD_FUNC(2) +SWAP_FUNC(3) COLD_FUNC(3) +SWAP_FUNC(4) COLD_FUNC(4) +SWAP_FUNC(5) COLD_FUNC(5) +``` diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md new file mode 100644 index 0000000000..fb86411b9e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md @@ -0,0 +1,77 @@ +--- + +title: "BOLT with SPE" + +weight: 7 + +### FIXED, DO NOT MODIFY + +layout: learningpathall + +--- + +### What is SPE +SPE stands for Statistical Profiling Extension. It is an Arm hardware unit that provides low-overhead, statistical sampling of program execution. +SPE samples microarchitectural events such as instruction execution, memory accesses, and branches. + +For BOLT, SPE branch samples are the relevant input as they provide an edge-based control-flow profile. +Unlike [BRBE](../brbe), SPE does not record sequences of taken branches. +Each sample captures only a single transition between two program locations, representing a single edge in the control-flow graph. + +Some implementations also support the Previous Branch Target (PBT) feature. +This feature records 1 taken branch in addition to the edge. +This provides a depth-1 branch history. It extends standard SPE sampling but remains shallower than BRBE. + +### When to use SPE +SPE provides less detailed control-flow information than BRBE. It can still capture useful branch behavior and guide code layout decisions, making it a good alternative when BRBE is unavailable or instrumentation overhead is prohibitive. + +### Optimizing with SPE +We check [SPE availability](#availability) before recording a profile. +We then record an SPE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. + +```bash { line_numbers=true } +mkdir -p prof +perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort +perf2bolt -p prof/spe.data -o prof/spe.fdata ./out/bsort --spe +llvm-bolt out/bsort -o out/bsort.opt.spe --data prof/spe.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats +``` + + +### Availability +SPE is an optional feature in processors that implement [Armv8.1](https://developer.arm.com/documentation/109697/2025_12/Feature-descriptions/The-Armv8-2-architecture-extension#md447-the-armv82-architecture-extension__feat_FEAT_SPE) or later. To check availability, we record a trace. + +On a successful recording we see: +```bash { command_line="user@host | 2-5"} +perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort +Bubble sorting 10000 elements +454 ms (first=100669 last=2147469841) +[ perf record: Woken up 7 times to write data ] +[ perf record: Captured and wrote 13.458 MB prof/spe.data ] +``` + +When unavailable: +```bash { command_line="user@host | 2-12"} +perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort + +event syntax error: 'arm_spe/branch_filter=1/u' + \___ Bad event or PMU + +Unable to find PMU or event on a PMU of 'arm_spe' +Run 'perf list' for a list of valid events + + Usage: perf record [] [] + or: perf record [] -- [] + + -e, --event event selector. use 'perf list' to list available events +``` + +To record an SPE trace we need a Linux system that is version 6.14 or later. We can check the version using: +```bash +perf --version +``` + + +### Further Reading +- [Arm Statistical Profiling Extension: Performance Analysis Methodology White Paper](https://developer.arm.com/documentation/109429/latest/) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md new file mode 100644 index 0000000000..4390b175e0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md @@ -0,0 +1,81 @@ +--- +title: Verify Optimization +weight: 9 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +### Verify with runtime + +{{% notice Note %}} +The example below uses a [BRBE](../brbe) optimized binary. The same verification applies to all BOLT profiling methods. +{{% /notice %}} + +We start by checking the runtime of the original and optimized BubbleSort binaries. A speedup is the first indication that BOLT improved the layout. + +```bash { command_line="user@host | 2-4,6-8"} +time out/bsort + Bubble sorting 10000 elements + 280 ms (first=100669 last=2147469841) + out/bsort 0.28s user 0.00s system 99% cpu 0.282 total +time out/bsort.opt.brbe + Bubble sorting 10000 elements + 147 ms (first=100669 last=2147469841) + out/bsort.opt.brbe 0.15s user 0.00s system 99% cpu 0.148 total +``` + +In this example, we see a first indication of improvement from the speedup. It is large, around 2x, because the input program is intentionally pathological. Real applications may see smaller improvements. + + +### Verify with hardware metrics +We now apply the [TopDown Methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology) again to confirm that BOLT improved the layout. +Runtime shows the effect, but TopDown confirms how the changes appear in the hardware metrics. + +We run the same tool that we used when checking whether the input program was a good candidate, but this time we check the optimized binary, for example the BRBE-optimized one. + +{{< tabpane code=true >}} + {{< tab header="topdown-tool" language="bash" output_lines="2-21">}} + topdown-tool ./out/bsort.opt.brbe + CPU Neoverse V1 metrics + ├── Stage 1 (Topdown metrics) + │ └── Topdown Level 1 (Topdown_L1) + │ └── ┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━┓ + │ ┃ Metric ┃ Value ┃ Unit ┃ + │ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━┩ + │ │ Backend Bound │ 11.19 │ % │ + │ │ Bad Speculation │ 24.86 │ % │ + │ » │ Frontend Bound │ 36.10 │ % │ « + │ │ Retiring │ 28.42 │ % │ + │ └─────────────────┴───────┴──────┘ + └── Stage 2 (uarch metrics) + ├── Misses Per Kilo Instructions (MPKI) + │ └── ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + │ ┃ Metric ┃ Value ┃ Unit ┃ + │ ┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ │ Branch MPKI │ 9.799 │ misses per 1,000 instructions │ + │ » │ L1I Cache MPKI │ 0.019 │ misses per 1,000 instructions │ « + │ └─────────────────────────┴───────┴───────────────────────────────┘ + ... + {{< /tab >}} + {{< tab header="perf stat" language="bash" output_lines="2-10">}} + perf stat -e instructions,L1-icache-misses:u ./out/bsort.opt.brbe + Performance counter stats for './out/bsort.opt.brbe': + + 982204165 instructions + 3807 L1-icache-misses + + 0.147606245 seconds time elapsed + + 0.147644000 seconds user + 0.000000000 seconds sys + {{< /tab >}} +{{< /tabpane >}} + +We compare these metrics with the earlier results. Front-end bound and L1I MPKI should be lower after optimization. + +We now see that the optimized program is **36%** front-end bound, down from 55%. In addition, the L1I MPKI is close to **0**, showing that code layout improved. This result is unusually low because the input program is intentionally pathological. + +The Branch MPKI also dropped to **10** from 16 because BOLT can improve branch prediction by swapping the fall-through and taken paths based on profile data. + +We can also compute these MPKIs manually using `perf stat`, as described in the [Good BOLT Candidates](../good-candidates) page. From 7aec49783bd098f173beba848271dce6e419f5bc Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 6 Mar 2026 10:14:38 -0500 Subject: [PATCH 02/51] Refine content for DGX Spark robotics learning path Updated the title and descriptions for clarity and consistency. Revised learning objectives and prerequisites for better readability. --- .../dgx_spark_isaac_robotics/_index.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md index 52874e8400..b4c4df7876 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md @@ -1,5 +1,5 @@ --- -title: Build Robot Simulation and RL Workflows with Isaac Sim and Isaac Lab on DGX Spark +title: Build Robot Simulation and Reinforcement Learning Workflows with Isaac Sim and Isaac Lab on DGX Spark draft: true cascade: @@ -7,20 +7,19 @@ cascade: minutes_to_complete: 90 -who_is_this_for: This learning path is intended for robotics developers, simulation engineers, and AI researchers who want to run high-fidelity robotic simulations and reinforcement learning (RL) pipelines using Isaac Sim and Isaac Lab on Arm-based NVIDIA DGX Spark systems powered by the Grace–Blackwell (GB10) architecture. +who_is_this_for: This learning path is intended for robotics developers, simulation engineers, and AI researchers who want to run high-fidelity robotic simulations and reinforcement learning (RL) pipelines using NVIDIA Isaac Sim and Isaac Lab on Arm-based NVIDIA DGX Spark system powered by the Grace–Blackwell (GB10) architecture. learning_objectives: - - Explain the roles of Isaac Sim and Isaac Lab, and describe how DGX Spark accelerates robotic simulation and reinforcement learning workloads - - Build Isaac Sim and Isaac Lab from source on an Arm-based DGX Spark system - - Launch and control a basic robot simulation in Isaac Sim using Python scripts - - Train and evaluate a reinforcement learning policy for the Unitree H1 humanoid robot using Isaac Lab and the RSL-RL interface + - Describe the roles of Isaac Sim and Isaac Lab within a robotics simulation and RL pipeline + - Build and configure Isaac Sim and Isaac Lab on an Arm-based DGX Spark system + - Launch and control a robot simulation in Isaac Sim using Python + - rain and evaluate a reinforcement learning policy for the Unitree H1 humanoid robot using Isaac Lab and RSL-RL prerequisites: - - Access to an NVIDIA DGX Spark system with at least 50 GB of free disk space + - A NVIDIA DGX Spark system with at least 50 GB of free disk space - Familiarity with Linux command-line tools - Experience with Python scripting and virtual environments - Basic understanding of reinforcement learning concepts (rewards, policies, episodes) - - Experience building software from source using CMake and make author: - Johnny Nunez From 003d438bdb0b9a1f5cd190bfa7e0bc9ab441eb53 Mon Sep 17 00:00:00 2001 From: Gabriel Peterson <25187859+gabrieldpeterson@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:13:10 -0800 Subject: [PATCH 03/51] Add a Learning Path for running image classification on an Alif E8 DevKit with ExecuTorch --- .../alif-image-classification/_index.md | 62 ++++ .../alif-image-classification/_next-steps.md | 8 + .../alif-image-classification/_review.md | 40 +++ .../application-code.md | 142 +++++++++ .../aws-ec2-setup.md | 282 ++++++++++++++++++ .../alif-image-classification/board-setup.md | 105 +++++++ .../build-flash-verify.md | 120 ++++++++ .../alif-image-classification/cat.jpg | Bin 0 -> 23009 bytes .../create-project.md | 210 +++++++++++++ .../image-preparation.md | 126 ++++++++ .../memory-configuration.md | 185 ++++++++++++ 11 files changed, 1280 insertions(+) create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_next-steps.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_review.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/application-code.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/aws-ec2-setup.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/board-setup.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/build-flash-verify.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/cat.jpg create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/create-project.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/image-preparation.md create mode 100644 content/learning-paths/embedded-and-microcontrollers/alif-image-classification/memory-configuration.md diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md new file mode 100644 index 0000000000..d09347e1b3 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md @@ -0,0 +1,62 @@ +--- +title: Run image classification on an Alif Ensemble E8 DevKit with ExecuTorch and Ethos-U85 + +minutes_to_complete: 120 + +who_is_this_for: This Learning Path is for embedded developers who want to deploy a neural network on an Arm Cortex-M55 microcontroller with an Ethos-U85 NPU. You will compile a MobileNetV2 model using ExecuTorch, embed it into bare-metal firmware, and run image classification on the Alif Ensemble E8 DevKit. + +learning_objectives: + - Compile a MobileNetV2 model for the Ethos-U85 NPU using ExecuTorch's ahead-of-time (AOT) compiler on an Arm-based cloud instance. + - Build ExecuTorch static libraries for bare-metal Cortex-M55 targets. + - Configure CMSIS project files, memory layout, and linker scripts for a large ML workload on the Alif Ensemble E8. + - Run real-time image classification inference on the Ethos-U85 NPU and verify results through SEGGER RTT. + +prerequisites: + - An Alif Ensemble E8 DevKit with a USB-C cable. + - A SEGGER J-Link debug probe (the DevKit has one built in). + - A development machine running macOS (Apple Silicon) or Linux. + - (Optional) An AWS account or access to an Arm-based cloud instance (Graviton c7g.4xlarge recommended). You can also build ExecuTorch locally on an Arm-based machine, though the steps will differ. + - Basic familiarity with C/C++ and embedded development concepts. + - VS Code installed on your development machine. + +author: Gabriel Peterson + +### Tags +skilllevels: Advanced +subjects: ML +armips: + - Cortex-M + - Ethos-U +tools_software_languages: + - ExecuTorch + - PyTorch + - GCC + - CMSIS-Toolbox + - Python +operatingsystems: + - Baremetal + +further_reading: + - resource: + title: ExecuTorch Arm Ethos-U NPU Backend Tutorial + link: https://docs.pytorch.org/executorch/1.0/tutorial-arm-ethos-u.html + type: documentation + - resource: + title: Alif Ensemble E8 DevKit Support Page + link: https://alifsemi.com/support/kits/ensemble-e8devkit/ + type: website + - resource: + title: Arm Ethos-U85 NPU Technical Overview + link: https://developer.arm.com/Processors/Ethos-U85 + type: documentation + - resource: + title: CMSIS-Toolbox Documentation + link: https://arm-software.github.io/CMSIS_6/latest/Toolbox/index.html + type: documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_next-steps.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_review.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_review.md new file mode 100644 index 0000000000..b9f9bfdebb --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_review.md @@ -0,0 +1,40 @@ +--- +title: Knowledge check +weight: 20 + +layout: "learningpathall" + +review: + - questions: + question: "Which NPU peripheral on the Alif Ensemble E8 is used for the Ethos-U85?" + explanation: "NPU_HG (High-Grade) at base address 0x49042000 is the Ethos-U85. NPU_HP is an Ethos-U55 at a different address. Using the wrong base address causes a product mismatch error." + correct_answer: 2 + answers: + - "NPU_HP" + - "NPU_HG" + - "NPU_HE" + - questions: + question: "Why do some ExecuTorch libraries need to be linked with --whole-archive?" + explanation: "Libraries like libexecutorch and libcortex_m_ops_lib contain static registration constructors that register operators and PAL symbols at startup. Without --whole-archive, the linker sees these constructors as unused and discards them, causing missing operator errors at runtime." + correct_answer: 3 + answers: + - "Because they are too large for normal linking" + - "Because the linker requires it for all C++ libraries" + - "Because they contain static registration constructors that would otherwise be discarded" + - questions: + question: "What does the GOT (Global Offset Table) fix in the linker script address?" + explanation: "The precompiled ExecuTorch libraries use position-independent code (PIC) that relies on the GOT for indirect function calls and vtable lookups. If the GOT isn't copied from flash to RAM at startup, these lookups resolve to address zero, causing BusFaults." + correct_answer: 1 + answers: + - "BusFaults caused by uninitialized indirect function call tables" + - "Stack overflow errors during inference" + - "Incorrect NPU command stream alignment" + - questions: + question: "What input data type does the MobileNetV2 model expect?" + explanation: "The model's first operator is cortex_m::quantize_per_tensor, which converts float32 input to int8 for the NPU. The image is stored as int8 in the header to save flash space, but the application code converts it to float32 before passing it to the model." + correct_answer: 2 + answers: + - "int8" + - "float32" + - "uint8" +--- diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/application-code.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/application-code.md new file mode 100644 index 0000000000..35d5c95590 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/application-code.md @@ -0,0 +1,142 @@ +--- +title: Add the application code +weight: 5 + +layout: "learningpathall" +--- + +## Overview + +The application code initializes the Ethos-U85 NPU, loads the MobileNetV2 model through ExecuTorch, runs inference on an embedded test image, and prints the classification result over SEGGER RTT. + +Rather than building this code line by line, you download the complete `main.cpp` from a reference repository and walk through its key sections. + +## Download main.cpp + +Download the working `main.cpp` from the workshop repository and place it in your project: + +```bash +cd ~/repo/alif/alif_vscode-template/mv2_runner +curl -L -o main.cpp \ + https://raw.githubusercontent.com/ArmDeveloperEcosystem/workshop-ethos-u/main/main.cpp +``` + +{{% notice Note %}} +If you prefer, you can clone the full repository with `git clone https://github.com/ArmDeveloperEcosystem/workshop-ethos-u.git` and copy `main.cpp` from there. +{{% /notice %}} + +The following sections explain what the code does. You don't need to modify anything; the downloaded file is ready to build. + +## Fault handlers + +The fault handlers (HardFault, MemManage, BusFault) print the stacked program counter and link register to SEGGER RTT when a crash occurs. This is essential for debugging on bare-metal systems where you don't have a console or operating system catching exceptions for you. + +```cpp +extern "C" void Fault_Print(uint32_t *frame, uint32_t fault_type) { + const char* names[] = {"HARDFAULT", "MEMMANAGE", "BUSFAULT"}; + SEGGER_RTT_printf(0, "*** %s ***\n", names[fault_type]); + // ... prints CFSR, HFSR, PC, LR, R0-R3 + while (1) { __WFI(); } +} +``` + +If you ever see a fault message in RTT Viewer, the PC value tells you exactly which instruction caused the crash. You can cross-reference it with the `.map` file or use `arm-none-eabi-addr2line` to find the source line. + +## NPU initialization + +The Alif E8 has three NPUs, and you need to use the right one. The model was compiled for Ethos-U85, which is the NPU_HG (High-Grade) peripheral at base address `0x49042000`: + +```cpp +static int npu_init(void) { + if (ethosu_init(ðos_drv, + (void*)NPU_HG_BASE, // Ethos-U85 + 0, 0, // no fast memory + 1, 1)) // secure, privileged + { + SEGGER_RTT_printf(0, "ERROR: ethosu_init failed\n"); + return -1; + } + return 0; +} +``` + +{{% notice Note %}} +NPU_HP (at a different base address) is an Ethos-U55, not the U85. Using the wrong base address results in a product mismatch error from the NPU driver. +{{% /notice %}} + +## NPU polling + +The NPU_HG peripheral has no interrupt line routed to the M55_HP core's NVIC. The code works around this by overriding the `ethosu_semaphore_take()` function to poll the NPU status register directly: + +```cpp +extern "C" int ethosu_semaphore_take(void *sem, uint64_t timeout) { + struct ethosu_sem_t *s = (struct ethosu_sem_t *)sem; + while (s->count == 0) { + if (NPU_HG_STATUS & 0x2) { // bit 1 = irq_raised + ethosu_irq_handler(ðos_drv); + } + __NOP(); + } + s->count--; + return 0; +} +``` + +The SysTick handler also polls this status register at 25 Hz as a backup path, and toggles the red LED so you can see the board is alive during inference. + +## ExecuTorch Platform Abstraction Layer + +ExecuTorch requires several platform functions to be implemented. These are thin wrappers that route logging through SEGGER RTT and use the standard library's `malloc`/`free` for the small dynamic allocations that ExecuTorch's initialization needs: + +```cpp +extern "C" { + void et_pal_init(void) {} + ET_NORETURN void et_pal_abort(void) { __BKPT(0); while(1) {} } + void et_pal_emit_log_message(...) { + SEGGER_RTT_printf(0, "[%c] %s\n", (char)level, message); + } + void* et_pal_allocate(size_t size) { return malloc(size); } + void et_pal_free(void* ptr) { free(ptr); } +} +``` + +## Memory pools + +The ExecuTorch runtime uses three memory pools, placed in SRAM using linker section attributes: + +```cpp +// SRAM0 (4 MB total) +__attribute__((section(".bss.at_sram0"), aligned(16))) +static uint8_t method_alloc_pool[1536 * 1024]; // 1.5 MB + +__attribute__((section(".bss.at_sram0"), aligned(16))) +static uint8_t temp_alloc_pool[1536 * 1024]; // 1.5 MB + +__attribute__((section(".bss.at_sram0"), aligned(16))) +static float input_float_buf[3 * 224 * 224]; // ~588 KB + +// SRAM1 (4 MB total) +__attribute__((section(".bss.at_sram1"), aligned(16))) +static uint8_t planned_buffer_pool[4 * 1024 * 1024]; // 4 MB +``` + +The method allocator holds the loaded model graph. The temp allocator provides scratch memory for the Ethos-U backend (which needs approximately 1.44 MB). The planned buffer pool holds the intermediate tensors that ExecuTorch pre-plans at model load time. + +## The inference pipeline + +The `run_inference()` function follows a 10-step pipeline: + +1. **Initialize** the ExecuTorch runtime. +2. **Create a data loader** that reads the model directly from flash memory (zero-copy). +3. **Load the program** (parse the `.pte` flatbuffer). +4. **Query method metadata** to find out how many planned buffers the model needs and how large they are. +5. **Set up planned memory** by carving sub-allocations from the SRAM1 pool. +6. **Create the memory manager** that ties together the method, temp, and planned allocators. +7. **Load the method** (the `forward` function of the model). +8. **Prepare the input tensor**: convert the embedded int8 image data to float32 (the model's first operator is `quantize_per_tensor`, which expects float input). +9. **Execute inference**: the quantize op runs on the CPU, the entire MobileNetV2 backbone runs as a single NPU command stream on the Ethos-U85, and the dequantize op runs back on the CPU. +10. **Read the output**: find the argmax of the 1000-class output vector to get the predicted ImageNet class. + +The NPU handles the bulk of the computation. The CPU-side overhead (ExecuTorch loading, input conversion, quantize/dequantize) is small compared to the NPU workload. + +You now have the application code in place. The next section configures the memory layout to accommodate the model and ExecuTorch runtime. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/aws-ec2-setup.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/aws-ec2-setup.md new file mode 100644 index 0000000000..c19ed3d126 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/aws-ec2-setup.md @@ -0,0 +1,282 @@ +--- +title: Compile the model on an Arm cloud instance +weight: 3 + +layout: "learningpathall" +--- + +## Why an Arm cloud instance? + +ExecuTorch's Arm backend build scripts are designed for native Arm compilation. The Vela compiler, which generates optimized command streams for Ethos-U NPUs, and the CMSIS-NN kernels all target Arm natively. Using an Arm-based EC2 instance avoids the complexity of cross-compilation from x86. + +In this section, you launch a Graviton-based EC2 instance, install ExecuTorch, compile a MobileNetV2 model for the Ethos-U85, and build the ExecuTorch static libraries that your firmware will link against. + +## Launch an EC2 instance + +Create an AWS EC2 instance with the following configuration: + +- **Instance type**: `c7g.4xlarge` (Arm Graviton3, 16 vCPUs, 32 GB RAM) +- **OS**: Ubuntu 22.04 LTS +- **Storage**: 50 GB + +The 16 cores speed up the ExecuTorch build significantly, and the 50 GB disk accommodates the repository, submodules, and build artifacts. + +Set up your SSH config so you can connect with a short alias (for example, `ssh alif`). This makes the `scp` commands later more convenient. + +## Install system dependencies + +Connect to your instance and install the required packages: + +```bash +sudo apt update && sudo apt upgrade -y +sudo apt install -y \ + git \ + cmake \ + ninja-build \ + build-essential \ + python3.10 \ + python3.10-venv \ + python3-pip \ + unzip \ + wget \ + rsync +``` + +Reboot if the kernel was updated: + +```bash +sudo reboot +``` + +After reconnecting, verify that Python 3.10 is available: + +```bash +python3 --version +``` + +The output is similar to: + +```output +Python 3.10.12 +``` + +## Set up the Python environment + +Create an isolated Python environment and install PyTorch: + +```bash +python3 -m venv ~/venv_executorch +source ~/venv_executorch/bin/activate + +pip install --upgrade pip setuptools wheel ninja cmake +pip install pyyaml +pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu +pip install pillow +``` + +You use CPU-only PyTorch because this instance has no GPU. You only need PyTorch for model export and ahead-of-time compilation; the actual inference happens on the microcontroller. + +Verify the installation: + +```bash +python - <<'PY' +import torch, torchvision +print(torch.__version__, torchvision.__version__) +PY +``` + +## Clone and install ExecuTorch + +```bash +mkdir -p ~/alif +cd ~/alif +git clone https://github.com/pytorch/executorch.git +cd executorch +git checkout 40d94b6d62a195a2f46b2baa20383fa4af27f7d4 +git submodule update --init --recursive +``` + +{{% notice Note %}} +The `git checkout` command pins ExecuTorch to a known-working commit. The Arm backend and Vela toolchain integration can change between commits, so pinning avoids unexpected breakage. +{{% /notice %}} + +Install the ExecuTorch Python package: + +```bash +python -m pip install -e . --no-build-isolation +``` + +## Set up the Arm/Ethos-U toolchain + +ExecuTorch includes a setup script that downloads the Arm GNU toolchain, CMSIS, and the Vela compiler: + +```bash +cd ~/alif/executorch +./examples/arm/setup.sh --i-agree-to-the-contained-eula +``` + +{{% notice Note %}} +The setup script may fail at the `tosa_serialization_lib` build step due to a pybind11 version incompatibility. If you see an error containing `def_property family does not currently support keep_alive`, run the following commands to complete the setup manually: + +```bash +pip install "pybind11<2.14" scikit-build-core setuptools_scm + +CMAKE_POLICY_VERSION_MINIMUM=3.5 pip install --no-build-isolation \ + --no-dependencies \ + ~/alif/executorch/examples/arm/arm-scratch/tosa-tools/serialization + +pip install --no-dependencies \ + -r ~/alif/executorch/backends/arm/requirements-arm-ethos-u.txt +``` + +The first command installs a pybind11 version that doesn't have the breaking change, along with the build tools that the serialization library needs. The second command builds and installs the serialization library using those local packages instead of downloading new ones. The third command installs the Ethos-U Vela compiler, which the setup script never reached due to the earlier failure. +{{% /notice %}} + +Source the environment paths that the setup script generated: + +```bash +source examples/arm/arm-scratch/setup_path.sh +``` + +After the setup script, reinstall ExecuTorch and its dependencies: + +```bash +python -m pip install -e . --no-build-isolation +pip install "torchao==0.15.0" +``` + +## Compile MobileNetV2 for Ethos-U85 + +Source the setup paths and run the ahead-of-time compiler: + +```bash +cd ~/alif/executorch +source examples/arm/arm-scratch/setup_path.sh + +mkdir -p ~/alif/models + +python -m examples.arm.aot_arm_compiler \ + -m mv2 \ + -q \ + -d \ + -t ethos-u85-256 \ + -o ~/alif/models/mv2_ethosu85_256.pte +``` + +The flags are: +- `-m mv2`: MobileNetV2 model +- `-q`: quantize the model (int8) +- `-d`: delegate computation to the NPU +- `-t ethos-u85-256`: target the Ethos-U85 with 256 MAC configuration +- `-o`: output path for the compiled `.pte` file + +Verify the output: + +```bash +ls -lh ~/alif/models/mv2_ethosu85_256.pte +``` + +The file should be approximately 3.7 MB. This `.pte` file contains the model graph, quantized weights, and the Vela-compiled command stream that the Ethos-U85 executes directly. + +## Build ExecuTorch static libraries + +Your firmware needs to link against ExecuTorch's runtime libraries. Build them for bare-metal Cortex-M: + +```bash +cd ~/alif/executorch +source ~/venv_executorch/bin/activate + +rm -rf cmake-out +bash backends/arm/scripts/build_executorch.sh +``` + +This step takes several minutes. When complete, list the output libraries: + +```bash +find arm_test/cmake-out -type f -name "*.a" | sort +``` + +You should see approximately 13 libraries, including `libexecutorch.a`, `libexecutorch_core.a`, `libexecutorch_delegate_ethos_u.a`, `libcortex_m_ops_lib.a`, and `libcmsis-nn.a`. + +## Package headers and libraries + +Bundle the headers and libraries for transfer to your development machine: + +```bash +cd ~/alif/executorch + +rm -rf ~/alif/et_bundle +mkdir -p ~/alif/et_bundle +cp -a arm_test/cmake-out/include ~/alif/et_bundle/ +cp -a arm_test/cmake-out/lib ~/alif/et_bundle/ + +tar -C ~/alif -czf ~/alif/et_bundle.tar.gz et_bundle +ls -lh ~/alif/et_bundle.tar.gz +``` + +## Transfer artifacts to your development machine + +Run these commands on your Mac or Linux development machine (not on the EC2 instance). The paths below use `~/repo/alif/` as the working directory; adjust these to match your own project location: + +```bash +mkdir -p ~/repo/alif/models +mkdir -p ~/repo/alif/third_party/executorch/lib + +scp alif:/home/ubuntu/alif/models/mv2_ethosu85_256.pte ~/repo/alif/models/ +scp alif:/home/ubuntu/alif/et_bundle.tar.gz ~/repo/alif/models/ +scp 'alif:/home/ubuntu/alif/executorch/arm_test/cmake-out/lib/*.a' \ + ~/repo/alif/third_party/executorch/lib/ +``` + +Verify the transfer: + +```bash +ls -lh ~/repo/alif/models/mv2_ethosu85_256.pte +ls ~/repo/alif/third_party/executorch/lib/*.a | wc -l +``` + +You should see the 3.7 MB model file and 13 library files. + +## Convert the model to a C header + +The firmware embeds the model as a byte array in flash memory. Use `xxd` to generate a C header: + +```bash +cd ~/repo/alif/models +xxd -i mv2_ethosu85_256.pte > mv2_ethosu85_256_pte.h +``` + +Open `mv2_ethosu85_256_pte.h` and change the first line from: + +```c +unsigned char mv2_ethosu85_256_pte[] = { +``` + +to: + +```c +#include +const uint8_t __attribute__((aligned(16))) mv2_ethosu85_256_pte[] = { +``` + +The `aligned(16)` attribute is required because the Ethos-U85 needs the Vela command stream data aligned to 16 bytes. Without it, the NPU driver will report an alignment error at runtime. + +## Extract the header bundle + +On your development machine, extract the ExecuTorch headers into the VS Code template project: + +```bash +cd ~/repo/alif/alif_vscode-template +mkdir -p third_party/executorch +tar -C third_party/executorch -xzf ~/repo/alif/models/et_bundle.tar.gz +``` + +Verify the headers are in place: + +```bash +ls third_party/executorch/et_bundle/include/executorch/ +``` + +You should see `runtime/` and other directories. + +You now have the compiled model, prebuilt libraries, and headers on your development machine, ready to integrate into the firmware project. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/board-setup.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/board-setup.md new file mode 100644 index 0000000000..213b0106aa --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/board-setup.md @@ -0,0 +1,105 @@ +--- +title: Set up the Alif Ensemble E8 DevKit +weight: 2 + +layout: "learningpathall" +--- + +## Overview + +The Alif Ensemble E8 DevKit features a dual-core Arm Cortex-M55 processor and three neural processing units (NPUs): two Ethos-U55 and one Ethos-U85. In this Learning Path, you use the Cortex-M55 High-Performance (HP) core running at 400 MHz to orchestrate inference on the Ethos-U85 NPU. + +Before writing any ML code, you need to verify that your toolchain, debug probe, and flashing workflow all function correctly. This section walks you through hardware setup, software installation, and a sanity check build. + +## Connect the board + +1. Unplug all USB cables from the board before changing any jumpers. +2. Verify the jumpers are in their factory default positions, as shown in the Alif Ensemble E8 DevKit (DK-E8) User Guide, available on [alifsemi.com](https://alifsemi.com/support/kits/ensemble-e8devkit/). +3. Connect a USB-C cable from your computer to the **PRG USB** port on the bottom edge of the board. +4. Confirm that a green LED illuminates near the E1 device and the UART switch (SW4). + +Leave **SW4** in its default position. This routes the on-board USB UART to **SEUART**, which the Alif Security Toolkit uses for programming. + +{{% notice Note %}} +Don't have a terminal application (PuTTY, minicom, screen) attached to SEUART while using the Security Toolkit. There is only one SEUART on the device, and two applications can't share the port. +{{% /notice %}} + +## Install the Alif Security Toolkit + +The Security Toolkit (SETOOLS) programs firmware images onto the board. + +1. Download SETOOLS v1.107.000 from the [Alif Ensemble E8 DevKit support page](https://alifsemi.com/support/kits/ensemble-e8devkit/). +2. Extract it to a stable location, for example `~/alif/app-release-exec-macos/`. +3. Open a terminal in the SETOOLS directory and run: + +```bash +./updateSystemPackage -d +``` + +On macOS, the system blocks this unsigned binary the first time. Open **System Settings > Privacy & Security**, scroll to the **Security** section, and select **Allow Anyway**. Then re-run the command. + +When prompted for a serial port, enter the DevKit's USB modem port. It usually appears as `/dev/cu.usbmodemXXXXXXX`. If SETOOLS detects the Ensemble E8 and asks to set it as default, answer `y`. + +## Install SEGGER J-Link + +SEGGER J-Link provides the debug connection for RTT (Real-Time Transfer) output, which you use later to view inference results. + +On macOS, install it with Homebrew: + +```bash +brew install --cask segger-jlink +``` + +Alternatively, download it from the [SEGGER website](https://www.segger.com/downloads/jlink/). Run J-Link Commander once after installation to update the on-board probe firmware if needed. + +## Set up VS Code and the Alif template + +1. Clone the Alif VS Code template repository: + +```bash +cd ~/repo/alif +git clone https://github.com/alifsemi/alif_vscode-template.git +cd alif_vscode-template +git checkout 8b1aa0b09eacf68a28850af00c11f0b5af03c100 +git submodule update --init +``` + +{{% notice Note %}} +The `git checkout` command pins the template to a known-working commit. This avoids breakage if the upstream template is updated. +{{% /notice %}} + +2. Open the `alif_vscode-template/` folder in VS Code. +3. Install the recommended extensions when prompted: + - Arm CMSIS Solution + - Arm Tools Environment Manager + - Cortex-Debug + - Microsoft C/C++ Extension Pack +4. When prompted, select **Always Allow** or **Allow for Selected Workspace**. +5. Restart VS Code if prompted. + +## Install CMSIS packs + +Press **F1** in VS Code, type `Tasks: Run Task`, and select **First time pack installation**. Press **A** to accept all licenses when prompted. + +## Configure VS Code settings + +Press **F1**, select **Preferences: Open User Settings (JSON)**, and add the following entries (update the paths for your system): + +```json +{ + "alif.setools.root": "/path/to/your/app-release-exec-macos", + "cortex-debug.JLinkGDBServerPath": "/Applications/SEGGER/JLink/JLinkGDBServerCLExe" +} +``` + +## Sanity check: build and flash Blinky + +Before moving on to ML code, verify your entire toolchain works end to end with the built-in Blinky example. + +1. In VS Code, select the **CMSIS** icon in the left sidebar. +2. Select the gear icon, then set **Active Target** to **E8-HP** and **Active Project** to **blinky**. +3. Select the **Build** (hammer) icon. +4. Press **F1**, select **Tasks: Run Task**, then select **Program with Security Toolkit (select COM port)**. +5. Choose the DevKit's port when prompted. + +If the board's red LED blinks, your toolchain, SETOOLS, and board connection are all working correctly. You're ready to move on to model compilation. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/build-flash-verify.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/build-flash-verify.md new file mode 100644 index 0000000000..1cbb5221c9 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/build-flash-verify.md @@ -0,0 +1,120 @@ +--- +title: Build, flash, and verify inference +weight: 8 + +layout: "learningpathall" +--- + +## Clean previous build artifacts + +If you've built other projects (like Blinky), delete the cached build files. CMSIS Toolbox caches aggressively and won't pick up YAML configuration changes unless you clean first: + +```bash +cd ~/repo/alif/alif_vscode-template +rm -rf tmp/ out/ +``` + +You can also clean from VS Code: press **F1** and select **CMSIS: Clean all out and tmp directories**. + +## Build the project + +### Option 1: Build from VS Code + +1. Select the **CMSIS** icon in the left sidebar. +2. Select the gear icon and set **Active Target** to **E8-HP** and **Active Project** to **mv2_runner**. +3. Select the **Build** (hammer) icon. + +### Option 2: Build from the command line + +If you prefer to build from the terminal, set the required environment variables first. The exact paths depend on where the Arm Tools Environment Manager installed the tools: + +```bash +export PATH="$HOME/.vcpkg/artifacts/2139c4c6/tools.open.cmsis.pack.cmsis.toolbox/2.12.0/bin:$HOME/.vcpkg/artifacts/2139c4c6/compilers.arm.arm.none.eabi.gcc/13.3.1/bin:$HOME/.vcpkg/artifacts/2139c4c6/tools.kitware.cmake/3.31.5/bin:$HOME/.vcpkg/artifacts/2139c4c6/tools.ninja.build.ninja/1.13.2:$PATH" +export CMSIS_COMPILER_ROOT="$HOME/.vcpkg/artifacts/2139c4c6/tools.open.cmsis.pack.cmsis.toolbox/2.12.0/etc" +export GCC_TOOLCHAIN_13_3_1="$HOME/.vcpkg/artifacts/2139c4c6/compilers.arm.arm.none.eabi.gcc/13.3.1/bin" + +cbuild alif.csolution.yml --context mv2_runner.debug+E8-HP +``` + +{{% notice Note %}} +The `GCC_TOOLCHAIN_13_3_1` variable must include the `/bin` suffix. Without it, the build system can't find the compiler executables. +{{% /notice %}} + +Check the output binary size: + +```bash +ls -lh out/mv2_runner/E8-HP/debug/mv2_runner.bin +``` + +The binary should be approximately 4 MB. + +## Flash to the board + +In VS Code, press **F1**, select **Tasks: Run Task**, then select **Program with Security Toolkit (select COM port)**. Choose the DevKit's port when prompted. + +The flashing process takes about 30 seconds. The Security Toolkit reads the `M55_HP_cfg.json` configuration and writes the binary to the correct MRAM address. + +## View output with SEGGER RTT Viewer + +1. Open **SEGGER J-Link RTT Viewer** on your development machine. +2. Set **Connection** to **USB**. +3. Filter by manufacturer: **AlifSemiconductor**. +4. For **Device**, start typing `AE822F` and select **AE822FA0E5597LS0_M55_HP** (Core: Cortex-M55). +5. Select **OK** to connect. + +The expected output is: + +```output +mv2_runner booted +Ethos-U85 NPU initialized (polling mode) +Model bytes: 3835552 +Image bytes: 150528 +ExecuTorch runtime initialized +Program loaded successfully +Method 'forward': 1 inputs, 1 outputs, 1 planned buffers + Planned buffer[0]: 752640 bytes +Loading method 'forward'... +[I] data:0x800432b0 +Method loaded, 1 inputs, 1 outputs +Converted input to float32 +Input tensor set (1x3x224x224 float32) +Running inference... +[sem_take #1] count=1 +[sem_take #2] count=0 +Inference complete! +Output tensor: 1000 elements +Top-1 class: 283 +Detected: cat! (ImageNet class 283) +``` + +If you see `Detected: cat!` with a class in the 280-285 range, the inference ran successfully on the Ethos-U85 NPU. + +## Understanding the output + +Each line tells you something about what's happening: + +- **"Ethos-U85 NPU initialized (polling mode)"**: The NPU driver connected to the correct peripheral (NPU_HG). "Polling mode" means NPU completion is detected by polling the status register rather than using an interrupt. +- **"Model bytes: 3835552"**: The embedded `.pte` model is 3.7 MB. +- **"Method 'forward': 1 inputs, 1 outputs, 1 planned buffers"**: The model has a single input tensor and single output tensor. ExecuTorch pre-plans one intermediate buffer. +- **"sem_take #1 count=1"** and **"sem_take #2 count=0"**: The NPU semaphore was signaled. Two semaphore takes means the NPU processed the entire model as a single command stream. +- **"Top-1 class: 283"**: ImageNet class 283 is "Persian cat". The 1000-element output vector was scanned for the highest score. + +## Troubleshooting + +If you don't see the expected output, check these common issues: + +**RTT Viewer shows nothing**: The code starts running as soon as it's flashed. If you connect RTT Viewer too late, you might miss the output. Press the board's reset button after connecting RTT Viewer. + +**"ethosu_init failed"**: The NPU base address is wrong. Verify the code uses `NPU_HG_BASE` (0x49042000), not `NPU_HP_BASE`. + +**BusFault at a low address**: The GOT sections are missing from the linker script. Verify that `*(.got)` and `*(.got.plt)` are in the `.data.at_dtcm` section. + +**"Missing operator: cortex_m::quantize_per_tensor.out"**: `libcortex_m_ops_lib` is not in the `--whole-archive` block. Check `mv2_runner.cproject.yml`. + +**"Memory allocation failed: 1505280B requested"**: The temp allocator pool is too small. The Ethos-U85 scratch buffer needs approximately 1.44 MB. Verify `TEMP_ALLOC_POOL_SIZE` is at least `1536 * 1024`. + +**MRAM overflow linker error**: Verify `APP_MRAM_HP_SIZE` is set to `0x00580000` in `app_mem_regions.h`. + +**"Vela bin ptr not aligned to 16 bytes"**: The model array in the header needs `__attribute__((aligned(16)))`. + +You've now built, flashed, and verified MobileNetV2 image classification running on the Ethos-U85 NPU through ExecuTorch. The model went from PyTorch, through the Vela compiler, into a `.pte` flatbuffer embedded in firmware, and produced a correct classification result on real hardware. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/cat.jpg b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/cat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e3a08eeae9837512da49adb91b1cb618934c8e78 GIT binary patch literal 23009 zcmY(q1AHVw*Ec$`HrB@3*c;onZQHhOPByl+v6GE$XM&B*jd}At?|0vOzyEK#y84{c zQ>Ut@^zSpDtDidnc`;8ba{xe24nP9{0AK+SV7LINuN>Id!3>7?zj=OhC1%UrAAii=CkY6bTBoyR- z00j;8UqC~{z`(-5K*Pbo!^6RS`@aDO0RaI82?YlY4TlI12akx1^aW(({~Gc7l04gltFW@f(7%Bi96$}Cu>~jz>_2n`o1lU(4|6jm-`2!0M0S5;E)ujK? z{C_F{02~7BOO5hXO-L|EaPY4K;y-GzF9qtC0tV#=8Y~k!F$SX$2{R@ZoFFOL_pidf zd-M1pSQgb|d)EKG`7pbS&rH)qU(a_t-}f_2lNVnRl| zlk7r{_TSRLp&Y-%0N4){Vn!xGB_|;xa5NId>!759K4#L-b-=eThrv-HP`|8t zHkq2dD*6_fARmu3O03{nUo(GWt<@g>XySumlt&l6eA1y9iwQxh@qA7v%rCFfvA zEDxE6ASQXkCQQkRuh0%ibc^1=h}V(@RaHi zHLHpYJV?)}cvE6DDJjL`vdTnj9r!eWlYCH_T;Qd=$bE^PIN;aonBF?PBV1J|Y9$n7 z|McVzM@01+%g3^ZwRo&8Q(&7>PkrFdj^yEkG&#XH*X}CT0l8GnC?ehSs`?`xq4eSeG60`GR#fWj{XHJx@PXA61pMr36B&uFN8#Aq#!) zBKZz4EiK{vSf+1*9Z0z1eIn?I+`>7sH+~|r)HCuGQY^OM&87wOxmXGM%2}gk`SEV% z;>m~OU_5&)R4>U#3IX&gUE-6yA+yaHjZ>oosBr|pd-)B#cw%b1A6)|1DC+$~ZnZ91 zsj2L%O&=47qTOhVvZiK%;F*ZANtBxC2RIi8M+y!kilcNNSP1S^rfPk<2K7A(g(T9^ z4&UoEm|dL}$S><)gpk=)q|m&{@C3wh9+=?qBCoFF*Ix|DI)By~P zjA05>K6vdP2DTrVL|HS0t2ocrH}EK47gm`%L}aNso_1){4XO*Ar5Weggd|of$aXkl zd+3$7ua&wzXGdepZ#~v3xtt3aTPZhpt|rm9wbsF|@j=D?{^gZz-$%6C;g;E?R#93# z>#fu|Vr$%;7aY4v2(ovr>A!0u>6X0v4kAr^1-f8<|C_b^!5)yZet_D|H<|fjx`IM3W&`{Iptbky`da zEpA8uGPQMX*I_G)Lxfojwyx<3BbORe=@;#ecci*YNP$61`k!$_(gGrJu<@_8Dnr)jTVj)tandQOw|o?CXNEvhS;pMV^YM9;wC zn~FK4+19b|Rg>Mm6J@|pXulaA|NR5)E6CV}Bq?nXFZ$)iX{|xQs z>^yY_#tK>{Wui<5=Q!6%pMfP6{<5R{lSAaQ?ylzmX8{D|0H3j%?zevknavZ+E*U+lw$jPV>gMzsG~bce zFNa|(mifI?b^qJUth%Zx@0eEupFLQYJ8SR7Iaeo;PafWBbMmP)4wvS9nJa`Z)r#K-{_Cx7a+lAs4Sw+2LxSOCAjxdGLspL_zM<2g63WtL5S&W4QEIUO|p zbgNUx{j>D^wh04hBx*Qv`d9tfyfy)?oO+-c;BE*SxS|z@P(<^BtL2DjWvc#y%lv@U z6QW5)7az2~e5CKj&kljg6?Gcd`;hsaoc@V^TB~7h(&r%N!Z>3$&P%@-p({!m5brZ{RmwDcc+sxF_jOz^7v4&~#ei8{fX!>q-= zC*guaoK6Ue#xIAO7oVbUiYIJjV4xi@5E&aea$Sz?r+Va3}|%4 zvc>sM3yEQE@KHSu^m|;c&Y8)}5;#e!QN6q}e5i5lWXoCG|43_Bd_yC57St2GnN^>;QO@GPRvkr;Y_R3VX8TDnHh$4e%gAjPV&#lF$?`?YxudW)zQ z@01Ia<|SO(C!jIs>igNm?0e>j7Hl0_v9;|{1IjX@HhU;xgU?i%e_giQ?K=r%#uWz^ zTDY~{;RWa#l}nqr@I;KO zRlA%IE)!H!^g?*RFKnx{33_{_am!U49%B=eNg=%;wG27!9m)^Z7KV>}5KY^!3=%9I z6^=ICc_KhP2y&G@B%N&7px=O_YMxy7lK%aOT1c)ZwBKU zH_>2JVuk!pbp(XTEPD{34k^1M=jq@}(@3^SPm^?nVmhegvU$oq4#c>FIWI)6h*yMc zSWHhnQiFE4qJl59jHY@NQ&TUez?3duFc4jXNWT|#4Hyi>cu?Z22pi(2RE44F=T?r) zBvGQNm=P`Pt)VJP7-Ng75gWwniRKc33&qe@Sc@6LGSto!23hXV8ue`a$5RYWUc<{_5UOb#b7t>D8 zK#(r5wvwYBVjJbs8Aq^r(q$QI@O=s{N^J5oEn^G>ufedVY8}HWU$2EiPqI^dMszlb zf-F2GQKT;Ctrtr%CNTJ5c^gtbYStOFJjNBB#Ufr@o|7;jwYMgrN6)N8O8T9;uwGd) zgwlf!?|6PQ8z29anPzir2(=FnxwB7%V-g(um@kj1fwgEwu-?Q%D$-UjF8h?2dD4Hu(uY{I&Lw&BZ6iJg<86;7%!T%EwW}>a6 z1Sl_StST~$Yq?bJ9po3=)in-f1r0wVt$KTY}4 zSmKgI}UlzmlsdS@jD+q%fOH)Cq8i-Ol zDoNhP2L5pD4~?V3{RDu?^Z(eb8WJA|N~qXvv?&%U3913IGa$CKAWW2H!Cl5gSBf22 zIgqTP4Hy}|D<+k=pGv}A-jcaH=9{__5+hU8qbp}qNeN9%i6I*cqH_iw)sd^I4s4z- zExHZ(>UWI_dqBPZ6h9gA?<1YUetb_X{?%I{8jQbjP6waEXiK6JTnJ5)D)EpRV6cJDc|3VFuzq z!wd$2G$K_q>~S!0!odx~5EkaH_ewsdP$WD_26}ENoi#(DSH*{PzBiyt_@$reCz^_m ze41i=je^gXHMrAADG_QfkhT62B?rtVWK{ze~u{r09*kDwlrt1c5`8Uen?`WGyyoCux+`C zDt+frenQ`cZHZG9`ITVP4i+IbseZuSkjx9jQ%y{0iHIIibX4u|bX-ZGDUpMRRF;=l z%=K~QSCEoE2{)hPpHfb0%T#6&TiXmI!crzH!Ll)`)Nm&%!E5#g6^Qe2;TB%*1f3K{ zx&{bF_y=K%N{FawQ?g<_ojg6rDLrCT-pGenROefYH2kaZtSqz?Qo~k)QG-NJiBq%8 zw$5HJV+@mT5@Knvsm$Ep1xEu%2$at*C9y6K)JsEwg@i&CTT)1l>`L435~f!$gOM)v z4bHTz5`KvtxH8mAfg4Fkc&c z96;kr)5OGGRON4^z^d`{YI$iqYKK>`lslR>^XHn8JoROTh4b*^F{&s9E@9Cj%i84y zpUlhP6*lXV^xrv3XT{4veOCG8e0SN)LWJ>1;sj}po7UtSzNI1BC+5?j%b!0*+AW1s zla>UDGen^Pr`|}RZed#tK4Z+YVK*x~YJ5ww>X)wcnaa1pV2pvGfxx>;D&k+;B*)Rj zVYk60Dp~SrZ?vguG3@XuH@+R zC18uWxwNNN%4`MDXtDBQp{|96WxMbW&RYp?5IJH@AG1ELV1%3Ct~Hjvwce(p`G2-j z6)dTQSExzpD6mQTBMVE^Fwvn<1Q*-y$&w{qW-*ReRB6<(=A?c)(iB>f!54t4BFx=2 z=r7=^{52xQ*;DOSLlF^93K0tt&kdD^aVrB=zorw8USS#{R4erhLSmH?OjH6w-YnZw ztWZpDR5dYC3F#k;>5PM?xyd7kJ!Mq8H*eWg+BxqDP{>n#Y7nzf|Lxabqz4pWFG0zZ zC$>I$FT=?l6eFW+ip7!6qJPkA=f4NawLW~eNMR#Xn7R4bp4Q)Uo?$if?6D24bprwH z2Aw&31itgP<`tq1pwmn!YCefzpk!dU2Y~S9pm&l6?0)ZtJ21P!rQGXwNz>xjWHfg- zGqlF9atvVIXKvl=*(34pf6(ur^G{56$Y+>M5=GX!`#!VSx_k z5Z6o2)(h`-2N8g;-_mYzjEpk0i5MiUyz%8U>*P^ifG$-cpC8@-p2b~dx5mxi|2}ka zKG9W4U)_X3JJW8?xuQ32?zB5Ho3h9|@r((#83voY+Sf$Sn-p_5&}14WTsD z0*P@{+a>)IpkHT|YAXVlHyB~_s5}j_nP1kj#i4aywb*tw>1j*~XzJ|A1j+W~+8M08 zO}!7|T3}g5aE-Tj0=MvD+6~y6GGLo^m9QVp56^i~MxgN4G)-esS)^TSoS}+wXQRvC zGEEY}vccuddI4%8PA~JjULsCIu?pCQ{V#^evOTI zmITg6@eFH5<9`wt1p+>Whq0^ku6&xaM7R7<2 zzd5IKb1w%yeKskJ?0D3s!`q;ix}R0+wgmgj!7u{=1F&9VHP44}hhJ|#wYPZcv*-MI zcIcjI>6+hls;{_9A5G4~mOeO$re2U-pvF2|3sCpn^*s-CF-t<@S{qlHyeS%mBLNg!M_f|G>icRLsHn@~yUaVgT z!X*e?hJVd7JiXGx51!Tw*W6TxIesxf+qX!Mg*S||kBFTt6>EEsNW!)_`s8b-S#&c7 z)qCRy_jW!%r#h$kM~nhK0oiwD{AWt_@qBrD;oon)FV42(f7&M5EPK9oM%pu6Y{v!S zRr@ONco6<&k72O-8#BeVrX^*?FCH0;H?q_CxAw-4=5pZ7)%`DClvd4Nz}X`KyMXAS zU&`Gj`^Ac9%fQ96?DnPATb@Z(-$3rjF$j|9C$#00&lEUUtx=!N_cvGL+v)o(sOcJ0 zE(n*#iLKpj4;OysJK1BSc9{DuL##mN2RM6*8Fo*d8NzOvfcT#Ju3_DR4JX-fU6=Y{ z9UBlbrE&c?)T7&0Jbsffy6Lk1?3nruGx;j3(eGJjb`or@eIuSebT7@L2IjZAUYS4; zUc^iN_gMK8Xlu_;z^k1{c6_S|UyFf7OUv)ELBoE%0p7M)sQtp*;h`1~YVPG0zTGna z1k4Z*(0N;Kud@qrgvQZ<;PPSP{-ldrSKi8|E%&$qLDIMSXZ42s+*r!_NPIID3-%ku z2j9eTqf>2%(afG(7f(AvoI7F!-{6~_)k`NOYNM8WT*Snd88avK1=}V9WuQI(qh<6? z*2h`=@;Tk^nB5ZypFsu8VHj8B%C2{pvzl>cLmS~6dq&*v;XTLp2oa|u+brDHLBo4n za~tH=4IP)^8k^J!hYh8s)7CB<)+Fs683v~n_8N-S5lhywONSOh8a(sr3y7o|XGV*g zw#h@1E7o(?Uz(0yoHfnW@ng|*o)UP~wT>G@VZB7Qc+JD?ew72>O)+XK&KezWT(X!o zEyjLVY>_e-1%pFS;W6~K3w&-_!f0WmpBg9*6HeV)*xh1@?WEL{IN3_N}~rv z?4%m{Hju@^`|{x{rdQrRms?jU3xrYVURNV)r@_&Ss=x!t!M zC|P^c`i!V;W#^=1AmoAkGSNoIt~>d%!SOmQ%yD5n!_;=sX47T8(NMq1Bbc^~ZQqWT z6%#(gUpLC7=R@Q-xsl?Lc;9H`L2>4NQ7fU`HR3$!DEoAtXV!bk;h|*{XF~cbfjqu- z_@G^T24&f#{fMtx-|E#L&u8Ead%3kmllPDV&VvCN+jqh*03tPu#lemO9S;j>WNocMgO;9HeJ@Acl$__r;e?ryL*DBB zD?tPT#&Vlq9$u{w5-6jkzgB3)i7TciBHU$l&2iO3yYCJytYfFERtId1^OuXUE_$6q z$>!|}ZVT;Nt>Pauk6f0Ai9{xxH;JFdUe@#})H`lTtLK{xBW5-f3`cqQbm0*3@A(Tw zAb~^eEeWhQd4komZ+fXDn;^RMRlB$q_4o;G{uL7zgPa5*oJ;ZFEm8HRPMl{lax3Qz z^{X|%nWh{(ji7Pc`ZgGwyVSBLYtV9lRPMTM+^WfN z{}!56QY!RHOKH>CvV8H%;^mU)<}$yv+1I{9IBap#U?9Tii}bRx2v3*4L}peR@NVY# z%-cQ9umGA`e_#1-$)BaJT|YNtjqhjIK77Q+#Tu*Tv>vw^>l+d54-wGXYUuFpdy}0d zzuj;HZS>HoQ41P*nTWw7NWeLB&tdV0h+*X1Hv*VQH!f@#*KZ{k_&s;t@-}9QP_t(l*AQvqfMbWW$c2E6 zo|U%D*4Z8W3D~ly?B1>QwoN6Bb}Hu9fYH|%Gt_JBbKInrS$@njvLiLCX*uKB8M4oi zk+tU^{sfQ)kkTJDHcm>J#nFv@3~f#L*kKue0?3n=axm|HDs8as^plhZbK&*lx845B#>c@Z-C!TqK7E2rwu6s*^Ria_i4H`2U=0R`Vmv{s~w^CNQ|v?k(1D-{txQ zB-^betlWSe`kX9a8hZXZym=Ajcj<)h&FIyE(2cjtve34l@ftl8H{`6KrT+@H$Y!r- z;2~~KHadXR4uBG-s}2_o7Y*ccmx7yh7v96w~cQIfgMlDNTkDrKC= z7fWHO{kYxt>R}*)4gLz#?nr38w~b$lB>ULim7dc!Etj~;;HUXWDYxOo3p#g?8_BfS za6e$MS>>cnxb`&*>FLacxD(Ajp@{X(g9aBE%{xXyO@6 zrLnlCx+;0#u0IB1+eS8K%b^*OT{9hB<6v`2&in*Wy=Ea)SF|;Q9(edHM8>zfA33-` zXowPw% zW`W6LgSMPJ20bgngQhhtllbd77sGYGwUNt6t&7HF2PYS6=vVc02!eIInp`MyOxQJb z^>-SvK*%Il+1U&a)zdrV@y0K*%PWXAop!5VeZ(E@7C$wqe{q6ceLy=~fTh{It})U0 zCNpEHw!RZu1FlZN2htNLf_I)JWxMcGytXLb4&&E1vpbfP zRk+zbIp>=vL>9!#slWw_?lBYFq7ieW+#`uykpg`!O`DF{q4GXMfgK~{k1llt>e$h< zM@93U=9bo90adR=z!8hKujL-F-EaR5vsUXMD0E%$RRT*P0X1TkTk~Leg4?WkcI@{~ z?>?>t_V)LmwiAQD`;eC>Bm?8u=Evx>GMzjSho-!eIy!!@@Ax+G^8}CDZLRf`k7Mb} zhEEPNrySnD9j0f00$lQ%1IH}lt!NWDr_YiYZ*Ih24EOAj+vg55$2Y=tX3#pCV%c{W z`_E$Knqo*p*;`kC`M3Z3UX^3QWo?_s7Z4_QVUsE^Fcc1i`@Cfkej2z297LVk*}1tFuQhHrARUz2Oq|n!}neD zE^K{4DYjhbV*Oig3GTG~BF7ZPZ0T(&hAQ_mx=PG=V#l>ueT2-6Vhz5-!#iPi4ip_{*EvqEU=?lBg^SAx z8MblbfFO%3|Ki2`#o!#E8o2s)ahP+S_MJp^nF(e_KyW_V%7&|tAv`ObNF%(ojxDXE z1i@{y7PX?F>!-a(Y~41;|7^*@z8t@h*^sz9!g1N^j&p2f&s(%VPYSozDNiiiNYpA+ zJiU0ahEyv+7=`D}fV01{bGS&Lx+K8@_OtRXi6$v+rIuk)I{av)jZJ~F zW|cMVDIj=^N#Gmi*q+`gKkj99k5;YD>BZmpt<%4lXdDzK1BO%p0_aLX&;>UusEo$T zDh57~2JDvmOtG8x=604rqR(wGx6F*^;(fgeUHgNGs8{e7?}O#~-z;F(6=p^^BOpcu+#_tPI9jFnz?TDpIL%JF3iWW(F zNKR4}zw4x`tJ=!J#GdAnU?8Ur>-xo`oVk^Bq)&(ltYUZg(7v<#*fR$G)t;sYazheg z;s!R>&BG+P`xDT5ld>dNi_cbw$tSYeZEp%Ql7J|~+Mtls-A0)S3MX6q@F4&7w${{w zmZ9^5c%Sjtaopz4Cm@!xl>508(f@IigY#ozNGI@-W2Irp!?kH6ge+7?)N1i3JFaFY z{OKNfobc?nxqWQ)iU(6Bw)ii4CDp(ZHH}f55&xo{O4Y^XrsXBtU6^z^dn=Ll6FES& zf?&YSlFje4+a=7F69oZabmPTrlHLaqy~c*#$w~P^t6YSV=UbECNa0iF`}&xI zVkxZ`zQ7pVyz+giHZ7 z%NqqG*$V)+>-gXR0@$+WMr{_AzK4I{GHtCw9o5I5GO!6y;#%3+mHbY6MAMSK{sio^ ze&AAK=OqhvR%=`r=f82)VJ&KW?fyA7r@Sl3)NiIpzZXaa;0~r0hTohS5O{=kH)eC% z$X|_!Sfmb(Kx*jkFE_jbS74L9Nq{sB5hPtw+A7Khhb|r(DedGA&gjjXf1b(&7GggE ze?sKkm2Fa%WyAK;;w17Yd60oXPsVhdrx1Wcv5Q4a?9*+VBiF(hgocvdPp(8m3_cVFFji_sIbi0D1l@y?` zz=)7)FpRJmE9dN$kiyl9qvC8$ z?2y#IzhbO~mI#1+b@sAr9&1*g8%y6Z<(RKlrD5%rVbz!oQhu1Py13a@xS994NfZ<2 z($0H)WqZ-W%QFtH{H`l5krXd1_;gW(-}2zXiCH0H_k-xB2>yLUcE*9NDHM5uoD9Oh z01xEM|Bff?)|HMv>fYQKo2)Pr{|VsE5h1jGXKI*~Xbf)_@kmVWLHMRt;yVqrV#<5C z??f?9XVIy8Ah01wm6*5ctx5G|rJNjppUG+E1#az2xRG#v$ViU11p1niozz?>N`RIv zyOd2yZ1Q_B+`&1g*o&m)wyZXb(?+s*lDEAUHA~up8nDSC?J|z}i0jGV zQ-0Up&9+Y>Z@ecgv0t6E;(L^}2w;OMa^kqS>}lA+I{~~@?6}~0h8?&#S-QZ`n`>|PL*c}5cJ1{e2HEqg zvc@NXJ=EFU0Gt1u{28DK3@Nt ztG}l@`UxPEg&7bPfx%?ODA7r}RU>dc@$kFarVse=tJk^&yEPV&|Jjssy#QLf8CdNO zkOtSbc={p${8g73&Nj@F7>XR+OG~2G%SKarfMUD>j)jFG^TE~p)0yRG?Td^ecr4Q} z05G-#gX_7pJraH~lurN)ymQ$fRlg2++8ZMNOg8&0j>6zaWRxC!6mR3U0vf9a7$OTNHLI?t}-*tfS+tP2JG1NZ3@ z(9l$H%`f2J_Bt%eXOaqseQABH0I6WcdE#Qk3YXOb1K3v z=Ybi-x3gS>NqSI}hRhuNTTB#T&B!_+jFDki2kR=%ax-`q;?Mnbf_$zS!km z*xR|7Ru_9_ZmM*9L%Sao<BrjVOR46pb<6n+*{^GB zNjY^wTf@eiLBkvRIh>jH+XvbUx}~kjkT6kRiurYk-@pXiyPJhC+Vb(vqD~R{F^NDA z)~??Iw5hT3zM?8pPwHRU=RN^U5SO2T<9|=5IsV+1wg_hSFG`3ch&M*s?`gB~52uoL z_!lOKCh>WPWd^MZbf=K+HyM;Jb7~%BIUM+A zdWHcNMDrOxub;ok4Vsm-nm`5-hFemK2#$_DB?UoR?=Ey-1GN;&fN*BNJxzd|-Q&&* z)`EX;NSYVx&3J5x0>Z;^hHIyzsF}5n)dv3L+;8qpXtFDOIIPJt9)7;`5IWiBYnY*j; zLu2FxT8Q4Nf-*rEmEyK+wwkWJP?oKPrR=+66gPwihYDM!r(2xDBK>hv@i?6dg;N}Kp#5ShccpvQDUWhC); zGUr8BY`W}sVYvX>S?E{R+8RN2V{gTauPBVh{u)wdF2dyEWLQ!=nrGx)k57QnZD~>G zDBTMBGz1uaIIEzTAinrvj?@BzPgSQfa+H?FQ(!=RR6}QL4^i6p?*>CTRGgR~1}g^c-%o&yod>HaWmAL{N8Ps44`<-+#mVd*Op*P#iY+geH>foShx#Va{OhDA zQKlx-avig7CRIK?H={jH=T99*u{}%CX>U|YlKc4ygdIW&BcPFQR{gNMaoxp2vtUX> zx7=!-ZBE2&Aq2MBWEr0Tk~}*pwGT@9)@jn>smAK)- zYaz*<_B2%;o!k(Reym1*`BICxXS_*UZMU3<9Hl-r$edyypJ4oRWJXaH|` z*w?SXY6T<7f>z#9IP5EY72Ot=P_4ABi%GcJ+BiyMgY{%Qjr3wcd;i?7me|nq^)Cuu zUnE8iFN~b7X{Y<};Ujh@W|-_PGMaSR4#a_+y2BXZ=@tedsaTih%6r`HQgXDnvEbdm z(sr5!>o_DnUsleaxVS5OW#f}rvT;i$gMy!cnLS!$a=f!19uPIT`GmQd?Dq_rj9l2X zuh6hF6`_NY&d#H&O-`?6Y^s~PB5@G>DTo&IZ|gCwX0TWKw}j1=ZbiV|WbR5e)h2R& znzi#?=83oh;WBd_V&K4y{3$tLDtk*`nz@$K|+ zB@GmFuPOEB{Kjk>E0>FLY(*4@H4b7FIY?*kFK(P`$v-}F1|O-UZVRD_+>2W~gAL5m zL78T)nkzBL{a#Fmy!Y%DFM8{(E3NX`|FGPX$&zU9f8xt_0ecVZ_;Q7pb$EV_X@r}a z5{C%$393iDr~X?sjE?#t+{;w|F-01PTXj$hWu-tMlQz~mZ+t#GLEEx~(5tfCftHE_ zn(Rm5QMS4edbZ{TRV;(A5TI%q1qDk2Vo4{xy7K;cCfk&@v^BNn-a^xbX^Dj5IjZ_a z@v{lgv)dpr4pr6DnvuAue*&0?#b~G(7cN&^%~@D<8N{bAvce{H$#SJuM~gF|i1TZ( z4cW3ISVG^=FpY0&sv)BLDMFJ$9lyPEv%Lv)v`jAaXYelb%=^~`=B(hhvkRmw)WVP35ihtI%gp9xHjfzS@?v4Ne7x^7 zf?KrNiy6m6@~2^t-4a_%{akqH;z2QLsPM@M<73lH4*-+4*5M)C^u5&nu5CUv{4H5Y z$Z+8|ud#|;2*DfJ(w-?Jr~cAQn$=KAvEq~~#3f_{Cm9!vH)8W8YYd%GR0oQv1ZCNg zH*U~1E;L>PFa2&P3lGzA{a|U@^>zTTxUJi*DVUk>gwoMcA%rmS8LSMrj>>D(IW4r6 zOXoo<#ef%>loxta#W!ZO(}%I5tgjH0aS7ySPpqkGT2YZwu!l8ft8QQ)@J6cny{y3U zE-|jT=7W+KfM>&_tFy{Q7=xhPdnQlA_Zm@ta2y4|Z*t<4J?E-E^RKzDPJQ`iDsK&W z9Q6ZPJ}G_Q*NvZ}-PZ0k`3_#ji~Lv1?j|*AkO;}jShL3lLD;WH+x$nf-V5E>o90E- z{6qcC6)#foPSjDW#Q91~t7<`t@_!08#J4mX5!X(|1e)-;mJ3b?6%eDSQU;~{+W8O4 z3hW4Cuw~a30(@HAkNUbhbYzL+CehqePWb+1Ok*tK`M7QcglfwIIH1S$XHI1X6bWK= zH-K`qCY_;g>f1f-M3mM5E_OTBKfBq=AvlEViUsla9;Ol_rj3IFiH!24U9N6K`Frr9 z@yv)HAcvfRJ%*Bix``hi$cQpI*$G&w&U!ofJbHC!AuSb>0`()zJ`3a2oh(Q&|!$Tw+%IRAIkt2$B^8E(2> znQy9SL*IGTMFck$NVx6FO$SYkP{|B8^)_fx<}au9#p_6VMBJXN0^?;o`%}ILNz2g? zR3gCF_*NOj7dOY&;+>j48sHhxOgK4Fs@X&Z{CfIHs({9Aefe`ueeNxOszs*n(Jym-^%`< z3&Kf}GUKW;AddQZi&3)TpRwdwX$onQj*b`pFz}&L9n)kXHB>F~$*y=(`Vj{|Mwh58 zeoMIlWw?_Tla+Mv{A(>f0ZaJ4myC~v1s5BSB{T$-0b0WtE!F^*fHa=~E}Ql4l2~O2 zrd3Sjm%vXz$$hOv+xFyW6HEc6A3t6rp)ZeU*(w^xCjd>#z}bSAR{h3qCbzpQZU`u` zg?K(@4=;h9)J$VNa_Diptbo#ef$+%s=Ua$mEzo_u%*Mt9_k8Ki-s2k2`q1lQue+XX z>@}?sMw;N%>H~K?|E`DmW&RHNRvH%vc?(-fvKZ4n(+xuz74lABaa`!lzdMrT< zG%)zahG=&%wzcrn zy)EVjewK?BKw!$#zuQmc_~_vb(p=*!@ElbC!!~%nHy_HrWjgzGdPb4gKH~5d4Wdqs zIU!-=xLg*STrHm5P$#T8-wh2sf4CS{A;+OuwrLo%!c$L!#T(h{Da!f*iLHXFS??%; zfm)6iB3gvf^pAVg*cr3ocVmwJ2P@{qw189B5s;zfr>eH?Ku%xXOv<*&-2^iJbrZGWg;AjdBGBk07Qjxb zvhKd`{>+6iWQHaDxZnV0%@VnZ@sA`cPRPf_W7u4n8BJ^Sf;c|a@}iG0g=sDO5_%j) z4F<&sIY@ z@KSD|^>(eq-R+;2aUar-6n$rXT!10ln!Xpytg7LIl&lD~Vg2T4Lvw)1(?U%K5Ql&5)0u!5kOo#K{Z z*=K~3UwE3o3SSa}D_~OWq*D|Kgg899o+SMXAodtIht0FoP<`aalEn%a7bXm3>>0A) zezm4Q%5>5Uf1KjdFwscYE|G!DQCA^E872=eFOrD3B=b|N6{asBfmtf+y4l&k(c;`*t#%5@uRJc!JaFGi0BhwCi#SD*wEIJ zMk%GY+d!G#Lwf%m{2V`ZA3&)^YUVr#+H&d~Q9uzIUl-J-iDEB@iMj!)Og_hOh<)2J zcfs=;2-vCAUK2@ai_JwB#*O&!uy(!0o!-?<1Y%bk6TLcnr(Z5vv!4ll5o1<_%r{K$ zw78ePnIbo5RXRUdFMoBWw>7o2B#OqSP-(=Ah&)L`xl_7S(Z=xaWFcF{r*Hlq+#3tu zMD!CIvY}0?_kt6kH@i&h;U+#gLl5+ zzFO@{Oqc8ft6Og}US1qd<_q2U@YKk!?jsdf{cnbbK14>)G*@}60`q&Cs)+jD>3WEU z=hNli(VDlsMjeCo?OFz`win}PJYYPk#EWN`7dRY`DtfiuH1y%i9=VNeg@yNE|Gyiv zC}Ey^k>S#^+8QUm%>=I-tlT{*sG`>j7bhaYF=a7q>k#QXywiVNR`^epE$x{)RB(?) zqR=&{;Grn6TOHvK*jiq*>25{vo}8zwZIAIf85ucI;`_rmu-4AknB~t_c{sLbKYQjU zr|4ox2?>iM`WeUcW`uf*^0?N~E$l=u_&M;L8#cxW`h+fW+R{*8G|^X#<~t$~*TV@H z#teLt*3HeW$NdwrGYsYj>4H4#!SivStmj@s2)qcAhwR)w0ZvB@l)1E86R={*lE}@t z_4kwtjs)?wwZx45l*Sf9)+C5(pnDoA0AY`J=CT)a_)rws%_NR~FHl2o_j#eTw*Dda zVk+i`O{{MN?|67xxd%mfjxX$4?e4|s?3&r{ku#>SF)g^rB}pFT4lbQl&jAc-NM(jT zQIVyEv5YD?xpNEB%yrhg^+^)LN)N?`50;9uG52vfvGSf~VFHck1aK)LT!uw?nAl*t6OlAs zv;-8b@?l}1+rpZOC4Vw&0(u((~7I|yXg_&tEY|RMuiuFJ1JMa!$ zuVmXTOuO|da;mM}VQXU*?Qkl0Le{0hbS>EP+n?y5T?dC>pMdLSeHl_Ho^SvP6!#fX zTvN1efbE)Yk+@bSUiRX(#IzT+ywo?dh@q&(XW4j~892j-Xml_WJh)P+M@C zm6$nrY(x=!e1~tB^DE=(0G^!dMU9B1|^-L%rF@=Rkynk|s9uWC8Y<5ma6kG-Mp;%1CHe@8quz?UyQPSYe zC)%e2t(UJpw3$HNoEE{qe%sdLWBKi9;+nJ0#MHZRtQ$`@kNBk&&&4g#HhG1^QseV1 zWYwxk+MQ0|ohl>hkCi)cR#$Bls~PPOsL1YtU2Sl&Z@F(UCkHT zSlnwc?-y&-S_8*M6Tc_QD57kq`E$2M=Uz0Ft0pVx3FlWOdOMTdpOxkw@MmyVyu{#q zbNNxGrj6;ys~RU}=aQ(})B|%|mY(V;EqTo4EJlWu_G|?t#Ck|!vP}zh-Ie&Lru3(r z&U_zesoR_QGuWiah4rX&t2NBvaW41y#2aGX zCC<`>jzmm-0snroBvz>FvHX$IiqhsjLs3)wnqc@mGo)nz(?;N=Le#9J7b{dSm!eLJ zKoChW`~RmaUg+P5N(3*JYTpas%sSQD`dbHMDZlK6TCDTjw0dXxJuDj*TwaX-r`PW)B1 zHHX`VcNo^ZGX_8X#LCG%83lNhXeFsvHJGg9JYt{aZnIar*U~mBX5WsatS&y2Z2D2Q^DV%y|2@^c5z9!$_rlKZ#j|K|U3yO=5D^uEq#%V>ebx z<~XHcF;>L^tsVAG78ANL2Mq^RqLIC?b zH5c_0H)l3#PTjPTFnCBp^P2MHdn)os^$3uoGn#Hq0MYDg!g6oguQ__ zV5(I@GL86;o~B!i&tm##oaTGGB18*3wt$IWe%SS2rNZe$5@9D{PFfFeMA z$oq9=2YKJ!;uVQzl~yK=}1_V3ele7ND9ikQoRnAKWa%X6^Pa3kcuBl z?Lkr3S2-zQlIrf>(}lcI%tOYAKk4+SK1Wk!ZNtYLrkA#o)zUXR3XZA{m1IVY^bxk> zZnQhsRUS11RM?*_QnWwASgXf}-Zs}z$s7mpyN3ls?mac?HT#9@BG2uqNjVCu734?2 zF0t2{Cl?somRdyzq=asc!=VlHtvAXj)8ickSk2c?G^8gHfN z6B>&1<`1(2XvwVr zYHv-eUb+_1NenAAPSfOLi5LKWC1!Va zYSOG|4}jNq;$-E?FPh*xDj<00{Xjw~Ax8Zx^V0yNbE$7Cfwg`EUE7$+U@WdMxSLpG zx9(OU?qIr@?G?HTgUxD5BBNImcGgf>BdAD}saZ;o#7k|ghEC4qb00{c+fB)X$sIWv zH~>iKR)mV3J`+2Mc;~x!P&k$sGN2>aW5}LCdLXCB>kV%vGJJLLS6Swi7^e2o$V4KX zhzbP{T4=SDmhhi!@Oh(R%>7Q_ihOIUQusN)bhnYEOsCX)4y?wK1#niew$)bf>8T!L zYdpx+liEO*(Nl09MyYe~6+aDomv8!vhDg?)+M@I_u-xy_Ypb%76vx;r#}lF6S${Kc zKmz%mx}F5JDR(5k*%C45aEN&o9}XjL1E{Yhh{uJ!q)%|bx6#P;q;raTpp$S9R62W` zR1(@uyrX4qMl;k9M$9~j>OMNJQ{k%9T{`?Vj2CnLHKDqNZf*xy(8#>mYvKm=(lEJ- zJZu)R2=8QiM#v)tYThcsg~$fFn({mU0Nx_EM!301gP25WhN=MAE9?*;>#6bm!(n%f zvN-D-&fBxQ`_;p=P$4Pus8JiT$%Bz1jB4l6;wgA1z34ArZ+P87^dyxR6uTARmzw`cNIs3DuRR z9?##KBf<&pRbyDLg>`q^_=sc3cK-lsLVK6Fmfm`O9nZW!Zi$gdWoZ}|ru!-I z(UH-mc5YKRt#{vQY1zFO7YeaOW&)B-jk7gJ->Z1ozqsBY;D%oiwKP;ks8R6LmVv~m z(V4E{I|8nx07&?X>$tD4Wfuu4f^1|=H;koG(NB>8H8l7;&KFqRzB!ZGBB6s0bf-#! zYCbw^HGkc^dPbxy{-#wsiXFv3>J1L40=PXYDn^LfwAp|Z`>DRU6jDj+(2u)LIq_Bo zrj<1Ep!3&kwdKTjSuWauV!MXY;fFaM)qg>WYJ?J}sCm+w=WRSzY3eXF{e6awx8l4- zX;DQa9rgVp?!M-MU48-{4+$FyrCZpG-zz>2L$s1u~&;-RQNKS~W1jIRljX~Yb+;HLE$)m>SEg1xk_wWT5 zMwK6mE+tEO?Wl3Rv1G8Ko-KG)J=p*rsxpf2O?IbuSz6j#+jdgj+`eY=_6s1-3uWnb9dFE_aAxX%e>b&x0k$~9$8$H zL~#{WRXj%I3X`T5*8o*&^*m~N0x7+I&0J?PcM0v;QEj4@N8s+@f)K5JVw)d5N0Oc> zqn_D}7|{N>?SMY1WCv%FM*c%l;xQ1IAmFp*a#I3ic5zkYPVDbqodD9o?vuA3N2|O3 zAU?rQzgK9EIEpl{UCA2&$NM#Hi^LP`&jX}yy*<@x*KZO@9eN#OFK&kzu^z7y%sp+^q>Z`*!k@#Ko%U^er?;*pks)Y= zi(A;ZSB%i1G&J=ur0UxnobQ_ma#Lbu&tq+U;L<)eB}Z9!En1X%)2ST?U1Kr*yWe=s zr(@YnF0uWwkD7m~dy-8gyG8*CQhhYo500&3?W_2&vl8bn=I^YGOTtMn?ZQN=0rh)v z#zG}}?tFBeqd#?CT4iJm$&1E7vx+jBG{hAuQAI+t4gB?*!FGprUgEL0IQF)}=WUD4 zcLSKmyDiKt!19r*Nm1!iX}B84cJFb|jn3ntxntZJDeq$XZ>NR7QKXt+#c9x~TArH2 zV{rx>9Bw++1hj@FRY7jC1&L!+00rYp04bvcR_5X3PjYGKIdC0L{%4>X?e5T?;cc=k zrzw_|xQMNz+(3>2<<3HC7pVff4Ph}jhPljQdwC(ddw9qaS#B0{tj5k|y3jC~+b zZ?>X0vZR?Ss?Bhb*N?i}khR~6^wVD4&HCgrT3avRggN0-d|V2X|_4`N@@eJq4cr&9T+d6 zmdNO5u8G60h+V}C- z-nVp?SkR7O`nF-j?ybEJvN%|;EHR69Zt+{%%Xx1d+zcZk39aIG;3#S6I}x#`U43V9 zVX;=ZyJj(7Wiinkb93$bL(3Q}yqI-7jdX)?UC8k|QVkn_ReV3YTXkyG`42rw8%ID_ zg#g&Ej^7j4QQqB4CC%h9#|$?!J2a~tE5gw+k1Ub&u z%p_2E2Wea(G0w3(lB|(6e@;$sQfasV9-6Sve@y=XrY8g5mr_ap0C+Y1(UTi@6+J7! zRrwyg)Pg)bwH%n70mU`kY`JWG%%4#QBe&^Dp&FGn27r$pWpcSDDSMfe&35sQ49wq| z!cY_9e9rp9cAs_RQ?|dTeZP;lv%kh&0S%q(cFG`In;DMH$S5rt^&#*hePnxw{`AFJ z<(kpc0L9XEp6Q}zff*I5+=#EsPu&?SXUxsq7Zslr5pE>7mP#osB^09r=^F)6PYpwJ zbBwizIUEr_a%dw{!eU_V0HMN zEMOIrsEklo&a^!TmYN)dQOj-x1*GU7Z-r_`noNJciMV2Ys3Bu4ZQ@S{=m)^*WQI$8 zoN^EmA$}8Y;qAZo9b_;V&fv`U?kgcG&wq=KX<><`RpPP=5EZCtPGjwx{4-jec6e-On<| zE>kmd*76fiTifjc&k|v9gb!L>vpsyPAYSaP-m)yk#xN*GJ%wTeSxS=EXYFkVuYU29t z-RXLSMY=?xpq~vbT+nU^^Z4jUTXI48eZxgV z5ncIwPs77R5*1icSnornr1T(?fNes4L&s6u%Zj$lUCcyw%Wz7N$ME{ltI&LP+uf0C zBq@W%OtR!*vqK%cw_EdI2`oa?`k0>+tisxbj};ZfR)0~q7mFT(FstHIodEMXvc}s{ z@*U567AndPfI%1f%+&m#D>3k(rme6Ta4$1iYl}+@NC#Gy@xnx>%ZzGUMm$x^T;y{% z@pkSHfvl1gWr}8x!1{^bpgZdih{s4C4$!`|hG`>iO7{%S?$C{h4?$lVYaekmhDVm# zIV7(2W>Hc!0)hxUx=_>o{51(Gc=T0KO8oqdb?(ZX44#y!q4Or5I(Z|Aq(O;^smTje zea&4%DF*UMB(1{lKu3#U_ z5oLl)cUcvY#Z>HkLD4e7JV&H}#`{y_O#o^Nni?JT>I`_FFX5oRuT?05r{k!S*;?ZM z?1!+L7T(GvUEbz$N1%QY)`Gh1&e6tx zm7BbCJWj%*D|rc5lA9KmWM*$VYb$dUwZ(P?xWrlkQy$J2vLXQ_))(&-*9d7yXHU~ozwxh#)v;n);|P9=bJu0w1}CoQ!k$_P12Y3x zkA6IljXqj=8phf{)nm;Nl74g~ABA=6?aM(^#abgG@79$J0~%_2W93C0g}8=yQ0+mv z4N?F;14i=1T{dV&eu&7*(Azouz?J-9ZZ@FSKJM!=c|FVBJhsrdjN8XF2a!H(b(6Qm=8ea7_nh}!^TIlao^x4V zB_tN2ng;Mbdd69{Vj1nR?$^y5#UX2)grqr=Q&V0%Q9$WHeD!gAcXo0)T!XV*UAlz| zT=ISjYIi+J-@`z%k>IisP(FTNJv9vp1g$#n;x!#xlMavMPxjYYpV%i$hQ;4XCc<2? zSf)uSDhS_m31wm`D!yY;+@wu!8cy(EWA2oPwYZMFqL72tPfhn5jkK`K$RoEIp&Pga z&^MVM5uoeNJ9}Zox!Hh2y$1a@)@q3x;0)4Se)AboA&uW?-Ax%-7NG;?H6BC$>W<7E z>>`Fcgr?iGNUP(ia`_vC$X!msC7mP;w2Y97S#}%xjYUp@OKPRK3Cc0)N2hbW03+mj>UN&n z+f^(XRklM-_vu6T=tmzR?UTPRQX2Iq#M4>JM_}__;2LNNUhmuIvVxSd@v11egwNy-yw{ollnfS*{6_ z*>x$vl~Cjm032#)I_u~gHmaPS71$;D&ufZ6EE`2Dc^LmM{!%|+nj$~CS zU@5=?!-;WHe=kbvYg>r<#d=aChMZ9%Mkm`9{{S%rpB-C60|v3hz>q{;PayOh$?)|b z9T#NxACBRTD5GtTt-nUAFq7&H4G;h*2azZEXvR+}trj|O*Iv^OLuRE0-d}dPDLjJF zf%5oia``y%H@NKOuh86G!!(u#WSNV5Z00Of6N+(kBV7 zV1y)&2E_9Wgq}VcC`@puB}#jYD0&(bzY*}%91=T*NZZ?CQoMHAi5u;<{WWC9CrL6KJSzZvle_M?h5n7=D@6=OVAF}by7(UPgC2X+rHT5uL45MwuHGh}P-C2jo zgqY!Rp`O_!0-mQJSFLIVJO-nJG7-#VOyzpFN{^3COK$MVDHK&`$@P<3mHuJ#&{&is zD^NxQD6J{PAgu`T(KAXzpyTy52ul%PNb&gBu8X>u)rB47V8GX6Sw`P~O$l8+Qc{y9 z>P;#G0ks7+B1sAg fQZr4|{0`LDri))mL$Rp-I$g}%DnR&n&};wMJaamc literal 0 HcmV?d00001 diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/create-project.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/create-project.md new file mode 100644 index 0000000000..da8b4708ae --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/create-project.md @@ -0,0 +1,210 @@ +--- +title: Create the mv2_runner firmware project +weight: 4 + +layout: "learningpathall" +--- + +## Overview + +You now create a new CMSIS project called `mv2_runner` by duplicating the existing Blinky example and configuring it to include ExecuTorch libraries, the compiled model, and SEGGER RTT for debug output. + +## Duplicate the Blinky project + +Start by copying the working Blinky project as a template: + +```bash +cd ~/repo/alif/alif_vscode-template +cp -R blinky/ mv2_runner +``` + +Rename the project file inside the new directory: + +```bash +mv mv2_runner/blinky.cproject.yml mv2_runner/mv2_runner.cproject.yml +``` + +Replace all internal references from `blinky` to `mv2_runner`: + +```bash +perl -pi -e 's/\bblinky\b/mv2_runner/g' $(grep -RIl "blinky" mv2_runner) +``` + +## Rename main.c to main.cpp + +ExecuTorch is a C++ library, so the source file needs a `.cpp` extension: + +```bash +mv mv2_runner/main.c mv2_runner/main.cpp +``` + +## Copy model assets + +Create an assets directory and copy the model header into the project: + +```bash +mkdir -p mv2_runner/assets +cp ~/repo/alif/models/mv2_ethosu85_256_pte.h mv2_runner/assets/ +``` + +## Create the SEGGER RTT configuration + +Create a file called `mv2_runner/SEGGER_RTT_Conf.h` with the following content: + +```c +#ifndef SEGGER_RTT_CONF_H +#define SEGGER_RTT_CONF_H + +#define SEGGER_RTT_MAX_NUM_UP_BUFFERS (1) +#define SEGGER_RTT_MAX_NUM_DOWN_BUFFERS (1) + +#define SEGGER_RTT_BUFFER_SIZE_UP (1024) +#define SEGGER_RTT_BUFFER_SIZE_DOWN (16) + +#define SEGGER_RTT_MODE_DEFAULT SEGGER_RTT_MODE_NO_BLOCK_SKIP + +#define SEGGER_RTT_PRINTF_BUFFER_SIZE (256) + +#endif +``` + +RTT (Real-Time Transfer) works through the J-Link debug probe. It reads and writes a memory buffer through the debug interface, which is much faster than UART and doesn't need extra wiring. + +## Install additional CMSIS packs + +The project depends on two CMSIS packs that aren't installed by default. Install them from the terminal: + +```bash +cd ~/repo/alif/alif_vscode-template +cpackget add ARM::CMSIS-Compiler@2.1.0 +cpackget add Keil::MDK-Middleware@8.2.0 +``` + +## Update the solution file + +Open `alif.csolution.yml` and make the following changes. + +Update the `created-for` field to match your CMSIS Toolbox version: + +```yaml + created-for: CMSIS-Toolbox@2.12.0 +``` + +Add the required packs under the `packs:` section: + +```yaml + packs: + - pack: AlifSemiconductor::Ensemble@2.0.4 + - pack: ARM::CMSIS@6.0.0 + - pack: ARM::CMSIS-Compiler@2.1.0 + - pack: Keil::MDK-Middleware@8.2.0 + - pack: ARM::ethos-u-core-driver +``` + +Add a `target-set` block to the `E8-HP` target type so the Security Toolkit knows which binary to flash. Find the `type: E8-HP` section and add: + +```yaml + - type: E8-HP + device: Alif Semiconductor::AE822FA0E5597LS0:M55_HP + board: Alif Semiconductor::DevKit-E8 + define: + - "CORE_M55_HP" + target-set: + - set: + images: + - project-context: mv2_runner.debug +``` + +For all other target types (E7-HE, E7-HP, E1C-HE, E8-HE), add a `target-set` pointing to `blinky.debug` instead. + +Add `mv2_runner` to the projects list: + +```yaml + projects: + - project: blinky/blinky.cproject.yml + - project: hello/hello.cproject.yml + - project: hello_rtt/hello_rtt.cproject.yml + - project: mv2_runner/mv2_runner.cproject.yml +``` + +## Configure the project file + +Replace the contents of `mv2_runner/mv2_runner.cproject.yml` with the following configuration: + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/Open-CMSIS-Pack/devtools/tools/projmgr/2.6.0/tools/projmgr/schemas/cproject.schema.json +project: + groups: + - group: App + files: + - file: main.cpp + - file: SEGGER_RTT_Conf.h + - file: assets/input_image.h + - group: SEGGER_RTT + files: + - file: ../libs/SEGGER_RTT_V796h/RTT/SEGGER_RTT.c + - file: ../libs/SEGGER_RTT_V796h/RTT/SEGGER_RTT_printf.c + - file: ../libs/SEGGER_RTT_V796h/Syscalls/SEGGER_RTT_Syscalls_GCC.c + + output: + base-name: $Project$ + type: + - elf + - bin + + layers: + - layer: ../device/ensemble/alif-ensemble.clayer.yml + + packs: + - pack: AlifSemiconductor::Ensemble + - pack: ARM::ethos-u-core-driver + + components: + - component: ARM::Machine Learning:NPU Support:Ethos-U Driver&Generic U85 + + define: + - C10_USING_CUSTOM_GENERATED_MACROS + - ETHOSU85 + + add-path: + - . + - ../libs/SEGGER_RTT_V796h/RTT + - ../third_party/executorch/et_bundle/include + - ../third_party/executorch/et_bundle/include/executorch/runtime/core/portable_type/c10 + + misc: + - for-compiler: GCC + Link: + - -L/absolute/path/to/alif/third_party/executorch/lib + - -Wl,--whole-archive + - -lexecutorch + - -lexecutorch_core + - -lexecutorch_delegate_ethos_u + - -lcortex_m_ops_lib + - -Wl,--no-whole-archive + - -Wl,--start-group + - -lextension_runner_util + - -lcortex_m_kernels + - -lportable_ops_lib + - -lportable_kernels + - -lquantized_ops_lib + - -lquantized_kernels + - -lkernels_util_all_deps + - -lcmsis-nn + - -lflatccrt + - -Wl,--end-group +``` + +{{% notice Warning %}} +You must update the `-L` path to match the absolute path to your `third_party/executorch/lib` directory. Each developer's path is different. +{{% /notice %}} + +There are several important details in this configuration: + +- **`--whole-archive`** is required for `libexecutorch`, `libexecutorch_core`, `libexecutorch_delegate_ethos_u`, and `libcortex_m_ops_lib`. These libraries contain static registration constructors (for operator registration and PAL symbols) that the linker would otherwise discard as unused. +- **Don't add** `portable_ops_lib` or `quantized_ops_lib` to `--whole-archive`. They are large and will overflow the microcontroller's ITCM/MRAM. +- **`--start-group`/`--end-group`** resolves circular dependencies among the remaining libraries. +- **`C10_USING_CUSTOM_GENERATED_MACROS`** tells ExecuTorch to skip looking for a `cmake_macros.h` header that doesn't exist in the bare-metal build. +- The **c10 include path** provides the tensor type definitions that ExecuTorch's headers depend on. + +You now have the project structure ready. The next sections cover the application code, memory configuration, and image preparation before you build and flash. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/image-preparation.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/image-preparation.md new file mode 100644 index 0000000000..b48f988de3 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/image-preparation.md @@ -0,0 +1,126 @@ +--- +title: Prepare a test image +weight: 7 + +layout: "learningpathall" +--- + +## Image requirements + +MobileNetV2 expects a specific input format: + +- **Resolution**: 224 x 224 pixels +- **Color**: RGB (3 channels) +- **Normalization**: ImageNet mean/std +- **Layout**: NCHW (channels first) +- **Data type**: int8 (stored in the header, converted to float32 at runtime) + +You use a Python script to convert any JPEG or PNG image into a C header that the firmware includes at compile time. + +## Set up a Python environment + +Create a lightweight virtual environment on your development machine: + +```bash +cd ~/repo/alif +python3 -m venv venv_image_prep +source venv_image_prep/bin/activate +pip install --upgrade pip +pip install numpy pillow +``` + +## Create the preprocessing script + +Create a directory for the image and script: + +```bash +mkdir -p ~/repo/alif/image +``` + +Place a test image in this directory. You can use any JPEG or PNG image. For this Learning Path, a photo of my cat is used as the example: + +![Test image of a cat used for classification#center](cat.jpg "Test image: a cat photo for ImageNet classification") + +Create a file called `image/prepare_image.py` with the following content: + +```python +from PIL import Image +import numpy as np + +IMG_SIZE = 224 + +img = Image.open("cat.jpg").convert("RGB") +img = img.resize((IMG_SIZE, IMG_SIZE)) + +x = np.asarray(img).astype(np.float32) + +# ImageNet normalization +mean = np.array([0.485, 0.456, 0.406]) * 255 +std = np.array([0.229, 0.224, 0.225]) * 255 +x = (x - mean) / std + +# NHWC -> NCHW +x = np.transpose(x, (2, 0, 1)) + +# Quantize to int8 +x = np.clip(x, -128, 127).astype(np.int8) + +# Emit C array +with open("input_image.h", "w") as f: + f.write("#include \n\n") + f.write("const int8_t input_image[3][224][224] = {\n") + for c in range(3): + f.write("{\n") + for row in x[c]: + f.write("{" + ",".join(map(str, row)) + "},\n") + f.write("},\n") + f.write("};\n\n") + f.write("const unsigned int input_image_len = 3 * 224 * 224;\n") +``` + +The script performs these transformations: +1. Resizes the image to 224x224 pixels. +2. Applies ImageNet normalization (subtracts the dataset mean, divides by standard deviation). +3. Transposes from HWC (height, width, channels) to NCHW (batch, channels, height, width) layout. +4. Quantizes to int8 range (-128 to 127). +5. Writes a C header with the pixel data as a constant array. + +## Run the script + +```bash +cd ~/repo/alif/image +python prepare_image.py +``` + +This generates `input_image.h` in the same directory. The file is approximately 349 KB. + +## Copy the header to the project + +```bash +cp ~/repo/alif/image/input_image.h \ + ~/repo/alif/alif_vscode-template/mv2_runner/assets/ +``` + +{{% notice Note %}} +The image data is stored as int8 in the header, but the model expects float32 input. The application code handles this conversion at runtime. The model's first operator (`cortex_m::quantize_per_tensor`) then re-quantizes the float values back to int8 for the NPU. Storing as int8 in the header saves approximately 450 KB of flash compared to storing as float32. +{{% /notice %}} + +## Try a different image + +To classify a different image, change the filename in `prepare_image.py`: + +```python +img = Image.open("dog.jpg").convert("RGB") +``` + +Re-run the script and copy the updated header to the project. Rebuild and flash to see the new classification result. + +The model classifies among 1000 ImageNet categories. Some common class ranges: +- **Cats**: 281-285 (281=tabby, 282=tiger cat, 283=Persian cat, 284=Siamese, 285=Egyptian cat) +- **Dogs**: 151-268 (various breeds) +- **Birds**: 7-23 +- **Vehicles**: 407=ambulance, 436=beach wagon, 511=convertible, 609=jeep, 817=sports car + +For a complete list, search for "ImageNet 1000 class labels" online. + +The test image is ready. The next section covers building, flashing, and verifying the inference output. diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/memory-configuration.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/memory-configuration.md new file mode 100644 index 0000000000..020e947130 --- /dev/null +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/memory-configuration.md @@ -0,0 +1,185 @@ +--- +title: Configure memory layout and flash settings +weight: 6 + +layout: "learningpathall" +--- + +## Why memory configuration matters + +The stock Alif VS Code template divides memory equally between the two Cortex-M55 cores and allocates modest stack/heap sizes suitable for simple examples like Blinky. A MobileNetV2 model with ExecuTorch needs significantly more: + +- The embedded model is approximately 3.7 MB (stored in MRAM/flash). +- The ExecuTorch runtime, operator libraries, and application code add another 800 KB of code. +- Inference requires approximately 7.6 MB of SRAM for memory pools and intermediate tensors. + +You need to reconfigure the MRAM allocation, stack/heap sizes, and linker script to fit this workload. + +## Edit the memory region configuration + +Open `device/ensemble/RTE/Device/AE822FA0E5597LS0_M55_HP/app_mem_regions.h`. + +Change the following values from their stock defaults: + +| Define | Stock value | New value | Purpose | +|--------|------------|-----------|---------| +| `APP_MRAM_HE_BASE` | `0x80000000` | `0x80580000` | Move HE core out of the way | +| `APP_MRAM_HE_SIZE` | `0x00200000` | `0x00000000` | Give HE core zero MRAM | +| `APP_MRAM_HP_BASE` | `0x80200000` | `0x80000000` | HP core starts at MRAM base | +| `APP_MRAM_HP_SIZE` | `0x00200000` | `0x00580000` | HP core gets full 5.5 MB | +| `APP_HP_STACK_SIZE` | `0x00002000` | `0x00004000` | 16 KB stack (doubled) | +| `APP_HP_HEAP_SIZE` | `0x00004000` | `0x00010000` | 64 KB heap (quadrupled) | + +The stock template splits MRAM 2 MB / 2 MB between the two cores. Since you're only using the HP core, you give it the entire 5.5 MB of available MRAM. The increased stack and heap accommodate ExecuTorch's initialization code, which uses more stack depth and a few small dynamic allocations. + +## Edit the linker script + +Open `device/ensemble/RTE/Device/AE822FA0E5597LS0_M55_HP/linker_gnu_mram.ld.src`. + +You need three changes to this file. + +### Add SRAM1 to the zero-initialization table + +The application code places the 4 MB planned memory pool in SRAM1. The C runtime startup code needs to zero-initialize this region. Find the `.zero.table` section: + +```text +#if __HAS_BULK_SRAM + LONG (ADDR(.bss.at_sram0)) + LONG (SIZEOF(.bss.at_sram0)/4) +#endif +``` + +Add two lines for SRAM1 immediately after: + +```text +#if __HAS_BULK_SRAM + LONG (ADDR(.bss.at_sram0)) + LONG (SIZEOF(.bss.at_sram0)/4) + LONG (ADDR(.bss.at_sram1)) + LONG (SIZEOF(.bss.at_sram1)/4) +#endif +``` + +### Add GOT sections to the data copy table + +The precompiled ExecuTorch libraries use position-independent code (PIC), which relies on a Global Offset Table (GOT). The GOT must be copied from flash to RAM at startup, otherwise the table contains zeros and every indirect function call (including C++ vtable lookups) crashes with a BusFault. + +Find the `.data.at_dtcm` section: + +```text + .data.at_dtcm : ALIGN(8) + { + *(vtable) + *(.data) + *(.data*) + *arm_common_tables*(.data* .rodata*) + + KEEP(*(.jcr*)) + + . = ALIGN(8); +``` + +Add the GOT entries after `KEEP(*(.jcr*))`: + +```text + .data.at_dtcm : ALIGN(8) + { + *(vtable) + *(.data) + *(.data*) + *arm_common_tables*(.data* .rodata*) + + KEEP(*(.jcr*)) + + /* GOT for PIC code in precompiled ExecuTorch libraries */ + *(.got) + *(.got.plt) + + . = ALIGN(8); +``` + +{{% notice Note %}} +This was the hardest bug to find during development. Without these two lines, the firmware boots, loads the model, but crashes with a BusFault when ExecuTorch tries to call any virtual function. The GOT is like a phone book for indirect calls. If you don't copy it from flash to RAM at startup, every lookup finds address zero and the CPU faults. +{{% /notice %}} + +### Add SRAM section wildcards + +The application code uses `__attribute__((section(".bss.at_sram0")))` to place memory pools in SRAM. The stock linker script only has specific named sections for LCD and camera buffers. You need wildcard patterns to catch the ExecuTorch pools. + +Find the `.bss.at_sram0` section: + +```text + .bss.at_sram0 (NOLOAD) : ALIGN(8) + { + *(.bss.lcd_crop_and_interpolate_buf) + *(.bss.lcd_frame_buf) + *(.bss.camera_frame_buf) + *(.bss.camera_frame_bayer_to_rgb_buf) + } > SRAM0 +#endif +``` + +Replace it with expanded SRAM0 wildcards and a new SRAM1 section: + +```text + .bss.at_sram0 (NOLOAD) : ALIGN(8) + { + *(.bss.lcd_crop_and_interpolate_buf) + *(.bss.lcd_frame_buf) + *(.bss.camera_frame_buf) + *(.bss.camera_frame_bayer_to_rgb_buf) + *(.bss.at_sram0) + *(.bss.at_sram0.*) + } > SRAM0 + + .bss.at_sram1 (NOLOAD) : ALIGN(8) + { + *(.bss.at_sram1) + *(.bss.at_sram1.*) + } > SRAM1 +#endif +``` + +After these changes, the memory layout is: + +| Region | Size | Usage | +|--------|------|-------| +| MRAM | 5.5 MB | Code + model (~4.5 MB used) | +| ITCM | 256 KB | Fast code (~89% used) | +| DTCM | 1 MB | Stack (16 KB) + heap (64 KB) + GOT + data | +| SRAM0 | 4 MB | Method pool (1.5 MB) + temp pool (1.5 MB) + float input buffer (~588 KB) | +| SRAM1 | 4 MB | Planned memory buffers | + +## Configure the flash settings + +The Security Toolkit needs a JSON configuration file that tells it where to load the binary in MRAM and which CPU should boot it. + +Open (or create) `.alif/M55_HP_cfg.json` and set its contents to: + +```json +{ + "DEVICE": { + "disabled" : false, + "binary": "app-device-config.json", + "version" : "0.5.00", + "signed": true + }, + "USER_APP": { + "binary": "alif-img.bin", + "mramAddress": "0x80000000", + "version": "1.0.0", + "cpu_id": "M55_HP", + "flags": ["boot"], + "signed": false + } +} +``` + +The key fields are: +- **`mramAddress`**: must match `APP_MRAM_HP_BASE` (0x80000000) from `app_mem_regions.h`. +- **`cpu_id`**: `M55_HP` tells the bootloader to start the High-Performance core. +- **`flags: ["boot"]`**: marks this application as the boot image. + +You can view the completed versions of these edited files in the [workshop repository](https://github.com/ArmDeveloperEcosystem/workshop-ethos-u) for reference. + +The memory layout and flash configuration are now ready. The next section covers preparing the test image. From 206e3848f07984deb3e3326407925535a8ec7dc4 Mon Sep 17 00:00:00 2001 From: Jason Andrews Date: Fri, 6 Mar 2026 16:21:29 -0600 Subject: [PATCH 04/51] Change author_primary to author in copilot-instructions and other templates. --- .github/copilot-instructions.md | 2 +- archetypes/install-guide.md | 2 +- archetypes/multi-tool-install-guide/_index.md | 2 +- archetypes/multi-tool-install-guide/tool-1.md | 2 +- archetypes/multi-tool-install-guide/tool-2.md | 2 +- .../layouts/partials/head/social.html | 10 +++++----- tools/report.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index a18f430078..8cc8425334 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -77,7 +77,7 @@ Install guides must include: - `title` - `minutes_to_complete` - `official_docs` -- `author_primary` +- `author` - `weight: 1` - `layout: installtoolsall` diff --git a/archetypes/install-guide.md b/archetypes/install-guide.md index 7d12654230..26713326c7 100644 --- a/archetypes/install-guide.md +++ b/archetypes/install-guide.md @@ -2,7 +2,7 @@ title: PLACEHOLDER TITLE minutes_to_complete: 10 official_docs: PLACEHOLDER LINK -author_primary: PLACEHOLDER NAME +author: PLACEHOLDER NAME ### FIXED, DO NOT MODIFY weight: 1 # Defines page ordering. Must be 1 for first (or only) page. diff --git a/archetypes/multi-tool-install-guide/_index.md b/archetypes/multi-tool-install-guide/_index.md index 5760a17774..d026c3e0ff 100644 --- a/archetypes/multi-tool-install-guide/_index.md +++ b/archetypes/multi-tool-install-guide/_index.md @@ -1,6 +1,6 @@ --- title: PLACEHOLDER TITLE -author_primary: PLACEHOLDER NAME +author: PLACEHOLDER NAME ### FIXED, DO NOT MODIFY weight: 1 # Defines page ordering. Must be 1 for first (or only) page. diff --git a/archetypes/multi-tool-install-guide/tool-1.md b/archetypes/multi-tool-install-guide/tool-1.md index 1f2d78b130..c5dc82cf7e 100644 --- a/archetypes/multi-tool-install-guide/tool-1.md +++ b/archetypes/multi-tool-install-guide/tool-1.md @@ -2,7 +2,7 @@ title: PLACEHOLDER TOOL 1 minutes_to_complete: 10 official_docs: PLACEHOLDER LINK -author_primary: PLACEHOLDER NAME +author: PLACEHOLDER NAME weight: 2 ### FIXED, DO NOT MODIFY diff --git a/archetypes/multi-tool-install-guide/tool-2.md b/archetypes/multi-tool-install-guide/tool-2.md index b8f23ea35d..306b86278b 100644 --- a/archetypes/multi-tool-install-guide/tool-2.md +++ b/archetypes/multi-tool-install-guide/tool-2.md @@ -2,7 +2,7 @@ title: PLACEHOLDER TOOL 2 minutes_to_complete: 10 official_docs: PLACEHOLDER LINK -author_primary: PLACEHOLDER NAME +author: PLACEHOLDER NAME weight: 3 ### FIXED, DO NOT MODIFY diff --git a/themes/arm-design-system-hugo-theme/layouts/partials/head/social.html b/themes/arm-design-system-hugo-theme/layouts/partials/head/social.html index bdf700dab6..531bef36e3 100644 --- a/themes/arm-design-system-hugo-theme/layouts/partials/head/social.html +++ b/themes/arm-design-system-hugo-theme/layouts/partials/head/social.html @@ -26,30 +26,30 @@ {{ if eq .Parent.Title "Install Guides"}} {{$title = printf "%s: Install Guide" .Title }} {{$desc = printf "Get up and running quickly with the most common tool settings with code snippets, using this %s installation guide." .Title}} - {{$author = .Params.author_primary}} + {{$author = .Params.author}} {{else}} {{$title = printf "%s: Install Guide" .Title }} {{$desc = printf "Get up and running quickly with the most common tool settings with code snippets, using this %s installation guide." ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Title}} - {{$author = ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Params.author_primary}} + {{$author = ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Params.author}} {{end}} {{else}} {{$title = printf "%s: %s" ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Title .Title}} {{$desc = ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Params.who_is_this_for}} - {{$author = ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Params.author_primary}} + {{$author = ($.Site.GetPage (path.Dir (path.Dir .RelPermalink))).Params.author}} {{end}} {{else if $is_content}} {{if eq .Layout "installtoolsall"}} {{$title = printf "%s: Install Guide" .Title}} {{$desc = printf "Get up and running quickly with the most common tool settings with code snippets, using this %s installation guide." .Title}} - {{$author = .Params.author_primary}} + {{$author = .Params.author}} {{else}} {{$title = .Title}} {{$desc = .Params.who_is_this_for}} - {{$author = .Params.author_primary}} + {{$author = .Params.author}} {{end}} {{end}} diff --git a/tools/report.py b/tools/report.py index 4afe5dd14a..4435d94fcc 100644 --- a/tools/report.py +++ b/tools/report.py @@ -56,7 +56,7 @@ def content_parser(directory, period): logging.debug(f"Last updated: {date}") author = "None" for directory_list in open(directory +"/" + item): - if re.search("author_primary", directory_list): + if re.search("^author:", directory_list): # split and strip out '\n' author = directory_list.split(": ")[1].rstrip() logging.debug(f"Primary author {author}") From e48bbbc8b50c71f562b7487b14f8985e4192515b Mon Sep 17 00:00:00 2001 From: Neethu Elizabeth Simon Date: Mon, 9 Mar 2026 16:28:34 -0700 Subject: [PATCH 05/51] fix: incorporate review comments --- .../container-lifecycle.mmd | 16 ++ .../container-lifecycle.png | Bin 0 -> 37625 bytes .../github-actions-ci.md | 2 +- .../mcp-communication-flow.mmd | 22 ++ .../mcp-communication-flow.png | Bin 0 -> 55743 bytes .../run-testcontainers-example.md | 191 ++++++++++++++ .../setup-environment.md | 2 +- .../write-test-cases.md | 243 +++++++++++------- 8 files changed, 375 insertions(+), 101 deletions(-) create mode 100644 content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.mmd create mode 100644 content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.png create mode 100644 content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.mmd create mode 100644 content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.png create mode 100644 content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.mmd b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.mmd new file mode 100644 index 0000000000..de7e9859fa --- /dev/null +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.mmd @@ -0,0 +1,16 @@ +flowchart TD + A[Start Test] --> B[Create DockerContainer] + B --> C{Image exists?} + C -->|No| D[Pull image] + C -->|Yes| E[Create container] + D --> E + E --> F[Start container] + F --> G[Wait for ready signal] + G --> H[Run test code] + H --> I[Stop container] + I --> J[Remove container] + J --> K[Test complete] + + style A fill:#e1f5fe + style K fill:#c8e6c9 + style H fill:#fff9c4 diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.png b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/container-lifecycle.png new file mode 100644 index 0000000000000000000000000000000000000000..643b196aa94c6c2cea7adf63c81f59813d7089be GIT binary patch literal 37625 zcmb@u1yogC_ddE0M?onSkdRbBLK*>)I*15LN=OUR4bmMVB}yY*qJSbucPdCrN+Ws@ z>5}d{`M&qNEd+Sx%S#1<@_ZlYTt6Cc!ay9x1I%K5~b=P$Sc_U(t^wA3X z&4o*EBi)g&HIEN%FO919Mjf0(8_5xmshzaCPYG`xDLV3c;RylfuhWi|amKk&|z z&MTE}d^~e={%PK#;&zQ)pZ|VJJqYzp7cb@hWAS@|vnGTyBIvaX=`eS0uBPa~YI3M* zk^fL5E`m6e@RjiQ*Rd#A+07!z5iapnF5jK&{`%S~s;Z>4mr+RgH!k_>C2w=*x)(=k zQ><$g^{kg@ns0=9AFJueX!JDlth5X)d|re-B!a!cGWc0wKT^|Po~sE?!Jkws5O54rYyfM&e`*N4Ap@sKEgLJ zB_ku9Yh^y>yl}1c;N|T?Sg2B@s)*a)x#NWMK1pNg$Asx46*s26uOPc8qV$O353hv1 zUq?zlQ!gkAiySqJo+gK<(IJO*OJ@88d505A*w1&uT!zp=;e;3J6s+$f1~U>O)m4=O zCJ8me1xuQ<+{gV#i~a-}A0Iw^sI@+jvK!ym9H{U4Dxtx8rh3W;*DvJ2m2qXNY;kaA zbu##^$EV)2f(X)RSaiMX@c8%fq~q6Dn@6_Wk$vJcTolOb{1eytovH6bLj_Zglt_Yn zc9r`06Y+YN)ant}d5@EOYwrU*Kk17u9!y#rimm=&+Lb@H{aqOv7l$KdMCx%POx1{|fR8Ta z(eI-0G;!oi(DQww1CLF&hIU6EV;)Mr`WRM%bn2tIaW;ORBgd@jpg+G-bzb|N=((8U zBDw$iyV!8l%*`9nlu-3Eo@>(6*hiLMpyG_*4qH1mF86)jfns`oex8-FVTS=v0tY#g zd$?+rjQumSf$%s?+$B=TPfAMi|009NbPce0;07UUg1l+^+Wj>RW;wQ#p-!P!6Xmb- zvRI-fh`q2wpBfxLi4Sw9h?P!Hy}IEuSt_B7HEi9%O%ZVk*?ooQSKP0%UQu0BMcNS0 z`TFl`JS5LIO=IVdq;%E$s+O+6upPV6TeIU)Szas(3F~4-@9yLW6CL6uchd>mVrzs# zckF`JXN7C}iG(Ex%^oDR>-(_>GN?Ne{5NI5`y;mG~a<-M-Ck_%nnz=<9W) zcc4yxBFqftXT*8I+Q1q+(J6*Ko)l?{9}{t^CiN1Q5Mq%K%6XfcyRwmHI5|0?L^tC2 zw~?;rR~98M(|+lwhQsY2)c8DpP8+O;@9^!uDg2@Q%@EI z*rt=t-uEPx%6jc=cu-EQp7Or-V1$|+`Is( zoT%yRkA)qvqtk`N#Hm;8{M-y>rh3i0%f(RC$F*JylEW9@GtJT4uZ&uS;UPizrCy1IQsG-spo8^@CRaE|+m1x_?3tXW$GfuAQn%6m&? z_0ZdrQ;>gfxYZnBfc%V)EmT(zZRyzcA`koC#yH3)-cX)>PfjK$^DGlK{`Jt%;NWDW4aaQ8rZES7+-<;b2!ONWby#ML)gKe#j7lu!4 zRnCGJlrzx^z>CPlHL=9VMdW|LnU88QzR+)7R#w*D-o89u!QvVDy0_Q(gN5LhW>kl4 zV)%{B+Fh^uii(}?WTAGtGxz->@c!fege}I)IY${8+6z&_?HwK0JkKyu2{N$6#l;y% z_%VIK{rxs>_7A2wfT->M~sV@XSwB8^Aq zdOfCtUQWXj9v&Q2K0L?VK>#O>jEst9Fc2s>M90KPZJ(3n`&XR6z(Pz+{Lgrqu3q*3 zS68P&1V(XXJ`M@$e9R{{6d(Kq6(PIf6qv0 zXy`u|o4ZUZ_c)OaxuC1m#O5*R2w>9$8xaBRpF?2nbwB31mbNxY4%5cg7PBXf z8oeNcqs@5v^6)46fF??Dugahx+|lWbeI^N^E0?0vQ-fyJ&i;f3eJ#wF-J3cT8A&5sdO^l1B zN)_?*w*}9*IXZG9J3iPCy!;=R?tevd|I^C;uge8ZJyGr2PoF+iQTZHv;r-ZHwO-9O z4-XHM6$vjL_f-jrit6eYUw&z8mJ0Cl8u2A^>Zm(7^vB&WlX-PzxIpjVk)m`5ZCuj6 zA`?lOkJ~$pBKiXT53QQ*;p+IMQAcg<9XEG(Ew^2<3lWmAN4xi%_|1qV-0XD@-^Imk zVO#Q+mu{CC1WimV{ru_Zz3(;ILCxGE@_?1B-fg|PtLyXU&kJD$+&Y3vtRKa__a6}y z$j(RhVvLPP#>yJ5T)A?xIZ#YYyCnDEg^^*}gLS2Eqa`v*DYts_;$*T0MKtO=A=Gr^PRov%y zejOj_pbQ4Gy-iL3AO;2oOvlWg2irvH`j&Q6ds~YMDJdV*#5}35Tp1o5to1qi`z2Tn zjm`?Er7bHh4Gjzn3&TzuZTu}>UalD#yM2+u)HK_7HxeJ|>w9Qvxj=jS*4*4233YH~ z<=nGpqWt_D-%?XGKjPZmDq@#uUD2y{(l0d7#x#C?MZI%z%Hu>lur7K58dBZR|ma*O}u_` z8$fRa&8;p9a4RKr`RJe{V7JS}z|Q8roi!sJz_3$H%gY3#O#MKf12=-tZAE ziHP&U&j7rN8kgnuzcc*Et*2P+hYx=x34~2!--Mop5L_dFzqP%6=I9_ck-znYZ|~61 zljX7H>S~V&x=H43v!Rb#ILPAAvXhI8oV?e}YQrsV?qJQ4vFd<^6LOna9|d!pE6 zx$>#1=jNQcj!uo^Y*3y?K=V|pfYq;&q8OCHOouc%`J<|;QEBPGM~{Z!zgH6vq5Z`n z&ajtWto16sC!BgUw^^>&YZn>%C+kS8R|FAKm6`X^uTJIA2p1C|A;~-z{atLT^$pI8 z3)u>J&5JZV26NHO$Aq}k6;nQE{OH6yj<$xwJ03I@n`}%>?BBZ=bSW$`S?BDTl%Eb& z3bmZu+o_jpQSK7lv59*qCi&f))SW&MTn4ZDOI#2B=bv4j~+i> zS#3z@HvObm!^t|g@;R_GCME_4M^Mm2+pv|9e?!X2NzLGA+VK&del=OSQESn+)S9zr ztmPHFH6A~%_t@fw`AHUBCO97|>bCazjkNDY9lATPa5;5`wpWh6ms&WY7?^$^e$SH> z7mtaczc5-fod6%J-do45_Ke{ZI_{=UtFA=;ijzY}kxS}Qu4>B42FAa?(EIE%-@0YE zwcsmE^KE~t^TrJcI&uYu+;fDr9&^1dFo?;@r@}%)<4u{JNdmB)w-&muUXAkcaYPzN ztA=|sjfV2at9*SUYdlYCTtBrmi=Oo>FFyE;ZJzS|k&DrJgrn?5ZT*E^{P2-*<$n)SSSYrwC@%)y9x4>w~Umuv5 z=>KArvD?beFY!^csIjBNqdi)nnMYF69~Q)yAa@N74J%7aBAQz?NXRR?OAYB$G_f*iwe?u`tH7FrVI(px4uv#N zem2O3QDeQ%;o=4nP;d#|ym`EKH7@t)=zt74-J2z&&Yx_5HvVcB|B)fdh%hitdk@%! zxlzbHY4_9**+s{srw{kyz^DWT%bOcQPnco14`jb8sF9bJK8w)YF5MV>FV!4@AnE$- zPXoaqg?Rf_d-5U5s;V0;IpQTH{77SS^UH+huYac9x0aSg(3nS$s+Bc~kb(kl=HuP% zWna<@(jnYuaZu~S-d_n5iKynKTEQf3!+W$@K2T}le^M;Oy zeueWXigl$l&23>YGgGQnKIPWH$cWRvQ@a(T;5nqwIMjD? zN)0z)y~XSm4B*S>74Ttsc~nCMz-zMzVJ$(2?@Nb@xxw)wKek~F8@hu;( zsgGI&A@?EF6S5Oun@T=hS2MBR&WXBGl@&tO(IGCNudPjp=-M7l({z!YRayZp#p*qGDVskj*fYimFDYczu6wf7T>qjcP2QSZ3(Yl!}{mve`$-F z0^@}rcn8VN&$j_?%IfrMbgAb(^7gU74yd3V1r+qk-_l!L#tnF4V&c8=X9l))+-vgL zKYu2wot-;hOXTI{d2G%dt^HbFTI#ZENWH_+HRAv$)5O9;Lfrvmr-{06t~OY~k^aWtj|Cnc(x|I{gRqH<%R5aXBIJcf zY+BlNGW<{a4V7j+4QXj##7`Qwpu53A*O{5a!a|XQ9XszW_OIRfQ=9V}YHDg-T^wBc z;zx(>XM(@nSMqHz>oHYO=nf=^*U=iiD;+3(FKFFu{aV!ApY9td6m?zgS7S4?=RuF@ zMcs^(1<6xyJJ)y!u!@VDuKj$No&6^>b2#e_-TA=Jv9Ye6JGN|`ob!Y4Zy+hRokQFj z2VzAYNJ;UF9{vcUP`Y{ZCgU|a-71Hn?)+7uDS3G%U-~N^lLL;L&}_46I+lBDlaTAN z=+?3K3;0rt!`u+4Wxr#V@ZPWV_N?jZ+FTw#VP|c-!OcxAxI~Rqx~<2)ejT1OTez_B zVIb#%>wJHmr>lk`_uS_Em+DprnFWoErRcI&|}&26ToBIkK*ski6P zKizrUdbbsHvA=kGTU(W|pTSIPaty6cAB5(}K-ZUii$q0nrDbFQYQ>FgrSKXZ`=iIx zB_a;@7q7awq}JB4q)GT*Q@(iCllv@1VRmN2^Rww1AM&h_#)a=X;O3qOPQH5S zWo~M^vM?6k!~MSIj*U&~SpuJvgViaiJKrL%2-wc|OSrE7xGQgKYO1&tBD53!ca!Fk zwzk9XwgQP0U3}!|;6j>^orTom)$Nd5G~a678&<|EAWrdRc_d&vG5GQ0=Jz~b6hfbV ze|gNg(s|KRQSo(A5nE9N%cIrE^=5oMy@Q)a<*a<|v-FjvH@*z1D-E@#a&dEWv$OLE z3Laqta)1c>{MicvIuE(muU{9AerSI0+u5-ze_hV&%Fkjx9|Z-X^^QxK#l_J*+}6G=-Bn}*jLb1^6^Q7ST0FBX60AV3Tu?CnsQ`Le!kACMan%j z4UPC%vc-jk(DVzetZm?lt`6yHYG!0*jZefd-CQh}m3yFT@$g>Y>=`zizO*jfi zf_|&1_x?Y!zyFQPt=0l+1GVDbXrHGqxhs2#Y*y{Qm3lNaPstzCoH&ZN@p6ln<^|c= zg3QcS-k#bi{da7~Tw*`_t6w>k7mMZ+PhVYL27{nbj3-I*bNWAYH%4)++(zzSU=*WR zTwMGQjt7Zb*_&;KVB{%=?vNqQsit0H-j?DPLWP&lw5 z|6pYmb>=c7e}?(bxWAi6MiTx(y}y6|j{Gk)9}s~451N;iW%w^Nf5`FfGdy`lLhk)3 zuC^G+Ci)NQ(AM_fH>#?t{?p;J*3?Iu0RKnE$Nzb=nC9lY|6DXQGn4sevVPRM|C^Wg&8QK`(6 zMqJ}O*fP!)rG&U&Iy=J#LzTTp{HzsHZ%cGX{mrY#`ETCv zdF|Q*K>r8DPyB9=;pE^5iRjzi-;Wj}{&0ahQuxM=8{l$q19;(91qFrTR>`ejp;bhRd?Rb1^ z8qCLwmIfB)^(_P|*M&hnLG!6`BRHlR=7lJ&C`tsyugp5)Cx4UV^oVA}Z_}Ii^eRIk zip|OpIn#x?WPt7;{D_=OK+PAT-rx68Cm-)R{Z&3Bm8YTiGSyB{H@O{gq<5U`V|WMjB-vT!EP~C$dIj zPv553a0I!G1J7|v?clPKo?b#}3t@_2#FCDE6{rWNoh1ck6Di+9o(*&4{gg^PJF0Gc z;mw)35o8935H;69_=43;D>6XqUBT0#*LolK}eLv(eEr6gbO%UcyTer zmS!0=L>Ez&M#%3>d)x=Tm_>BREAOUhW?5&}z0gGK%w(YPzo|Yr5K5mkfk}@(HOeA%lt)2&h zMc$qVF3V&5iCwmn9ymy{&D*>@4K=mi{hB_j?C%2jHbi8lC@LEi`;D`mue%rfTiCd` zk}8|O6{uQRJT*1FjpV+6|4Dl*A|@tGnKecHD7m(l#*j)+ukP{>#8KlO=rWyh& zx4@wx+6NC_5D^lRlIrgraFd^elsV%8t?>2h)@*!bLVf+OUtdtCY{D)3Z;g#}O#$P+ z$5l>$e~$vVg09hi^8>Sc`-))^5qgztBI_TB>1x+@*X!^u*rFSg1mqBd$?8-H&B7>p z$fv3$1MVIE?PSMJu_)1Tu(NM&(s0FjMn9lp#nwlDboCealASDD{0%@{Pj_Of)_wWs zPf9W;S0KrN3^0bY`p-UB#iW}0pHV`-|) zD&jn8c7>f?gVpf;^@wx{-;~|&5&jY1(#2W_N+gpZ)enCtNRHO5!il`P2&hewQFWeFI zQzyTI&m)CS^V;NLErvx-PGnbdFaJji;M5{fP*8vZ_E1S)VchFrnG6Ac+Du0g({9X{ zh-7@SJBQrAzwYI8zriOhJbW@VeX@L++iH=x#Ox$JcC|h6(WPPh{uuVRHQK_emEKna z1V4ONFLR&Ak#a2f_$h3+?h?PC;Hyyg2ZtgYODjM_v9ou49~`{F-hsjZt7~J0M(n1p zZ_oARRaf5@*wPy=5JMOQwrG(vxGU!gCsbVn|Mfm^)?4m{A8v0i=ZQZ4{C8%Ni14L& zx=Tka{f~NuiE^9CD#zAS8fy02q_{q$1H$Nibpfd2N;F&o+9z9n&%bj9|ID+}(%)5% z6nVL_drFTjb=iWm41Y)}ROiL<2U_{Hn7%w}7d6+ZHu6Oa8#6q#wcUo%!X$F)R!)5V zD#&0kJ=1B1H2$a$OV!Ik zu1ekGr+f(pR5AUyJMwCljr9@=};TWVNNO?bb%m4K$9RI8Y?Oy8(Q4+pPmzw75u(i|DnHW1y+?pM3vR_AyL+?hC1$=grrp>& z&mIy9*(HkiA3pr~ZCs$=a7RPKZr7e3D@-p^qxjix^bCvIx@R?VWM;cfBn}D)s_q?@A-#?@(eCs|h;6->a7ifk=iwQegi~ zMl+ECZn0Dz5?h1n!jR9A`}GT*Mv_G6j5}~^>D%UhUgX71KdREl53DVoT&G26D54W2 ztuCSn-sE~&xsIH8slo-8O=+FEd zuJVOA>PABGV3b!l7H)BVf{;9HKy~>+fHztucIOd*I-j zb%rEIlB)U zC7iZS+UTCE6#C-=5n>04j~6AXcb9O8QVmz;=iRrLv>bvnNti`sDkj$UyzJ~&lZEVF za9`co{OL@O_7bPIB$^+Rc!H)m~pqn0M-J_N8(LIh)Cb7k1V&&jSxoM#y zE88|Z6@4;;@1|?(lU=1ul^oQO7U)>+%WqzuPYD zy_B@6q|N1KVL469#B=Fc8dL3A6J7D81z)l?Kfod?s+-N<3jk~ZUO-=p8pSJYO;o>o zhZ0mUyhdM-j}AAGQ?&(9NMghm=!-oJkzK__;F#{e_ndv}5A=1v_( zTwFZ(0zLUTGOUR;b0Cq|HJ>MyT5l61JAV)gSq*)x^V}&v9jZ9CJjKmF5p8UXV(dv1(@|3!#0vCMft1PqXQMwAziFwD7-t(sn?58_ z*zu1)8c(~5#N#tA19UGYc9i$2KDuKpDbAlv;?ke#X~)@bsgM^P`t-uK5OoV0fZWKt z%FowL8MY$seoGN1)fcoyRSQMN#x@)Cy(_zP@xTzbcvrr#qQdpjqeno$k`3R-bF%vh zV>vq5iEl}wZpXOLzgPvO3f-UohWO~mg)uXDWRnFbP4B$!90ym9lxqWToe8he3)_6 z&1eb$d+e$xCnqJ>d?Qc~Un$BT zv$nlF{&f0l`|_2#?4-(w-uNEpz5V@rckcpm+BUloORGSA%jd8%CWaKmiII@W_Zh1U zHQ#qLfJucf)8E_6;CYeC!PV7wbM8_=6a0{tXkyV2XSqpES1bud?2z<>{AWW`3trVQYNh8jFXA$M*JiQ&SUQ*WS7B z7)UFP6dARr-6^=Pm=Yp0YeVzu%0>J@xFJg>;=1R>s^ao;+kxzE_|-;t_i10QKYa5( z8%9jRO$!oR{40`KxX+$F^YZd)OM7-^5=icXg7B~~M$e09rQ>n0F){)t0)Jp9Xy7~5 zc^?Rls76LWKO08{OB&*i6l4mi`= zm9A;uyhuYM3M@)-4nyLnPpk6-IWXi)eQ*5X+lEu}K3igw{c2eB^(ECnv+9jr{4MH` zSFb>OXL~Ac35e_&9;TQQ-xOXD@`F82KoADG%)58*Mv8?P0s!AN$SCkR+CO{tY?N4_ zVWt~e$aZ4)PqJNt{B@Yu7m({;bE{2Bp()0D$UtWIbAM~m`(T^d6TgMev@0Quf;(C) z5GT_COqv82*SGHOS4)y)1cImbztVA5vGZI0T^pG6$!g~s-xG0Y5T8ALc>7zbh>wqt zu~MczFrI(9lUXjFOFGDH9MPo1I~OC-VT*(%gR@4Ay zO-lFk>_%2r*1Y>|eoMiNmo7aqFu-h+y|}^0=P+7qaxF@>mQ5DIy>e?RF$t@CqqrVh z3)0X$C+z&WzASL@#ux60Tl>}9PE4B_lrTfO87#1ZMbz6d>59{te z=FXjGVDZw@=uX$>jT?Dj%*MvYE3AeKR%T8U)5D*9|NcI|{w^tL?1Oq9r2p<;QT@YO zNgEOp(zqjYv*mLjfxyk1<<_GrcB=NrhkGnaX<%Gt|IT#Y&?o?v79>;|1Jm!mF+fK} zMU7xoVbO3ruH6NN6N_R>uBY(R@bX_@f=e_xgoG-exIP(mPHH6G0`Qr3yP+grEqP&|xrZ!Go>$d)E*EcAst*58w z>(_f;47afw4Ie+s*J3Oz*q$=E{xyx|{2*|Vf?~MJ(X2Q}`8p)^9shJ)@@)s5Sa-L6 zod>fIvW9ITK+J*mbm#9NKTDml#!t!DUv4uFnPbVrncekil)-%ey9@N~d)GEP!9ecMru#zbI+B+x(Ov{2!cGJM=6yLitGOC1#b9pm zZ6QKxb7ncY&eGDGU;|LnSAe$w#aP9^EFrV0ot@HNgu;WnQMf-Ee3dp-SIW|0Vb!>< zo|+{x3kF6;bOg=F16*9(y}dnSB}tqYHz9!xbnt_1vX1WF-iy@AhG2tkdZ{Q?n|m@5 zLm)WrKcuRmku%H1Ea)fR*WZ6lTulDP2(tecCJ5L9W2LKhYUTi*%CobBKBPND=j^Ob z!Ri6)Tl+B6e_}#+y~$qLs&LfW8pC#(f?{!QE>i65WphhQtdZe6@ESnW$ZeCUDZ>u_ zq6F;h9I1y7{sn4Gy0Niw?^?`FH8d!ygr(;PKRd17Q}$Oj9kcs81p{bt685A5q-{BuJO4-Xe?Wu>o0 zL4GkmpYy#``-RI_WEB)(L&H~O9s-q`?PPUm!%ZO}mhBMpXZS?1@$q31(V^O=;1;or zH?|?t0HbHD#3Vsxfzi`T1#hIYvocxZG=GgG6mr2{-rntWG6k&tey371Ro1#tQkI3e zxy_}~m`Bv87c`#UBZW_3V>0Y8KP!Fym)c*8iJ0N?_wVW)!69w2)nbB>MsogIdi3>)00 zY^9uuS`Q_?WQlrb2lIqajmgj4B!aw>5js~Ow%&hiOj~pru_S`4z%(#21}w2flBn+E zT3T7HZ=H5t+u`nUmP{t-qxSaP6-HM;EizedqbJ$ax*@+ZTC%sl zB}c?bLw#_kmWhZ>5}lZs*!h~X#^PhY3{f( zHb&-Mkq?kXG5eNsmVg}6!j{`hqvy|`hiHCld%L`}l!>@BAt50*FK;|k3UA$F54Z|g z<$B7>KY#u-Fg3lm%^69oql2fT6AWq}3i3aojIawz$sr>3|_hV5N z{i#B|nru3T+or{NZ7(#-}$RL!+Th zZQ+(7r=VzQZT&N2GCuBFROWo~lQO6Qz4kanL`tOaf+r?Ef=~tsnM+d-6cxeu{dD+l zZ6opm@6nkDy$CWgHs;~(u2<{U*V$8HHLOocN@~`VmjC{}`|?;Be2l&<8N4%RtjEjqJ$F_Pc2*$@z$YTghbog& z^FD}2eRqEagQZF~$RXwdQtSqafw1pIp2mGgGto&SXS9 zSD-?=_ywfec@daM$c5|IUHLuDzqh*{32RYUqy`XFV5^+@(VSAAN&NlMDLG!1$Y+SL8l*^TwYkDu%eY;8w#dJu$?-?TO;=uhF3AIKW0D%Rip z{=aos_pD?4`+uO22#O0x*1bK4s)>hLUM-z?XU~3k|Na8?A3Z%im|y|>pZ19P9B=g@ zTWkHrrtjaKAuhJHwUxiF%5c%x#01p{9)FRG@#49A;WSjVw2)Tr1`P2nNuU$*n6**w zLR$=hp|-QM`~j*1GczG!U86}MMu#e0Ao#&KU^}IPUD9qlpF6EBLizP@pGB;!#tRE; z-oLjKadGlG5eEqr4b52`$iI!+gTu+SX=W%fE53Ea+F^DqjnE783)oN;fTqL??x#;1b3T4 zD9OdinO|9{3F~svH)JRH#ocRoc)`#y#0Fw+Yko^lOr*g=ASx=+A>5$UYMD}b_=KeA zM+kR&XJ^#V){koEvxsTeZu`iH8WV~7Bm5Bl>ZC-iO^}AZd6Uu_wmR3p+c%`cc<_Pf+8zGO-S$Ki(dXz}f*oy^J(p5EB!{{n{)l=Hu0%_T$bX zwjU3HD4UFQ!+QE_oV&X)2xvje`}xZ%gkF8auG+w_fB2BHyBp8U+=B3cTp7_UGL!^4 zqHJ4WOW16Rh->YZk!;xCWrEPE%1REcVq-8tt>Lt21f~mXeY3fs4>lz#1MKZUwgRK4 z3RTjrr}u1Zx_{K2z#2T*R}}HO<+rPr3RxcgDT^E>dY`{cU7)DzN;nTXQ<;br$ZuR< zH*!_*TIF0Xp`#0Ula&P^86V-|S^?xNVQ+8%RPJO=jmyf!GAM8U{CNa!xT$G2C3OiEZ5l zBd?*cvr(gd$eKtTf`^-)=UO4V!$G92J%;rqO|0{{ym7F)0W5f#on3m=`Z0K2$nqs6 zrIYjXC^ruTvwNd=QCC_rKXlBG^sSE+c{e*|0|WY!{v`EdMPq)>CJ{r+6RX8S+sSV* zEs%91CFM5}9zqHFy_Hv2FH@iP4{7xXO*HZC^fphogsSS!WZ|)h3MPGVnhg6_* z<6`Tf1S{jwn>*-cYS&AaR1gtCcOb_3C%_p5jdXA%Cig7d$Q6I%;yJPt+Mlvd*u2rr zBF~?y`f0z!CjzG&A0Hnhc2?z)nz`TZF3G4WgNbw?E=X9y#!-WY57yGExJmE>0bdv^ zp_tv(KhX=gSYnv)Or*Pg29V2FdItuq4r^ooCN$ms{!NP%Qbi9nxDMnMrH5>Rky+&M(Niof(n8vlj-)XCc}YWvjv9BOW&8Rv)? zW}ttvC;AZMvEMW>Hg2BXfbviLQ&v5z_aXIV+rrS$`{0;HRh9k2!-)kmV!OJ!`1tvg z3mNvGZEiZ52(w0#J z$RN*zjgBU+(`tXAQf-uU2*E-C=jIt&I*BC(~3`stotu zOUkEP3+ksxm>v?fii*IL(gO<Lr7RSaY-`vYfBhKj(p;sZ8GCG+GvEw zxGnNa@CCq64sC61=ZJ{jZG>MGpr#acvo|y}w6tUgBEerFS@7s@CpRECK761o`60d9 z;41+`sBUYMH8Pq8Dk(!Nit$*A~@0MeJ#5^{kMc`da(b*J}lV#o^+*chPR^)zP>g#oVW~^ILpz`JL9=$fB)`E z5sq1s{A~kP9(X9Q%)mXKmCnLNAnrjUn7pYVh)Ss`DWNzfN-PBXrlme|dZZBG zCXjt_hHx2bU}R+Q(B}BJ;cLMv0ebL9xlQtJG)xnXgip#~rZ#YNHjq=sa{C@ER{$fT zm?oMym>G+!K#d?05)uRi1XvrgI*%CO{Z%P=bs= zQ!C_20yhRL5-KVxK&Ax*3d{roQBs}T`WuK$jeEX}LO{~l-j4G7`_}>X3{XwQcP0@BjSyS+{AJ&{@DunVFf;J0bK=njWdj zL;_hnsNMoR5G@wZmpQ$NcCPc<16i{mcrY=sy23&p2?@v{n!w(N=hW)!>mhXvY}jCb ze{o@ki&!4fcG>#Zhg9@E?;}5wY zk!ME;i(B{j^cc4*2sqo)uoA|{waUT`rd>)ygxaf)nGx&{0x$HKMvF;%|8=vC?S9yV zdl6C7V8^cdU;FkvQuLoq;I*)>%T8Q{6z_#n9A-)Ef;GWKYxk6 zlM@sKv3Oo{AR{NQ^-b?D%j1X5M>S)c0W#vYC_M;0$o1c3XZsuGNlB=(xqj#1sIz z8p+5&z={34v#JW&dGatDtMR_BE*j+dohR~L15m8x;8+5=>i2n8w}h)mJe_!XA0m+u zjzHT)L<)GpT!^x+E*U+2a(ujv-ITN_5)rSgqB2-tC-yW83d4wqUTJZ-K7@InI~GTrJcb4IzRBQA`C<0JGQ z_5wfu<3~I~%A|zdjw#=#0N9HB{v!>_2GRaxsYTo_9^6oE94ZmebjyK>K}?JYdCi%O zhX8EjP3%e*`gFL_W4ZpBjt+ea@@5-u!16d={4~YKKM=|d-Vy4cK4xWkE{)VTHi}_e z{(RFD^(eJ7d1o+M>~3r8&B*vAq_ev_9|uR#zG@=w1_cVEIWh@AhL~8d+*a)H*t?@E z(Wu3$u*?~P^_qf$#|A%bv8{F!Zup3ZYk?V`7F2eyvy0QoJ3v+%luk%SX0rXlU=!#u zT$jiGCFOu5)Y{O|$-^rT4r`S`t$`GlSgj>kiavf8Kb*jCW_z+XyA6@P?@5cbAPDkp zBqd!bxOCG}QpheAUP4k*Jx|dNXq*mq@IZdl;JKp>O9)xpnKFc=JqVbYnE05OqC02m zJzEcE)4xL$E#g8$Kv;Vl5DE+f@T}UZ2jDu;SiH;4wFj*eR1o1^c&u4EcY&66a&9gc zrUumA^-gToAc+H4L~w2-A!$E+_dR$T>rn@YIaZ*qW?*38+BG_)ac^S>F<8=Uh>UFW zJrO&Hz@mrpGbr&YlL^<|-bns3%?z4+a>NHd2$aq&ghxeQ6!JX=d_%{fy@`ihbo5!j z4oHh$%wy(f6T!{VbF8lDW_>4G+PWZeD396MNQP8{m9_Q$riI(%$KM+L896w{{_ed4 zkr(LoKxp*o{d-;>9^Fd&s?5xFs9XR#fyG4w!GKoF&AiE&FSITLDlw6t%VPIcRr9j5 zzjj&?BBH)@TB{z6jP!qIW?o(q_|{h>64QQDsDg>^;Y2+xAp!!PD&2zo{D6ubruCth zyESr4e3u$J?*5X$9+vR@RszjagP)-4aRq6Om{`67wHzWV=ZOb|UD6hDgc*E_0mp>u z>gt?Xp7mGzScL>Zn}X2L#Hy-yr6;azT+FtUgQY|W8TlV8t3_Ge#+DYn!`7F(j?|Qu zIV%O{*1J^P-A6$nllA6vs5O#NTPu&nkBM z-3V;U!Xp`7&G;Ais)n-7Qx=)Zvawn!DjdgR$Ey)SG2NvX1u zk(d=T_&iXLgY|Wh0tXXJOpIKlKB!%J035`E9>sbuzi90-??h;4Of*Rdw~S$Q$3Aj=jp3%_bW|ol9r}=qKZ^SM`s+WbMM?)DlxmHQ%(a(S|`_4-&sXj5Z951d2h~f z8Pa=yem;*t5pOGypv1Ie^FMoA+S;01%~WLD_9kmiAV*PNwoG~HsXi-Nfo4(W59g}t zZQu_pU6u_cC4E2>luVmOWP3bsWRN-~pqo>T_~4S`+~>PGc_m%q1VHVD&% zC+jq{My%>rIlS&vcXU~q!S=I&2<{biJdlcz?qcKSX5r?}sjQThk*Q#F)h4yW$p^u1 zGwJo}>iuBiWv1=FU@2J?lIE8t7B`r+IEFgnPL{^XcyDfcn9n&QbK|al7>3^7UK$}= z%SVs)7rr-KY`FH{RUY$xC@Gk5V4-mQfKUX%rp-hp?3A+EZ?udBwxBm!uUD8{)UNZ8 z!!!ngd5wy?0J1|U3HiIn4E3kJC$xpmi|OMPCkmpbM|-nz3kz3FOb&Zo{X;kr1S-(7 zG8kN2!?zHKwH`qgHq;;=Y}+u?t#u{vo;3O|_oU1C9BoBbTN&`EEqLEQ{LRi z$k^xDUr+9bPfcar#|RNUO-*H0#+y7mBqs|u-~OkJuTyVgWl96>H6BI1$z>8kn#X2FRT)ecnS&%;O&TBnWk`FmypoV(TVRquAF|Xt^L_F z4=6H(^a;f!1_0zs26tzD96V$_?^|=pn{GXw0XrO8)^#Jp=WlWTKmS8S|0zXb ziRe+OC+0rv&+)p%sirX$E%xjTpIxI#q>s%jYl-qtyD$bw&Hp=6XPVCCOR0}u%-K!8 z^QScqgEY==)Ya92mj&!1i{mBwpMQ3u?YvD7S?>SU+*gN1)pq@E8fg_lKtd2uQbD>w z#2}QCM(IYn1yn>tx}^oAM5IeVMY_AYySvW9$LD;{SLZtCd;d9HjEpmT_UxH`@B3bB z{bKEbM&s7)+t>bh9~c`O|LuM76SXxWBLSY?UmO3U_4w`ZFt#cF+h<{8;~;fK0tXe$ z8nC=ETa5zhyg-K{E{^(_+~fDQ`R17_jN{@uAnr*1^eOm%>qq{!WwM5B@4{+-AT&91x3GR0pQ3msAQ>~faft*xE%qW+o}E1$;Pd;xOiYeX zn6DpE(=iqlymKU{1x*b|W^iA;01Cp{i>V*|elZ^}G0AUJj=&H=jsLcIXarR5$5X@u zu++zoudXaZK|Dx^^Z@-#V*)j_GmXicJuVIx7neW!kRHg~e3lWIv9?%z{*h>( zN7MZLJcJaAK#4orTi0uSd5vbgq`i%k@BS@)>k9c?A2|PtN_=_&S1pGHZ8f~-LWHVG zzfQS_g-sWwh^_pjH%4x>SVabX0I z=+(IY3y>1^0_d0_;JQQm7%HdrRCnedXL-g7@Mu)H-aVQI z@_%gXFyc}B@H+b}Og?!Si%pGTGL=@e=L3g0SV-a7A|n5+y{BWr8pu@EIA2>^ABe5;?Vkug_A1CS~`{d_~B=YQe5 zkW1x5aJ?H?sOOQJW|6Sc-az*Dq-og#e0)$n-oM{%jgtgGkDu2T#bax0S5V7UQqrk4 zuH5gv^jms}&L$_d3(ZY|ma%_e_r9?3OGU*qW8?cUBw&d_#-^rz>T!rvE`0n5S%5G* zd+pDUS+68WxufwBRL?Oma{w(Uw>b;c-httO;W06B2A7=a_OJa)`BcGq>y{0q3pX#r zoC;tM@_n)z_zemQHa0d7gYNyBtWHh-rk|$-bX$-N9<7Gog+E5d^F-cVNd8>+ z`a<2pk=LF3Jn$q(36cgJY(1vPvw@kFM5BtA)2Mi00 z5Q<0x*EZlC4weV7>QYlv&z~}AloA7Nc~NsUR^TcH_tDi8;9pxr^H2lexf~!u0y6jW zq`+YRtX9z9*0xt4A_dUabDKp{vi&vpNk6sxIpF>lW8i-BdBX9RZNccb{?i)2#tB^a zBdhwf$CJN`(!hDZ{kS0M^ZV81;nC(SOFg~IIguqLN}}J-K{fc_Y!pl*|F%(Rx=aWJ zXm9|56ZE7P05CK2VK;V^RpXpTkA@xcKgN4_%MYTc5BfuN4uOc<@xe4`XAidaKk&|M zLO}hk{4juWk;m;oaN>gDtd@kg|N7cx!1Jxw#{7XBg-^!OP+#w*BB!Xh*OB6Wf$&;> zem*HnB*TSnARnV6!lT%LY{AztKGrr3ZV2QQ(F_a~7kuu3>UPxq7$WzReM5JMn1EJ1 zQesneay%H$rupzFRP@pL%Z^Idi7f{5H{eSER`QiArJq&v?5f{kWInO9x0;u@@^Nc1 z`?RuLAi#9sA)f`e81oa%mGPCPnRz?17H>CV2kkA3-h9uWP?)vuKO zuw|^Jwe#*0E)YEBH8t@OGj1_q`sikZkiqcu^nSqGnwwZ4Nv%_0hN5GJz4P|Q>L3r$ zKYj1$2nh*UU*L|;Dj>DpF}sYsrxs}*7zn4qcG&f&qKez!FR8Eh5r#BeP|!(mq%HA` z6;4OjdgJ8ujeu^AtJ$+>#^&aWi-V*H4$emX0g6X;hi+cKo-#Oy@LNsbrO2-z?Q>#b z8FokBy-S;0mNKy_QrDbSo|yO;g2H-#YUK5smDx=UG%QSzuaaNBd~isJ>QUyPEsXNI zl~g+NrslN^(H}nWtP3P{s&&2a7?PI||Aa?=TW7Xqo0L6{o5zw%zhUW3l9FO*wrXDF z($v~MDw39QFuN0wK`E}QXCRE~K|sl$N$9%6LXC70{(!;~*uX%x&c#raXM>KEe7kEp z+22n#>VBG%A%TH#sCF+d=H%pHc?_sI0nNWN)d{(&|KRHZc6T?&29~9z_BIurZb6e0 z%>|#?=0i5&lY>|RD@2XstA3;0`ueRr1GnW_QbkQx+;ivTPgzSwjjmZw4XBkISfP3+ zW#^QKhkip4wkMk^;>YJG$%%2Q2MQhnqNSWYGEhh_6@*8|@!1 zEyZf)6^g!3GU~|NfXXkzZ36?c<-KBYd+=q~)MS^Tu#z$6Qb??zcXUv}t1}c*zo3f+ zmhaTWS3zDQD{Bq~p9c?pp)#U)TGy#?FGX@03k-G+_Gd3*5#W*d%6!o(rwLJVJK5)% zx)|u{>O4|xwcVAjD@Zlkm>HdI8UHZir>@QyBJ|k+kwO$v_+VC(45sFV>P1=(s|-+X zm!w_4$zv%Qr>3;ORuy=Wk>HDJ-ZefxP;zBQM&9F&Rw-Wd_V7rYZ9P4!dWuvgq^0@7 z$pLQ?ZSBfC6F+8gfpx(^5)DfPCOY0FU%!hNC!v(g%*qN!atRswMz%aY-d}f#!)arQ z{^1OZDu?0PSP-hGnBG(gOf%`SWOVYr4Wg>5>3MmJKVGpr9K&!PD%(XiW&;CnR9Cmc z?1F|cE1O*wa+Vhp8v(nQ`t0*pCM^8iv?X_R^b+7hO7hZpGX**{5|VHz z%;#851{k(n0oHNxor(4oUrQ(rzbD&-LP}~`L{PaQzeovH7f#3C;;QEBgg@g~*E zwarJEGF@%0X5e>SAynPmqS^S@+uKQ3(aibDh(*HC9=TC+`b}ByuTR$2`cd@0wNfXb zVEmF2G9y%NON8Is#Tlr`gikfM)pv^AfBmGDUcMJ?lUu52> z`HA=F8sJmsb+3{?#k{U#YWmRO+&zINMAF*EW}Xej_^%I>fE`ELp&E(uPEo0RT-Et4 z8{EBj?Lz58(Sc`}*MmQNc<|S&bok+wz!#YN{mO)U%E~0e^fb#i(caGAL?gaRo&V03 z2;TcaO7O~sKzrDUike!7CTa%(r)UtUX#UzCQ77zkVqxLWJ|&@&{@4D_zS;!rli{=$ zIBN7$3q`|cn7=oE*(<-m0`I)0_8`^^Hlez>`1`++DzL@%kE!07YEt7A5<$S@`o~+m z>?NH0z1acE?^g-GlK6Xv@`{Rg&OVW=uvO>A#)eLg3i3!>J3BR1>dW%$*XX5-;-}w_ zMLR(n1W&P$Jlv)v(^5<~y@@8neDmg`WBQ0j{B!3Lv#%s$7$g|`Q3bw z0@~zroNf6K`HG4n{pSHA1@I-c?DwNncm2qxmzJbg&@l4o%#S3CdPKlIh0Nfe*dt+xd#yTwK;{5f2^6n(Ozxc3jbzEP7#J=x zRLpc^zQ^5oin8af?%8tyDpibf&wsQbyUxn0Rs4L$3)emK4ag>U{V6Lmv_5@eVAtT? zJlL*#Qb0uMHXm&%TxQSu(lshB?%<#qOg`H58)rs~t+?o?so+wTYh0&-sWOhwMv!^2 z-re|)uK721!`2hKT?^b2(~K5K6r5soOcp3ecmm0bV8r&ZzF)7S`7u8UyZ*6?wIU->~=Hhyww+=%@{#A@`)e?f<|kuYcjIVm9>2FZuK5+w{{~kK6MK&4n~Hw1kg~ z5@a;Q5-}5#-@8V&unFj{?EMm#(--oO+SY#k+Jp7ohWASD z#o!Ol*}3uybA5NaHzsS1<&7l=3s(rKu3aJ_v7O)WgjC7d^#1*#_wRo(Fc`oc6Vqux zDM3B}nk@)&fC{Iq%)PP_hrVv@1w`rO)YLjXCCyUTrxqg*k}{^I1kAVxhmNKvqdPhk zQaUzdV@oq??#sw1Unhn@iQH6c58ZWuJM9T z?HhcTZo|-EYlr^a{i(R)rT$9LgczEb0GqGeq?o<8~EU+t_%#?F4kAr3u>k?nV(NCh^2a*RK6S6$Lpglv>~5Q*3VciYyK8 z?!G<;NrQr1EI%d!mZ346v+7C;jItUi9zk@Buf1!J4;C%_$edgpfLTe1xNg)NN(I~n z)+JoYU(^cfe*Ied?z#2~s}&Q&?J}7YR4+R-?(ez&pmvWLC>Lk~zrknI^YiJRo>c)< zekvu_A<_)$6-Uej-kVi~6cQ5bt+8dibd&8pS*G< z1_ja3B$O1+@(OCmD|lQVfIaT5+r%hPOC|RC#X`c&w(>z*naijRo0Ah4ii|!N4G~ej zDZcJ&)b4_LO3cg@6aracoBJ<8E)H1-3>@_$9RP}skFf>^`0u4$f-8iLEtrDm3scww z1%*5SC!(WN0ixvNFU0hKQ;+bvaN{90e$f*Xv{$h|LL8}=H#3t3k>?w4@A_$Lq!?zu zR#MxQ&;8Y;*st|cfvoP_mF9!rd-p+ukk79qh;TvSE+$5e>akGV(_NqmMSI}%!M{q| z-%RL>ZkG0uqI#?hAH(+I6#>EOw6WqLr8B2#7t zQ}1WhE7a6UOcfa!7Z46k>L8brH>_9~`0y?y)b7KbodQfuw7k4`xah)sE{Z@^YrMuU z)1O*oZ93xN@UVc%`?k^oQ&@C?*&R^Z*yM@_R5J_mNh0*|30h#II}Aa9=S+-Tkx7*hL_j$RQ-=t{R;FSD2P&# zO1dHt2hs?LMjfjj5lCrob4G(jGzM;Xre!BA^V-_(!==;zA=9CChe84_Qc$q*hSkNz z0dfq8+cg!ygWl}`{q~jGt}e5xH*R3y2;8XUZ%ztzy2$oVD3DW=cR{MjnBz;e$Yl zdz&cHFUV^rC%#Y$mG(9?d@@(xeKXnB+nd+FB4;J7O6+Xy2v~0a<^t4b=pN6=O6{T{ZAsqRu-0uT#0xr4ZeqY0q78~X(s=J* z;d*j(pESZAs4C#-aQ4g0Oz)ogbK2VNKjcv9^NVp8mnJ(q8yXB&N3u9n!@Q+ZL0Ea0 zO1@I!ShIBR!8IxXj)Br_x7fqXwXFpR&cU_|#PZss!tg}xFO93%1bn`ck<3uwJI~kenCt{A;gvlYrRl3*t!>}=uV5)gCuN+< z5Z!_N9+W2!E0(}X3)UG3gH*mg1uFP_=#4y-v7Ke!?x_sAG03&xvTu&=J%U9ffLf$0 z(zhl;=L*4WEp?Hjqazhxu0wq=ALzQ(OScTOs7o#H5dG>%soYwarY3W;=+2OQ_RJej zeO;w0NjN!uA*O=j(7Vqv1vEQ5vasAE zZIJP0*}&OmXK^hxXby%f^A)BaHumLDIo!aahFKX{CRcB*&lH$OgtJ6lB(W!g^03hM zMNF3P5sD`pYvYsOzPX<4>AK)Lewh~#c2BsO%COOAb9%o}=23^pJ6~;BNHVmPl^^t* zy%eNjfI@{0@4-~yn@XkKiE0}cbQmuZr$pvQ@2{j zrK3CVWTXyw(5d;s7no$Q(f!o%EoZxCn%cGd&b_zY+ehnqM2H~RUYO)`m&wV?XNnf`<67MD zsRDL0c*|MYA{wVnqRI|Bj9<$DsU3Oz`26|vhH=8c4RP98G~E3#6vt~d))Y1Qmo-@w?r^k07u*`hIeyUY|#LNspfIKV^yNYUg-eg8pCoYKMYw=gTyA)A( z9a?d(-obA|&XTPE=Ub^mu{%(~Jsgs>-?>)@hedTK zEu>jG20NvPF|_1o31Muk%%3#U%shH$Fz(YKbJ|LH7H@@cCJ~`8!1h{@m}Bww2LBO-oCQ zg#;H_SX|uR+Nvvv4Ga`@(43<#rZ!TOm6Zi`RJBfe1znXdlN~-JlNgwo8I@DkC0DOr z-3oh^{~hmk7N{>WN6mYG{CHLucr{=7_3dYvA3&3dgN;oiATFNxe5lyU_Q~%@hteJ= z#l*(e$9@MaYMu?t_$S_NKHyAxcp$>@B7iQHnu@8lxm#>)wG0{db8Ypsb-`gxTk}OU zrj!LT%q~4^bC6~(&2$t%AI1>13!{CikyGOLaz(t&J#0I0b4qNpk?)ucLEN;2A_CmC zMwhI%=D&=^j;;T{kJtY+#$${9Kl^lb)mhRD1}pW}P2+fa4A%4y^n~BwaUUt3wxlra zJWp+(S{7)KKyR3kpKslAG}`6;e(&qylAQPD%WJ1DlYp^3`M9#P+IwF+UgCJl(w0f^ zi$ZNrq-TxMzFT5~2b+t-i}m4;R&kWE0p6OGj`J-p#ybIM5#e8WfpYP%acWM;r2Y^k z*sG+jYPcPK^O9`VBEz0!)4TIF?XHKUvTKDpY+F3^4~aje6_|Bmlr!X3Use7|*XrMb z(fVLTr0xQalV({l4!WQp_vXgC++17~8sV#&-)A2T47{txL=qI4R!16}8rW|0TrAjq zev{)u@FZx1fHKr#kSE~C3)^IW(2%ez5Txf~6i97r+KTUi6BM#)sINy78F{(T!GYN4 z^};0Mu-nGR;*X}J&4UNR&N1-^HzQhm9ZXF@`E*dxYqi<4!I>bhrGZ@GSb83%L#_>8^xS<`!{^6kDPM>}IPoXjC}!Ojy$* zEGqdFNa5$xuGGxrre=gsb$+zGZ!#ob1}^$> zKHKfBUv}MrC3)U%w_8?)vbw?(rVVM>PbH7C@}=bB;`k^C5wruRa;xBcDJK_u*YT&8 z6?U~zZ5P3RGz4q!)HEZYW}B5cbEr&&mew^T@oIdt0XR-+i^K7<9CIf-$OX96t1`Ru z=NipdDAYz!@m{%aaexq!+4Z)xXe>@G0FFBAp7)R`YplcBMoAH$ZvQrJ${+a~l6tNdLX$BBF+mRsvw3+=n7;)aD*MjH1<4WsKm$sS$0 z=sa;CHF@yn{k(#Pe2(#8`gXLe{knBfet}DNW(k?;el6Phh>Ie}u8r)Aa>>F?swHMR zJk-r{LvJuKoQrm!-{!bLK(K&=@RhW3-34|(`VUNmTRmXtWxNQxZ*k1rU~_Y0o}Hvu zQ~QsPme~aK^c{?37~)q65lH6q3b~_rE8=_VxirdNRDS(xyY)6MR!}{!-%LQj#=TM& zdA@Y#Hu4^CMV9DJ>k1XhiI$gF%8m3ApZjvbtA_Y^OC3cHQZJvn*qP{>9p%~-oVMwE zdQb=s6%{H!Gx|pr-}mnJb});|a=z9?!SzCV!^TbTc|g=DMVhe2^{JfxCAUgIf4H%;JL)cHi9tB=fO0??$8Y5H z_%0~#38_jZ`hWD-ma+ER@H+AD3~V1tVC_dn`^v_=udjD&8UHF#^@W!W24{Zkkfck+%clmjd3{62)R5Qf#T-InoN2Hi=(Rk_{Pr#5m3OC$_8{`;Aja|W z)3%zW7>{H!JHcZ7+8kEjtmyggr%_a_U{!_kGYY6s?Z`SqTO&)RS0+@KWT#i#1q zhI8lGts+X8ev@Xh-^|@X^ z%*>=Qj%FN8=NKeUH?hQd^mfSQjh;QM;`szvMsDuiW7AJX^kEI54gA|Ap%12gNiQ)l zywS6iN+~WDXCX1U^7M9$#w!gi<}l^x$jcdopR=>UzL(V3>x%!3l#0GSm)M+DiCXC) zG#U}V8Za7)Qs?;HEz*c=Z+$SVvZ2&5P zX!Z|O@;jo#!^;=+ygZNkAHvy@BWaa801^G)hqLy1-&z9CLA>~wDat0%md^+y?%}V8X-i{sV&nnfKoq3`m~-`bfVqm>(`?^Jkz3ENOo> zz}9-0{yRSX6V^IA7iZWJSKJ~7KsD)~f$*`%Z`>uDy6p@J8sp@(CHVs?fgFwnW>AS7(85Oiqp!04wpW)B^S_!v!>~Mglyg4yvC%hV!NBFPfT6fq~}=M;aXm z{av?bQVy5=#M`(gGjOWCwRt|{QLg@L|9C=0_RA`b9l`sKcyMvB^@GKWS$Sa+B98-c(O+@V%X z66u+i=1WEv08UuKdUa_)7)VVXx`>01&;ZmjQ5_G;R4pt{nvi;f<^!Xxx%+=NZQT>#y@v1`vJekCaUZaL_VwqvO4atyxQaSEPfFShhL?z5u7EuY*@BwG9mntWJ*VlcaXji}-++(VoP}74t-G;1#v|(ad~j z#P0d=O^m~xW7Z2k&^d&WkFKLBN*jdA&~eJS{i7=2OhEOj9|t#3^&+aNbw?v5`LtL26e@lH8|ZM|wLnF-=Eis9lwEDQQCo4h0fC27+wlQ4I2~H3sp0TF zqFo0yF&e_6UdSdWXaN*xIJ1=}rGo72hY{K@M4UJd@>)p%CptMfysIQI`l5`Vg~g;k zI355RFpPDEuU7XiP;J5Ym7SFpw7Ie4<5p0>hq8R^y=$`4??)?7i=f09BS=>6v>Cs$ zLJ4fD=&77nT=#Pw4 zMDv`MxS!6CSJmE?lnkt-UEhBxa>85S;t=WS`AngLq8U(qkCc?{Pcnyx+Z2~cNuj&Y z>&aTez}*fy0Ya8=Du4lURQ_9j5HU_}ESsh)6xai(UgR0DD%$0AflPpziRlWzt?l7X zEQoXc{4}$*P7tE=R+z}$_j_+^CrgU&y?j-R26T~2)=ciFVtNriL!q0SHi$sII-ZJrQqU!w5Lu{AfXb(n(^+dFa@~P`zKohD5z*1-5X~}^LTL-HmKOH-g7(5J z0fd;?XvCqpriKaufH7O`-5f!x5Os{P3*3zCpLg@G8kHe?jezp2{Z>O&qWDW%_v zxuTHxD9|NtV}qmH{%EycyXd07U%CC>S{z)pSFb97@ff2VfyI_J8bXAQh1jnWaB^HQ z#=Az1v7ww*V8;5DJsB#83k!Ae^{ukjob-njdZTDa9Ra43#yyK%u<`>m1Im)*_}~(N zhFqakH^pNl6w=k9Eg*2;x*p~V6=U=6mMNt=!|bf0UM?Ra2@A#eK!D>a;ZMoW5eqQ5ruoyXAqZYDw#zC2auI4z|c(g1!Jy zr}8uGTf z2AA~9wMsd#&;Vvwxy|fBcV;Dh>!p^+Tk+L8pibi8*tl}03DDhJbEi80d}Hc-ZtArl z$ENsGAtzdGN-?6w@T}3&?Nm4*E34kmpUq0`m#>C}4-O1uYn9toy$J)0ScU-A$|LJ^ zQJT-6L%EkwgIu7tdoD^|R{HlR<%Cq9xRjt*3)UFhW9AVTdDi_Pun9c2X1Q~0)5EN?g2>j zyYJowk>y_$p?))NzB`GJLc_u?Q$%NQe}bI@kiyQw@_-=|D36v);&sy-4G)Z~WEE9a zZ-^016?OS-{Z;z*-prlUdi=O$d~VZGaKluXJDSGbLNGmF9EOec>Uu>TQ)tA_thCxj z=}P>HQ4Yx9i|$wh*ff^Ry^z@RRAg&tMOKxXBta&cpta3>gwk@BJGCgE+JUI1J z8@qK&+%lzl1?$9jY-}A6k2Mw?iKm$P(4)gi=?PAOU>~XY|Cp;JHM@;=gXe)!jDQJ0 zl#*>D)<}#RMI%law(YhwTI7UEG+s>n@&U#Qx6V+w2i;SHbkdQP2_fEYn&JxG-n$eTnvaq#dI-2pq65Eq|g#xnLKGg1RlxpL*; zmshW<0}Wy|M3v^K2RQ^z_4O-}W}A8+&;e&^QfT(-oO+q-XY< zhLhdo%nQ6Hc>2qZ5$nw6HThllrMGeIxLYeGUGFoou6VzlePh@FZ}Q&0#2`>3smbMNF`D=GBMWQQOH zcB7rE=kM*Ey6^;IY5=x_M1V^H+E-B)ZcGorc_Hw#%&0MJIWp<{dJ72WHPxB|&bP&{ zT|fi+PzoTOY}yqgEiG1!;qkKZ1WJA)<^w}PbRs4yQ$S@BIo^t0NCBkPZqdG8FBewT z^oFrbjK+v;Xv*OGS-JctEZdtiZV@3NC6=Sy2n{n>94r9m6`EHaZ?=wRs_kszs(pRR z=UaU+6WxlO`E!e$6qo$FB%s~QY^lgXsBUr{8nkK^U_&a9S5V-7a-?c(>{DK@0IZGC z+QF2h57SPGFJFEG7TDLX5+I<)^cX3}LcWKyng)klghkQVXl@#MH~`d{smz-3<%Lrw zJ6iX*)Baj_{3K&OdcVW9E@Kh9uvbse1}AZu*TbAq08t~vS?w2;vMY2fo` zMrc)l>S5YTTW9~XN)9BaKq?b+-17m3uGIUzIiLtNdsHciAjpMR_wR@)MxSnNZ1k7d zxH*^3yefuq=zcIS;PXqjHLkKH#z(WrqIar34~6Eot-Hohft*OJ`;7DwaQUGQz-3AT zO-ut7l%6y~f6j8lR&B@(TH{X-VWh(_e|L!j0l~K&moND*Y3Tf0gS#sRZbMBh0e}6eBF_^sDT}#rapLZ06j#(k{6-DMm+=U zxua!vq@`t=K{Ib=U=S1?Eg6RjDim}C@b&&-`So4uJN5*JyVQ8d{yv``fp#9YYsFK@ z_-KRretz^4d7P)Z;{{z9!GHI~aH&4fz03#lmCAR42Lt>jX*cO8=})(^oLA#(Cw;1G zKsVX_GHb(Z#N)j)wjpV`YJmE*n1s0Cdd^qdgv3U0 zM5L0|@Ug0DA~|UPFu_?{TX&9eZ#;XVe8U@*WPDUO`d98-$N$W}!s zkUI6tD!^;bWxoTm1#~QL=$oze&2eC4#7FSyCG6L4-rUV_Ux+`*o}KE#M5u(E`zb0M zRtpz&``X%!Ky3xS2rxIl78LxaNTJ&>H9NaxIIt{zZ*Oq{?cB?+IXQB&vWk=lXyH>2 z0$*I>kk#XPLrg(2JwZ#xp%?ndDB;m{Bmw+Zf*J?+B#qWTKn6ON6mY@_B)q|$Jo)r1 zoHLV9aRh0VcJcFtw#4BOMy%9N3_KO-kW8R`&rtG(iP7)9V5P&V^6vyBLQWnB=O9q9 zQ~V7+QYSG8k^@k-K;Nswh0J+qUv1R4yIBg!hyQCe2b<MonOZ*S9qxZbE}w*(OmzVNAQ`^L2P@<_RHON`~Ym*7Y4uyF+%ND6Eut^3ujK;uTx z8mRM_sgxLZnpkhJS86a3Vb@zh%>u42Z~Vhq6jabG!N;E7Sr!5E3km{RN1cBNCQveD zqEGsYA#*nLha=LUT?d=RBKQ`yD?MRILVZZR#QG|-HP899=d)L~wh3E=fN4)JSW9S^ zAE+a9-Hl-TeIFVDK|ut|1KPL>@E4XiuG>C)<_Q2>+(@OpM+RG<^D94%*Cg5Z<3xx` zuNJ+EFG=UZ-_?4;6h=?a{!1ssK{yUvpXjkaj#Fy`%bM)+WfL=LFcy{!5~Z2i9qnR4O@LA&}Db zXJGi05%1?GGm^n2Pe+=VVlq;ThJEbtuhPlSUN2+Esy|b{@$+-IeIkQ%Kto((f2kG4 zC_}&W;QnQ$(Cf&^Y4<^Ms0JIz^(Z(E@P@tSh&Uhmr-#W7HKyHPVfMWJ-!fJ{p3IRZ zD?pqNCH^h9&$eX1H^$C{9FOO;|7at&*9p%$5nQ|Wy60UW1LPs( zLeBZ1LtjWKFF*cIX~&4sVbzc&t{;4%pXgMFvN?L-Oh4o!lN*JMW4H8nLT zC@8MN?n`|+w!JU&6E=s8<~Ii>DQp%Jr{iO?va`WvWPIJxmQ);r$_h_rCs7IT$ETAPLiT;KqJEJ z-Wlwt=FVk3P*)*wOy2ocJkc?7t8J*G>F1~JL$)rdW$krWM!RM>dWPI+Z`^inux1FvA@`vunp%2R*Art_Q=?l=Y&M#}&@0 z@{fURmJF||2plcdG~F=tuMpN=Yki2+H8wRm#Z*nt4;JO{nCssqQ0!ugOBQ5fzTsf9 zsApizkQu7~Ngq{oC&`Ta2e-Mt$+N8KtK?8B=99C=$&J5o#s5|2tMuwdk4sa;dLk_h z*gq@$%A=+mRrQh$E=N=J_I9T-rmpV^GV&NECf=muN=h3m`PC_-M@#D&13vXlYf*EV<(XCP^w#Q*7VDXg zsEp4UilItNM^?3ySON7B8o~|-MVtfnW-cbD=|8`TBQ%5iT(-aoXpwFlDlw5VZf_jy zQxQ;eFV!>P6k>B;jSTagvuW@BnRMVbH|k(C`Y!ifh|4L?pxe)aSEkBp%6^{>;8Rd( z3SRg!9PIeu*QW`&h?b#3v&F4y&Y`kmQ@7H!9LiexmE`yF$3YI%s~;9VB==p5*kL0K zp^D>kEVXHEp>B?P6sYjrtTsniz`t<0ucu)I2eGCyl-qYUDQj6~^OAf`jl=HtL%Yz% z2_$}&B4(9az4WI~-E;LLcD{s=N`DDf9xf$!-&yOg+&STzUQ8#q5y?HSgAdHu6xBA8 zDp6DQUP7Ha41rBo_=oLIeDuU!*AkMj+dmyNSK*mGYLuVaXN2E-Hgug-34p^QDY=J#%#7b*_u zQ_Rp=>D90MxXD?qSv!!l}rFk12WQDY?O1Qgysc@rk zsT_ZS1QJKzk)0l!7=!Dnc6NVfl>njMbAM`JZ1BvyJ*B89|MP_E3a9mLL!XeKzPgSc z>$SbB%lu`d{uNniN@=a-Taxp!bA#|m(facsr3MyrRss9X&~PQe<`)^MskI%tLE9m| zYc%6x#*fnjw2PFvv-HAp3n>Q%3JNPJ97w4NC3V_fh-%TXGhIjZ(B}+4Hg&Y~>m3_` zGc@wH*y`|L`MiRZJ$2i$U)H6WGYcIHO0GAgQzVFoUkLtwM|<^oM2px+_^r>7w9K+1ay~=pks23uXh+2QeBwDFyhH3C*NQvv0pqwD8ADA8+9o8K6$n)6) z$~ceI3KqRt&>_cf@~S5~_Pga;<)e6n3fzEX01tgtS-!FAv)s#PA#W_z3+)CsvV>ka zZ+IQ;Wfl!Mn2%aIR(jtjB(xOha(P58RH)_mxu&TB54oaAS>kG;=op$<_f<`KnC@$; zv%=jP+RA4yBILBRHmk`;Q7XEXx_eutPA%I{oe3B0rWL1F3d+##V&J?!QaC*2CL*(+ zAD+%y_;%5z_DjL=W@>A8Jjl*oiDTxee1>81%V46el8*H=>o;ddunB#Bb?fMjBy=A+ z3@3@G*7T3vs=iiRD2udJm>zDq7^m)>F3&Ao=jOENEs51tuZ<4kSzKHgbTE%!zezVO z+i+B%z3z3iTmCYd{;r;AKr@5;+T6r?{gIsiG9LcmMNGsX+(1y66fv1DKt=ljR~t%# zZjpZ#BR*ppwXVT?z8nFw)>aoJLBWB21N~b=n@<(*LCj`t3w3{Z^R$h?o^5Dv*8p#S zi`M#nA}dyanSeAEy8CTIRPFCrh>%eD!p($nvBHC4lg4+AFSL&Z`hCbbo?Uk`QIeCh z{1qBx81P!kOUh|`jhvX{t;}4kkZZW0(8Ab6f?|bW%-YK2@pV3-i96^B-et0}t{&rq zqD~l%B<@_oCnkc3uiDX0}-rID$>WF;jJ7|sUmi>Eu6#h-;EKG zYu~$Jaw+Um7$z;+&MDS?2~AB+r~4&!can2dA2ueNy*w@)jQT9k?Y?PJZ7WsHGG%@6N4|bl&%k-EwOF z6y5yxSX5#vB9}bhKIj}lqMhC)FdNv=$iPGk`r)1)*f?o&M^W+;vVoqK6P44`jwkts zTmqK2?1P4=`VTTH=S^twhC!an7GLJrcp)+9>-ou#yJ(yV>RZ4OuFMb({ZT>muY@7g zflshZVQO3^emvX;t3S$MRqbWb!|AVSQMq&YW*^UG_q(soeal7{FGc>1 i^ZhsX`~T0s$JCijk(6IE-lrk(b6-;S&KL2gFaH-4VW3+8 literal 0 HcmV?d00001 diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md index 595bd41535..556fbff5b7 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md @@ -1,6 +1,6 @@ --- title: Configure GitHub Actions for CI/CD -weight: 5 +weight: 6 ### FIXED, DO NOT MODIFY layout: learningpathall diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.mmd b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.mmd new file mode 100644 index 0000000000..cc6f32e9a4 --- /dev/null +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.mmd @@ -0,0 +1,22 @@ +sequenceDiagram + participant Test as Pytest Test + participant TC as Testcontainers + participant MCP as MCP Server Container + + Test->>TC: Create DockerContainer + TC->>MCP: Start container + TC-->>Test: Container ready + + Test->>MCP: initialize request + MCP-->>Test: capabilities response + + Test->>MCP: initialized notification + + Test->>MCP: tools/call (check_image) + MCP-->>Test: tool result + + Test->>MCP: tools/call (knowledge_base_search) + MCP-->>Test: tool result + + Test->>TC: Exit context manager + TC->>MCP: Stop & remove container diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.png b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/mcp-communication-flow.png new file mode 100644 index 0000000000000000000000000000000000000000..85683e96559cf235cb2a5edf1cd7621a1255ae9d GIT binary patch literal 55743 zcmcG$1yq!6+b%p7f*>L&U8983f^<3}APv%~ba$u903t2YEe%rAjUp{w(kKnmjl_TQ zzF+VCt@VFv?>*~z)~GOXUvZvCU3Y-ItoY3vq&E-<#LedtqKXK_*?uy1h>?n ziAZv2XJ_EVV#43rZi#NjgQ^}1KgApx_;>_q^Ptnk^ClL`?-D%6`Qu!Hr^mNQFyP6A zk{t)0dIb!K;VH$njSZeecEVo4)9Z!eG76D=(dIAP?7r~gucVhSB9`pcv|e!S>`16* z;QMB)si?n+D6#GB)3*syJ0_1Rzj9P0>B}CVRuE>&k9&wvv-o7)_B-zJgp^Y-@l6|P zIrWU~BZ5a#(apog5q?CJO06#+F?KR$7_!%tlMDLD33NXAk^kUs$6h5Lj$irHH^+Js zoeX=tKa6nQ#zN+N!ju_~13k6 z8m<23*mF*Jl=dq|zp9;y@reQ+{tet^+XZd}Nkco=$eh!1^ zyPahe&DRwa3PjwtczfrII-74;if<8;YWbyx5%_97A(gq0rC}b175YhGY|K$rU zEU%|{@;g~Du(0SAt8sDd@9*OZB=JXg4ymyD`m%;nPU_Cm+`m8jJi^Fh_npX&P+c0+ z9YPA8qVXQ<+Th^Ca{V{w_M0*a0xp{y zRrb*_F)=E!A3tmtsTPKCvKU%g#(&&BIaS?y5?Rkw@nwK>=8MpRq;O_*52L}2OiI;7 zPX^OF;#H-kbIIOsT?DwfxkI_T$w}SpI_@+?GIDn(otCuR@B1hjphXuoR?gMyL`_N; z{jlW4cRDUV<&F%+oU*d10_}5;gGnBZ?P+i4;{)d!Z{ZSGdvOi?+qR}FgW)SwuXh$s zkH@Xe2l^=wmVSNKiO({g&e!x=TcgPRZ1^MQt7?JPoLxK~O7v5|pnC`|mC#C)#gZmo zfTZ_{?e@`57A8{5ilF)p3CXjKeOXiajccz$vY%bIl4;j@Iw;~gy<;(2yUxvc z^b4n}EANX;k1Y8IX7TED5nVBl)*H^(Yj;*qDBSB91w?_JxpH=P$J@&M6L(M& z4i5O9t~(3&gFjy`El?nzPfx$z8GW7*uUY6|-1r)Keq!(R4NWB^O0#K|BqVbmoxo4@ zi6P)!r6}feS=kg;4h{~n;B6#2LBN$)xIvLxUhEmIY%IGb=ee9Pv)8eb%Yt-z^7LU$ zjJf$RQ{I>Nb93JrRartP1s^iYge7X*oq0H%pZN$4?+?vaOfMu0dz*cK_ed`3j*)Tn z=8<7)D)$uc?N;|+iN9E|km#kA6+51U?!+3W^$AWsK6bhV4K1zR2>MVMg0T!KZoW_v zd16wxPcP_4uHRbkcQ&=z#Kmp3*;8$ClAQg>*4;Zeknr(bjuj}#ZDQ3>+&$agS zEL{6>kwZjQR)XVY=E)y-XQt{j(P8(+_v7QZ_ICe_X9hHLu0#>o(%G~jxt(~PfFW0^s>Ra8XPpaMB6c{+f{pkU>k?ASa z!U88+Dk@?-3n8b+H@}pX(WB=U7k&Mc$@rZ*V5&bv5HK;v2a|I4eg9sXLVfo<| zKJv}?n`*vB+1li|W~%)6?^-p^8&0;i;nAw+Q+0Kw@%DChdLAB3mQ+^#-YR?MEaua;XTUcW{8 z^_LAMGCcfOPqOhXiWTRxlbjzvcK6r#Z(Q}&_2iUy+;3@qMo%|I#cB2_$Mi+9n#Y-k zgxSh{d;7p7!N1j&4moQU3c4E+^bgO^$o~B9HykPOUP`3E^`HIKr^)#=&7IZbb;+#0 z0vDTIEc^XpC2D!EQg7Hm+}yW@q^vgzv5+GLlNCip4vvnhk>`6OPN=pjdj(uv=B0v? z^76H*$p~3N#}x)k`t)-Glx%Vc9_Ggg`W_|LA6=0aQ?=3crA0-h<>jXb)4gMtyV7RZyiTt=4(t(O!k6%V9ogsGU`TL0i~u z2U~MNM8Xj<&YH4n`Vn^IW{nEbj|s9ko+AN-aj|gd5ly7{aGyvi5!?H^4<$ zjhD*_VqEtpjE!x+{Kl@yFR^s)XEECwkt&;1;e7EjECM>|JonY#zZEf&mg9T_&(p-^ za_j3h8uE1K+LE#Y$~-YllRhhkhd+RS6+SJES+A>e4~5Uae(TG!6la_ee&y~Zj33Wn-DM{erXZH5#^Bx9cDLcCYkreVrdN@e* z!VPp-SXdr9Od*Xfm(XH-cyw%x3$6h9OT9D7alhU$^7|u^@2}!ium|EN2RpQ&5YxmC2 zx15}GpQ?_O#EgAxDhA`evzUg3Y;OKW!qK9k?rA$KhEf?`Eh>tKmoE>rN@Vn({&g;c z(c~!M;pIKp{AKUv<~G|Zm&jp!8%e{y?FDb~=8ESk!MM2EH4|-1arIPra=uc3yblQp zfB(cTZ8U^BtWT7+q}93~EbJ`Gb;a%N3}k&vPhTu9ZqHM({Lnd%O?X0$AKB7EM^7tw z>t>xSW~}h}^;l`K5p(N}m_+UEL))*}3IxeQGJ%1CX4Cch^1WCn36aJmc#qDgd?fnt z*sH>zJvS!C-@|LP*0sUN*f@#L5sk<0y8XlZ{AT_a<}@)Gk7K37L)#}XmbesSFH}@i zT8I0dB?&y|bzCtsGTN$k9M7}f5Emv5)=}*D9v&`-|A<%;KIwin<+9nBvh+nM$Zx*I z`~CY{Uh#P*pOCQoY>V??3IFEJeuBS4i~-Q;3pBjz-_hn)K!;JxJjXR2;oztu~3wr1;ij}h?9bsS=* z@Q>3CR6PdlHIAzaYKPm+ffG<-V66EaSKj&h4!yqgbc**|Xwi+{>amdRoyDH|lYU>k z?SpB}NXhwkx6{-=$NlZcxW18+xVMA@BT-RY+}YhNEq&56{&&@BZHxdVAW$nH_*cj? zE-LZ#BMYb9+*Ou}fH!Y033}Vv9dfF4&)a#Q9aW4J1_BsCX}WQ64498c73*gesWAcC zsrS6zOSi;V9>T)@vM~%wb*yw|YU(S&3ZfO5mNn>O7k%*Yu7SGxXoghk+PL${dd(J0 z3;-i}HBrn*oSb=SemBWfczM+wmw#7Uj_bn0R?g#6j(yW^P@vl|3k#H(grrct#G(yP z&TTtE;D{CZy>&5uVq)7}_0iHC^^2Z$@5#Y>9VT*@&M~qiLsIG4v&(La#m9u4Ee9xy>)F4a z3ge*2(yE=-y63ryE1b!NJWA8_rBD*EA8MQrP7bD1ngdn)OU~o1#fB?g+>Z58ZQY5N z_?7pPl`c$davf4&1zdfj&agYHfaP@9qvZHtzjB*chv{ z9jTZO$*-?eW9shgOr`2rndjy0fp2ytZe=*X6^3lcy@1nt>Mh>ANR>h2BnhSCwTjb? z$r>3lFD#^?;U{jZJBcqC^0W0dKnfa7{(U%0zas&(lEo9Zfa0GSk_)MeZc0g>0+u@A`bM-8@|L{Q+GnUuB_Sgag%y|1J>a%3gY)XKCgb59m2k5dGPz0RixR?n^ovA z)xsOfw7WYW_Yumf&OS$DWet-x&ivFbdv+G}+$t)Cl%BE<7CSm%?6II7Em#MyuPrS9 znfTSos;yCOu`?SQfra$JxS2+!7Z7j|94u2gL3jIh^!xWZLn`gg)4}5n!Um_u6fd&6 zbC~N=c6WCddUVIC>_-3{+Epba+1;Wv$fH{N_|aqNWaWxo(deJ$fyL(5X&g*78SBT{ zZX$9qZ229dpH|d$Tif80OX<*}qG;vhLE>rbU{dy=Y8=wX33y*DMr*%+KW_;-N+Rs% z*w~u&&mFBl`9gr48_a16I)BDt#T75;^ygr8WP=(#QE96j**OsJySMv~VLDLA46de^ zMH8UDH9gIvGUSS1``u5=&%e``Mx&f4nbF@@d>1*l&_$MuI{qcqX`WVCD2LhJCZQZj z&ikdoQgacDz>AHIEjv3q#fQ|?U^O?-);;x%Qc#ixU1~X=M0aD?T?sZy{NqPI9{-Eh_Z^Rne_RNHX= zYHjUtA9=RpsuMbFAS+ZwNog<(!||zK(2QNY#m^$i(_p;Yw$B+NxnR4d`QfCokmDL} z&&BQ%cx=wxE)8qeyGbR)Aau4+Zs|=S=teClSmtbPE!R^IEQFlHXcLC(BpNxa8mqSuV#;Mjp3K|^=OZT7LJInbDp zB_TF3kypP(;Ke|u#ODFQReobv|ILGkbkx7Qy zvO01;l_h5^-0-G!z|eDUZZk3AGhFnau`<^s`ND7CqOpmNS65f%l7&J$Ex+hBf4hdA znJ93y0oU7b{9aXEU0qYq$Y{agWsK>-LO;H~egv(QN`Pp_6^zPj*EfDvJ0`|!j=#tX z8K!4|p8<`;L%AI|sn>h_D$tJ4k|7g>Ywk{rC!KXqRro6!rze!)b&9bz-H_*f{zOHk zOUW1T8M>z@Q#-A~0?TZ&TH9tb;9?eje;0;XN}rupR+2DakIm0N&3Pg2v@!L2rdO~1 zgYxdW#&~(R@cGGjrc9poQ8Cv23wFZWTitJSI1c`;(a=2!(fJsV{-o z-YZzVN-p{A`mLn1y^-XlQ*A(k`}-m2u)B=I9K=C&?pKy$q&SSbb;~XC88DQ6Y$D57 z9aqc${$-8fwq`)4s}>e6{7~Qhqdpu&GK+~MpgWma7`7iTZ&;rWRSkZeGDTJ5mgs1q zy+^oA3TG_w8lvFGdrEK1Sz*q#7s7Ox7&fL=M9aa@4!3s^$zSTWq3k9tV?+ z)Ti!M6LdmC(;uS)9UX$qYyPsC_6M1duz2qci72-N#`-e)XDBb4MMuCtY0W`2m|N%s zg&E6ZTU}K(|Nbtv^ZG=3Zt|43Dd1DFn`wRxPh1YGY-jy|-oc{JofO#F*$E1k8LDA@ zp3x8Gl9@R^Cs{`_<4g8ZWNfT!Wb?Co;i8IGMn-2U!TMCM?rvO3^D`F>7FXvCl8baN z<)k!xlM>eSC_TB2fj#!m5aSzo(dou^hXMPWnT)Ec472cPR_r(~&NSA*_c|oRHiOfb zH%k7o#5exflW}S#ipZN-St*Pw#Y(p<8f$83k`JK!ICI)8a;_Hmyz(mpH zQ^orCk#lFKpBbozS}N-A4!f~ za3nen+%STduSBIPaaFP!u;+X@7yZ$1xJ>$`FQ)GSTMSN)8r@yhvGU(bWL)1%m3?%C zqTfpz9Q*O%sOY=pd3S%5H1w6o%<3t%O3hkR6M854k2e@LT(?U{9Ax&g$F4K%Ear~;*H|{=C(Ito+Bdug%YgLq|5nQW&olNv(7^vsU#M@+kLV4mt*x!B ztZWUX0*ES`$X8WU!)-n+J^mzoaBvWC+xz!7FnnGa8;@kl!~-#?v$sSdk?`BoHO_?r z0qyP>nfa4;;|9J1-iCQo!Zw+iPNCVjFmVBGhdi9Y8QQiazsotrt{`@oJV^ z{MtFXf`Rk0WNY}!YdLbE1=703p5&fH{^+J(7Sr{ULqm~WMp;=|WMpLKHg{{B);fB6 zy!~-}ZeU_A4d;&o1M>0lVb!Xnj+7iJ(9Zk1IrYB9c|>z90a{{oboBi?#Ot0*Ld&Fr zV-iJfduGng&KmvIj^@X$t0RT1y7d#q`mNjB+lPlPP)XV&=m}6aZru3v>61>a%PW2T zfjm`KDJdz`J5*|=`@wpX|1Bv=$%)BHAZKE%=)S(bqoX5K>gM+V0yG{K6%|XgrY0?C z|G>b&$cXi1^j@cZJ`m6g3l0ZU6uaCzfDfA0PneQ~Pt40`VA--Dg) zZ3_zvx80>zmX`TC#;&eMzSu;d=J?}MNMb55L~>hCNz2L_6|2e0%2HBN;$ZmT+3o*P zx3{;~)Eoz3*5BX%NWW#UxF-AX@Gw1{CPQ&~Zf>x|SWcq{h9REQ-00;?JeW&G#r`sL zRc`KzF{cpGdjTOKvQkp|#cH0%y9$%UH?Xi29^9d$qod$+2$E_-re$&G@80U-pFf{dcQ6qmO<}JdTix)zCm@iZ zlawdN%)p@0;8mBFrmvzx=-;$FUcp-TmPMmff{DA;Ab$IQYFLs_v z{wHioUc2utEeo5!nzUHB85kJ2xufyyW*RS{B$9Q#xnE1Iuc;KLZ;%H}lo!&6_u9JTfxik6JHY3=|cy?d9uK*%^w8 zeussRiDZwp===2S(W6JXByRhEpec%R#Z%znHO|{TyLTIj?wps6=gJS6e1>^68!C)? z`cy_n#utS|e(&rwKRw=)rrUv!K&7(5x3t=LbLF~9p^l)+bIE zg_;#13vW@Wy`!U5-sfkLk&&yz`Pf)k?QLx-j_Xr(e}BYsI5{~vTF9!Yskyiuj+Pj| zc<}-Y3G2-}KH?f;Chc1ZO>n#jH8mPEpGUD}lpUWxf2QDf0;sRQ^}CstmbT$^KfbE! z2-X=a-nTwJyyV?q3_I`K@U5sg2qNJKq2MPZBjY#iPoJ2W*xa;1y=!O?4s9(dF2=&X zcmOiZPW!-(VWF@86w`J8PplS;Zk=118k3OMiHn-r&-F=Qm2ai#q$DITad76gTwt$Z z-@I7|;@5D#rlW&{Qsf_)+k5x))+eg!+z;r`z@d5Q=!_PA#KJZS4-fb9@+vMa&NV;j zjAm^Bea3vGAW^{e3sk6($-D3%C0$uvbwApkZ)=m#)tx5ew$iF|iY6OK1f^~l zrW^}NFPHS$b`cNN5=?f`$=ZPS_t&lW7@eFHym#;3{rj*h_Ev^e@-=b_i)(97U{t{m zQ2Sy4GJA{G!2%x2!qT!(yN2-YT|;AI)rNtq2oday^$O#M4y&%Y?c(MJfay`J^pz_< z;}zD}z9>1lp7-xT;5VR)7%kQpPnAFX-HeUv4-oB6xWpYwO4*Ij&7%`i#?Kdn#}v_y ziNwIbsH&=hB`N5(Ycl$U7WsW>2zYFPhmbHm{UR*#T|OCpbX;6q?lkthpr8pDtj0H2 zpzg{^$tDYR{jzZpm>S5EJMs{+lzc8No&Ex5@@KLBs6{!*ER!{i?Cc-#*e!mt(M6bA zSo~RB6tD5^&jDWyJ}$*s`On&#`8Y2+jLUND@c1}0GZPALKffd|FK-zIDL+3yUBvSI z{IiG{ZYx}*!SC+@@MAweKkJEc1VRH{18XcY@-8xcU8)Au^OcRypP7!Ck6^!*h z{>zsyurTTA=?C&2B1QK~pWSs-+HQFb(9u*1BRF=7AeGPQ5Aof*%JT9etE#s&Cy)15 zv5=#qqwKLkT}Dzg_-?O22FyLS$Q74$5lG5COJN9dobpCsY&1O?nM4F zOMN%Dn2#PmPLF`%CUc(s3D_7Ex;)iF*ake72`LXdV>yfmugIU~ zdp%8uZ$C~Zdhd3ksJ{DnF*mT6hK2_D2ROL6>%FN@6%`dDwNkCDClcJ&jS-0aChZSm z*8S4F z3U>DP%DUfIv(v5M`vkv#Z)a<}JXXfI_SEM5%yUS|P54`PcgQdm1x0>b`g`<+SnwtF zseg5KwbfKDFDq+kml1zsJon1KW&!y57Z(;X)6)m(WB8niJhRc|zwuDerbRGQxoubE z4HpfJjg1Wqf=D?#nwx#w4SbuLlaCcez zzeQE=b$ZOKS^fh$4?uia1yqmp(2)Sg!e;1Rzqgb?$!5Pui z)U^It^rl<);H>bAJt&1{rlurBL_}m{jQ9ad11wMvceb{27B6|`bzMT36hT>W(_%SD z_C8YwUMDIlib7ppdsggmWXBRMmHb$t)qs5@Uy}$A508)#Dzj7#fA7ZVY-?ywSJ&Rb z0X^Ddr;9t0&ymk%^QD~~duS^RWxf3(ZXKKV>E6it`g)nNqOaMV>tGSh%{_MAo(mwL z?~lKY4omhr4*2?#T-Zy1g(U=!9pDG-*#TLszx!*NP^}{wfo)gy7ijYX5K_*WvnlEG+MOqK|=i zd@ty@V*1-(MptF^#zHo(k>b*U0hf6ojxfVA&~1BOU_gt6cKPFn^y#fRC(5rKv=phL7v8{efee;w*i`Ktk)+e6~jcLJN?HG5PxCE3{$s;a7j->OHzTL)Tb z$VR;P*+2J-{hb@n6coZ^V@bNOu3o;h)_J5dWoG~Q-p;F7txSv2@J^3|S**=RwQfk} zj_5e`Rj#uBAtxqzMgwB1q3d02h$#Yg91|v(*BeXASOEI}L#U;3@f*N_@9%EIV+{8y zLI@kEn-I~x@CW?-?$D9IRoER-Mj(X9fhQ#2#_&Pn2fPmq1XjMhus}#mtfrxnyOx5O z89?J)nXykjn^ALhtr zV!k9ODXgyU6u=@>x+Gz5mgvsq^v%u9r~bIXXZId^ij16^Ia#tBf_ zM+q80U#6y}OoVwL{ed5p#sQ`Mf|>#52YBJJyQBaY>FDSPZQK|23(5#4Qb2at z3~~?~(+$bd(Z;}p(!_6>o13HFDKXS}ow{jg#6gJ%kWf;>ffQwlPUf|5>Fi|W=l?rZ z=WfCEH?OeX&cT5OEh;Xqsv9jr{VpKj0#(Mwj>3XwVrIrd0&Q9u%0uIMdU^sl|Mu-0 z&=~9bBRY0=c4p?TA3rQMXTIh-NaiR_)w*)<^2Rpn0f=#RabY9ILA60m17HXpCGFEE zNhzt;Ad-Pn(}7Tz?=u|VVcZB=btZsbKtJ&J_03rTby%muiUK(Yh|+xcv*}Q- zvX~eKat?;kdvoTAiOB;bT4xMax5=wl3Wh@v2*AO?p+W!r`Ezbz0id3ruP^A|J+U10 zw6wm2_^5@RWZ2tk0iyjt(W-CY`rijK?Y8#k&)MN@D8uR}bzk`hO8KkNx?Wp5JHS85 zIE-+R1^^!H7k&hUhJvzK?v@Cck6t$6WeLEme`PSF|Hfb@fw;0nLu-Pd4a))G3Ja&% zzqv;%zNHUz&5ZKL2PIXGi5QipHTT~=ZjrSu-Furv*BP)wBvIC&&j!W&U zCO<<%!{;;~IyxjX6nQPj9s{HwQ#qdbdWABS+j6XQq)-=r8K??$?1zyZuU{i5Lt#hp zW+=u+Msfn@pbU+Uh=T8;putgQ4i%1_F8D`c%3_Ue*SSa!2LpXqr?zdVLiRD<$_Jfgcl$>KR?f?@_A;U%Oy266`YWPfr!>wP^}xjd{Lyq_k4Y2`w`|g1B{4gq%J6ADJ&PIKR7-# zvG-q0A8JXotyfHC$n;8+*J0G8>5nXQ?%qn2o+8rPt0G?|%=yxI-Yg0(82Y|CO zxV+F6@4PX^$-x0U7apA5ye7TWPoF-8{UawQhm!E}5<<@HE)S67`nS#sdLDo5S#(|< z4#xWs6C*+N%Z5k4HN>!F^dDef@wM~tA!#cDk!$1Rgo}$C^8Wqt(PDCzT(TF`0@4yR zU{3WCBP?GOuuu&}9i4)>v78rK$Aym=83XZtudL+iJP4NnbRHic9}5xkjaD~w%AtA}tNysfQVvqeXJ+1rT4)AE_rC#L2hdCYBV%j|d zxwNlbxq@l~4QH9^j*PiZom&RO!v_yiCMl-Oo`RkOJ#+ofpIW0XAU%K(_t&7`1xw}p z&Uyu#5 zwB^3E>vi0}e*G$TTpb3N7t({Q`MgfypUR+e|nPk5JEdyClz`OzPKx`wLRTqAE z1iR>%tA=pKGxefRJ_?r@gmnvI*G)1mttz|b$w^Q6JO{-650gW|L4*@l%0dLHtS6~jnJNbz*Xax6m22M0(1 z09LMY9&E<@_wOrnW?%oDD_R%UIt#<7ATCb-;DJe39H)m+!|^~lf{L`I=`fp(663K! zu&n@&Bun(-{5&U=vg4-Pdc~EWK7ITC9j){F?!SUe&xd%JCRv?hcZi5eM*r~fDZXp| z4b@Ozf0kh$pe>L9J~e~`@zuig~KrR1ZIR59Vd@%mf=Fu;coUtm^%n?bp#a{80D_HL#Na@>vQJT9BB zC20Dc?jShv0-Nos1Ox=gNyO4#18_NOEo&nqw#6>(24O@J%fHDKmJe&gNk5cSRaUdp zU+yM67g=x-pUfmp6MjOM3owmH`3f>94n`Bu&aYoz{rcwH#!UJz7X{HxFP zf&PA>ZZ{5gc3g5Ec?E?nTa~Qa?d|Q;o(I|<9<}9HE+Kl^%ge94_FF&NS+rm5p5Hk_ zMOhjV4+UPL?ayomVtnwMYSf^Z5<3j6BiHzJ$ecm4eRsrVdB=Gwyd+S$Age_g+I z>lTzMXB`Si>tnOyWOqXA0u`b)n2g)#2eF2p)y1PA4hhj~ko-}pMb82Wbv^eY>+7Kj zflzn(O=EL2EkHSX`Yh!g!VgUr}50*T>U(qZ{K5i6m9>;F)_-&~x88S5Wh_J^Jq`)RAT0yKdpvf#xi-=)fb3qt zgvoKmq4EBsHEoQRv`$P2fOP)gLEF@n4mIApW<8Vy3iS;%Sj87F2=MVSFRYc3zP{ow zUjP_ufhM4)Mx4*a!}D`|+( zkcS0VO*sz)#UOwso51trB8_$ThOa{sSVd63JEx|Sj8Fh4VWol50E%s#pcrUA1a599 zZFUY0P$G$RJrq3q;j2QE8impcun@rQ#j5iV^4^%L%OwGlccI#V&Hz^iv>+3N*m&!D z=rfm*bJGxgz{Xyktl>r)fOZYyj}*w=0JA42C-2|8x3uR4Y#ES>vaUX8I^~w*pVHDo zA!&li3)l}y^XQQ+6n|*v@FtKw)GXF}2J7(U%M`PRIZCAX_)zv`;zE?NgqmPP^!4|3yy{*co`=6V!Bfe-nvYL0OZx4 zien&BQh`r8-oD`rIx44ZwdY?h%a8j~}#DdYC8Ff$069%TH>H+Pd!+@%fkB<*@XGODU-$T2&xJk;xBuJFCJ-DNzyB zFHaATZ%s|$P+=lrdQxrcyVkEGOuj?2Sb^@C92dtB-I?c1(bm~{9}R+OZ!|0i7;J#b zQ4%ymjHxo$I1BCx3VIHIR$s}H_N4r7()XzuG_s}N=rEY0Jh`EPffuKf00pb_6uYRy zC7>i=Azx(4Dknc-i5|Ia3>~4a?!xTZivw8N^zB;#jIYb)3<#+Ez$BkX&{oFPS|0g85{8q@yP zmTR8O6@l7T~x7#R0Kyef))@@V$WPEIp-VxN;qLqTZ7~xrYi&-WuPlh+YXmds8!^!e=pDmT~eAF3b#Ynt(6Sq6TI^J>39GMD8`^V3HN)8xVMlVK=EZ}dE)$_S zR33B~7|i*gUw}OUyBp+4AZkiODY@dOT<8@uOEa@->uDh(qUaLi9v0nt0Y1KJ=su;a zZDgA4xjl`uTcG!6r==0xxl;=8Untr7)vG6}+L{R<c<^W{_ZjKbi}t*Z%U-_4RUsLPI)X;lY9IuIB=F>23ryRo+;`F;Z{ zbx@3SyK|!%jYR;eMGs{Etq!b&GJp6R+Qoh`6lWoL`u0iF{=bv-T(eeI6P{iR>^)^o zRY}-V$0v2jf+@c!M~DHy8mh*dRJ8%3t6b!kV%`X zD-0T3DlK%ZQW}t0qO!g6FMvs@!P&`C>0C!7W2r&&`@5wyG&Ja_UL-pf62Ng5yzI?% zwId@lFW6Sh7xu0R)~SLc3XTZ`nqX0MC*ZMP^oCEQgC0L(9+g5Rh>wCeLrhey+ur-A z-IbxhsNSvmIXhlYkfY@Ip4!?zVph1x+p)M~+McbDX7U__d*-8G{!}K zee0j^NWy7$hZ)2)5@wReCV0%Q+V?(taa~5#l9j?J*ShXpzH$W`u~gxzvIFq{GcPp= z5tuxOPPa7k0m?%vEp+F!2Auv{S9=Y1nDJwd)Z90Wez&M(y*acw$%g?vEwct?`=!M^ zHpLHqbEb#s^ED+}Y$zob4jPn-a*mekaXm~&M&qT$s4|Qoe)IN|~6;IzWc2UARFVkY4*k~THXIWCA1aAiNeqz3hnFNFT4Uw!(0?_T^N39Xiv7WhKM zei<3G%*Ey9nY3V{mtOSIs8X|`+;Z*IsTGt|DBxi3fJla@jMpM5qt{ zZZyEQ?KCQZV{$81KG5+^`4tbzUrEcEFtKGCr@gMB88fi5st0xp`XSKl$j&pa6vkzHh;{=EdR5;M>Qq9#aD=2M^BXUl;(wMb`v2C5Q#36c9&i z{QEyYNc(?*$ZWncG%z6Nu}J};ZhaIy(K_qt`jeBBgB-xoi!~edz%0{P0Z&5g0H1(h zn$rpK+8J_AGfj{s8LjtphZQqfGFj*D41^oJk;fdB;00FMFHV&w0X+fz?ZXHBV?}Db zt5>i7nQqV}81E~Zo13#3FPACHK=dNuEHD(3b$==^kjFu*DsbllLmo&7)Kq-boLv53~qEu1toG{CJxrl+Sv`~xiO_juqbMu6o=OiYYJ!k#WEh#x@1 z$H&La9Net82ha?f)sQOlog1*pBr*F&M&Qr|FOV5&x;*yK#zl*# zrEUKCvl6xqbZ=8&xM^v#V42;797b*Jk_D_E2{}2iGOq?U-$UQIb4T>qGZ0O_wX|fa z7V1DY28|ah9TgMfaV5GLJk*DX2#98wT3QOi@i2zH&)PLAL!NnFobhYCva*RNkMGO&57fM`HG zhjYu|VN}}sw9YoSv@k@%x|gI3y%4*`R=~Gy-l%wO46od9=$n&?(Js2{hy5vfT0Ipy?I)nkSR)c6|_?cM=` z8GSN7hX;YpW@DwMfPPK&o5qKGGiAuYIsp6(u^@2H-Gzqh-a)>g{^D^4av{VlR6eUg zE7}661EdC4H3=CR!0Ae*-D=BKNPR~ZCwPD>>e7z76bg8r7wKn zYq1pC+K$~;zIz2}05iM70Xj5zBZGrVpk#m|acr#sAv)w7a8JlEsIaO&xC2B$v*_hl zxOAVlZ-Ed&7(nUV?23z zXz!>tC^zO>HW2575EUrWX8q|{%(gZ*DU-|@iVvU4c)m@y#DX*#oOz+|3n3kfhV#ZU zQc`JhWgu$~ae-nGHKmiIL)N9_hf4tl7N{^D9s-t3NHh>Tw+$u4;m=;bOT9CoJ3}Nz zx5{oF+#nEuAo7@!gP31hGWp*t?4dxf>*_ouB;F+^PD1qmaDAtPfx4Q5o( zO%xj{IX_@(I9nu_z+-#3`3p?T5q*}fIKfl%@Db#1u~&S`8aa3J%UgO;Zcp zvO!L{(C~M9gaJ!IX;uefJU>qxE>Wpfkd?Ivni-e%)P*yBj9Fmz5OjU;ql$`(AZ)w~ zVl=p57uBQ&mj1uTXa0JCXx?V<#i%RJ7sO0Kx51Vcqn03&QdWt7S`PT+>3Fog0iw#lyV%jF*qePqs5ei!P6{^W+@LE)9M_Y^-{u9V}8ja2;@xggp5SJ3d0( z2AA~lJK5aqY_O#{#njZ*L0|`Wd+@IZ$daIKg3hiKtIkBIs-y%3yjC-m(4_Xi#?0?R za)6WtB(E_#$R_T6M4ehIx3(WcHG`)9=`DNJpZo?pV$zF;3ac?2@nf&OeX{rapR&Wj zoulX|kyIWUz~)OJ@^5k-e|>3YRzyw?U}vvDDJb>+L(l?&0yhk8g1(O>LWY6xPy;pz z8^UC)G^hN^n-&A06JxlSrid_Wr7k-0Xl~#@tpd~Xf z?!k9WNgI4R`_}71jTdE8!OneuRN3-1cIWQpykmA z7l950>Epc#)Oc~D=0EO#ax(;sJT_u2mJwJz|HWeGEkT<-~f5;cX-~RiX<~%l0%y9sH3Mg%%t(llu3j|9b zHjRmdA*SHAN-*mO+;p%u4!BBJvumaEGQxx$_CJP9s=R@L0dU#emK;d_L$C|vYXg(XF|Mgxq#(}+8Qda!>>(EP2gJ|)BHEEe6@rLxC^ML zoAsa{|JN`H*ig{tp_fH{{8(92b75C^84WO=LdFtYJ@8Dx+COe4(uE)&xVF7+pet=m zRJ~ABgCadz<7}RALu6)e!$6K}ZBk=C7&eZ=M!Cd|(M+c#7vjTi3Vi)$UC;~=>?I_^P@$GLeEoEq>alLcJ$UcO{AkkHI%{{DS%WQ2^A z6lUti)ysab;BFj^cOfK}n2^AP9{@P|5!f^kWkEomL>NFL7zYr{jZ99S1}O(xv;M7- z!9fT|GF}{?+L(e|5xkRHH{gmFii(h6fP_w!9VXKI;oT_1@tT|GPru*o|4@q|sP7nI zl{i+zEXsRwdU}h3?N>CSpbFUdY^Ju1j|2woL8v`b#`H)vy&4BLMWLv zTw-$)-kzV25B73*%%i8p1`y+f9$Yce46n{g_F9>&IRG92vd1NA&@!p8 zun!h9lSXN_`Ykv+sYQ(kI1=$%gc=hUmxYzJwZD?9LLm-r_yM5%_a)G(A(@(iumS7^ zJqz#xV95Z{dq9;yBnMD=(N$oXO-z7>Mec0t>^xoTS|Uvs&+lv{D~n4wKR4$9VW2zA4%=3e1j8bWKWpzNx@urWmIy05B)>opr{QP4o)79^CNUm)3r(_}GGSqxk~H5JLj6Z0wt(x>&2w=j?LJ`!;moatyQ zO!Q9OL|UfCCZTJq!ou(nYDF*K6cn(8u*{TV2qiZF$bpC!=nLd^JwU|wzm%53GMU>s zf~x`FQ0K{$OZEKTXC4q)h8RD9pG%D(j#p{_=<0%TgXlPPGy_x9Bgi(9N65d2qd0H? z2Q<<*BHWA1X@Cu$_+VUW1bGLx|^%WMlW*|`lmD1FVXOxkXyExHO zsCbdB0|5-qXOxs^mYLl}oWR*uu!s?Cru{Sklc6$z>acTk0(iBcA3%3*P7VnvDI7Yz zLtp*dF$3lVx&hQyvcht-ukWSCw($F43js<6iu`qP$2J`@^wFPOiD5(%637vaqaWX! znZwm-2l|}D{zNot6PRL5GqbR)_oev*Yk^```O?yI4ci%fyl=n=p*X`CsLYRPZCzd0 zUPGZPE#-W_1v7g4z8IXMI6gX(iDhpd7_bNay|S{xEcKGuFVH@NS*P~!rsOJw*AdWX z^o+Q`jtJUu5dd}rO9?b)qnek_oLpRS@$mpU{z4EN&S4e*Qy_q_jx<3GK_6kB!P;j2ox2EQA!Yv`Q55gAK&qqc}ASmz0YxPWH01aNAsQ>cigL__45gy|4<2YUCR_1U>MuHOtd>Rz0 z542JM(a~#FpgovhVS_skVQSMN=eqXR#)i;?0g@TP3^$5K zq3{D#AOFWWJ}ezu6Eal|{n_}Be&Qn_zSgG0?PmmoqNAb~=I5cVaMqY-lE6(o^73jR z*9jsm;5&#G?5~d8LcnHLIvba|g~r>a?<0n>wu0CQcy9;W+u&c2aasHXEe>b{Gy@<% zWFR>Kc*x6h|NR{cQ3(o%ak8hM`!jKI*bUK2$?ELvze%njD5d@=YHPjz2EOy-&)^#*VDS! zeO(vTd7j5{{P$towtd@g0K6Lx9Qd4;84Hb*d~eMHVjM>_Bq*r<(>nulSc?3&?!KAwz~37?czicIm1!{^p`>zi-HRt^vPB!uf=P6nK#K zj^J|ghtP!n>z$@#JqwQF@+el5rDwU ztL*7hQHfRNmQBx?ACR+6wr3(E;)mvB@XVunc-ANTer-096^o5ZFfPF5blFzJ&k25{ zKzsZC70DS61>LzBc|IJ8qQ{Sae@XEDc6e;g-HC1d0t4IH+Ln_S1;3=EPc(uHqZgz< zBoi*Ym`r-Co~wvpgqaGvB*r#=-ZiMC3*@Wu+fU;0j$V4fWdTZes@e`j;)LS1p|b@z z;Y{OSd-7|0N+h()HK57jpa^QPX7As`D~cX4Mjaz*|L}PaHcH$S`%fRJ3TH z-SU+y)3TCHurs?qZ6+R_{%Xo$_G=9}Kpc;iALoT<`*WVi#te{o8JucWn&=Q>GtBMd zJh}0EcIt$7CfC1Yh>?eHkNrhBH~wa& z;2^2f7&Qtx@7Bc^C2KmGK|=cf96KxUfvZ;9S>QQlm!54Azf-n+>O&)=@2^jCFpO#S z1NKe?|Eu3d+y4Y!8fH4nLw-P6s!Q%OF<*5#m;vdzp=7be zt(9A(1lpYbruF@M2wF(CLn#gHu2ZbvyXSf{ynl0Ie0*mE3G!@HfS~EOMQLLWJkwArc^=Nvu$ibBOH`Y-?KG(SaEfS*NlUtnZt+s ze6@*u`qcVaUQvl~aBN0_EMmu`tb9B2QWP?&!c z0zezEMn05@P00ZKC$8PxQ0cQR0wiw9a}i6Q?Ch*zN-WwIfqbXW0AQHwGHIdTfjzZA zU$;b{_gdngc~!EAQ&dxM;6SfX_oUNIQEQ3xX(&CWoxhjNGe}JB#)f?5B?EPI6b7Ul zDG5Iz+$17Ds#Wvq)wT#=w8hUEoRffJ$(mmI)wCeA>eD8vs*vGVa%9qCejU(TskJLA zDlmdBoZlFpaPXj#X}wf-`Yt6?kdJPe*KXZvzbH6wap<&tT2cd0sY^s2*%VgIPu|s{ zm?6$k@5Wn$C1_@QnWnl`?ug){*ewNvk~Dnz^6Tr-+@8w)`t%L6Pu{7wvY4yU422Bt zr&;-8-kkNTS3i94KuYjVl@^$c>!55NHq#|%uuDy*B>#+c?U2n2e%b-Rszgj)bpiKQ8#mexXx``cbGYn+Yh$g~fTYh)U^vs%bZ6hN!k1N-79)?CKPc_Q3+_Yx zg9;*>jb&AC@`WW*c07C@<6jgz;cR1t_$;kEd8@nr&IOU<`pG31D?U5XnT}Ob-M8Oj zd;87uuC7an$l6<0`V3v4$BEX~S$b2>rs|>i1_-R_Kf4(zw#k(4(IYK6`T9+8!tVeL zPeex5!>nrFn|))w^o_3jk9JSnzGmRAQ4{8La`oH^;$QG$wX6NC54t&V%a5Nt$$_kI zRgfQa?%eXE_#D&4qqQH>z=wi|>AedEuvqMGF9}63Ry)3jjU_OCRoF-dRIc9Kr@h2| z+*((w&tQqw@1K6H@ai%E$|s%sVc2z52`2FRVJs_T%GoT}{&~ zYAgi=n3Navs zc;)=*(}6p;U%Q+B@vwi^^qC(@h8iu14DTFL{Mmc(eT!X9OA)((1rrhB`Kb2|7L1I* zhWU-3TCS5X{0jXdj0>~UBlPs5+v&)Um-(i2%u~`Q2xUHqF~`%(z@H!ZmgFRRGPl~% zsVn*6FYPtRRuAoNqz;WVVyf`VlRs4(FVtvhDkR@dQ<`oBt z_}k2l{%`yOPgQDI${)31!(6>|qZ|s6<z|GM1T%3Prq9{vazGDBhrdOS7Dm9^jBMM&lUO8(yGKUVztu!&3&IZC2 zhWFR*eA9PeG_tjUA^*Ipt+T*V#4PE@!{stkx^8v44$-5y;;bve{!$?bKl;- zrYkhArqpe|3_#qm^%mpb%_p<6ysL6H1Ar6X#JZG0&z<>Qx0}$rwW2Bj)RQPHFS4-# z1_G*TCopz5R6}E~#-w3;&zwH}SKkN-18J9CKZ4RrZEpe7((pIo`}Scz@Wgp5;T)tO zNy7_97uHNqFNMPbjxufH#5M*_&d#BjgWS3E9_)H$$t2oal@kL@4X0yZbs+Na@})~h zFdA=bYwMi`CWF2fX$k#lHaNjE^``%!4z!i7E`7YBXQLqXx;`wk(S zH(y2Gh4dDt1l@0XI^{Af2kh#6jo;6A&qRGz@amJ4p!DqF?!hy&d`o-C_t|4v?_c-| zaT!1wVWhz$i)L3*ect4Os48|pd^j7Jf`<^Ta+;4HS+-fXv#vAf{GTf)Er1Lh^=aDA zD0+v0`H#8cR7pIA1$xdj(P*N8_j_zNK?BgnkBJQWg&9Z2lwQ4h+-fs45#C7cP?SDC zhhnXJK08X@#1o*ju4P)RtX?Uh~;X0u&m)xX3vR7I^(^Zb0!tDT;)hzyJNy zGs*!_^P1Kj-g3re$HF6*-K)n(IZ#EHx0Gf38s};u{FM$RE=;C>!2|8lG zkMZOvhs#-6lE8214BfvX#nqAh1UBZ_j$!{hT6b zt(&-N0E-f*6sRBs=$qevf{^&@dBs~@oGjz#BWxtL? zFxo1?$;rtsNy-kSipf$UP3}S;Zf0z^psz95|4e}ix3wi*zro{T)TjexRfR3LW%?)V zh~FjGxNQHygS9r>O3KkY5w26{Vr}}z*s{LP0C@pNR7W~&x<6X-}={llMi z{Mw;Yr?8-)`Cx+^E~6~5TCku8B65%Ve9ip|xSykX zxcv~yiYdE_wQW1NtK~-g@=v6+@3C zeQ~-dBV+%*eKuJh4Jn<+fge$86XL|7n#@eE)vH$nmr$XRuoiCgb3sS)nlKKQvR89~ zK0GW?2ZKXARg8Cx(a~w-TcP0LD?Aj301F^UM&HBHp#kG9AgDsr{vxV0M;d*^uV23y ze1=&eYPJ_hV?J@Us{U#n#Y&@XUa9OegC%+WH^{D*x3nzbDQeqB{F;8qFk-aSl96;6 z8ft2g@an>}gZg(ljg^(gW5*u0-?!7~`%N?;6goD6n^lz!-GxO%<4=_)EHd`;`av=mzgM(_t&@FC7B_-;cCiyBdNmz3K zzLV8n)N*lgd7HNtkI7h6HgJj8Q<=GoMySsnwA*6dJgkAzY|h5iwelXKQA#Y(R_h+3 zH)2G6{kyksPt6{Z_LHBc2;vULY3{dJv}lo~-mdqPNwOo(E_gky|ztp#D4G9>4AdSoZ)WxTAM}oGYh6nVr z_Ufheruk3V{IHtZ`_>E5iW%8QLsFqp0b0V3-6Kj&#p*eH$K;6<101Zpug7oOmM>Z| zl|-rbQ7(`BKCikwCOUHGBi-F)nVSA11`l1nr{L=Sxf24eYz%K&PkeQhk8)t}nY%_0 z;Cw#tL3vCc<+wMcopq9wQ-9V{nLlg`4z}Hd8q=GcaB%u?kAcJAI62AI#+^`1n>@)> z-8Evz)&)&{FKTdUT^t=#DVhT9o$}%~NQa4~|C{s!FS^&2_ipU z(3a1dIm{GB^!jeIBIXXgd+S!U^^b=e4<0yR^y~8@pr6!>(;X}(0v?m%l#UQngC9(uj4*y~8cH+dt#3vMQbecC`G(u?cDo`A2 z+*Ah+6mkeEifP&OsibFWCsh6~YHC#EWPnYX@S*yoaV8dDzC3p4*QdT2s3QqWqVh8$ z!hGdAsh^p@2m(8I>_FiPa7ykOvB8FlYSpS$Kw0CDyTcVw{o9y}v&!h7+Imx`_uGU zv%+zun>&eus}`cd88vbw#B*%{ZwzdmNy$&&ycun> zwuV!?#GU+)wj>S6>_6KQkGPXeyr-I|^we28S!rF|zHG%x$i^W@j!W55YGEI^Z-zMF ztP)ZG&3n5^BEN=a`Bp_mU*X?LeZPZ!woJ)E)`r=FXab>c?ZLbURH?vhz%FuscDC=h zaeR=PjZYXt8a#Mg;j16tzVRr-pLA&FxbVqHP8F~CMDlDk^M2{@@|7^085?)@8U3=Z zt{mn=kQlo66HH7^X9tc8QusgG4N-n8o68V&?bQ9#^r2w`?KzoNQC#MAJRlo3>@@@N zhq0zwJPCHqcqLSnY%XPVl18TIC&xhh3;xKkbY$mPTL)L3anR{$iQ93{w2qBmzBou6 zfxgnMs0GZ;n8+?(O+YFmIb?O@4FNR-0Iv6?S zYGFwM#=qFA32h}8w_QOZ1g5=&dCA^hY=+o9ysiboEbrUa_HN>Nd|q<3_c9oepU7bd=kmy-mdks1Eqa^#C!@eD}l|z zVMxBCfOUVxd4B{uf-p#m#T~Qe%&D!ora&K+&N7l$Le8T{T@Btbvbrz=$4|;AlmRqP z`BMc7odhop0;-S`wl{g6IB1M+nn#}Ba`V=$AsQNZUv}SaLU^DuqgE%zVEnEUfn$ss z$iC2BWB#$&*p33}Ke+1MjCV&)cX4yjQ@FE3nlJ!VjX(#0wXyK7C7bEx*eRUO%{IgG z)3g+wZWR=C6u4FnI;Bd}l*ip2lx~-#-QFo~4R4}H&z@)l^%q|c!u4{yNljIi-=oEX z9Xv~L8_YzeG=$SQB7ASc_&tp3X7U=1f|@QK-}9#kXHT7KEhML;tm_pU=T<~K#=&wvGP5+*BR-2fVxJSv5NY97I%ZXk2~Yo+Iah|k zB~Fuh8Xv1?*kd&%`;$%BjRXTo3xw$EE_V$_kKWmPr1&>(@X3 z>3SX&j`uM8BgL6c|9vnN) zF23vk-g0FaRz%IVw!R&iQ&grwWWVWND*1dJh{uNj|&G3CUMa81Ve`c*)r+2i_H%XC6 z-C#@LMQeIcjc zLW^$(a-}250Tzc2@2VV^eXR%k4%5JQ0mR!qR5({f!4bWf@~0Yq%+LJdJJv( zE!h8R@yv}e$c7-my^7dDvw|>xH|HASlLq>p${`&GGbZx+MHK9;mtt})MEs3_9J~bq zFU4OFv5tF{+{f2+G5*<4P6j{DUohFgFg6y8k%N*Zc=pAX6@A>;F=HsjP>ux$-=3uG zgRo{5ovP!P)G;Hb9YOy4biM7_(y?ihQi3?72|!-<_qVZ_jxdVfb?;XKm=R}$7iI1p zGU~{|gDc(K3h#EidIDh)P4Hn*&OUw0O%Hz7 zdvNW$&m9i#ZlX@1&}Gml>};}@@6mjd8XP&DMa$dz|rc#}K&5R~JK z=MIRjsZ(Q1|M8)_IdY_L_u2f=xpTK%s9y_zouBWf%NPi^K^(k^iy!=T<-w)SGnF>4 zc(-i*&nUO0OCL9WMiNNtHlx{T`l?k89gohv252?(#z9Z}SCgY>$1Brqdm^U;U9@SC zJ`;q5Qa#?dtw2J?JoY0pvW$bSlk?}AKW@%Se{geJ?)|wZFPDwCo~S#v{qh}!uhyh_ z^vhcJDM0Dn>(^OYJJMWFq|UsgdAWXA+q^aQNkgK8GlCXhzKe8BzQ=Ke4goGDMQw>! zdC_AO_1-1-00qNF%DHEc4go*{2}KGMHAlDGYle$|wJ0B%Kff7iQsX!8yj8t0A>%7= zJcuv(hJO5?H*NZxja!0ZAKdA`S{xRjBG0lOUoXLij)}H zw=ef+un0Qu(AM#z`rX#5X#Y9Q#3o+9ZjyN zPCmafk?DUbmm1g5v+)%0wB;h7fDOHNZ|MAo-WjEW_2AwDP){77MeiWW*5nN#H{Ts^ zKOOdG)G+UtE*v|`di>&;{h((k%QSx;d4F1AT8CvD2UT?VR#ND-(0&aqb<5Sn^Wc8y zc(1g)9DLwSX~P4F>FC#wVk-%AbdLYE?HzV>*MGLXDapwwJi96PoIFaW7uVsntHr~J zU%x)gQ~fwB@0B#tL^&C#)62fUKOL2v$+W0!KSJy}Pi_#2Oy-6Y)R!zd+4%I! z>e=T;3?4j5qkHYtC8M0J?J6QyQwa|uu|=oXVTfjMO(t_plv5!lC-}t_ zz@~HNe2PwK-NyE<^WtZqfpEIio21H~y?P<{#%u-XqJ6+Mx&cNYs%{Y1cnxgFS+$dx znmW(YvP-8K|Vxf3N*+h9uxLrfg6SY78|1SF7DoYO*@p!TF)=6sHpHgVJXwUPdD-T zD69VvieOIg3kb+VPAOFd+V}siKz(gFWOpH9+4yLFEmEY~iMX`xrFVrpq)dcLjF`-9 z&SQiPf}vNFMnU4==HSP>E_>w~?;VRB=%1AfzQ=|UZWaWoDf)sg0-b*|uJ4=+OLHsed3Y zdt5D>)>?WTY6xNOym_nLus4%N({^KOh=s+k!qy!2bQ)-RL#&CtotsP>t#{ zc=*NEA8e6Qbpf(#9L9(|$zG;I2OT{<7bmBE)Rq)YVl4xG^w(IwUE8)zZ;J|WU3aj; zJkr;<1!)kZ4?#pa2Z3G2)WOUH+<*9RvE+1JFRT-3k0?(#`%8eZiM!zV%v|E_Pa*2K zbm_E1hVq{z{3by%cH>$zv)j~akgi|?8F^PhSTZs^?ddh4&^?!C*es*`sJiXDJ_+p zrm93C$`_9vK@IqLH_9{cJJCnaR*kPer;m( z?0KE9R-eHWa>I0VU3qZOP}0EAtP!UEC4X(ecwxi{c1T^NT^@$5EBBxC#1xwu*-xz- zBVR${mkgRnJ_YU9um!1@#l>-@G2j9o->8AT7`zYZP2@-;7^J~3F7B|zNW9ps$;ipf(>{wWt;58dtj#p8Q zbE#8Gbb>xjv|hQhrXZ>vNx|zF8v1|Jf2`HuLvtQnL^&Z7Bi--jzE*>3MDj|tdM@2W zf$Z0~&}K|b@Ph3@e81-z^S}p35Xgx>Hx?EF8EwSxP8-+L^oQs{ zO{38ar3;`M_3?CcG~RvE|NJdIr0ghYK(ku;F*S!xv>+M1USn9tK7ITC{`_W+fMLXH z`WdNyxO}FkOO(T0oSkI_vIr!BAU0Ac6a+)&BwSoH1(bd@VF83cFeM%jaG4y(pSa0+ zlpUu$6CM{AGu77N(Zn% zv9am`>5(KU_|f5qgme_1L1+&d(q3Tm>{2(kol_TqJShpid-uMe4s=4b+*#ns&WYZS z{@I|PFcxls!$J( z3%4S8AiSWcTNUvUxL3$Q5(ZE#El}QPxBMP07^3pCKEy8qwA!}VlA~4KSOyiHAE`~1 zL6j4!wstYn0-Svx%s4B%9Hny5P6VH;B*$`8bj zQnH;u$E!4Kq0yR`)6nMjSRcTK3?r2tKju;uggTT)*)ljm?I9zeuAnl$S{Ar!%Ec&K z^I5ax6cnTdYJ&WtLY{-wbx-w#*H~oY0n;t>gm24l=BjgBZK5=AnYA?QN2FLb4=f1@@4n$SG{`GPCzUHPAv#58)m{*Aka}r4vOEn8m35i z&2p3(dIjiPI(4#m9}MU@$<%ajNMQWGv;Z~ZzaSKE?MEAZBR{`Y4#pVn%MIhzzt4ZV zzV7GmMc=Tnm%gp_G#_Xce49wM&||uZc5d^Rl0^;O?m0-BlRp5eukMMIRYiF6^y%uJ zZR2akcsIKb%gq^VHSNflp1Pu@p^frhmR*jyqlN1o-JoC>XwIF-0YkX|;6b&HY$*N* z`~=9oqN79%_zI+jDWCyt1+=VXHnmc&~j``%DbgZ30(8HUIqdTnF+=R{W=|_xK8eCvQDl zW>ux>1#Pegu9`d(8RE+zI1yil+E4sLYDq_1ZO_J~ceF$yZv@Q{ALZ4(KA{@>GLzpq z-PRg4N|aU3wt4i4W6h?(PXJn={6`AY@77ma_-8q|B)pjd?Yna0m9DQ{i`FA9nTZsS zBSv&Od+uDy#kMM9*130F_T=F=@7}pROK(7O#yCweeME8bX>(OEv9;P`zqT7E3Mcc@ zyxqzI3VRV~)0|@$DCEEf~BKa)1E7puxzpy6nb}pZN=;%8U&%Afx%bKvD>_lO8BJ}+`P#vlCGdKXcYI9?Mw51UpH~sUUrd? zy|U`dfxmw=H)nk@m2cdOKy~db=n|F}d95spj*gCuirS%U{JU=M1~4N;DC~a8I{bF> zBz+7S2xK&;X&WZUnCmR`GVlxU5wvsX@L|KC6g)~7(GF0u|84ttWBgyFx3EWrg*XFNAcGEk*-qGkHx0VJ1$Us*b-YZj?QB zu(TR?G2GFXdQUXKWE9qF{`{&2NKq7~3Tfgnl{gxNolI2Ze>T_aq-)2uI<#?%pO?C- z>I=#|5GGzfI1`sXoC*o10rvY)dU!GqG9ge_%8zQFM9N&#Uz?-nia!SbEtLU|knH8% zsMVSYfXaOc3k`)eal;F!f!YjA6z_MGm}P&&W_w2GV7bB=03AbKq3KFKE!#l)?=5IY z<->Y2#zUSc{$+J-qQ$n~twKXxT~}MXCremnsaO#IBG&2wS>YG1GlR#q%`VGG@8FIO@feVF92<<=$_w7e!B_BN=iUn zMr%B4mY+-0r%z?HpXf$~ssP)owG9wama}I!zBzNWMhb|lC3MFQ*#9u_7(oZ$1J7MI=k);{q3^b{GB~ z`>rkI)FrIv#xRO`xqi2}QA-f32lm@oYCOx&AIt{1!dI>Qc))N9b$4g|_>D!~`}=m) z@C=E$t)%OlFzgl^;sdi!-y(v zfu(5_u#(UF`HccA!Tqc`C zf0*MWjf4C5-_p9nXz0@qVwIeJ>5{&l9&dHiuZ@BB{Ofdn@cE zrUMmL=?xk<5CJ@>GLVs$EK%>#4YyJNb?ms|PUGXn5eB$c+1x!*Q8mL~@LvkspL5_i zxcsq3K}E)zZKK1%2`IWQY8;G;x}kN4-kkws&InhTnQ@3hr7y^771Vc49!OKKv;Vx6 z<*DfCjP!K(!=C>kq#c^;TD|FkY%Yt+xc)x_uGH_IV`;hVa1_EKN`9)=JeT=VV^=?y z$<-3csgrk2%suTVFf^~2*NLI|husfs8&}Z@c)%kA=i>mQb~8;g-AeuA6eld!HImwou~E+RpySS#8w&zK?7OmT6VJw_N9a5$v>K=qitsNLbRHV>xP zMCF09@6U*%N008=r}cjUkUaWO^274UHyvpMijI2gABrxnH^WZv)~7eS`y73+zd54p!@F zO2`|UnHb5n&6E}hj}SO(VXJ|n1dpH1+Lqmp0{{6M?aY5vT>@n5swpkxt~pI24o{4(D0YWx^&wWsiA z2+l6Kl<5;;;7?O2CM9e|BR3EOM?*Q^6!*R)_9Tb!a1&J_+9jUlW$LImwAam&Wxm-y z+N0%d$1krR)Kby8texoNb2&El9e^8<1KpCqdhC^y!PGK5r2II2;@#WV-$MLx^OxtP z+YBqbbB7zt=#z8mR8pK&6`>iNfD6KZUgUWWAtmo2ZWrh&yo zwQe;pM41e%q#NHZ#x^=RbmxrVEnd2GPh?~p;qJ>6(FZszSX}dT@Sqo#a7+Qc>v&9L<% zm*;tQL4K4@-U3)$`1PL)S2YVWxxlQ~0KXOWI2lq~OWs2Cb~--m33CZmRaL;f1hLRQ zqnmi%s8FbFnm&Ja)?cuD^h0{^n)(~Nf22&;Nj($xPuQ6)WBr!<*9@b|gS`Ctcku`& z-r=yO*c*^yog;A!uKC$kn008zkJ;2uhIRB_?x+_;{p_Y6E?|w=mp8!iRO=I!z~LF? zfcFm2zf*Ti|WiW&N^Vf z|Kfv+KaySyeif~?bGiQci8nJ*d4zY40o_6%@JwbLrJ*2XYl{->merna_+4;I}7_u(U1;kZ z(h*HrnJ08k*eUYS!y5p>Si;)rqhigBT%G;XSZEHqM)Kg3)5PXgqqR93HR(aG_x5N2 zsJg1})aYMAg&;j#LeEs2RaG49Qn&ll?D>OsECsNMfB%GbWT&eeY%rvpE5rE-3Cb*jQC3cfj;2nD z0~Q9VdUtZC(QM2QSZ@~+!psUq)l(eE$jBh3oCbu9qD%}`H^)#Eff9%en?=qaZ?1(l z_>hQ|>cu77IIeU$a1IUgZa{H*BY@^f&?wlUWPqiFdcS_+l3A)i=7r{*?@{w91S>2pdr2e!e0@f)Zk;e| z?Z*nD;v1qL+9Z5Tm_MX81Y;1#cKG5{OrIWx;RRYaluz9z^*BJ+BT5qw9I$4AEBS`U z)VR5-RbIRd{;+gDL<=5>Ra*uxT+g>6Y&{zn$BUh!(LuH`OYM2!SnD6p$u(c#p!oTG zD?N{|W*qT%c1#ew_>i`@e?%_A(!5sO^Qpgj(ZxDl2W@Qu4u)( zs8Mf+sst?v>32e=c=Oqyn+jIr`-RHMEdBZ0Fx$=O+l65LUO6QqvH#Nj*@$W1ro&&DJRxS4 zPkLKZgYx$>!cv}oarxQp(xOAKdyIP=EO=48PVI@sKEx$(ZAo`nQls8Igof<=rIAgC zH&l=zpi18HgM3tQIH;EpjhS;4Y^*oDe^;J=`}RY~VsxTpS)66){n4~Sr#Q6VdN&!M zXYlS@;*8SPt?)PG9q8&-V32D8=VY&)ED23C|piY=W02#;w3AaI!RVKsFYU^9VL+|s}d4wNfBEe=~Kaex8% z#t+ri$1f>moc8zk7w3YwT=2h)`)lZ)Qw}Cpto`szESMiY6!l7UPT;4Q8(=t54aQ0h z&26S5HP^I%yzXVzr+cHp&Aq5#e|&zE+dVc)i?hP(h)~tW+45&6Tys(|?DYP6t1z`8 zXE#Rl*Gno+aj~k@8Zm3g(4qLrzTHZcA4;QP(Mq!jo`~>rV2-B6+_@LVulF7F?wzX) zfvsByM8`#+wv9iohND(p>8 z0F$wBaU5G;WDPpK*vaW^uM^NPB$wc%PLprBlojp!;hPydcWCLon1lru7SUH0QyGmJ zGX{P-ev}b|ozI>f^2sS!ns~`ER$M~IY)8js?+q>AeLQ#eETNV;b7yud?i;&-)I0lJ z^2=EV2{>$?w^)hM7+G&c@yf-EuLV}old==WB6d%o-~+oufsOD!y_h?n`^^s)K2uCN zGICwSt)&zu^``WPa*938MICm91GuE@y z#E`iI#%6rkq!B;=BA@`(Y6E2+xE?TvD6C-YX!g5nJeK5-_%AalE;KV!n)N{Rvc88; zi-ZC9ZpiDAkWjIA9drTv2}S!YJ>9O4+q6G9#mVHOtKhbiv%mry9?}WF`bG^JH)uC! zaRtII_(A(zBi3CdVhVRD3Vb=icSUf?U_-R+y$C!M{2meI9A`hfCh`z-U2# z1V{t#Ui5WYZl~Nfvn;AyrQpLCKi>&H8x@`UeYPLbUSiSuSRa-%zS$JdC}*;BsY1i} z0S7)TueWAc2sJ}(w4?LZ8*ST*)`o}45u_wCyGSriFYY|Ovx4ZY3&{mEnFv>tv6TRdQFLZwlr z&(}dNX>EjKZ&;X&_`25j%C}Af#wr-J$@z>V|O8)BuiT*4M zcVMzM#Of<@i3C4WtR%q%kXvfnUfSc((P@0RxE|_(y}0Y9JEwYFs?&+Tf3#}-`h_?G zBu?BN=r4uO`aSMcN_p$%G_9q*u~Zz)O4$K5N*?4+d7t8BIYOc<-oj z%OLeOf+kO|liBsf4TxgEAb#I91&m;5osLbfB;I$Pg#|F@yxFs_^~FY-x(_##@6%JI zgo=R!W&KVg86S-Jp@58-TpsQ_6+2ej%qD{IQ&$-wYuFeU1D**pwQi;+CUOtR6ygG% zlocIp`6SoUwt83Q*3frOHbg}6Il4Y0jsT(NRi=;R@RUQD6|4xVYH*YA)DRN=#koAF zgK;_zJh;#XW@{Pv=BmCg&rIvDNI&LNf3+RP7C0n?Paa7B#Q?1lWGCN(<@nLUMbz-) zkK<%G=md`fMVb##(OaLvEkR%GF+iy%PSJ?{=rO5{&})eJe1w%B)@IsRW%c^?ef6Cs z@?sM`Xi#7fn;%>kFFwJ<18X<`I$DRj;M>GCZSWXFR$L#Em!CgrXe5Rx=(8B&wHT;? z*8J#iTXFb(-7HV|`tZ{NE&h~-6>eV=LU{ZUylUElCCmQqYC1bx?GVM-9y z=6Ws&J)b@0GYUt|+?ck0X}#lj2_sa2zD@QVIdZQrcm#|SRxw3my0RB(&d75NDyE0A55Zj>Hdf z%LU(-O-Lgtq6ZHe)LBVC^u<0tTh|9LyMQ6tKR%3RK_tqbqUUxTECe?^tJG(`agaqs zf(T$ZGOMZ_DRuQ&-0TcTkIs&mAS1X1-Q&~7)#+zVJoO?tadC#!qWqh(rJWbH*;|m3 zmX^(ZL=}sqVOuAqL2@$xXZ>sE?0o0NYR1g=9yqWO=S|8zvOki$;7&3uJFQ)vL9!-g ziN4x@Y;SeUx??E#r>xh&@G-XQ=GFN2{oG+S&Dd0$;^zkK=r@NjoB5RxYr z-Ltb{DQ6zTFg_VrCip$2IoF%7O~eSG02cgk%#q%(L~$&@yvho+xcszeP{}kkm^-SN zx;~api*M&OuD*E4?ZgtN(1yIQ9+S?^U6I+zEn?S}m>xZ6>#npdi~9c5wc`jN&uIe{ zplL3=>k~S8pw5U9=SQBI2kC;5SU|wUyQ}w4!evL!Hh`m!7InP}yY`Z5Ewa-2MuO>Tv;knx{{j)=NiG;%D~13ZQc1o$AsqR}Gf? zo!*vT%ZTh;_4E7tO9Bc9i1P298HZxN>!EoWlDN3N}&F@QfReHe*F{vjX5E#f;(wcs>p|!NdacFBPV2J~cG-KfxOYXzpqb zf-a$vwH=p1=)kqIv>aDJkQ~+--}AY&?7>Kz&2> zLsH+)5^=BhqaKq$xtl?+Q92RjO_J@dN~hMopXHwMjFWRtxdS(lw5GteDM#an$47Mo zj^SLIEM2*>VAHuVEMXyuIu~!FV2D-jK;&ZJQ{e&6Eiza$6Nc~_o0=TwxFZ(Y9Yja+ z0RKmo?{urRDe&94F}s9z57TdPDs8W4OX5wsvBeTB@Zurv2xgysZ%q(zv=u)c>-@MXO3j#pu*gB zY9A`I3pZs=?MZL9|&<9-qJf2S?ZQUuN|l;}QPLFXZj{CrQh8&OUeEQ?1#XNrLQ|Ch*7a2fxwO z(0Jmntk-2Cb9ES!Y(EPz1plLt*1oY))G*LagRvLqGpJswP-xK8b9+J~1; zkKTOAw#d!i;(xpkt?@lZ?MNyqmE*_SvpMdNkAJK}+aH>DNt`hXHxN~9KTGzB@xS_V zzQ{^h(6#H1WszQ#78GNbrX}4$J{{T*=-2NU^1h(?;y!n~?9VQg7G{$3B7+^NHGey~ zbE<@!e*SuH?pD=MAi9eR#>U1L*X&e@Jj>b-Ffuw4c?ZTZQtQd^jQ|#B6-@2y`gtYI zQw^oCP3+^$Je2k7R(zQQ(PYUWYN{c$)YWwcpU8hPoz~@hq;l-B%0h-%>Ya|OK=W=as(FutE z$=XkgYqNZIUG&H0GSBE5`dq7P>8oFK_>uGB{_(AEXI(h;kaHZR#UrF_ylI*VJ>FbZ z*`^0nu=BZ)%E|O~OVdg|lqPNSIiEP;_^5U~DOT6iphs0PbQ510k@}yzQdYJ=d}VPR z&NY$AQTODW{&^FTW<|$Y;@i*5Gtpnf8!=99Y7}2)b2-u4;#&H;cip`_V-!Svm-x@N zKYks6BaBrDWU)?*{JrsQXYQ=(?dSjCiT#Z@12dl+al~LzTX*9I{g1_uXtkzC3WD4GU%IYRBl_-J z6+0#8*!Z~Rl~;b@#pDhC$(vYXcxa=e8f&whR)=dP-+B6UBfYyXsz$7V5g|Q3<4zmw zbSYe`R|w7`6HQG42eyPolhT|XYCTL^)ziq^P)U&lcK_7zF8|{#qS%t|JX6;;XXPG7 zz`Ayw6L*1R!`C{oCllRrhRiwr`rW(i-)*IZf06?H9~>IH>1Q+kG_mv2u@2a%V9NSP zzJTn&=4g6m9v)RhP#}O_{&0H5sd%VcI2%6temKG8b%Id;FkI_EHX>+r<5Ry}_#lDl`qf`Gp;KF$_5lMxGS0%?5Tk`HgXR9c^v%;UoWSy;6b- zV)KMNx9uY+AEg`6Fo)h`;NAj+)WB>(ui@iBk-;t5QF1n;0VoLV3HC}VZ=Yo(A-mmM zpsI3wmIqHBjnrsELk}-6VDl|OOlHxN0Pc8kPcho29A)SfK!Bw@*6)BVm?H6Lu0OtG z#|*wnq@t?b8IeJDUk;wgmkJ|^BD{(dA%Hn${q~@-;#7HZkEnLaWq%`UY!`@y}TAW-a+&h*h|~%VOF8*8J_8%aaS6gMd$=SYJJV&J|Tv zXfH25B?~WKI$}XJV7%;JZxx2J#lR#_a}pZ#_UY)pJH_Ui08gE_XSNUD7C0|qetvkZHHoD3Log>Q?Gvhq>zj?~*cmTjcR z!9C4BmxYPKKmF3#A!J)7cJll=%NxBUN=ggZ;ci0?Y;$efktDnoD4&&V`II+7Nj?et|s))apKbD?_XW{ zf9cJ394jt4!-Z?kN$8shVQD zoP=}^ScoL;d=d9}3I8sUKrzUfi^N7*m(HDEe)=Z8xgR6>SnzX>&QJ9V;>?Kie=y~Q zeX8(#Vqc$ouqY^u37+n1f8asI9N>xMZ@$0z1pS z&`w74rt4k8&drjwPW8mn3%LyYA@fd+h-o- za|y5ji`FlCd~0bU)J(9+2|qtSaDeOi`IZGzdP;K&hf@B9ZQtILxqbM#CtCOp&ulug zu_@kBVB^ec_sHj(Ur+{9Ya=C_uM(lL=t>r6{iV+PE5#T?S*KOKe?QFxfjugO^Zq`` zX)F^B!pixibt4aMOyi}wu_5`j*nlB~r^I%Dd?6dOa?_V0%fq8LFeNCiod=>ge?`7H zqwzK}8H7!M{O@rke8F0#H^!%-&*rwjJ=gFEc#6 z`PpNUU~3K;GLp0kAhqnyK9Ls1#!47R*-Pkh`dhfn7P1u74$v~O> z+>=sqVe>gc_ag6xfN6s$yZgxdaK6}soH~9f%sDT2uLjdXWHMRvm|%m;~AmX(pyxtUNbt$64Q%308ams1qJ> zt65c)f8egE6Vj$;-YJN_{b4nXvL8VK1KWs)Sl{I-3g*EetEd19UzHrtdeW`m7aGrR zWHt1pSxavz7@&0 z#{m>_b^YyT6rVc~4k>s@MYc0s7!Zn%uIovcWXTlO|Ggu3rZ=GGf-FG$u*l`@pWq_W zH0%GcxWlY>_Mkc-a9U%d6*k#9!TdadZglZ{Jy%`sAdiy{&lqF zXMRb^SDvS*uAPMu3nohhw?2F#bMyY)LtxEH(cX*b{MoaU%fh7nUbg)NKjv%{`Yx@| zwes?mG8d`l7Nk9fz3-8(7wk)4VkGONprvfO+b^Qd`hG=pEl*=|c?XYoB9>mf8uJoh z7(RJ-hszE6QvZAcqiR!UWAmeF15j60+#F4Ncf;PIG{HE#(CA0#FApE{c~<2sTM1dc z0nXC`u3RztQC@POlybY4lDDEek}eu_G%_Ng${g;m?DZXm`r-ZUg)B`b_7MIe=Gx&b zCAbL+ZF9=ldfHssGN`xkPb+&Peqo!MHbT}EJ~a@T=l|G0{t!S1;<&c>xJ*@vAj)*= zHR5Jo-it3qs@+3a2muG&yH6jUVme7SsDJt%v$AQA*$al6nKFHj5oKNKKXU?%WCagR zN|aQrY{kwL-zAl5PO0~|NeKr~J+H<%7IL4ib|1!RzQ2L$lZ;st(7^LT&vArgd^#os zUy7+yK~{>03Jy9}pbW(O)A|*M;8b-{VuJU!q@M~4yGt?2#+=H@na4b#*Mz(38X5yn zi1>}<0gD*t9iduNR-P#;vbFW}>i#wYr3No~hhu$zKSSnHR+b5|XtIYzzVh-9xERzX z9Clo30IR50iq5mZXuDXE1O?^S(ozkxN?eK+2;EOYiYMqb>@p&y4qdtw-n=Op;l$wZ z0$?J{mRGEZ0SZPyM#W+EaWZ>pL4K5R;qTF7*^(unVA6W^>^brODelU{sa)T6?Z!!OBcB&m~i|(j;W7Eg@u%SPLs;o0$qh;8XTBDfu2gv#~BDKC4gFela9F8 z1}VB(_9w-~RgW;C6US5ys1)P?S9v{9C%~ja+VT=^apvYqXzL6>&c{X5t0GxIe*uLX z?g~8hc#a=OU&{)?RMXiNiu@o05@J){QSG*}VGzZ_;-K+i8>Y%D>W7=#$&h*oRF?>rN_Jzq}zB<$yu z@Z3`8&|(45$ltPx)DUW8SQPmC`wMD(Id=H))4fd95b}Jw8dStO($xhRAg&o>j4xqs zGDM3|{9Wg|d#Ve6-~=&d&efeph#wyZ;aCI;R~jx((^URq4$r-cef9P0S&lcL+Wf1y ztKSErj<{v>x9-Vy(!4f!x`}F62xzPU5$u~3nF~(+ujkOMmdzEn3fNRm#9hhMT;ht9 zZ(-ONb6zp~%%D}QR&+o-Qn70=&e{BomKMW&f|v{H{nN8p@?{)943f}NL@9*s8ip3n zcep^#^B0?V0=$`Ee{yIycJ8fM6|i;__cq3Bs+!wzl4iWa#Av2h855|%xBQV0n*y9f zs7^4Z7)lK&`f=0%7Q)oSZJdvAWt4MZ%HSu#ECcceDFZx(H~*0jq7LlYP;$|DWGgG6 zyvaD{r)p_vkX86Nr5*8`G_Q7LulhhI6zWgU3L$p@&W+)<@Ol}j=0-7vf*j-hKnsq# z32&`AoT{*7hxC~E9Oy@n%_53oTnMW9Uk&Iz&~&M@qqG6D*eoEyWBd|*pW$`T} zGI-k~hs^&L(hRgs`*KK?+*I+*NDust?0pa+$k`73jfqy+ZSeG?4NV-ri|~u$2Ve|B zHxG>w46uhEyHHrz=VTQcTw*8gAl*i>hTes!_&-)%ztkKY=FygpQ%j^ZFbF;4<9QgQ z}_b{QAZ#^R;(iXpjxH3_a zlk-vdtB9tD+^2#KG=PcS$jaB&maX~u`C)~M^)UPvtY(xo@JX1Ub!}jQ#zptvRFNL> z1^v^sJ=HyFmXo@rf}==p!`I=izP^1ZnGYPufQtsasDN$Mjqrj7bC~9A5~B!{21MmW zBuas7w)XaDlp6V?EeB9b&0ltT0^$Nt!7LDihB6fkVv1xSqfcHwK|_^upP1R2X{$ZL zgaL5I>_%OIkiwSL#AEoGvvYMy3KyY z4jgEUZx^()?g90n<}SU`ex~QIghdaZyWu8tME%qX5Ul*P`+f#%4(nh%8Iiv8jB zsH%8XG?#UpxU>$pTc+9O{bh7R3LF2U3Z$eS7gJPGrY=P}%$1U52X5xP%KW_1gJ57{ za*2Qj*D}&;%E{~4Mt_syj0G_k9Q`X7c-uXBvVkk`%>BEoqPJU4YbK?qzh8Z>cP<{D z&>D7#B@CwViWpP|kzsFN-yEh-p}3nu0O-KEA(!wSlbeG&fB%MzI-=dCK98tW(DJ9U z92W9WXhqyb?V~=7-ble14Gj!F%O$p4siTIYhEiCaWA;UJ2A&RZtq5d4Jf1xb$T8E- zQ^w$dW&Mi0E(`EAE+E}EQ7~S-TIi6vmUBnda_7M_zdJjtQ9SvR6VWjk5!+4lxEV{s z^;)CMG#Kqk1iNBIZB&3z;p4V86+-#bFkjeQnkl)5?);|t!2eQ(W;fq|-p4@Nk33E> z$_AAYAPbak8MwpfG3R7wt0_OmE=En6XG20adJ1n_N+96N?I5!3vVQAdT?qqYN?lxE%z4kVrcN^R|9C}|(! zx=5_l#O~@}WbFbuUmK?>jxF9R@9!8JL#`KfJ6?kXIvs-+d!YD{9hg9zsMd-Qg#-fd z9g{bPFTf=IS165fga8AC0jMJCIJ3H#|EwK-NKrJLFYB@|*ur;)!BYXg#;{)~%*YNW zIStjMtw&V;Nt2uil`wMw33dka`vQc_9#@8ln)`vxfMf)9Z;9k4B(ob0t6 z_(B$d7X>t^_A4t9SqiOXFrK?S_AkBwrY_FRyt{M+ah#S!<46(jc%(o;L`AY#UtO*D zr$UY%L>aSHm6hY*R)Eui4^WGiG3W?`hi7QkVL49Fao7XIM{$J1;ZKGt>LQ<- zjkzj9`BK5A$*Kb-A*PJov>T0qbPyy{joQoF__L1ES%k-u8#Wv`b(wHZCB&=>yg*kK zK0{5vQUeU^T=Nb|!GIAg0_Ucuhp0QIK~e-y?@1$y%};&z=w4R#t7dV&DDF;V<1_>F z*Q(?q=1|{IK8$5A*m4J_gHfcD;q1uy`1uO4>>@KarSAm=T|6z9>i>9wy}%a+wzH}` z3lQ_{UZEOhT62U(+hfHry)^6wit_1=)^C7VS&DC5JBc-A@)0z9Z2Bp);JK>0(O-#$ z7z~Uph5JLdc zECQTCju19_Az$fh&XElhayvOBud91JHZ>stI>QGtT~syw0|N^faOGg)r7j-00k36& zlg4oH{N8M}nZ?2a z&HmQkxB98y<-1X(;}x@%?L+$=`5v;_8v6&oc@F7;ZNY7bGuYG=nwJ`EJhQBm|9S`2 zr!e{^F%eA2e}6~SvBM6w;eX~oqD|u-IHV)8>3^f@^qNm2M)kg$x|ty#UT3c$a+%MY z*;^&zSb2P%y{3pq;bBtl&JM!G7KVok?-cp=mKyldS&9=1-_db#ReLI4n_e0Uk1RVS z);&0JrrD}wn8)|nkFp-RhsjQraY`9#*9dQKN*oWWcQ(~dmprJT&rT{lI`^aJn6KBE ztxCypVc@8@h7Jto@=*PvUsDQW`TR@y_r|v_NVgfS$?Ck*^$k5PD%8MZCEQVK^t>?q zRSh4=I&9lyEYo^Zu9plh*4nsg#TGYO8g=}5(m8aGuZ(Q#c<>^Nf>rGzU*mq}BF`PB zzJKYPG)Q;Q2PTqj)D6{rZ*5bs6z>szOFSMbNay;q3IhW+QN0xtJq7`}0Sv)UHSXJ;q06{gZTs`?2jBl& z;NgG1)&NnQT~4)L2l8_rT($gsu;$bcO~7uh;kG_dsMqW*y2GJYu2vXTXe<dY4`wBLC>UgfPxy3AWyI5~m*=tCV3lO?tHPU&6#>W&M1cgUiw&--8F9 zW-f}C9Kom6HrFm|Z$BlwwwOolO5v;EPcx4-Hi?@)%1`o{3Q;noOUO+e%WJyiPr8qBgc8zmqJe`i3*yH=WSA;%h0dH2a2!USH($72e~rsPlm{3-|zOmK;95<&Qxjh zXeEc5tKsKcAC@j`P!a>vT2cL??8a&)M^VgPC)h68{!Kn|V&o4-nMeDJIijPFf(8G< z{o050cR`vwd-%Ay$14M-6U`RCd#Xl%MA@IPu$;%ackkNjJcU98vawp`UkUeE{^OV5 zhv?Y!m8axCznC4nN~r{W1;flF^a|>Dm8YQBJ$?FzyaF}XW_Pi9r*9|UpN!scOvTKl z_8isuu^d#4(EW0-lSWH@E`otPR2?a-L9AQ|ib|>VLH{$4W6~=TmsEat)r9NH$+^md-12zY{>qIT;neZpk5^aJLZ=Je z=JoZd$!mJd%(c$*YmX;5@DDXRh|;X9<;4<`gbkFAwmEja7!D58s|X_$H{>&0V;to8_|pv!_%T z#4;YH(g5Le9v*Mv;vtOMmYy{qGi-5#I$!GZqR?$qYn$^(>*BVoq{4n0XZw>zSZU5a zox-Nc=woaLGOh$X%2PR}lAE~xm)BKk=_NA95`-4u#7l{ZrOG%vk$=F5OPouRBw5px z!60QjH}{&fwD37EM0$~BnAc#PH{$niq44Vx&U0=<`mp6sg72nV;B`v=x0|ILA>oF^XpV;Gi7AB|V!)+F)QY@s%+VIbKyz0!t9(CqbD^>vt^GG>4&vUTQ`Oc3;CA;P3K7S!^s1x!* z#^d3`sleeR z<$C<8a<^iyjb-QqhL@Bh>pV<*w{GLyMV|$Bb4`MFv1x5zYcBirj49Xhum`gn?i+~e z*6ml0M5rh4D=L-)6_OSE!DFAm2P5P8pY@dcrluG$+#w_;21L>f$HUsH6y(pLff_Rt zdJW#(pP*FjaB&%nI3G$|g$`Zap}W{6GY?xjZz4WvsGmK-zvq>L+uZKzLO1ow5D$lO z+2$N)Djpp=T__NzBH#E$eFUdHR5x$!-s(~)a55NuFncOdbzSyzu1kxEJahLG2nI~3 zmUHb|JYpCagm7UhfA{X=z(C2h;LNcWmP>>n@o|qgF1b##^EB4wMyRku+&9(4S37tw6bu%C3SQcd-E+Mw3Y3N#1QGr@FAww)kQF{ zJ*H&jS+6viOd)tRw@Bd=IMrLy7GqQB?)SUO!@F!JV@4Le8GBx=05>bL-(k10T$gme#1&bbx*PHPJBduZ* z8A38S-x@KL+xpjJ_ckTPSL&Zie0cxlBz=K;`=oBJ5D(AbKy@+>ECj^ai2+E2b&v4M z>1ke<(#$jYVD7_HZ?4ihixCx;NDm zK-jZ?VVg6o3)qD%o42LpvT}0?^!N8O95{_O;>yopH39#YtTfffk9DqHyY05q3P%*L zG6!VZkTBHor%f;*P^&g3udFO4{|s*PFoBQ?kVCyFbP*XD;t{?h;QvsZQYUJaho$VlKpQ%nYAMIc78Iwu{vc&b zy4F~cR|}RW>KNccl#5#*dbivC{of=bAsW!Pu;}Wk!>T+Ef)WoGi~mjp&x5kr*1bQhe>KV3k2t3! ztj!GSb2TUGBmWGlhg;#pvuA(L3`W8>cNH!8wwz^?G^=N-=ACRYe|$*@F-96h4UxUN zGTZ0{5mD(j}czC|qy<9to#g0v>&0nuD~>kvrx4*H@PvfD4thwsOg? z2{5*09}u-Q;hgn~kz}i@^fUWq`XMMtd-`#8^n!*-O_kfZk!DTYriS>9Wq$E`lg|PW z*B?J_ucx>`ef+I0XIqxoZJ+SNq3-tMvVo!0>+963Y%ExU!N!pY)R9H$MW<_#tZF}R@>|A=xM9@ENQ!*%rZSg{>z7%8FquM zO*xh=bD@t;2f2ewSKR&$d^3GCG%YNcu&_RV4omRDtV7cJT9_AGT4iunOoC%RSeSy^ zONL0|A|eWUZ6u_6$J&%sT_Tv7Y=~tP7$6vR{_OYwkwj0Y?dxJPWdUGg&<*k8=P;noZ`M%=m= z(|KxLXxY_9lRn??wx^MM1c`peEc9#x|3rxOj9v!l%!OgHXLoQhRx{`+JMo2c0{ z>~LkdcvY9fSjCWPSRkYxW%x6aW?eluva3l7L>y){usVM(O#0~STU)fubz^WTw%G{6 zcd;=hTJ@s>fvCc*Az?L6|6^EOW7 zM)vjU|Jbx;ew^(tE+d1r=a%PI6$zuwc5$=nI>~)<1d+Qh=2dl1O4;;>&J({m7!w0hF1!f=)LyKt=Q7j~&pT}T zUdq8c`9Tn}#G==u>nME0tyVH!*2mV?6q>@yUY3-X5B#dO8f~t$A8S2r^p`dE*i^4M zyvMQd?pjzRS+z}7$_BY~ee2vpAI)&^7+l=R=2DP2SloTTI4x($gPsg(UKUR!w;hw) zxBr5|XX~yvJ8|@wU!s60oVVD=%Qe(kxk{n>p^s#^wsPEucUMPZcmF>dJ@B$>`LM7_ zl{=yXEPo4rCb*I@pOkpuz@UZ6j9}h~dE-yfLx=8F1V%l5>Y|nV=yin)ukY;=gVTTq zkspjT&wAqRA#+Hkm?PmrZ>O(6G%xF+@T;~rD(3YTwrsj!oj=&-R?WBnf}nm3xCj z^>Q)IVMDtU8XR);3a!l>AEz3WYH>Nuw>|aTM_Lmh2Vmef_p{%dh& z(Hmzc-;1WHSGL!Wcqja`52W3Pn`(XRQqeX~y!9!;V&@K;qylZWr8qhg zleRS(4_9!81qi5GnmPj2^`eyFp~5(I`ocm%&px#mQ}ap&vK%3i9!lQkwTkw7 zmEpDAq%_Nhdv@}AxhW}EwOufL4{HRoA;=jAbY2m-N-HYtZEeXC{DEp(e6+s(C~H#i z%1U-%o@s$q9IO?gV{vg!^&q&y9~9*7dfAMiR`bq0=emzZ3>OPxZdqB`U$628u+LZ> zEM;2X9v6Q5(*7!woQgj;1$ssM3I$#gPU$T2Dxv1-RFjE8$;MP}9pGTud;ThnxwXFb z{MJnZS=ST)^W-Js@U0Y6+P}j(q3gxh{|~7+?XuKro*e@Ij!u$*kh8fGcDj}Inq=*z zGWuUsbiE1WP!)mNZkDZ0sy(l`)5g)M;$B|ZZD|>HtJ;UDY`RJk-)ehR6b+iKwq=WC zTscfp8JK44xEv)>WR$Jwp+)=s8|p{9gG36g1+@eQCniD`Z^yhVv7HU9Q9Toq%66UJ z+&Sk1cnUYetv^Ppm}H4o?LPQAPF~<7CCM`O(`a_Q=~Jjv2l%~`(!Bmky2bO-V>-XG!>}Ad4c;XjbByuQM6yFkM< bytes: + +def encode_mcp_message(payload: dict) -> bytes: """Encode an MCP message for stdio transport.""" return (json.dumps(payload) + "\n").encode("utf-8") +``` + +This function converts a Python dictionary to a JSON string, adds a newline, and encodes it as bytes. +2. Add a function to read Docker's multiplexed stream format: -def _read_docker_frame(sock, timeout: float) -> bytes: +```python +def read_docker_frame(sock, timeout: float) -> bytes: """Read a Docker multiplexed frame from the socket.""" deadline = time.time() + timeout header = b"" + + # Read the 8-byte header while len(header) < 8: if time.time() > deadline: raise TimeoutError("Timed out waiting for docker frame header.") @@ -108,7 +140,7 @@ def _read_docker_frame(sock, timeout: float) -> bytes: continue header += chunk - # Docker frame format: 8-byte header + # Docker frame format: # byte 0: stream type (0x01 = stdout, 0x02 = stderr) # bytes 1-3: Reserved (\x00\x00\x00) # bytes 4-7: Payload size (big-endian uint32) @@ -126,17 +158,23 @@ def _read_docker_frame(sock, timeout: float) -> bytes: continue payload += chunk return payload +``` +3. Add a function to parse MCP JSON-RPC messages: -def _read_mcp_message(sock, timeout: float = 10.0) -> dict: +```python +def read_mcp_message(sock, timeout: float = 10.0) -> dict: """Read and parse an MCP JSON-RPC message.""" deadline = time.time() + timeout buffer = b"" + while True: if time.time() > deadline: raise TimeoutError("Timed out waiting for MCP response line.") - frame = _read_docker_frame(sock, timeout) + + frame = read_docker_frame(sock, timeout) buffer += frame + while b"\n" in buffer: line, buffer = buffer.split(b"\n", 1) if not line: @@ -144,6 +182,7 @@ def _read_mcp_message(sock, timeout: float = 10.0) -> dict: try: return json.loads(line.decode("utf-8")) except json.JSONDecodeError: + # Try to find JSON object in the line idx = line.find(b"{") if idx != -1: try: @@ -152,9 +191,15 @@ def _read_mcp_message(sock, timeout: float = 10.0) -> dict: continue ``` -## Write the main test function +Save `helpers.py`. You now have the communication utilities needed for testing. + +**Understanding the code**: The Docker socket uses a multiplexed format where each frame has an 8-byte header. The helper functions handle this low-level detail so your tests can focus on MCP logic. + +## Step 4: Write the test function -Create the main test function in `test_mcp.py` that uses testcontainers to manage the MCP server lifecycle: +Create the main test file `test_mcp.py`: + +1. Start with imports: ```python import os @@ -162,20 +207,30 @@ from pathlib import Path import pytest from testcontainers.core.container import DockerContainer from testcontainers.core.waiting_utils import wait_for_logs + import constants +from helpers import encode_mcp_message, read_mcp_message +``` + +2. Create the test function that starts the container: -def test_mcp_stdio_transport_responds(): +```python +def test_mcp_server_initializes(): + """Test that the MCP server starts and responds to initialization.""" image = os.getenv("MCP_IMAGE", constants.MCP_DOCKER_IMAGE) - repo_root = Path(__file__).resolve().parents[1] with ( DockerContainer(image) - .with_volume_mapping(str(repo_root), "/workspace") .with_kwargs(stdin_open=True, tty=False) ) as container: # Wait for MCP server to start wait_for_logs(container, "Starting MCP server", timeout=60) - + print("MCP server started successfully") +``` + +3. Add socket attachment and initialization: + +```python # Attach to container stdin/stdout socket_wrapper = container.get_wrapped_container().attach_socket( params={"stdin": 1, "stdout": 1, "stderr": 1, "stream": 1} @@ -183,101 +238,91 @@ def test_mcp_stdio_transport_responds(): raw_socket = socket_wrapper._sock raw_socket.settimeout(10) - # Initialize MCP session - raw_socket.sendall(_encode_mcp_message(constants.INIT_REQUEST)) - response = _read_mcp_message(raw_socket, timeout=20) + # Send initialize request + raw_socket.sendall(encode_mcp_message(constants.INIT_REQUEST)) + response = read_mcp_message(raw_socket, timeout=20) - # Verify initialization - assert response.get("id") == 1 - assert "result" in response - assert "serverInfo" in response["result"] + # Verify the response + assert response.get("id") == 1, "Response ID should match request ID" + assert "result" in response, "Response should contain result" + assert "serverInfo" in response["result"], "Result should contain serverInfo" + print(f"Server info: {response['result']['serverInfo']}") +``` + +4. Complete the initialization handshake: + +```python # Send initialized notification raw_socket.sendall( - _encode_mcp_message({ - "jsonrpc": "2.0", - "method": "initialized", + encode_mcp_message({ + "jsonrpc": "2.0", + "method": "initialized", "params": {} }) ) + print("MCP session initialized successfully") ``` -## Add tool-specific tests - -Extend the test function to verify individual MCP tools: - -```python - def _read_response(expected_id: int, timeout: float = 10.0) -> dict: - """Helper to read a specific response by ID.""" - deadline = time.time() + timeout - while time.time() < deadline: - message = _read_mcp_message(raw_socket, timeout=timeout) - if message.get("id") == expected_id: - return message - raise TimeoutError(f"Timed out waiting for response id={expected_id}.") - - # Test check_image tool - raw_socket.sendall(_encode_mcp_message(constants.CHECK_IMAGE_REQUEST)) - check_image_response = _read_response(2, timeout=60) - assert check_image_response.get("result")["structuredContent"] == \ - constants.EXPECTED_CHECK_IMAGE_RESPONSE - - # Test knowledge_base_search tool - raw_socket.sendall(_encode_mcp_message(constants.CHECK_NGINX_REQUEST)) - check_nginx_response = _read_response(4, timeout=60) - urls = json.dumps(check_nginx_response["result"]["structuredContent"]) - assert any( - expected in urls - for expected in constants.EXPECTED_CHECK_NGINX_RESPONSE - ) -``` +Save the file. You now have a basic test that verifies the MCP server starts and initializes correctly. -## Run the tests +## Step 5: Run your test -Execute the test suite using pytest: +Execute the test to verify your implementation: ```bash -python -m pytest -v mcp-local/tests/test_mcp.py +python -m pytest -v test_mcp.py ``` -The output shows each test assertion: +If successful, you see output similar to: ```output ============================= test session starts ============================== platform linux -- Python 3.11.0, pytest-8.0.0 collected 1 item -mcp-local/tests/test_mcp.py::test_mcp_stdio_transport_responds PASSED [100%] +test_mcp.py::test_mcp_server_initializes PASSED [100%] ============================== 1 passed in 45.32s ============================== ``` -For more verbose output that shows the test progress: +Use the `-s` flag to see print statements: ```bash -python -m pytest -s mcp-local/tests/test_mcp.py +python -m pytest -s test_mcp.py ``` -The `-s` flag displays print statements, showing each tool test as it completes. +## Step 6: Add a tool test (challenge) + +Now extend your test to verify an MCP tool. This is a hands-on challenge. + +**Your task**: Add code to your test function that: + +1. Sends the `CHECK_IMAGE_REQUEST` from your constants file +2. Reads the response +3. Verifies the response matches `EXPECTED_CHECK_IMAGE_RESPONSE` + +**Hints**: +- Use `raw_socket.sendall(encode_mcp_message(...))` to send requests +- Use `read_mcp_message(raw_socket, timeout=60)` to read responses (tool calls take longer) +- The response structure is `response["result"]["structuredContent"]` + +After attempting this yourself, you can compare your solution with the implementation in [mcp-local/tests/test_mcp.py](https://github.com/arm/mcp/blob/main/mcp-local/tests/test_mcp.py) -## How Testcontainers handle container lifecycle +## Troubleshooting -The `with DockerContainer(image) as container` pattern: +**Container fails to start**: Verify the Docker image exists by running `docker images arm-mcp`. -1. Pulls the image if not present locally. -2. Creates and starts a new container. -3. Waits for the "Starting MCP server" log message. -4. Yields the container for your test code. -5. Automatically stops and removes the container when the test completes. +**Timeout errors**: Increase the timeout values. The MCP server can take 30-60 seconds to initialize on first run. -This ensures every test run starts with a clean environment. +**Socket connection errors**: Ensure `stdin_open=True` is set in `with_kwargs()`. ## What you've accomplished and what's next In this section: -- You learned how MCP servers communicate using JSON-RPC over stdio. +- You built a test suite from scratch, understanding each component. +- You learnt how MCP servers communicate using JSON-RPC over stdio. - You created helper functions to handle Docker socket communication. -- You wrote integration tests that verify MCP tool responses. -- You ran the test suite locally using pytest. +- You wrote and ran integration tests using pytest. In the next section, you will configure GitHub Actions to run these tests automatically in your CI/CD pipeline. From 020c1f649842ab8fd3ed5ad1579da2dc58eea62a Mon Sep 17 00:00:00 2001 From: Jason Andrews Date: Tue, 10 Mar 2026 16:49:24 -0500 Subject: [PATCH 06/51] Final tech review of Testcontainers --- .../cross-platform/automate-mcp-with-testcontainers/_index.md | 2 +- .../run-testcontainers-example.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md index cb9e4e21fb..15ce1d5fef 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md @@ -7,7 +7,7 @@ cascade: minutes_to_complete: 60 -who_is_this_for: This is an introductory topic for software developers and QA engineers who want to automate integration testing of MCP (Model Context Protocol) servers using Testcontainers and PyTest. +who_is_this_for: This is an introductory topic for software developers and QA engineers who want to automate integration testing of Model Context Protocol (MCP) servers using Testcontainers and PyTest. learning_objectives: - Set up Testcontainers with PyTest for containerized testing of MCP servers diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md index 5cf9c3badb..78ef07faba 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md @@ -42,7 +42,9 @@ When the `with` block exits, Testcontainers automatically stops and removes the ## Verify container architecture -Since you're running on an Arm machine, verify the container uses the correct architecture: +Since you're running on an Arm machine, verify the container uses the correct architecture. + +Create a file `verify_arch.py` with code below: ```python from testcontainers.core.container import DockerContainer From 33681259fac8b3a20fd36393c795a4d8c121137a Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Wed, 11 Mar 2026 14:15:03 -0400 Subject: [PATCH 07/51] Update draft status for NXP FRDM i.MX 93 guide Removed draft status from the NXP FRDM i.MX 93 board guide. --- .../embedded-and-microcontrollers/linux-nxp-board/_index.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/content/learning-paths/embedded-and-microcontrollers/linux-nxp-board/_index.md b/content/learning-paths/embedded-and-microcontrollers/linux-nxp-board/_index.md index d39d381bd6..6ec6c59a9e 100644 --- a/content/learning-paths/embedded-and-microcontrollers/linux-nxp-board/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/linux-nxp-board/_index.md @@ -1,9 +1,5 @@ --- title: Use Linux on the NXP FRDM i.MX 93 board - -draft: true -cascade: - draft: true minutes_to_complete: 120 From 91271bfb636d89ad5c7c135ee031fca3d87d2aff Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Wed, 11 Mar 2026 14:28:09 -0400 Subject: [PATCH 08/51] Revise introduction for DGX Spark and Isaac tools Enhanced the introduction section to clarify the capabilities of the DGX Spark system and the workflow of Isaac Sim and Isaac Lab. Improved descriptions of the tools and their integration for robotic simulation and reinforcement learning. --- .../1_introduction_isaac.md | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md index 89b55e8863..230702e538 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md @@ -8,17 +8,20 @@ layout: learningpathall ## Overview -In this Learning Path, you will build, configure, and run robotic simulation and [reinforcement learning (RL)](https://en.wikipedia.org/wiki/Reinforcement_learning) workflows using NVIDIA Isaac Sim and Isaac Lab on an Arm-based DGX Spark system. The NVIDIA DGX Spark is a personal AI supercomputer powered by the GB10 [Grace Blackwell](https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction/) Superchip, combining an Arm-based Grace CPU with a Blackwell GPU in a compact desktop form factor. +In this Learning Path, you will build, configure, and run robotic simulation and [reinforcement learning (RL)](https://en.wikipedia.org/wiki/Reinforcement_learning) workflows using NVIDIA Isaac Sim and Isaac Lab on an Arm-based DGX Spark system. The NVIDIA DGX Spark is a personal AI supercomputer powered by the GB10 [Grace Blackwell](https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction/) Superchip. The system couples an Arm CPU cluster with a Blackwell GPU and a unified memory architecture to accelerate simulation orchestration, sensor preprocessing, physics, rendering, and RL training. -Isaac Sim and Isaac Lab are NVIDIA's core tools for robotics simulation and reinforcement learning. Together they provide an end-to-end pipeline: simulate robots in physically accurate environments, train control policies using reinforcement learning, and evaluate those policies before deploying them to real hardware. - -This section introduces both tools and explains why the DGX Spark platform is an ideal development environment for these workloads. +NVIDIA's Isaac Sim and Isaac Lab tools together provide an end-to-end robotics development workflow: + 1. Simulate robots in physically realistic environments. + 2. Train control policies using reinforcement learning. + 3. Evaluate trained policies before deployment to physical robots. + +This section introduces both tools and explains how DGX Spark supports high-performance robotic simulation and RL experimentation. ## What is Isaac Sim? -[Isaac Sim](https://docs.isaacsim.omniverse.nvidia.com/latest/index.html) is a robotics simulation platform built on NVIDIA Omniverse. It provides GPU-accelerated physics simulation to enable fast, realistic robot simulations that can run faster than real time. +[Isaac Sim](https://docs.isaacsim.omniverse.nvidia.com/latest/index.html) is a robotics simulation platform built on NVIDIA Omniverse. It provides GPU-accelerated physics and rendering to enable high-fidelity robot simulation. -Key capabilities of Isaac Sim include: +Core capabilities include: | **Capability** | **Description** | |----------------|-----------------| @@ -28,7 +31,7 @@ Key capabilities of Isaac Sim include: | Parallel environments | Run thousands of simulation instances simultaneously on a single GPU for massive data throughput | | Python API | Full programmatic control of scenes, robots, and simulations through Python scripting | -Isaac Sim enables you to create detailed virtual worlds where robots can learn, be tested, and be validated without the cost, time, or risk of physical experiments. +Isaac Sim lets you prototype and validate robot behavior in a controlled virtual environment before physical testing. ## What is Isaac Lab? @@ -38,10 +41,10 @@ Isaac Lab supports two task design workflows: | **Workflow** | **Description** | **Best for** | |--------------|-----------------|--------------| -| Manager-Based | Modular design where observations, actions, rewards, and terminations are defined through separate manager classes | Structured environments with reusable components | +| Manager-Based | Modular environment components (observations, rewards, terminations) defined through separate manager classes | Structured environments with reusable components | | Direct | A single class defines the entire environment logic, similar to traditional Gymnasium environments | Rapid prototyping and full control over environment logic | -Isaac Lab includes out-of-the-box integration with multiple reinforcement learning libraries: +Isaac Lab integrates with multiple reinforcement learning libraries, including: | **RL Library** | **Supported Algorithms** | |----------------|--------------------------| @@ -50,7 +53,7 @@ Isaac Lab includes out-of-the-box integration with multiple reinforcement learni | skrl | PPO, IPPO, MAPPO, AMP (Adversarial Motion Priors) | | Stable Baselines3 (sb3) | PPO | -In this Learning Path you will use the **RSL-RL** library, which is a lightweight and efficient PPO implementation commonly used for locomotion tasks. +In this Learning Path, you will use RSL-RL, a lightweight and efficient PPO implementation commonly used for locomotion training. ## Why DGX Spark for robotic simulation? @@ -68,18 +71,18 @@ Traditional robotics development requires separate machines for simulation, trai ## How Isaac Sim and Isaac Lab work together -The following describes the typical workflow when using Isaac Sim and Isaac Lab together on DGX Spark: +A typical robotics workflow using Isaac Sim and Isaac Lab on DGX Spark follows these steps: -1. **Define the environment**: Isaac Lab provides pre-built environment configurations for common tasks (locomotion, manipulation, navigation). You can also create custom environments. -2. **Launch the simulation**: Isaac Sim initializes the physics engine, loads robot models (URDF/USD), and sets up the scene on the Blackwell GPU. +1. **Define the environment**: Isaac Lab provides pre-built environment configurations for common tasks (locomotion, manipulation, navigation). You can also create custom environments tailored to specific robots or tasks. +2. **Launch the simulation**: Isaac Sim initializes the physics engine, loads the robot models (URDF/USD), and constructs the simulation scene. Physics simulation and rendering run on the Blackwell GPU. 3. **Train a policy**: Isaac Lab's training scripts use RL algorithms (such as PPO via RSL-RL) to optimize a neural network policy. The GPU runs thousands of parallel environments simultaneously. 4. **Evaluate and iterate**: Trained policies can be tested in simulation with visualization enabled or exported for deployment to real hardware. -The entire pipeline runs locally on DGX Spark. Headless mode (without visualization) maximizes GPU utilization for training, while visualization mode lets you inspect robot behavior interactively. +The full workflow can run locally on a DGX Spark system. Running Isaac Sim in headless mode (without visualization) maximizes GPU utilization for training, while enabling visualization allows interactive inspection of robot behavior during debugging or validation. ## Available environment categories -Isaac Lab ships with a comprehensive set of pre-built environments across several categories: +Isaac Lab includes a large set of pre-built environments organized by task type: | **Category** | **Examples** | **Description** | |--------------|-------------|-----------------| @@ -90,13 +93,12 @@ Isaac Lab ships with a comprehensive set of pre-built environments across severa | Navigation | Anymal C navigation | Point-to-point navigation with heading control | | Multi-agent | Cart-Double-Pendulum, Shadow-Hand-Over | Tasks that require coordination among multiple agents | -You will be able to list all available environments after setting up Isaac Lab in the next section. The following command will be available once installation is complete: +After installing Isaac Lab in the next section, you can list the available environments using: ```bash ./isaaclab.sh -p scripts/environments/list_envs.py ``` - -You can also filter by keyword: +You can also filter environments by keyword. For example, to list locomotion environments: ```bash ./isaaclab.sh -p scripts/environments/list_envs.py --keyword locomotion @@ -106,11 +108,11 @@ For the complete list of environments, see the [Isaac Lab Available Environments ## What you will accomplish in this Learning Path -In the learning path that follow you will: +In this Learning Path you will: 1. **Set up Isaac Sim and Isaac Lab** on your DGX Spark by building both tools from source 2. **Run a basic robot simulation** in Isaac Sim and interact with it through Python 3. **Train a reinforcement learning policy** for the Unitree H1 humanoid robot on rough terrain using RSL-RL -4. **Explore advanced RL scenarios** including diverse locomotion tasks and robot configurations +4. **Explore additional RL environments** to understand how the workflow generalizes to other robots and tasks. -By the end, you will have a fully functional Isaac Sim and Isaac Lab development environment on DGX Spark and hands-on experience with the complete robotics RL pipeline. +By the end of the Learning Path, you will have a working Isaac Sim and Isaac Lab development environment on DGX Spark and practical experience running a complete robotics reinforcement learning pipeline. From 824a1d945cc34ade162f6c8a9495f9a2dc251702 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Wed, 11 Mar 2026 14:53:52 -0400 Subject: [PATCH 09/51] Update 2_isaac_installation.md --- .../2_isaac_installation.md | 85 ++++++++++++------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md index 5fdaef4a60..18703657f8 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md @@ -8,13 +8,19 @@ layout: learningpathall ## Set up your development environment -Before running robotic simulations and reinforcement learning tasks, you need to build Isaac Sim and Isaac Lab from source on your DGX Spark system. This section walks you through verifying your system, installing dependencies, building Isaac Sim, and then setting up Isaac Lab on top of it. +Before running robotic simulations and reinforcement learning workloads, you need to prepare your DGX Spark development environment and install the dependencies required for Isaac Sim and Isaac Lab. -The build process takes approximately 15-20 minutes on the Grace CPU and requires around 50 GB of available disk space. +In this section you will: + * Verify the DGX Spark system configuration + * Install required build dependencies + * Build and configure Isaac Sim + * Set up Isaac Lab on top of the Isaac Sim environment + +The full setup typically takes 15–20 minutes on a DGX Spark system and requires approximately 50 GB of available disk space. ## Step 1: Verify your system -Start by confirming that your DGX Spark system has the required hardware and software configuration. +Begin by confirming that the DGX Spark system has the expected hardware and software configuration. Check the CPU architecture: @@ -31,8 +37,9 @@ Architecture: aarch64 CPU(s): 20 On-line CPU(s) list: 0-19 ``` +The Architecture field should report aarch64, indicating that the system is running on Arm. -Verify the Blackwell GPU is recognized: +Check that the Blackwell GPU is detected by the NVIDIA driver: ```bash nvidia-smi @@ -49,6 +56,7 @@ You will see output similar to: | 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | +-----------------------------------------+------------------------+----------------------+ ``` +The GPU name should appear as NVIDIA GB10, confirming that the Grace–Blackwell GPU is available. Confirm the CUDA toolkit is installed: @@ -68,21 +76,25 @@ Isaac Sim requires GCC/G++ 11, Git LFS, and CUDA 13.0 or later. If any of these ## Step 2: Install GCC 11 and Git LFS -Isaac Sim requires GCC/G++ version 11 for compilation. Install it and set it as the default compiler: +Isaac Sim requires GCC/G++ version 11 when building components from source. Install the required compiler version and configure it as the system default. +Update the package index and install the GCC 11 toolchain: ```bash sudo apt update && sudo apt install -y gcc-11 g++-11 +``` +Register GCC 11 as the default compiler using update-alternatives. This allows multiple compiler versions to coexist while prioritizing GCC 11 for builds: +```bash sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 200 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 200 ``` -Install Git LFS, which is needed to pull large binary assets from the Isaac Sim repository: +Next, install Git LFS (Large File Storage). Isaac Sim repositories use Git LFS to manage large binary assets such as models and simulation data. ```bash sudo apt install -y git-lfs ``` -Verify both installations: +After installation, verify that the compiler and Git LFS are available: ```bash gcc --version @@ -90,33 +102,36 @@ g++ --version git lfs version ``` -The GCC output should show version 11.x. Git LFS should report a version number confirming it is installed. +The gcc and g++ commands should report version 11.x, and git lfs version should display the installed Git LFS version. ## Step 3: Clone and build Isaac Sim -Clone the Isaac Sim repository from GitHub. The `--depth=1` flag creates a shallow clone to reduce download time, and `--recursive` fetches all submodules: +Next, download the Isaac Sim source repository and its required assets. +Start by cloning the repository. The --depth=1 option performs a shallow clone to reduce download size, and --recursive ensures all required submodules are fetched. ```bash cd ~ git clone --depth=1 --recursive https://github.com/isaac-sim/IsaacSim cd IsaacSim +``` +Isaac Sim stores large simulation assets (such as USD environments, textures, and prebuilt components) using Git Large File Storage (LFS). Initialize Git LFS and download the required assets: +```bash git lfs install git lfs pull ``` {{% notice Note %}} -The Git LFS pull downloads several gigabytes of simulation assets (USD files, textures, and pre-built libraries). Ensure you have a stable network connection. +The Git LFS download retrieves several gigabytes of simulation assets. Ensure you have a stable internet connection and sufficient disk space before running this step. {{% /notice %}} -Build Isaac Sim by running the build script. This compiles the simulation engine and all its components: +Once the repository and assets are downloaded, build Isaac Sim using the provided build script: ```bash ./build.sh ``` +By default, the build uses all available CPU cores on the Grace processor. On DGX Spark, compilation typically takes 10-15 minutes. -The build uses all available CPU cores on the Grace processor. On DGX Spark, compilation typically takes 10-15 minutes. - -When the build succeeds, you will see output similar to: +When the build finishes successfully, you will see output similar to: ```output BUILD (RELEASE) SUCCEEDED (Took 674.39 seconds) @@ -124,14 +139,15 @@ BUILD (RELEASE) SUCCEEDED (Took 674.39 seconds) ## Step 4: Set Isaac Sim environment variables -After the build completes, configure your shell to recognize the Isaac Sim installation. Run the following commands from inside the `IsaacSim` directory: +After the build completes, configure environment variables so that your shell can locate the Isaac Sim binaries and Python runtime. +Navigate to the IsaacSim directory if you are not already there, then export the following variables: ```bash export ISAACSIM_PATH="${PWD}/_build/linux-aarch64/release" export ISAACSIM_PYTHON_EXE="${ISAACSIM_PATH}/python.sh" ``` -The table below explains each variable: +These variables are used by Isaac Lab and other tools to locate the Isaac Sim runtime. | **Variable** | **Purpose** | |--------------|-------------| @@ -139,44 +155,56 @@ The table below explains each variable: | `ISAACSIM_PYTHON_EXE` | References the Python wrapper script that runs Python with Isaac Sim's dependencies preloaded | {{% notice Tip %}} -Add these `export` lines to your `~/.bashrc` file so they persist across terminal sessions: - +To make these environment variables persist across terminal sessions, add them to your shell configuration file. +Run the following commands: ```bash echo 'export ISAACSIM_PATH="$HOME/IsaacSim/_build/linux-aarch64/release"' >> ~/.bashrc echo 'export ISAACSIM_PYTHON_EXE="${ISAACSIM_PATH}/python.sh"' >> ~/.bashrc source ~/.bashrc ``` +After this step, the variables will be available automatically whenever you open a new terminal. + {{% /notice %}} ## Step 5: Validate the Isaac Sim build -Launch Isaac Sim to verify the build was successful. The `LD_PRELOAD` setting resolves a library compatibility issue on aarch64: +Launch Isaac Sim to verify the build was successful. On some aarch64 systems, Isaac Sim may require preloading the GNU OpenMP runtime (libgomp) to avoid library compatibility issues. Setting the LD_PRELOAD environment variable ensures the correct library is loaded before Isaac Sim starts. +Run the following command to launch Isaac Sim: ```bash export LD_PRELOAD="$LD_PRELOAD:/lib/aarch64-linux-gnu/libgomp.so.1" ${ISAACSIM_PATH}/isaac-sim.sh ``` -If the build is correct, Isaac Sim opens its viewer window (or starts in headless mode if no display is available). You should see initialization messages confirming that the Blackwell GPU is detected and the physics engine is ready. +If the installation is correct, Isaac Sim opens its viewer window (or starts in headless mode if no display is available). During startup, the console output should report initialization of the Blackwell GPU and the physics simulation engine. + +Once you confirm that Isaac Sim starts successfully, stop the application by pressing: +`Ctrl + C` -Press `Ctrl+C` in the terminal to close Isaac Sim after verifying it starts successfully. +This returns you to the terminal and confirms that the build and runtime environment are functioning correctly. ## Step 6: Clone and install Isaac Lab -With Isaac Sim successfully built and validated, you can now set up Isaac Lab to enable RL training workflows. Clone the repository into your home directory: +After confirming that Isaac Sim runs correctly, you can install Isaac Lab, which provides the reinforcement learning environments and training pipelines used in this learning path. +Start by cloning the Isaac Lab repository into your home directory: ```bash cd ~ git clone --recursive https://github.com/isaac-sim/IsaacLab cd IsaacLab ``` +Isaac Lab expects to locate an Isaac Sim installation in a directory named `_isaac_sim` inside the repository. Instead of copying files, create a symbolic link pointing to the Isaac Sim build directory that you configured earlier. -Create a symbolic link so Isaac Lab can find your Isaac Sim installation: +First confirm that the ISAACSIM_PATH variable is set: ```bash echo "ISAACSIM_PATH=$ISAACSIM_PATH" +``` +Then create the symbolic link +```bash ln -sfn "${ISAACSIM_PATH}" "${PWD}/_isaac_sim" ``` +This links the Isaac Lab repository to the Isaac Sim installation that was built in the previous steps. Verify the symbolic link is correct: @@ -186,7 +214,7 @@ ls -l "${PWD}/_isaac_sim/python.sh" You should see the symlink pointing to your Isaac Sim build directory. -Install Isaac Lab and all its dependencies: +Next, install Isaac Lab and its Python dependencies: ```bash ./isaaclab.sh --install @@ -196,14 +224,13 @@ This command installs the Isaac Lab Python packages, RL libraries (RSL-RL, rl_ga ## Step 7: Validate the Isaac Lab installation -Verify that Isaac Lab is installed correctly by listing the available RL environments: +Confirm that Isaac Lab is installed correctly by listing the available RL environments: ```bash export LD_PRELOAD="$LD_PRELOAD:/lib/aarch64-linux-gnu/libgomp.so.1" ./isaaclab.sh -p scripts/environments/list_envs.py ``` - -You should see a list of available environments, including entries such as: +If the installation is successful, the command prints a list of available environments. The output will include entries similar to: ```output Isaac-Cartpole-v0 @@ -225,9 +252,9 @@ In this section you have: - Verified your DGX Spark system has the required Grace CPU, Blackwell GPU, and CUDA 13 environment - Installed GCC 11 and Git LFS as build prerequisites -- Cloned and built Isaac Sim from source, producing aarch64-optimized binaries for the Grace-Blackwell platform +- Cloned and built Isaac Sim, producing binaries for the aarch64 Grace–Blackwell platform - Configured environment variables so Isaac Lab can locate the Isaac Sim installation - Cloned and installed Isaac Lab with all RL library dependencies - Validated both installations by launching Isaac Sim and listing available environments -Your development environment is now fully configured for robot simulation and RL workflows. In the next module, you will run your first robot simulation and begin interacting with Isaac Sim through Python scripts. +Your development environment is now fully configured for robot simulation and RL workflows. In the next section, you will run your first robot simulation and begin interacting with Isaac Sim through Python scripts. From 71dc90d363e2773321871152c0f2982d9fbf893d Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Wed, 11 Mar 2026 15:07:20 -0400 Subject: [PATCH 10/51] Refine documentation for Isaac Lab robot simulation Updated the documentation for the Isaac Lab robot simulation tutorial, enhancing clarity and detail in the descriptions of various steps and concepts. --- .../3_isaac_small_project.md | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md index 28511d9657..720d8626a5 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md @@ -8,9 +8,9 @@ layout: learningpathall ## Deploy a basic robot simulation -With Isaac Sim and Isaac Lab installed, you can now run your first robot simulation. In this section you will launch a pre-built simulation scene, interact with it programmatically, and understand the key concepts behind Isaac Sim's simulation loop. +With Isaac Sim and Isaac Lab installed, you can now run your first robot simulation. In this section you will launch a pre-built simulation scene, interact with it programmatically, and explore the key concepts behind Isaac Sim's simulation loop. -You will work with the Cartpole environment, a classic control benchmark where a cart must balance a pole by applying horizontal forces. This environment is simple enough to understand quickly but demonstrates all the core simulation concepts required for more complex robotics tasks. +The example environment used here is Cartpole, a classic control benchmark in which a cart must balance an upright pole by applying horizontal forces. Although simple, this environment demonstrates the core mechanics of simulation environments used in robotics and reinforcement learning. ## Step 1: Launch a sample scene from Isaac Lab @@ -24,14 +24,15 @@ export LD_PRELOAD="$LD_PRELOAD:/lib/aarch64-linux-gnu/libgomp.so.1" This script creates an empty simulation world with a ground plane and default lighting. It validates that the Isaac Sim rendering and physics engines are working on your DGX Spark system. -If a display is connected, a viewer window should open; otherwise, log messages will confirm that the simulation initialized successfully in headless mode. +If a display is available, a viewer window opens showing the simulation scene. On systems without a graphical display, the simulation runs in headless mode, and initialization messages appear in the terminal. Press `Ctrl+C` to exit the simulation. ## Step 2: Spawn and simulate a robot -Next, run a more complete example that spawns articulated robots into the scene. This tutorial demonstrates how Isaac Sim handles multi-body physics: - +Next, run a tutorial that loads an articulated robot into the simulation and advances the physics engine. +This example demonstrates how Isaac Sim handles multi-body dynamics, including loading robot assets, configuring actuators, and stepping the physics simulation. +Run the following command: ```bash ./isaaclab.sh -p scripts/tutorials/01_assets/run_articulation.py ``` @@ -46,7 +47,10 @@ This script loads a robot model, advances the physics simulation, and prints joi ## Step 3: Run the Cartpole environment -Now run a complete environment that combines scene, action, observation, and event managers. The `create_cartpole_base_env.py` tutorial creates a Cartpole base environment and applies random actions: +Next, run a complete Isaac Lab environment that combines a simulation scene with environment management components such as action, observation, and event managers. + +The `create_cartpole_base_env.py` tutorial creates a Cartpole environment and applies random actions to the cart. Running multiple environments in parallel allows reinforcement learning algorithms to collect experience more efficiently. +Run the following command: ```bash ./isaaclab.sh -p scripts/tutorials/03_envs/create_cartpole_base_env.py --num_envs 32 @@ -62,7 +66,8 @@ This tutorial script uses a hardcoded `CartpoleEnvCfg` configuration. It does no ## Step 4: Run the Cartpole RL environment -The previous script creates a base environment without rewards or terminations. To see the full RL environment (with reward computation and episode resets), run: +The previous tutorial created a base simulation environment that advances physics and applies actions but does not include reinforcement learning components such as rewards or episode termination. +To run the full reinforcement learning version of the environment, execute the following command: ```bash ./isaaclab.sh -p scripts/tutorials/03_envs/run_cartpole_rl_env.py --num_envs 32 @@ -70,7 +75,7 @@ The previous script creates a base environment without rewards or terminations. This script wraps the Cartpole scene in a `ManagerBasedRLEnv`, which includes reward computation, termination conditions, and the standard Gymnasium `step()` interface that returns `(obs, reward, terminated, truncated, info)`. -The key difference between the two scripts: +Key differences between the base and RL environments: | **Script** | **Environment type** | **Returns from step()** | |-----------|---------------------|------------------------| @@ -79,11 +84,11 @@ The key difference between the two scripts: ## Step 5: Understand the simulation code -To understand what happens inside an Isaac Lab environment, examine the Cartpole environment source code. The key elements are: +To better understand how Isaac Lab environments operate, examine the Cartpole environment source code. Isaac Lab environments are typically defined through configuration classes that specify the scene layout, action interfaces, observation space, and environment events. ### Environment configuration -Every Isaac Lab environment starts with a configuration class that defines the simulation parameters. The `CartpoleEnvCfg` in the tutorial specifies: +Every Isaac Lab environment starts with a configuration class that defines the simulation parameters. In the Cartpole tutorial, the `CartpoleEnvCfg` configuration specifies the scene layout and simulation timing: ```python @configclass @@ -103,7 +108,7 @@ class CartpoleEnvCfg(ManagerBasedEnvCfg): self.sim.dt = 0.005 # sim step every 5ms: 200Hz ``` -The table below explains each parameter: +The table below summarizes the key parameters: | **Parameter** | **Value** | **Description** | |---------------|-----------|-----------------| @@ -114,7 +119,7 @@ The table below explains each parameter: ### Actions, observations, and events -The configuration defines three manager groups: +Isaac Lab environments organize functionality into manager groups that define how the agent interacts with the simulation. **Actions** — how the agent controls the robot: @@ -158,12 +163,15 @@ class EventCfg: reset_cart_position = EventTerm(func=mdp.reset_joints_by_offset, mode="reset", ...) reset_pole_position = EventTerm(func=mdp.reset_joints_by_offset, mode="reset", ...) ``` - -Events introduce variability that improves training robustness. Randomizing the pole mass on startup means the agent must learn to balance poles of different weights. Randomizing joint positions on reset ensures each episode starts from a different state. +Events introduce controlled randomness into the environment. +For example: + * The pole mass is randomized during initialization + * Cart and pole positions are randomized on reset +This variability helps the trained policy generalize to slightly different system dynamics. ### The simulation loop -The core simulation loop in Isaac Lab follows a standard Gymnasium-style interface. From `run_cartpole_rl_env.py`: +The core simulation loop in Isaac Lab follows a standard Gymnasium-style interface. The example below is taken from `run_cartpole_rl_env.py`: ```python # Create the RL environment @@ -200,8 +208,9 @@ All computations happen in parallel across all environments using PyTorch tensor ## Step 6: Run with headless mode -For reinforcement learning tasks, headless mode is preferred to maximize GPU throughput. You can test it now using the Cartpole RL environment. +For reinforcement learning workflows, it is common to run Isaac Sim without rendering. Disabling the viewer allows more GPU resources to be used for physics simulation and neural network computation. +You can test headless execution using the Cartpole RL environment: ```bash ./isaaclab.sh -p scripts/tutorials/03_envs/run_cartpole_rl_env.py --num_envs 64 --headless ``` From 51bb3bb77f9cca8e4f935434f85f6256315359d8 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Wed, 11 Mar 2026 15:26:45 -0400 Subject: [PATCH 11/51] Refine training section for Isaac Lab and RSL-RL Enhanced clarity and detail in the training workflow for reinforcement learning policy using Isaac Lab and RSL-RL. Added specific stages of the RL training pipeline and improved explanations of training parameters and evaluation. --- .../dgx_spark_isaac_robotics/4_isaac_rfl.md | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md index 8746a0a4fe..4e43864e34 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md @@ -8,7 +8,11 @@ layout: learningpathall ## Train a reinforcement learning policy using Isaac Lab and RSL-RL -In this section you will train a reinforcement learning (RL) policy for the [Unitree] (https://www.unitree.com/) H1 humanoid robot to walk over rough terrain. You will use Isaac Lab's RSL-RL integration, which implements the Proximal Policy Optimization (PPO) algorithm. By the end of this section you will understand the full training pipeline, including task configuration, PPO hyperparameters, and policy evaluation. +In this section you will train a reinforcement learning (RL) policy for the [Unitree] (https://www.unitree.com/) H1 humanoid robot to walk over rough terrain. The training workflow uses Isaac Lab’s integration with the RSL-RL library, which implements the Proximal Policy Optimization (PPO) algorithm. This integration connects Isaac Sim’s physics simulation with an efficient RL training pipeline. By the end of this section you will understand the key stages of the RL training pipeline, including: + * Task configuration and environment selection + * PPO training parameters and rollout collection + * Monitoring training progress + * Evaluating the trained policy in simulation ## What is RSL-RL? @@ -19,11 +23,10 @@ RSL-RL (Robotic Systems Lab Reinforcement Learning) is a lightweight RL library - Asymmetric actor-critic support (the critic can observe more than the actor) - Minimal dependencies and tight integration with Isaac Lab -Isaac Lab provides ready-to-use training scripts for RSL-RL under `scripts/reinforcement_learning/rsl_rl/`. +Isaac Lab includes ready-to-use training scripts for RSL-RL under `scripts/reinforcement_learning/rsl_rl/`. ## Step 1: Understand the training task - -The task you will train is **Isaac-Velocity-Rough-H1-v0**. This is a locomotion task where the [Unitree H1](https://www.unitree.com/h1/) humanoid robot must track a velocity command while navigating rough terrain. +In this section you will train the **Isaac-Velocity-Rough-H1-v0** environment. This is a locomotion task where the [Unitree H1](https://www.unitree.com/h1/) humanoid robot must track a velocity command while navigating rough terrain. The task details are: @@ -44,7 +47,7 @@ This setup provides a high-dimensional control problem ideal for testing locomot ## Step 2: Launch the training -Navigate to the Isaac Lab directory and start training in headless mode for maximum performance: +Navigate to the Isaac Lab directory and start the training job. Running in headless mode disables visualization so that more GPU resources can be used for physics simulation and neural network computation: ```bash cd ~/IsaacLab @@ -54,9 +57,9 @@ export LD_PRELOAD="$LD_PRELOAD:/lib/aarch64-linux-gnu/libgomp.so.1" --headless ``` -Once the training starts, you will see log messages reporting iteration progress, rewards, and performance statistics. - -``` +Once training begins, the terminal displays iteration progress, reward statistics, and performance metrics. +Example output: +```output Learning iteration 15/3000 Computation: 65955 steps/s (collection: 1.256s, learning 0.235s) @@ -92,13 +95,17 @@ Metrics/base_velocity/error_vel_yaw: 0.4705 ETA: 01:21:49 ``` - -This command launches the training with default hyperparameters. The Blackwell GPU runs thousands of parallel H1 environments simultaneously while the Grace CPU handles logging and orchestration. +During training: + * The Blackwell GPU accelerates physics simulation, neural network inference, and PPO training updates. + * The Grace CPU manages environment orchestration, logging, and experiment control. + * Multiple simulation environments run in parallel, enabling efficient rollout collection for reinforcement learning. + +Each PPO iteration collects experience from all parallel environments, then updates the policy network using the gathered trajectories. {{% notice Warning %}} **Known issue: NVRTC GPU architecture error on DGX Spark** -When running RL training on the Blackwell GPU (GB10, compute capability 12.1), you may encounter: +When running RL training on the Blackwell GPU (GB10, compute capability 12.1), you may encounter an error similar to: ``` RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch) @@ -108,11 +115,12 @@ This error occurs because the NVRTC runtime compiler inside PyTorch does not yet **Workaround**: Make sure you are using the Isaac Sim build from source (as described in the setup section) rather than a pip-installed version. The source build includes the correct CUDA 13 runtime for Blackwell. If the error persists, try running with `--headless` mode, which avoids some NVRTC code paths used by the renderer. Also ensure your NVIDIA driver is up to date (`nvidia-smi` should show driver 580.x or later). -This issue is expected to be resolved in future Isaac Sim and PyTorch releases with full Blackwell support. +Support for Blackwell GPUs is expected to improve in upcoming PyTorch and Isaac Sim releases. {{% /notice %}} +### Adjusting training parameters You can also override default parameters from the command line: - +For example: ```bash ./isaaclab.sh -p scripts/reinforcement_learning/rsl_rl/train.py \ --task=Isaac-Velocity-Rough-H1-v0 \ @@ -121,7 +129,6 @@ You can also override default parameters from the command line: --max_iterations=1500 \ --seed=42 ``` - ### Command-line arguments The following table explains the key command-line arguments: @@ -134,8 +141,10 @@ The following table explains the key command-line arguments: | `--max_iterations` | 1500 | Total number of PPO training iterations. Each iteration collects a batch of experience and updates the policy | | `--seed` | 0 | Random seed for reproducibility. Set this to get deterministic results across runs | +Each environment runs an independent simulation of the robot, allowing the RL algorithm to collect experience efficiently. + {{% notice Tip %}} -On DGX Spark, 2048 to 4096 parallel environments work well for locomotion tasks. Higher values increase sample throughput but require more GPU memory. Start with 2048 if you want faster iteration cycles during development. +On DGX Spark, 2048 - 4096 parallel environments typically work well for locomotion tasks. Higher values increase sample throughput but require more GPU memory. Start with 2048 if you want faster iteration cycles during development. {{% /notice %}} ## Step 3: Understand the PPO hyperparameters @@ -177,14 +186,13 @@ PPO (Proximal Policy Optimization) is the RL algorithm used by RSL-RL. Understan | `save_interval` | `50` | Save a model checkpoint every N iterations. Useful for resuming training or evaluating intermediate policies | ### How the hyperparameters interact - -The total amount of experience collected per training iteration is: - +During training, each iteration collects experience from all parallel environments. +The total batch size per iteration is: ``` batch_size = num_envs × num_steps_per_env ``` -For example, with `num_envs=4096` and `num_steps_per_env=24`: +For example, with `num_envs=4096` and `num_steps_per_env=24`, the batch size per iteration is: ``` batch_size = 4096 × 24 = 98,304 environment steps per iteration @@ -194,7 +202,7 @@ This batch is then split into `num_mini_batches` (4) mini-batches of ~24,576 ste ## Step 4: Monitor the training -During training, RSL-RL prints statistics to the terminal at regular intervals. A typical output looks like: +During training, RSL-RL periodically prints progress statistics to the terminal. A typical log output looks like: ```output Learning iteration 100/1500 @@ -207,7 +215,7 @@ Learning iteration 100/1500 fps: 48523 ``` -Interpreting these values helps track convergence and diagnose training instability, such as stagnating rewards or exploding losses. +These metrics help you track learning progress and detect issues such as unstable gradients or stagnating policies. The following table explains each metric: @@ -241,7 +249,7 @@ For evaluation, use the inference task name `Isaac-Velocity-Rough-H1-Play-v0` in The play script loads the most recent checkpoint and runs the policy in real time. You will observe the Unitree H1 humanoid walking over procedurally generated rough terrain, responding to live velocity commands. -You can also specify a particular checkpoint manually, which is useful for comparing intermediate policy performance. +You can also run inference with a specific checkpoint. This is useful for comparing policy performance at different stages of training. ```bash ./isaaclab.sh -p scripts/reinforcement_learning/rsl_rl/play.py \ @@ -249,10 +257,13 @@ You can also specify a particular checkpoint manually, which is useful for compa --num_envs=512 \ --checkpoint=logs/rsl_rl/h1_rough//model_1500.pt ``` +This command loads the specified checkpoint and runs the policy using the same simulation environment. ### Understanding the evaluation -During evaluation, you can observe how the robot's behavior improves over the course of training: +During evaluation, you can observe how the robot’s behavior evolves as training progresses. + +Typical training behavior follows three stages: - **Early training (iterations 0–200)**: The robot often collapses immediately or performs erratic, uncoordinated motions. - **Mid training (iterations 200–800)**: The robot begins to walk forward with some success, though it may still stumble or lose balance on rough terrain. @@ -268,7 +279,7 @@ At iteration 50, the policy is still in its exploration phase. Most robots exhib ![img3 alt-text#center](isaaclab_h1_512_0050.gif "Figure 3: Early Stage") -*** Iteration 1250 (Late Stage, num_envs=512) *** +*** Iteration 1350 (Late Stage, num_envs=512) *** By iteration 1350, the policy has matured. Most robots demonstrate coordinated walking behavior, balance maintenance, and accurate velocity tracking, even on rough terrain. The improvement in foot placement and heading stability is clearly visible. @@ -283,4 +294,4 @@ In this module, you have: - Monitored training progress using reward curves, episode statistics, and performance metrics - Evaluated the trained policy through interactive visualization and behavior analysis -You have now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. \ No newline at end of file +You have now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. From ca5266882ed680b04ea8bbbeb5dc073bf3703aa8 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Thu, 12 Mar 2026 09:41:47 -0400 Subject: [PATCH 12/51] Mark BOLT introduction page as draft Set the draft status for the BOLT introduction page. --- .../servers-and-cloud-computing/bolt-demo/_index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md index 96a183e2ab..7720b875a1 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md @@ -1,6 +1,10 @@ --- title: "Get started with BOLT" +draft: true +cascade: + draft: true + minutes_to_complete: 20 who_is_this_for: This is an introductory topic for performance‑minded developers From 3f12d28d60ce5891a63613835a2ccaa6772b6cae Mon Sep 17 00:00:00 2001 From: Brendan Long Date: Thu, 12 Mar 2026 15:50:34 -0500 Subject: [PATCH 13/51] Add Prince Agyeman from asct project as co-author on install guide --- assets/contributors.csv | 3 ++- content/install-guides/asct.md | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/assets/contributors.csv b/assets/contributors.csv index 42918ac77b..dedf06545e 100644 --- a/assets/contributors.csv +++ b/assets/contributors.csv @@ -113,4 +113,5 @@ Steve Suzuki,Arm,,,, Qixiang Xu,Arm,,,, Phalani Paladugu,Arm,phalani-paladugu,phalani-paladugu,, Richard Burton,Arm,Burton2000,,, -Asier Arranz,NVIDIA,,asierarranz,,asierarranz.com \ No newline at end of file +Asier Arranz,NVIDIA,,asierarranz,,asierarranz.com +Prince Agyeman,Arm,,,, \ No newline at end of file diff --git a/content/install-guides/asct.md b/content/install-guides/asct.md index a051417099..a4f81382ec 100644 --- a/content/install-guides/asct.md +++ b/content/install-guides/asct.md @@ -17,7 +17,9 @@ test_maintenance: false # No official documentation official_docs: https://learn.arm.com/install-guides/asct/ -author: Jason Andrews +author: +- Jason Andrews +- Prince Agyeman ### PAGE SETUP weight: 1 # Defines page ordering. Must be 1 for first (or only) page. From fae5e36c347ed297bd9e3cb5f3f56c0700a982df Mon Sep 17 00:00:00 2001 From: Madeline Underwood Date: Thu, 12 Mar 2026 21:50:09 +0000 Subject: [PATCH 14/51] Refactor introduction section for MCP server testing documentation --- .../introduction.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md index 1a46d6c394..e6ad80ca94 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md @@ -16,10 +16,10 @@ The Arm MCP Server provides AI assistants with tools and knowledge specifically MCP servers expose multiple tools that AI assistants can invoke. As these tools evolve, you need reliable automated tests to: -- Verify that each tool responds correctly to valid requests. -- Catch regressions when updating server code or dependencies. -- Validate container startup and communication protocols. -- Ensure compatibility across different environments. +- Verify that each tool responds correctly to valid requests +- Catch regressions when updating server code or dependencies +- Validate container startup and communication protocols +- Ensure compatibility across different environments ## Understanding Testcontainers @@ -29,24 +29,24 @@ Testcontainers is a Python library that provides lightweight, throwaway instance This approach offers several benefits: -- **Realistic testing**: Tests run against the actual server implementation. -- **Isolation**: Each test run gets a fresh container instance. -- **Reproducibility**: Tests behave consistently across development machines and CI environments. -- **No external dependencies**: Tests don't require a pre-deployed server. +- **Realistic testing**: Tests run against the actual server implementation +- **Isolation**: Each test run gets a fresh container instance +- **Reproducibility**: Tests behave consistently across development machines and CI environments +- **No external dependencies**: Tests don't require a pre-deployed server ## What you will build -In this Learning Path, you will create an integration test suite that: +In this Learning Path, you'll create an integration test suite that: -1. Starts the Arm MCP server in a Docker container using Testcontainers. -2. Communicates with the server using the MCP stdio transport protocol. -3. Tests multiple MCP tools including container image checking, knowledge base search, and code analysis. -4. Integrates with GitHub Actions for continuous testing. +- Starts the Arm MCP server in a Docker container using Testcontainers +- Communicates with the server using the MCP stdio transport protocol +- Tests multiple MCP tools including container image checking, knowledge base search, and code analysis +- Integrates with GitHub Actions for continuous testing ## What you've accomplished and what's next In this section: -- You learned what MCP servers are and why automated testing matters. -- You discovered how Testcontainers enable realistic integration testing. +- You learned what MCP servers are and why automated testing matters +- You discovered how Testcontainers enable realistic integration testing -In the next section, you will set up your development environment and install the required dependencies. +In the next section, you'll set up your development environment and install the required dependencies. From 7256753fd4a09b418c284e6dec77c7fb61e40c71 Mon Sep 17 00:00:00 2001 From: Madeline Underwood Date: Thu, 12 Mar 2026 21:50:24 +0000 Subject: [PATCH 15/51] Refine documentation for MCP server testing, including title case adjustments and improved clarity in prerequisites and test case instructions. --- .../_index.md | 7 +-- .../github-actions-ci.md | 32 ++++------ .../introduction.md | 16 ++--- .../run-testcontainers-example.md | 30 +++++----- .../setup-environment.md | 24 ++++---- .../write-test-cases.md | 58 +++++++++---------- 6 files changed, 77 insertions(+), 90 deletions(-) diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md index 15ce1d5fef..c902d60a1f 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/_index.md @@ -1,9 +1,5 @@ --- -title: Automate MCP Server testing using Pytest and Testcontainers - -draft: true -cascade: - draft: true +title: Automate MCP server testing using Pytest and Testcontainers minutes_to_complete: 60 @@ -62,6 +58,7 @@ further_reading: link: https://docs.pytest.org/ type: documentation + ### FIXED, DO NOT MODIFY # ================================================================================ weight: 1 # _index.md always has weight of 1 to order correctly diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md index 556fbff5b7..12294ca252 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/github-actions-ci.md @@ -8,12 +8,12 @@ layout: learningpathall ## Why use GitHub Actions for MCP testing? -GitHub Actions provide automated CI/CD directly in your repository. For MCP server testing, it offers: +GitHub Actions provides automated CI/CD directly in your repository. For MCP server testing, it offers: -- **Arm runner support**: GitHub provides native Arm64 runners for building and testing. -- **Docker integration**: Runners come with Docker pre-installed. -- **Automatic triggers**: Tests run on every push and pull request. -- **Parallel execution**: Multiple jobs can run simultaneously. +- **Arm runner support**: GitHub provides native Arm64 runners for building and testing +- **Docker integration**: Runners come with Docker pre-installed +- **Automatic triggers**: Tests run on every push and pull request +- **Parallel execution**: Multiple jobs can run simultaneously ## Create the workflow file @@ -143,7 +143,7 @@ This runs the integration tests in parallel on both architectures. ## Monitor workflow runs -After pushing the workflow file, navigate to the Actions tab in your GitHub repository. Each workflow run shows: +After you push the workflow file, navigate to the Actions tab in your GitHub repository. Each workflow run shows: - Build steps and their status - Execution time for each step @@ -154,7 +154,7 @@ After pushing the workflow file, navigate to the Actions tab in your GitHub repo If the workflow fails, check these common causes: -**Docker build timeout**: The initial image build can take 10+ minutes. GitHub Actions has a default timeout of 360 minutes per job, but individual steps might need explicit timeouts: +**Docker build timeout**: The initial image build can take more than 10 minutes. GitHub Actions has a default timeout of 360 minutes per job, but individual steps might need explicit timeouts: ```yaml - name: Build MCP Docker image @@ -177,21 +177,11 @@ If the workflow fails, check these common causes: ## What you've accomplished and what's next -In this section: -- You created a GitHub Actions workflow for automated testing. -- You learned how to use Arm64 runners for native execution. -- You added test artifacts and multi-platform support. -- You explored troubleshooting techniques for CI failures. - -You now have a complete CI/CD pipeline that automatically tests your MCP server on every code change. - -## Summary - In this Learning Path, you learned how to: -- Set up testcontainers for Docker-based integration testing. -- Write pytest tests that communicate with MCP servers over stdio transport. -- Parse MCP JSON-RPC responses and validate tool outputs. -- Configure GitHub Actions with Arm64 runners for automated testing. +- Set up Testcontainers for Docker-based integration testing +- Write pytest tests that communicate with MCP servers over stdio transport +- Parse MCP JSON-RPC responses and validate tool outputs +- Configure GitHub Actions with Arm64 runners for automated testing These techniques apply to any MCP server implementation, not just the Arm MCP Server. Use this foundation to build comprehensive test suites that ensure your MCP tools work correctly across updates and deployments. diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md index e6ad80ca94..c024551846 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/introduction.md @@ -1,5 +1,5 @@ --- -title: Introduction to MCP Server Testing +title: Introduction to MCP server testing weight: 2 ### FIXED, DO NOT MODIFY @@ -21,18 +21,18 @@ MCP servers expose multiple tools that AI assistants can invoke. As these tools - Validate container startup and communication protocols - Ensure compatibility across different environments -## Understanding Testcontainers +## Understanding Testcontainers for Python Testcontainers is a Python library that provides lightweight, throwaway instances of Docker containers for testing. Instead of mocking your MCP server, you can spin up the actual Docker container, run tests against it, and tear it down automatically. -![Diagram showing Testcontainers workflow: test code creates a Docker container, runs tests against it, and automatically tears it down after completion#center](testcontainers.png "Figure 1. Testcontainers Flow") +![Diagram showing Testcontainers workflow: test code creates a Docker container, runs tests against it, and automatically tears it down after completion alt-txt#center](testcontainers.png "Testcontainers flow") This approach offers several benefits: -- **Realistic testing**: Tests run against the actual server implementation -- **Isolation**: Each test run gets a fresh container instance -- **Reproducibility**: Tests behave consistently across development machines and CI environments -- **No external dependencies**: Tests don't require a pre-deployed server +- Realistic testing: tests run against the actual server implementation +- Isolation: each test run gets a fresh container instance +- Reproducibility: tests behave consistently across development machines and CI environments +- No external dependencies: tests don't require a pre-deployed server ## What you will build @@ -43,7 +43,7 @@ In this Learning Path, you'll create an integration test suite that: - Tests multiple MCP tools including container image checking, knowledge base search, and code analysis - Integrates with GitHub Actions for continuous testing -## What you've accomplished and what's next +## What you've learned and what's next In this section: - You learned what MCP servers are and why automated testing matters diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md index 78ef07faba..dbcc4b7745 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/run-testcontainers-example.md @@ -6,9 +6,9 @@ weight: 4 layout: learningpathall --- -## Understand Testcontainers basics +## Understand Testcontainers for Python testing -Before writing full integration tests, explore how Testcontainers manages containers programmatically. This section walks you through basic examples that demonstrate the core concepts. +Before writing full integration tests, see how Testcontainers manages containers programmatically. This section walks you through basic examples that demonstrate the core concepts. ## Run a simple container @@ -40,11 +40,11 @@ Hello from Testcontainers on Arm! When the `with` block exits, Testcontainers automatically stops and removes the container. -## Verify container architecture +## Verify Arm container architecture Since you're running on an Arm machine, verify the container uses the correct architecture. -Create a file `verify_arch.py` with code below: +Create a file `verify_arch.py` with the code below: ```python from testcontainers.core.container import DockerContainer @@ -118,7 +118,7 @@ Container automatically stopped and removed. The following diagram illustrates how Testcontainers manages the complete container lifecycle: -![Container lifecycle flowchart showing: Start Test leads to Create DockerContainer, which checks if Image exists. If No, it Pulls image then Creates container. If Yes, it directly Creates container. Then it Starts container, Waits for ready signal, Runs test code, Stops container, Removes container, and finally Test complete.#center](container-lifecycle.png "Figure 2. Testcontainers Container Lifecycle") +![Container lifecycle flowchart showing: Start Test leads to Create DockerContainer, which checks if Image exists. If No, it Pulls image then Creates container. If Yes, it directly Creates container. Then it Starts container, Waits for ready signal, Runs test code, Stops container, Removes container, and finally Test complete.alt-txt#center](container-lifecycle.png "Testcontainers container lifecycle") The `DockerContainer` context manager handles four phases automatically: @@ -131,7 +131,7 @@ The `DockerContainer` context manager handles four phases automatically: This ensures tests always start with a clean environment and never leave orphaned containers. -## Configure container options +## Configure Testcontainers options Testcontainers provides methods to configure container settings: @@ -150,9 +150,9 @@ with ( Common configuration options include: -- **with_env()**: Set environment variables inside the container -- **with_volume_mapping()**: Mount host directories into the container -- **with_kwargs()**: Pass additional arguments to the Docker SDK +- **with_env()**: sets environment variables inside the container +- **with_volume_mapping()**: mounts host directories into the container +- **with_kwargs()**: passes additional arguments to the Docker SDK For MCP testing, `stdin_open=True` enables communication over the stdio transport. @@ -182,12 +182,12 @@ except TimeoutError: print("Check the Docker logs for more details.") ``` -## What you've accomplished and what's next +## What you've learned and what's next In this section: -- You ran basic Testcontainers examples to understand the container lifecycle. -- You verified container architecture matches your Arm system. -- You started the MCP server container and waited for it to initialize. -- You learned how to configure containers and handle errors. +- You ran basic Testcontainers examples to understand the container lifecycle +- You verified container architecture matches your Arm system +- You started the MCP server container and waited for it to initialize +- You learned how to configure containers and handle errors -In the next section, you will write full integration tests that communicate with the MCP server using the JSON-RPC protocol. +In the next section, you'll write full integration tests that communicate with the MCP server using the JSON-RPC protocol. diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/setup-environment.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/setup-environment.md index 81702d1f74..091ea6e748 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/setup-environment.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/setup-environment.md @@ -6,7 +6,7 @@ weight: 3 layout: learningpathall --- -## Prerequisites +## Prerequisites for MCP testing with Docker and Python Before you begin, ensure you have the following installed on your machine: @@ -14,7 +14,7 @@ Before you begin, ensure you have the following installed on your machine: - Docker Engine or Docker Desktop - Git -If you are on Linux, you need the Python virtual environment package. +If you're on Linux, you need the Python virtual environment package. For Debian or Ubuntu, run: @@ -85,7 +85,7 @@ The test framework requires Pytest and Testcontainers. Install them using the pr pip install -r mcp-local/tests/requirements.txt ``` -The requirements file contains: +The requirements file includes: ```text testcontainers @@ -106,7 +106,7 @@ The output confirms Testcontainers can interact with Docker: Testcontainers ready ``` -## Understanding the test directory structure +## Understanding the MCP test directory structure The test files are located in `mcp-local/tests/`: @@ -118,15 +118,15 @@ mcp-local/tests/ └── test_mcp.py # Main test file ``` -- **constants.py**: Contains MCP request payloads and expected responses for each tool being tested. -- **test_mcp.py**: The main test file that uses Testcontainers to spin up the MCP server and run assertions. -- **sum_test.s**: A sample Arm assembly file used to test the LLVM-MCA analysis tool. +- **constants.py**: Contains MCP request payloads and expected responses for each tool being tested +- **test_mcp.py**: The main test file that uses Testcontainers to spin up the MCP server and run assertions +- **sum_test.s**: A sample Arm assembly file used to test the LLVM-MCA analysis tool -## What you've accomplished and what's next +## What you've learned and what's next In this section: -- You cloned the Arm MCP repository and built the server Docker image. -- You set up a Python virtual environment with Pytest and Testcontainers. -- You explored the test directory structure. +- You cloned the Arm MCP repository and built the server Docker image +- You set up a Python virtual environment with Pytest and Testcontainers +- You explored the test directory structure -In the next section, you will run basic Testcontainers examples to understand how containers are managed programmatically. +In the next section, you'll run basic Testcontainers examples to understand how containers are managed programmatically. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/write-test-cases.md b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/write-test-cases.md index f1f4c1f7cb..53e848b060 100644 --- a/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/write-test-cases.md +++ b/content/learning-paths/cross-platform/automate-mcp-with-testcontainers/write-test-cases.md @@ -6,9 +6,9 @@ weight: 5 layout: learningpathall --- -## Overview +## MCP integration testing overview -In this section, you will build an integration test suite for the Arm MCP server step by step. You will create the test files yourself and understand each component as you go. +In this section, you'll build an integration test suite for the Arm MCP server step by step. You'll create the test files yourself and understand each component as you go. The Arm MCP repository already includes a complete test implementation in [mcp-local/tests/](https://github.com/arm/mcp/tree/main/mcp-local/tests). You can reference those files at any point, but this tutorial guides you through building a simplified version to understand the key concepts. @@ -17,7 +17,7 @@ The Arm MCP repository already includes a complete test implementation in [mcp-l Before writing tests, you need to understand how MCP servers communicate. MCP uses JSON-RPC 2.0 over standard input/output (stdio transport). The following diagram shows the communication flow between your test code, Testcontainers, and the MCP server: -![MCP communication sequence diagram showing: Pytest Test creates DockerContainer via Testcontainers, which starts the MCP Server Container. The test then sends initialize request, receives capabilities response, sends initialized notification, then makes tools/call requests for check_image and knowledge_base_search, receiving results for each. Finally, the test exits the context manager, triggering Testcontainers to stop and remove the container.#center](mcp-communication-flow.png "Figure 1. MCP JSON-RPC Communication Flow") +![MCP communication sequence diagram showing: Pytest Test creates DockerContainer via Testcontainers, which starts the MCP Server Container. The test then sends initialize request, receives capabilities response, sends initialized notification, then makes tools/call requests for check_image and knowledge_base_search, receiving results for each. Finally, the test exits the context manager, triggering Testcontainers to stop and remove the container. alt-txt#center](mcp-communication-flow.png "MCP JSON-RPC Communication Flow") The communication follows this sequence: | Step | Direction | Message Type | @@ -29,7 +29,7 @@ The communication follows this sequence: Each message is a JSON object followed by a newline character. -## Step 1: Create the test directory +## Step 1: create the test directory Create a directory for your test files: @@ -38,19 +38,19 @@ mkdir -p my-mcp-tests cd my-mcp-tests ``` -## Step 2: Define test constants +## Step 2: define test constants Create a file called `constants.py` to hold the MCP request payloads and expected responses. Open your editor and create `constants.py` with the following content: -1. First, define the Docker image name: +First, define the Docker image name: ```python MCP_DOCKER_IMAGE = "arm-mcp:latest" ``` -2. Add the initialization request. This follows the MCP protocol specification: +Add the initialization request. This follows the MCP protocol specification: ```python INIT_REQUEST = { @@ -65,7 +65,7 @@ INIT_REQUEST = { } ``` -3. Add a test request for the `check_image` tool. This tool verifies if a Docker image supports Arm architecture: +Add a test request for the `check_image` tool. This tool verifies if a Docker image supports Arm architecture: ```python CHECK_IMAGE_REQUEST = { @@ -85,7 +85,7 @@ CHECK_IMAGE_REQUEST = { } ``` -4. Define what response you expect from the tool: +Define what response you expect from the tool: ```python EXPECTED_CHECK_IMAGE_RESPONSE = { @@ -100,15 +100,15 @@ EXPECTED_CHECK_IMAGE_RESPONSE = { Save the file. This gives you a complete `constants.py` with one test case. -**Try it yourself**: Follow the same format to add another test request for a different MCP tool. Look at the [Arm MCP documentation](https://github.com/arm/mcp/tree/main) to find other available tools like `knowledge_base_search`. +Try it yourself by following the same format to add another test request for a different MCP tool. Look at the [Arm MCP documentation](https://github.com/arm/mcp/tree/main) to find other available tools such as `knowledge_base_search`. -## Step 3: Create helper functions +## Step 3: create helper functions for JSON-RPC communication The MCP server runs inside a Docker container that communicates over an attached socket. You need helper functions to encode and decode messages. Create a new file called `helpers.py`: -1. Start with the imports and the message encoding function: +Start with the imports and the message encoding function: ```python import json @@ -122,7 +122,7 @@ def encode_mcp_message(payload: dict) -> bytes: This function converts a Python dictionary to a JSON string, adds a newline, and encodes it as bytes. -2. Add a function to read Docker's multiplexed stream format: +Add a function to read Docker's multiplexed stream format: ```python def read_docker_frame(sock, timeout: float) -> bytes: @@ -160,7 +160,7 @@ def read_docker_frame(sock, timeout: float) -> bytes: return payload ``` -3. Add a function to parse MCP JSON-RPC messages: +Add a function to parse MCP JSON-RPC messages: ```python def read_mcp_message(sock, timeout: float = 10.0) -> dict: @@ -193,13 +193,13 @@ def read_mcp_message(sock, timeout: float = 10.0) -> dict: Save `helpers.py`. You now have the communication utilities needed for testing. -**Understanding the code**: The Docker socket uses a multiplexed format where each frame has an 8-byte header. The helper functions handle this low-level detail so your tests can focus on MCP logic. +**Understanding the code**: The Docker socket uses a multiplexed format where each frame has an 8-byte header. When you attach to a container's stdin/stdout, Docker wraps the data in frames that need to be parsed. The helper functions handle this low-level detail so your tests can focus on MCP logic. ## Step 4: Write the test function Create the main test file `test_mcp.py`: -1. Start with imports: +Start with imports: ```python import os @@ -212,7 +212,7 @@ import constants from helpers import encode_mcp_message, read_mcp_message ``` -2. Create the test function that starts the container: +Create the test function that starts the container: ```python def test_mcp_server_initializes(): @@ -228,7 +228,7 @@ def test_mcp_server_initializes(): print("MCP server started successfully") ``` -3. Add socket attachment and initialization: +Attach to the container's stdio streams to communicate with the MCP server. The MCP protocol uses JSON-RPC 2.0 messages over stdin/stdout: ```python # Attach to container stdin/stdout @@ -250,7 +250,7 @@ def test_mcp_server_initializes(): print(f"Server info: {response['result']['serverInfo']}") ``` -4. Complete the initialization handshake: +Complete the initialization handshake: ```python # Send initialized notification @@ -298,13 +298,13 @@ Now extend your test to verify an MCP tool. This is a hands-on challenge. **Your task**: Add code to your test function that: -1. Sends the `CHECK_IMAGE_REQUEST` from your constants file -2. Reads the response -3. Verifies the response matches `EXPECTED_CHECK_IMAGE_RESPONSE` +- Sends the `CHECK_IMAGE_REQUEST` from your constants file +- Reads the response +- Verifies the response matches `EXPECTED_CHECK_IMAGE_RESPONSE` **Hints**: - Use `raw_socket.sendall(encode_mcp_message(...))` to send requests -- Use `read_mcp_message(raw_socket, timeout=60)` to read responses (tool calls take longer) +- Use `read_mcp_message(raw_socket, timeout=60)` to read responses (tool calls take longer than initialization) - The response structure is `response["result"]["structuredContent"]` After attempting this yourself, you can compare your solution with the implementation in [mcp-local/tests/test_mcp.py](https://github.com/arm/mcp/blob/main/mcp-local/tests/test_mcp.py) @@ -317,12 +317,12 @@ After attempting this yourself, you can compare your solution with the implement **Socket connection errors**: Ensure `stdin_open=True` is set in `with_kwargs()`. -## What you've accomplished and what's next +## What you've learned and what's next In this section: -- You built a test suite from scratch, understanding each component. -- You learnt how MCP servers communicate using JSON-RPC over stdio. -- You created helper functions to handle Docker socket communication. -- You wrote and ran integration tests using pytest. +- You built a test suite from scratch, understanding each component +- You learned how MCP servers communicate using JSON-RPC over stdio +- You created helper functions to handle Docker socket communication +- You wrote and ran integration tests using pytest -In the next section, you will configure GitHub Actions to run these tests automatically in your CI/CD pipeline. +In the next section, you'll configure GitHub Actions to run these tests automatically in your CI/CD pipeline. From 504ffd0655684e010a543e0b364a9169d75c6c3a Mon Sep 17 00:00:00 2001 From: Madeline Underwood Date: Thu, 12 Mar 2026 22:22:35 +0000 Subject: [PATCH 16/51] Refine language and improve clarity in Isaac Sim documentation --- .../1_introduction_isaac.md | 19 ++++++++---- .../2_isaac_installation.md | 29 +++++++++---------- .../3_isaac_small_project.md | 21 +++++++------- .../dgx_spark_isaac_robotics/4_isaac_rfl.md | 26 ++++++++++------- .../dgx_spark_isaac_robotics/_index.md | 5 +--- 5 files changed, 54 insertions(+), 46 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md index 230702e538..77cc59e0c7 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md @@ -8,7 +8,7 @@ layout: learningpathall ## Overview -In this Learning Path, you will build, configure, and run robotic simulation and [reinforcement learning (RL)](https://en.wikipedia.org/wiki/Reinforcement_learning) workflows using NVIDIA Isaac Sim and Isaac Lab on an Arm-based DGX Spark system. The NVIDIA DGX Spark is a personal AI supercomputer powered by the GB10 [Grace Blackwell](https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction/) Superchip. The system couples an Arm CPU cluster with a Blackwell GPU and a unified memory architecture to accelerate simulation orchestration, sensor preprocessing, physics, rendering, and RL training. +In this Learning Path, you'll build, configure, and run robotic simulation and [reinforcement learning (RL)](https://en.wikipedia.org/wiki/Reinforcement_learning) workflows using NVIDIA Isaac Sim and Isaac Lab on an Arm-based DGX Spark system. The NVIDIA DGX Spark is a personal AI supercomputer powered by the GB10 [Grace Blackwell](https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/1_gb10_introduction/) Superchip. The system couples an Arm CPU cluster with a Blackwell GPU and a unified memory architecture to accelerate simulation orchestration, sensor preprocessing, physics, rendering, and RL training. NVIDIA's Isaac Sim and Isaac Lab tools together provide an end-to-end robotics development workflow: 1. Simulate robots in physically realistic environments. @@ -106,13 +106,22 @@ You can also filter environments by keyword. For example, to list locomotion env For the complete list of environments, see the [Isaac Lab Available Environments](https://isaac-sim.github.io/IsaacLab/main/source/overview/environments.html) documentation. -## What you will accomplish in this Learning Path +## What you'll accomplish in this Learning Path -In this Learning Path you will: +In this Learning Path you'll: 1. **Set up Isaac Sim and Isaac Lab** on your DGX Spark by building both tools from source 2. **Run a basic robot simulation** in Isaac Sim and interact with it through Python 3. **Train a reinforcement learning policy** for the Unitree H1 humanoid robot on rough terrain using RSL-RL -4. **Explore additional RL environments** to understand how the workflow generalizes to other robots and tasks. +4. **Explore additional RL environments** to understand how the workflow generalizes to other robots and tasks -By the end of the Learning Path, you will have a working Isaac Sim and Isaac Lab development environment on DGX Spark and practical experience running a complete robotics reinforcement learning pipeline. +By the end of the Learning Path, you'll have a working Isaac Sim and Isaac Lab development environment on DGX Spark and practical experience running a complete robotics reinforcement learning pipeline. + +## What you've learned and what's next + +In this section: +- You learned what Isaac Sim and Isaac Lab are and how they work together for robotics development +- You discovered why DGX Spark's unified memory architecture is ideal for simulation and RL training +- You explored the available environment categories for different robotics tasks + +In the next section, you'll set up your development environment and install Isaac Sim and Isaac Lab on your DGX Spark system. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md index 18703657f8..73ee34e7fb 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/2_isaac_installation.md @@ -8,9 +8,9 @@ layout: learningpathall ## Set up your development environment -Before running robotic simulations and reinforcement learning workloads, you need to prepare your DGX Spark development environment and install the dependencies required for Isaac Sim and Isaac Lab. +Before you run robotic simulations and reinforcement learning workloads, you need to prepare your DGX Spark development environment and install the dependencies required for Isaac Sim and Isaac Lab. -In this section you will: +In this section you'll: * Verify the DGX Spark system configuration * Install required build dependencies * Build and configure Isaac Sim @@ -18,7 +18,7 @@ In this section you will: The full setup typically takes 15–20 minutes on a DGX Spark system and requires approximately 50 GB of available disk space. -## Step 1: Verify your system +## Step 1: Verify your DGX Spark system Begin by confirming that the DGX Spark system has the expected hardware and software configuration. @@ -37,7 +37,7 @@ Architecture: aarch64 CPU(s): 20 On-line CPU(s) list: 0-19 ``` -The Architecture field should report aarch64, indicating that the system is running on Arm. +The Architecture field should report `aarch64`, indicating that the system is running on Arm. Check that the Blackwell GPU is detected by the NVIDIA driver: @@ -70,9 +70,7 @@ The expected output includes: Cuda compilation tools, release 13.0, V13.0.88 ``` -{{% notice Note %}} -Isaac Sim requires GCC/G++ 11, Git LFS, and CUDA 13.0 or later. If any of these checks fail, resolve the issue before proceeding. -{{% /notice %}} +{{% notice Note %}}Isaac Sim requires GCC/G++ 11, Git LFS, and CUDA 13.0 or later. If any of these checks fail, resolve the issue before you proceed.{{% /notice %}} ## Step 2: Install GCC 11 and Git LFS @@ -82,7 +80,8 @@ Update the package index and install the GCC 11 toolchain: ```bash sudo apt update && sudo apt install -y gcc-11 g++-11 ``` -Register GCC 11 as the default compiler using update-alternatives. This allows multiple compiler versions to coexist while prioritizing GCC 11 for builds: + +Register GCC 11 as the default compiler using `update-alternatives`. This allows multiple compiler versions to coexist while prioritizing GCC 11 for builds. The priority value of 200 ensures GCC 11 takes precedence over other installed versions: ```bash sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 200 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 200 @@ -120,9 +119,7 @@ git lfs install git lfs pull ``` -{{% notice Note %}} -The Git LFS download retrieves several gigabytes of simulation assets. Ensure you have a stable internet connection and sufficient disk space before running this step. -{{% /notice %}} +{{% notice Note %}}The Git LFS download retrieves several gigabytes of simulation assets. Ensure you have a stable internet connection and sufficient disk space before you run this step.{{% /notice %}} Once the repository and assets are downloaded, build Isaac Sim using the provided build script: @@ -166,7 +163,7 @@ After this step, the variables will be available automatically whenever you open {{% /notice %}} -## Step 5: Validate the Isaac Sim build +## Step 5: Validate your Isaac Sim build Launch Isaac Sim to verify the build was successful. On some aarch64 systems, Isaac Sim may require preloading the GNU OpenMP runtime (libgomp) to avoid library compatibility issues. Setting the LD_PRELOAD environment variable ensures the correct library is loaded before Isaac Sim starts. @@ -244,11 +241,11 @@ Isaac-Reach-Franka-v0 If the environment list displays without errors, both Isaac Sim and Isaac Lab are correctly installed and ready for use. -You are now ready to run and train RL tasks using Isaac Lab environments. +You're now ready to run and train RL tasks using Isaac Lab environments. -## What you have accomplished +## What you've learned and what's next -In this section you have: +In this section you've: - Verified your DGX Spark system has the required Grace CPU, Blackwell GPU, and CUDA 13 environment - Installed GCC 11 and Git LFS as build prerequisites @@ -257,4 +254,4 @@ In this section you have: - Cloned and installed Isaac Lab with all RL library dependencies - Validated both installations by launching Isaac Sim and listing available environments -Your development environment is now fully configured for robot simulation and RL workflows. In the next section, you will run your first robot simulation and begin interacting with Isaac Sim through Python scripts. +Your development environment is now fully configured for robot simulation and RL workflows. In the next section, you'll run your first robot simulation and begin interacting with Isaac Sim through Python scripts. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md index 720d8626a5..65b71a5433 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md @@ -8,7 +8,7 @@ layout: learningpathall ## Deploy a basic robot simulation -With Isaac Sim and Isaac Lab installed, you can now run your first robot simulation. In this section you will launch a pre-built simulation scene, interact with it programmatically, and explore the key concepts behind Isaac Sim's simulation loop. +With Isaac Sim and Isaac Lab installed, you can now run your first robot simulation. In this section you'll launch a pre-built simulation scene, interact with it programmatically, and explore the key concepts behind Isaac Sim's simulation loop. The example environment used here is Cartpole, a classic control benchmark in which a cart must balance an upright pole by applying horizontal forces. Although simple, this environment demonstrates the core mechanics of simulation environments used in robotics and reinforcement learning. @@ -30,9 +30,8 @@ Press `Ctrl+C` to exit the simulation. ## Step 2: Spawn and simulate a robot -Next, run a tutorial that loads an articulated robot into the simulation and advances the physics engine. -This example demonstrates how Isaac Sim handles multi-body dynamics, including loading robot assets, configuring actuators, and stepping the physics simulation. -Run the following command: +Next, run a tutorial that loads an articulated robot into the simulation and advances the physics engine. This example demonstrates how Isaac Sim handles multi-body dynamics, including loading robot assets, configuring actuators, and stepping the physics simulation. + ```bash ./isaaclab.sh -p scripts/tutorials/01_assets/run_articulation.py ``` @@ -56,7 +55,7 @@ Run the following command: ./isaaclab.sh -p scripts/tutorials/03_envs/create_cartpole_base_env.py --num_envs 32 ``` -This command launches 32 parallel Cartpole environments on the Blackwell GPU. Each environment runs its own independent simulation with random joint efforts applied to the cart. You will see the pole joint angle printed to the terminal for each step. +This command launches 32 parallel Cartpole environments on the Blackwell GPU. Each environment runs its own independent simulation with random joint efforts applied to the cart. You'll see the pole joint angle printed to the terminal for each step. ![img2 alt-text#center](32_cartpole.gif "Figure 2: 32 parallel Cartpole") @@ -66,7 +65,8 @@ This tutorial script uses a hardcoded `CartpoleEnvCfg` configuration. It does no ## Step 4: Run the Cartpole RL environment -The previous tutorial created a base simulation environment that advances physics and applies actions but does not include reinforcement learning components such as rewards or episode termination. +The previous tutorial created a base simulation environment that advances physics and applies actions but doesn't include reinforcement learning components such as rewards or episode termination. + To run the full reinforcement learning version of the environment, execute the following command: ```bash @@ -206,7 +206,7 @@ Each call to `env.step(action)` performs these operations on the GPU: All computations happen in parallel across all environments using PyTorch tensors on the GPU. This is what makes Isaac Lab efficient: thousands of environments run in parallel without Python loop overhead. -## Step 6: Run with headless mode +## Step 6: Run in headless mode For reinforcement learning workflows, it is common to run Isaac Sim without rendering. Disabling the viewer allows more GPU resources to be used for physics simulation and neural network computation. @@ -221,9 +221,9 @@ In headless mode, all GPU resources are dedicated to physics simulation and tens When running headless on DGX Spark, the Blackwell GPU handles both the physics simulation and neural network computation. The unified memory architecture means there is no performance penalty for sharing GPU memory between these workloads. {{% /notice %}} -## What you have accomplished +## What you've learned and what's next -In this section you have: +In this section you've: - Launched your first Isaac Sim scene on DGX Spark and verified the rendering and physics engines work correctly - Spawned articulated robots and observed multi-body physics simulation @@ -232,4 +232,5 @@ In this section you have: - Tested headless mode for maximum training performance You now understand the core components of an Isaac Lab simulation environment, including scene creation, robot articulation, observation and action structures, and simulation loop execution. -In the next section, you will use these concepts to train a reinforcement learning policy for a humanoid robot. + +In the next section, you'll use these concepts to train a reinforcement learning policy for a humanoid robot. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md index 4e43864e34..e913a81a57 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md @@ -8,7 +8,7 @@ layout: learningpathall ## Train a reinforcement learning policy using Isaac Lab and RSL-RL -In this section you will train a reinforcement learning (RL) policy for the [Unitree] (https://www.unitree.com/) H1 humanoid robot to walk over rough terrain. The training workflow uses Isaac Lab’s integration with the RSL-RL library, which implements the Proximal Policy Optimization (PPO) algorithm. This integration connects Isaac Sim’s physics simulation with an efficient RL training pipeline. By the end of this section you will understand the key stages of the RL training pipeline, including: +In this section you'll train a reinforcement learning (RL) policy for the [Unitree](https://www.unitree.com/) H1 humanoid robot to walk over rough terrain. The training workflow uses Isaac Lab’s integration with the RSL-RL library, which implements the Proximal Policy Optimization (PPO) algorithm. This integration connects Isaac Sim’s physics simulation with an efficient RL training pipeline. By the end of this section you'll understand the key stages of the RL training pipeline, including: * Task configuration and environment selection * PPO training parameters and rollout collection * Monitoring training progress @@ -26,7 +26,8 @@ RSL-RL (Robotic Systems Lab Reinforcement Learning) is a lightweight RL library Isaac Lab includes ready-to-use training scripts for RSL-RL under `scripts/reinforcement_learning/rsl_rl/`. ## Step 1: Understand the training task -In this section you will train the **Isaac-Velocity-Rough-H1-v0** environment. This is a locomotion task where the [Unitree H1](https://www.unitree.com/h1/) humanoid robot must track a velocity command while navigating rough terrain. + +In this section you'll train the **Isaac-Velocity-Rough-H1-v0** environment. This is a locomotion task where the [Unitree H1](https://www.unitree.com/h1/) humanoid robot must track a velocity command while navigating rough terrain. The task details are: @@ -58,6 +59,7 @@ export LD_PRELOAD="$LD_PRELOAD:/lib/aarch64-linux-gnu/libgomp.so.1" ``` Once training begins, the terminal displays iteration progress, reward statistics, and performance metrics. + Example output: ```output Learning iteration 15/3000 @@ -118,8 +120,10 @@ This error occurs because the NVRTC runtime compiler inside PyTorch does not yet Support for Blackwell GPUs is expected to improve in upcoming PyTorch and Isaac Sim releases. {{% /notice %}} -### Adjusting training parameters -You can also override default parameters from the command line: +### Adjust training parameters + +You can also override default parameters from the command line. + For example: ```bash ./isaaclab.sh -p scripts/reinforcement_learning/rsl_rl/train.py \ @@ -186,8 +190,8 @@ PPO (Proximal Policy Optimization) is the RL algorithm used by RSL-RL. Understan | `save_interval` | `50` | Save a model checkpoint every N iterations. Useful for resuming training or evaluating intermediate policies | ### How the hyperparameters interact -During training, each iteration collects experience from all parallel environments. -The total batch size per iteration is: + +During training, each iteration collects experience from all parallel environments. The total batch size per iteration is: ``` batch_size = num_envs × num_steps_per_env ``` @@ -273,25 +277,25 @@ This progression—from falling to stable walking—demonstrates how PPO gradual The following visualizations compare two training stages using `num_envs=512`, showcasing the benefit of large-scale parallel training on DGX Spark. -*** Iteration 50 (Early Stage, num_envs=512) *** +**Iteration 50 (Early Stage, num_envs=512)** At iteration 50, the policy is still in its exploration phase. Most robots exhibit noisy joint actions, lack coordination, and frequently fall. There is no observable response to the velocity command, and no stable gait has emerged. ![img3 alt-text#center](isaaclab_h1_512_0050.gif "Figure 3: Early Stage") -*** Iteration 1350 (Late Stage, num_envs=512) *** +**Iteration 1350 (Late Stage, num_envs=512)** By iteration 1350, the policy has matured. Most robots demonstrate coordinated walking behavior, balance maintenance, and accurate velocity tracking, even on rough terrain. The improvement in foot placement and heading stability is clearly visible. ![img4 alt-text#center](isaaclab_h1_512_1350.gif "Figure 4: Late Stage") -## What you have accomplished +## What you've learned -In this module, you have: +In this section, you've: - Trained a reinforcement learning policy for the Unitree H1 humanoid robot using RSL-RL and the PPO algorithm - Understood key hyperparameters in the training pipeline, including policy architecture, rollout strategy, and PPO optimization settings - Monitored training progress using reward curves, episode statistics, and performance metrics - Evaluated the trained policy through interactive visualization and behavior analysis -You have now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. +You've now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md index b4c4df7876..a674ef45bc 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md @@ -1,10 +1,7 @@ --- title: Build Robot Simulation and Reinforcement Learning Workflows with Isaac Sim and Isaac Lab on DGX Spark -draft: true -cascade: - draft: true - + minutes_to_complete: 90 who_is_this_for: This learning path is intended for robotics developers, simulation engineers, and AI researchers who want to run high-fidelity robotic simulations and reinforcement learning (RL) pipelines using NVIDIA Isaac Sim and Isaac Lab on Arm-based NVIDIA DGX Spark system powered by the Grace–Blackwell (GB10) architecture. From b0450c915bbf7176b771ad72b929d94d41849485 Mon Sep 17 00:00:00 2001 From: Madeline Underwood Date: Thu, 12 Mar 2026 22:31:26 +0000 Subject: [PATCH 17/51] Refactor section titles and improve clarity in the introduction of the Isaac Sim learning path --- .../1_introduction_isaac.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md index 77cc59e0c7..35d5f48338 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/1_introduction_isaac.md @@ -106,16 +106,16 @@ You can also filter environments by keyword. For example, to list locomotion env For the complete list of environments, see the [Isaac Lab Available Environments](https://isaac-sim.github.io/IsaacLab/main/source/overview/environments.html) documentation. -## What you'll accomplish in this Learning Path +## What you'll build -In this Learning Path you'll: +In this Learning Path, you'll: -1. **Set up Isaac Sim and Isaac Lab** on your DGX Spark by building both tools from source -2. **Run a basic robot simulation** in Isaac Sim and interact with it through Python -3. **Train a reinforcement learning policy** for the Unitree H1 humanoid robot on rough terrain using RSL-RL -4. **Explore additional RL environments** to understand how the workflow generalizes to other robots and tasks +1. Set up Isaac Sim and Isaac Lab on your DGX Spark by building both tools from source +2. Run a basic robot simulation in Isaac Sim and interact with it through Python +3. Train a reinforcement learning policy for the Unitree H1 humanoid robot on rough terrain using RSL-RL +4. Explore additional RL environments to understand how the workflow generalizes to other robots and tasks -By the end of the Learning Path, you'll have a working Isaac Sim and Isaac Lab development environment on DGX Spark and practical experience running a complete robotics reinforcement learning pipeline. +By the end, you'll have a working Isaac Sim and Isaac Lab development environment on DGX Spark and practical experience running a complete robotics reinforcement learning pipeline. ## What you've learned and what's next From 2932cbcdf9d262333d5e5961173126c768d23604 Mon Sep 17 00:00:00 2001 From: Madeline Underwood Date: Thu, 12 Mar 2026 22:31:36 +0000 Subject: [PATCH 18/51] Improve clarity and consistency in documentation by refining image captions and updating the audience description for the Isaac Sim learning path --- .../dgx_spark_isaac_robotics/3_isaac_small_project.md | 4 ++-- .../dgx_spark_isaac_robotics/4_isaac_rfl.md | 6 +++--- .../laptops-and-desktops/dgx_spark_isaac_robotics/_index.md | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md index 65b71a5433..2a1cdc7f52 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/3_isaac_small_project.md @@ -42,7 +42,7 @@ This script loads a robot model, advances the physics simulation, and prints joi - Configuring joint actuators and control modes - Stepping the physics simulation and reading back joint positions and velocities -![img1 alt-text#center](run_articulation.gif "Figure 1: run_articulation.py") +![img1 alt-text#center](run_articulation.gif "run_articulation.py") ## Step 3: Run the Cartpole environment @@ -57,7 +57,7 @@ Run the following command: This command launches 32 parallel Cartpole environments on the Blackwell GPU. Each environment runs its own independent simulation with random joint efforts applied to the cart. You'll see the pole joint angle printed to the terminal for each step. -![img2 alt-text#center](32_cartpole.gif "Figure 2: 32 parallel Cartpole") +![img2 alt-text#center](32_cartpole.gif "32 parallel Cartpole") {{% notice Note %}} This tutorial script uses a hardcoded `CartpoleEnvCfg` configuration. It does not accept a `--task` argument. The `--num_envs` flag controls how many parallel environments are spawned on the GPU. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md index e913a81a57..02e9a9e946 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/4_isaac_rfl.md @@ -281,13 +281,13 @@ The following visualizations compare two training stages using `num_envs=512`, s At iteration 50, the policy is still in its exploration phase. Most robots exhibit noisy joint actions, lack coordination, and frequently fall. There is no observable response to the velocity command, and no stable gait has emerged. -![img3 alt-text#center](isaaclab_h1_512_0050.gif "Figure 3: Early Stage") +![img3 alt-text#center](isaaclab_h1_512_0050.gif "Early Stage") **Iteration 1350 (Late Stage, num_envs=512)** By iteration 1350, the policy has matured. Most robots demonstrate coordinated walking behavior, balance maintenance, and accurate velocity tracking, even on rough terrain. The improvement in foot placement and heading stability is clearly visible. -![img4 alt-text#center](isaaclab_h1_512_1350.gif "Figure 4: Late Stage") +![img4 alt-text#center](isaaclab_h1_512_1350.gif "Late Stage") ## What you've learned @@ -298,4 +298,4 @@ In this section, you've: - Monitored training progress using reward curves, episode statistics, and performance metrics - Evaluated the trained policy through interactive visualization and behavior analysis -You've now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. +You've now completed the end-to-end workflow of training and validating a reinforcement learning policy for humanoid locomotion on DGX Spark. \ No newline at end of file diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md index a674ef45bc..bada253138 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md @@ -4,7 +4,7 @@ title: Build Robot Simulation and Reinforcement Learning Workflows with Isaac Si minutes_to_complete: 90 -who_is_this_for: This learning path is intended for robotics developers, simulation engineers, and AI researchers who want to run high-fidelity robotic simulations and reinforcement learning (RL) pipelines using NVIDIA Isaac Sim and Isaac Lab on Arm-based NVIDIA DGX Spark system powered by the Grace–Blackwell (GB10) architecture. +who_is_this_for: This is an advanced topic for robotics developers, simulation engineers, and AI researchers who want to run high-fidelity robotic simulations and reinforcement learning (RL) pipelines using NVIDIA Isaac Sim and Isaac Lab on Arm-based NVIDIA DGX Spark system powered by the Grace–Blackwell (GB10) architecture. learning_objectives: - Describe the roles of Isaac Sim and Isaac Lab within a robotics simulation and RL pipeline From b85d746fea5bad2a72745b1e1896aba14f1d845a Mon Sep 17 00:00:00 2001 From: Jason Andrews Date: Thu, 12 Mar 2026 17:45:10 -0500 Subject: [PATCH 19/51] Start tech review on Alif image classification --- .../alif-image-classification/_index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md index d09347e1b3..83256c0b0c 100644 --- a/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/alif-image-classification/_index.md @@ -1,6 +1,10 @@ --- title: Run image classification on an Alif Ensemble E8 DevKit with ExecuTorch and Ethos-U85 +draft: true +cascade: + draft: true + minutes_to_complete: 120 who_is_this_for: This Learning Path is for embedded developers who want to deploy a neural network on an Arm Cortex-M55 microcontroller with an Ethos-U85 NPU. You will compile a MobileNetV2 model using ExecuTorch, embed it into bare-metal firmware, and run image classification on the Alif Ensemble E8 DevKit. From 2303aef691208230a5259205cd45a28cc45a1760 Mon Sep 17 00:00:00 2001 From: Joana Cruz Date: Fri, 13 Mar 2026 10:55:21 +0000 Subject: [PATCH 20/51] Fix small incongruencies in libamath reproducible article. Some incosistencies were found in the technical content of the Multi-accuracy libamath's pathway. These have been reviewed and corrected. --- .../multi-accuracy-libamath/floating-point-rep.md | 14 +++++++------- .../multi-accuracy-libamath/ulp.md | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md index 610aa1be31..e756582d02 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/floating-point-rep.md @@ -23,7 +23,7 @@ Floating-point numbers are a finite, discrete approximation of real numbers. The A floating-point number is typically expressed as: ```output -± d.dddd...d × B^e +(−1)^s × d.dddd...d × B^e ``` where: @@ -31,16 +31,16 @@ where: * e is the exponent * d.dddd...d is the mantissa (or significand) * *p* is the number of bits used for precision -* the +/- sign is stored separately +* s is the sign bit -The precision of a floating-point format refers to the number of binary digits used to represent the mantissa. This is denoted by *p*, and a system with *p* bits of precision can distinguish between \( 2^p \) different fractional values. +The precision of a floating-point format refers to the number of binary digits used to represent the mantissa. This is denoted by *p*, and a system with *p* bits of precision can represent approximately 2^p distinct mantissa values. If the leading digit is non-zero, the number is said to be normalized (also called a *normal number*). {{% notice Example 1%}} Fixing `B = 2, p = 24` -`0.1 = 1.10011001100110011001101 × 2^4` is a normalized representation of 0.1 +`0.1 = 1.10011001100110011001101 × 2^-4` is a normalized representation of 0.1 `0.1 = 0.000110011001100110011001 × 2^0` is a non-normalized representation of 0.1 @@ -76,7 +76,7 @@ For any exponent, *n*, numbers are evenly spaced between 2ⁿ and 2ⁿ⁺¹. How ## Bitwise representation of floating-point numbers -Since there are \( B^p \) possible mantissas and `emax-emin+1` possible exponents, then `log2(B^p) + log2(emax-emin+1) + 1` (sign) bits are needed to represent a given floating-point number in a system. +Since there are \( B^p \) possible mantissas and `emax-emin+1` possible exponents, then `log2(B^p) + log2(emax-emin+1) + 1` digits are needed to represent a given floating-point number in a system. In Example 2, 3+2+1=6 bits are needed. @@ -111,7 +111,7 @@ In this format: * The sign is represented using 1 bit * The exponent uses 8 bits -* The mantissa uses 23 bits +* The mantissa uses 24 bits (including the implicit leading bit for normalized numbers) The value of a normalized floating-point number in IEEE-754 can be represented as: @@ -119,7 +119,7 @@ The value of a normalized floating-point number in IEEE-754 can be represented a x = (−1)^S × (1.M) × 2^(E−127) ``` -The exponent bias of 127 allows storage of exponents from -126 to +127. The leading digit is implicit in normalized numbers, giving a total of 24 bits of precision. +The exponent bias of 127 allows storage of exponents from -126 to +127. The leading digit is implicit in normalized numbers, meaning the actual number of bits used to store the mantissa in this format is 23. {{% notice Special cases in IEEE-754 single precision %}} Since the exponent field uses 8 bits, E ranges between 0 and 2^8-1=255. However not all these 256 values are used for normal numbers. diff --git a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md index 1f0e77fde5..e0f6b7e97d 100644 --- a/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md +++ b/content/learning-paths/servers-and-cloud-computing/multi-accuracy-libamath/ulp.md @@ -56,7 +56,7 @@ ULP(x) = 2^(e-p+1) Where: * `e` is the unbiased exponent (in the IEEE-754 definition of single precision this is `E-127`) -* `p` is the precision (23 for IEEE-754 single-precision) +* `p` is the precision (24 for IEEE-754 single-precision) When computing the ULP of IEEE-754 floats, this expression becomes: ``` From bfd373444373f736bc883a7cf22973391b22dab9 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:13:06 -0400 Subject: [PATCH 21/51] Update _index.md --- .../cross-platform/tinkerblox_ultraedge/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/cross-platform/tinkerblox_ultraedge/_index.md b/content/learning-paths/cross-platform/tinkerblox_ultraedge/_index.md index b355bd3638..27c7ec9993 100644 --- a/content/learning-paths/cross-platform/tinkerblox_ultraedge/_index.md +++ b/content/learning-paths/cross-platform/tinkerblox_ultraedge/_index.md @@ -1,5 +1,5 @@ --- -title: Deploy UltraEdge HPC-I for AI and mixed workloads on Arm +title: Deploy Tinkerblox UltraEdge HPC-I for AI and mixed workloads on Arm minutes_to_complete: 60 who_is_this_for: This is an advanced topic for business, R&D, and engineering teams seeking to optimize CPU and GPU infrastructure utilization while reducing total cost of ownership on edge and constrained environments. It's ideal for innovation and development teams building next-generation AI workloads using alternative runtime environments and packaging technologies. From 177a29d5056708252ee8d26be34538bff671392e Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:24:55 -0400 Subject: [PATCH 22/51] Revise who_is_this_for and prerequisites sections --- .../bolt-demo/_index.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md index 7720b875a1..2f08fa0764 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/_index.md @@ -7,23 +7,23 @@ cascade: minutes_to_complete: 20 -who_is_this_for: This is an introductory topic for performance‑minded developers - who have a compiled aarch64 Linux program and want to see if BOLT can make it run faster. +who_is_this_for: This learning path is intended for developers who have compiled an AArch64 Linux application and want to evaluate whether LLVM BOLT can improve its runtime performance. + learning_objectives: - Identify whether a program is a good candidate for code layout optimization - - Apply BOLT to optimize a small program with poor spatial locality - - Use different profiling techniques, including BRBE, Instrumentation, SPE, and PMU events - - Verify the impact of BOLT optimization using performance metrics + - Use LLVM BOLT to perform profile-guided post-link optimization of an AArch64 binary with poor spatial locality + - Collect profile data using multiple techniques, including BRBE, instrumentation, SPE, and PMU event sampling + - Evaluate the impact of BOLT optimizations using performance metrics and profiling data prerequisites: - - An AArch64 system running Linux with [Perf](/install-guides/perf/) installed - - Linux kernel version 6.17 or later for [BRBE](./brbe) profiling - - Linux kernel version 6.14 or later for [SPE](./spe) profiling - - GCC version 13.3 or later to compile the demo program ([GCC](/install-guides/gcc/) ) - - BOLT version [21.1.8](https://github.com/llvm/llvm-project/releases/tag/llvmorg-21.1.8) or later (download [zip](https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.8/LLVM-21.1.8-Linux-ARM64.tar.xz)) - - A system with enough performance counters for the [TopDown](/install-guides/topdown-tool) methodology, typically a non-virtualized instance + - An AArch64 system running Linux with [perf](/install-guides/perf/) installed + - Linux kernel version 6.17 or later to enable Branch Record Buffer Extension [BRBE](./brbe) profiling + - Linux kernel version 6.14 or later for Arm Statistical Profiling Extension [SPE](./spe) support + - GCC version 13.3 or later to compile the example program ([GCC](/install-guides/gcc/) ) + - LLVM BOLT version [21.1.8](https://github.com/llvm/llvm-project/releases/tag/llvmorg-21.1.8) or later (download [zip](https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.8/LLVM-21.1.8-Linux-ARM64.tar.xz)) + - A system with with sufficient hardware performance counters to use the [TopDown](/install-guides/topdown-tool) methodology. This typically requires running on bare metal rather than a virtualized environment. author: Paschalis Mpeis From e3ece937545180d7e1a27d489f12adb2cbb8870c Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:35:20 -0400 Subject: [PATCH 23/51] Revise overview of BOLT tutorial and profiling methods Updated tutorial overview to clarify the use of BOLT for performance optimization in AArch64 applications. Expanded on profiling methods and their implications for code layout optimization. --- .../bolt-demo/overview.md | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md index 6d668a8230..975061a46c 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/overview.md @@ -6,24 +6,24 @@ weight: 2 layout: learningpathall --- -### Tutorial Overview +### Overview -This tutorial shows how to apply [BOLT](https://github.com/llvm/llvm-project/blob/main/bolt/README.md) in different configurations. -It is based on a demo from the 2025 LLVM Developers Conference: +This Learning Path demonstrates how to use [BOLT](https://github.com/llvm/llvm-project/blob/main/bolt/README.md) a post-link binary optimizer from LLVM, to improve the performance of AArch64 applications using profile-guided code layout optimization. +The example used in the Learning Path is based on a demonstration from the 2025 LLVM Developers Conference: [BOLT tutorial on AArch64 and how it competes or complements other PGOs](https://youtu.be/KdHtOMc5_c8?si=249wZTn_YcTFOjcJ&t=1452). -The input program is a pathological case based on [BubbleSort](../setup), a workload with poor spatial locality. -First, we check whether the input binary is a good candidate for code layout optimization. -If it is, we can capture a profile using one of several profiling methods: -- **[BRBE](../brbe)**: Samples deep branch stacks with low profiling overheads. -- **[Instrumentation](../instrumentation)**: Captures high-quality, complete profiles, but has high collection overhead. -- **[SPE](../spe)**: Samples individual branches. Use it if BRBE is not available, as profile quality can be lower. -- **[PMU](../pmu)**: Samples basic events such as instructions or cycles. This method provides the least profiling information. +The input program is a deliberately inefficient implementation based on [BubbleSort](../setup). This workload exhibits poor instruction locality, making it a useful example for demonstrating how BOLT can improve performance by reorganizing code layout. - +The tutorial first evaluates whether the input binary is a good candidate for code layout optimization. If the program shows signs of poor spatial locality, you will then collect runtime profiles that BOLT can use to guide optimization. +Several profiling methods are supported: +- **[BRBE](../brbe)**: Uses the Arm Branch Record Buffer Extension to sample branch history with low runtime overhead. +- **[Instrumentation](../instrumentation)**: Inserts counters into the binary to record execution frequencies. This produces highly accurate profiles but introduces runtime overhead during profile collection. +- **[SPE](../spe)**: Uses the Arm Statistical Profiling Extension to sample microarchitectural events. BOLT can infer control-flow behavior from these samples, although the resulting profile quality may be lower than BRBE. +- **[PMU](../pmu)**: Uses standard performance monitoring unit events such as instructions or cycles. This method provides the least detailed information about control flow and is typically used when other profiling options are unavailable. -ETM and ETE generate data that you can use with BOLT. This tutorial does not cover these tracing methods. +Arm trace extensions such as **ETM** and **ETE** can also generate traces that are usable by BOLT, but these tracing mechanisms are not covered in this tutorial. -For each profiling method, we will perform the relevant BOLT optimization steps. -Finally, we will use hardware metrics to confirm how effective the optimization was. +For each profiling method, you will walk through the process of collecting a profile, converting it into a format usable by BOLT, and applying BOLT to generate an optimized binary. + +Finally, you will use hardware performance metrics to evaluate how effective the optimization was. From 6348420fd22c5f0b3c1394559ac98b1bdd73a042 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:39:29 -0400 Subject: [PATCH 24/51] Update environment setup instructions in setup.md Clarify the instructions for creating the bsort.cpp file. --- .../servers-and-cloud-computing/bolt-demo/setup.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index 35ee0fc415..dcc6036eb5 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -8,7 +8,9 @@ layout: learningpathall ### Environment setup -We start in an empty directory and place the input program [bsort.cpp](../bsort.cpp) there. +In your home directory (or another empty working directory), create a file named `bsort.cpp` with the following content: + + The [last section](#why-bubble-sort) explains why we chose BubbleSort for this tutorial. We create and use the following directories as needed throughout this guide: From 537256290b9d97afea98c49d0d5cbcd73ac6d420 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:40:04 -0400 Subject: [PATCH 25/51] Implement Bubble Sort in C++ with timing Added C++ implementation of Bubble Sort with swap functionality and timing. --- .../bolt-demo/setup.md | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index dcc6036eb5..61169bb8db 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -10,6 +10,121 @@ layout: learningpathall ### Environment setup In your home directory (or another empty working directory), create a file named `bsort.cpp` with the following content: +```cpp +#include +#include +#include + +#define ARRAY_LEN 10000 +#define FUNC_COPIES 5 +volatile bool Cond = false; +#define COND() (__builtin_expect(Cond, true)) + +#define NOPS(N) \ + asm volatile( \ + ".rept %0\n" \ + "nop\n" \ + ".endr\n" \ + : : "i"(N) : "memory") + +// Swap functionality plus some cold blocks. +#define SWAP_FUNC(ID) \ + static __attribute__((noinline)) \ + void swap##ID(int *left, int *right) { \ + if (COND()) NOPS(300); \ + int tmp = *left; \ + if (COND()) NOPS(300); else *left = *right; \ + if (COND()) NOPS(300); else *right = tmp; \ + } + +// Aligned at 16KiB +#define COLD_FUNC(ID) \ + static __attribute__((noinline, aligned(16384), used)) \ + void cold_func##ID(void) { \ + asm volatile("nop"); \ + } + +// Create copies of swap, and interleave with big chunks of cold code. +SWAP_FUNC(1) COLD_FUNC(1) +SWAP_FUNC(2) COLD_FUNC(2) +SWAP_FUNC(3) COLD_FUNC(3) +SWAP_FUNC(4) COLD_FUNC(4) +SWAP_FUNC(5) COLD_FUNC(5) + +typedef void (*swap_fty)(int *, int *); +static swap_fty const swap_funcs[FUNC_COPIES] = { + swap1, swap2, swap3, swap4, swap5 +}; + + +/* Sorting Logic */ +void bubble_sort(int *a, int n) { + if (n <= 1) + return; + + int end = n - 1; + int swapped = 1; + unsigned idx = 0; + + while (swapped && end > 0) { + swapped = 0; + // pick a different copy of the swap function, in a round-robin fashion + // and call it. + for (int i = 1; i <= end; ++i) { + if (a[i] < a[i - 1]) { + auto swap_func = swap_funcs[idx++]; + idx %= FUNC_COPIES; + swap_func(&a[i - 1], &a[i]); + swapped = 1; + } + } + --end; + } +} + +void sort_array(int *data) { + for (int i = 0; i < ARRAY_LEN; ++i) { + data[i] = rand(); + } + bubble_sort(data, ARRAY_LEN); +} + +/* Timers, helpers, and main */ +static struct timespec timer_start; +static inline void start_timer(void) { + clock_gettime(CLOCK_MONOTONIC, &timer_start); +} + +static inline void stop_timer(void) { + struct timespec timer_end; + clock_gettime(CLOCK_MONOTONIC, &timer_end); + long long ms = (timer_end.tv_sec - timer_start.tv_sec) * 1000LL + + (timer_end.tv_nsec - timer_start.tv_nsec) / 1000000LL; + printf("%lld ms ", ms); +} + +static void print_first_last(const int *data, int n) { + if (n <= 0) + return; + + const int first = data[0]; + const int last = data[n - 1]; + printf("(first=%d last=%d)\n", first, last); +} + +int main(void) { + srand(0); + printf("Bubble sorting %d elements\n", ARRAY_LEN); + int data[ARRAY_LEN]; + + start_timer(); + sort_array(data); + stop_timer(); + + print_first_last(data, ARRAY_LEN); + return 0; +} +``` The [last section](#why-bubble-sort) explains why we chose BubbleSort for this tutorial. From 2cfef8b3367fa7cde9b6c9f32ab4ac7723114013 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 10:46:51 -0400 Subject: [PATCH 26/51] Update setup.md --- .../bolt-demo/setup.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index 61169bb8db..c4fb2b20fc 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -8,7 +8,7 @@ layout: learningpathall ### Environment setup -In your home directory (or another empty working directory), create a file named `bsort.cpp` with the following content: +On your AArch64 Linux bare-metal instance, navigate to your home directory (or another empty working directory) and create a file named `bsort.cpp` with the following content: ```cpp #include @@ -126,26 +126,26 @@ int main(void) { } ``` -The [last section](#why-bubble-sort) explains why we chose BubbleSort for this tutorial. - -We create and use the following directories as needed throughout this guide: +The [last section](#why-bubble-sort) explains why this tutorial uses BubbleSort as the demonstration workload. +Create the following directories to organize generated files from this example: +```bash +mkdir -p out prof heatmap +``` - **out**: Stores output binaries - **prof**: Stores profile data - **heatmap**: Stores heatmap visualizations and related metrics ### Compile the input program {#compile} -We now compile the input binary. -Because BOLT and PGO pipelines can include multiple stages, this binary is also called the **stage-0 binary**. +Next, compile the input program. +Because BOLT and other profile-guided optimization pipelines often involve multiple build stages, you will refer to this initial binary as the **stage-0 binary**. -To keep the example useful, we must keep the original function order. -Small programs like this are simple enough that compilers might reorder functions and improve layout without profile data. -That behavior is rare in real applications, but it can happen here. -To keep our example with poor locality, we pass specific options to the relevant toolchain. +For this example, you must preserve the original function order from the source file. +Small programs like this one are simple enough that modern compilers may reorder functions automatically to improve instruction locality, even without profile data. That behavior rarely affects large real-world applications, but it can occur in this example. To ensure the program retains its intentionally poor layout, pass specific options to the compiler and linker. BOLT works with both LLVM and GNU toolchains. -GNU (gcc) provides a direct flag that preserves the original order: `-fno-toplevel-reorder`. -LLVM (clang) requires an order file that defines the initial layout. +GNU (gcc) provides a flag that preserves the original order: `-fno-toplevel-reorder`. +LLVM Clang does not provide an equivalent flag, so it relies on a symbol ordering file that explicitly defines the initial function layout. You can find the file used in this tutorial here: You can find this file here: [orderfile.txt](../orderfile.txt). Both approaches are shown below. From 0c8545179466deab9b9851f292c704a20eae2ca1 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 11:00:00 -0400 Subject: [PATCH 27/51] Update setup.md --- .../bolt-demo/setup.md | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index c4fb2b20fc..dd3bc67eaf 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -145,12 +145,22 @@ Small programs like this one are simple enough that modern compilers may reorder BOLT works with both LLVM and GNU toolchains. GNU (gcc) provides a flag that preserves the original order: `-fno-toplevel-reorder`. -LLVM Clang does not provide an equivalent flag, so it relies on a symbol ordering file that explicitly defines the initial function layout. You can find the file used in this tutorial here: -You can find this file here: [orderfile.txt](../orderfile.txt). +LLVM Clang does not provide an equivalent flag, so it relies on a symbol ordering file that explicitly defines the initial function layout. Using a file editor of your choice copy the contents below into a file named `orderfile.txt`. +```txt +_ZL5swap1PiS_ +_ZL10cold_func1v +_ZL5swap2PiS_ +_ZL10cold_func2v +_ZL5swap3PiS_ +_ZL10cold_func3v +_ZL5swap4PiS_ +_ZL10cold_func4v +_ZL5swap5PiS_ +_ZL10cold_func5v +``` -Both approaches are shown below. -Compile with your preferred toolchain, and ensure that relocations are enabled. -We explain why they matter [later](#why-relocations) in this tutorial. +Both approaches to compile the binary are shown. Compile with your preferred toolchain, and ensure that relocations are enabled. +You will look at why relocations matter [later](#why-relocations) in this Learning Path. {{< tabpane code=true >}} {{< tab header="GNU" language="bash">}} @@ -164,9 +174,8 @@ clang bsort.cpp -o out/bsort -O3 -fuse-ld=lld -ffunction-sections -Wl,--emit-rel {{< /tabpane >}} ### Verify the function order -We now verify that the compiler preserved the original function order. -We do this by inspecting the symbols in the `.text` section. -The output should list the swap and cold functions interleaved, matching their order in the source file. +Verify that the compiler preserved the intended function order by inspecting the symbols in the `.text` section of the binary. +Run the following command: {{< tabpane code=true >}} {{< tab header="GNU" language="bash" output_lines="2-13">}} @@ -201,10 +210,14 @@ The output should list the swap and cold functions interleaved, matching their o {{< /tab >}} {{< /tabpane >}} +The output should show the **swap** and **cold** functions interleaved. +This layout matches the order in the source file and creates poor instruction locality, which makes the program a good candidate for BOLT optimization. ### Verify the presence of relocations -We now verify that the binary includes relocations. -This can be seen by checking for `.rel*.*` entries in the section table, such as `.rela.text`. +Verify that the binary contains relocation information. +BOLT relies on relocation records to safely modify the binary layout after linking. + +Check the ELF section table and confirm that relocation sections such as `.rela.text` appear in the output. {{< tabpane code=true >}} {{< tab header="GNU" language="bash" output_lines="2-13">}} @@ -237,21 +250,18 @@ This can be seen by checking for `.rel*.*` entries in the section table, such as {{< /tab >}} {{< /tabpane >}} +Look for relocation sections such as **`.rela.text`** in the output. Their presence confirms that the linker preserved relocation information required by BOLT. ### Why relocations are important {#why-relocations} -BOLT relies on relocations to update references after it changes the code layout. -Without relocations, BOLT is severely limited. For example, function reordering is disabled, which makes code layout optimizations ineffective. - -Because BOLT runs post-link, it may need to adjust locations that the linker patched in the original binary. -Relocations describe these locations, so they must be preserved for BOLT to be able to apply its full set of layout optimizations. +BOLT uses relocation records to update references after it changes the code layout. When BOLT reorders functions or basic blocks, it must update addresses used by instructions such as calls, branches, and references to code or data. Relocation records identify these locations in the binary so that BOLT can safely rewrite them. +Without relocations, BOLT cannot reliably adjust these references. As a result, many optimizations become unavailable. For example, BOLT disables function reordering when relocation information is missing, which prevents most code layout optimizations. +Because BOLT operates on fully linked binaries, it must modify addresses that the linker already resolved. Relocations preserve the information needed to update those addresses correctly. ### Why Bubble Sort? -Bubble Sort keeps this tutorial simple. -The code is in one file, has no external dependencies, and runs in a few seconds under instrumentation with a small, fixed workload. -In its original form it is not a good candidate for code layout optimization. -To make it one, we add **cold code** blocks between hot paths. -This reduces code locality, which BOLT improves later. +Bubble Sort is a simple program with all the code in one file. The program has no external dependencies, and runs in a few seconds under instrumentation with a small, fixed workload. +In its original form, the program does not benefit much from code layout optimization. To create a more interesting example, instruction locality is intentionally reduced. +We introduce **cold code paths** between frequently executed code. These cold blocks separate hot instructions in memory and degrade spatial locality. BOLT later improves performance by reorganizing the binary so that hot code paths appear closer together. The code below shows the changes we introduced to reduce code locality. From 0e5de1d57fc40f6a9359249dbfe972a7e3935938 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 11:34:45 -0400 Subject: [PATCH 28/51] Update setup.md --- .../servers-and-cloud-computing/bolt-demo/setup.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index dd3bc67eaf..0be10ed92b 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -263,9 +263,9 @@ Bubble Sort is a simple program with all the code in one file. The program has n In its original form, the program does not benefit much from code layout optimization. To create a more interesting example, instruction locality is intentionally reduced. We introduce **cold code paths** between frequently executed code. These cold blocks separate hot instructions in memory and degrade spatial locality. BOLT later improves performance by reorganizing the binary so that hot code paths appear closer together. -The code below shows the changes we introduced to reduce code locality. +The code below shows how the program was modified to reduce code locality. -The main sort function is shown below. It rotates through 5 copies of the swap function, selecting a different one each time a swap is performed. +The main sort function rotates through five copies of the swap function. Each time the algorithm performs a swap, it selects the next swap implementation in a round-robin fashion. ```cpp { line_numbers=true linenos=table line_start=48 } void bubble_sort(int *a, int n) { if (n <= 1) @@ -291,8 +291,8 @@ void bubble_sort(int *a, int n) { } } ``` +Each swap function is defined using a macro and contains a small cold path that includes several nop instructions. -Each swap function is defined using a macro and includes some nop instructions on a cold path. ```cpp { line_numbers=true linenos=table line_start=18 } #define SWAP_FUNC(ID) \ static __attribute__((noinline)) \ @@ -304,8 +304,8 @@ Each swap function is defined using a macro and includes some nop instructions o } ``` -To further reduce code locality, we place larger cold functions between hot ones. -These cold functions are also defined using a macro and consist entirely of nop instructions. +To further reduce code locality, we place larger cold functions between frequently executed functions. These cold functions occupy space in the instruction layout and push hot code farther apart in memory. +We define these cold functions using a macro. Each function contains only a nop instruction and does not participate in the program’s hot execution path. ```cpp { line_numbers=true linenos=table line_start=28 } #define COLD_FUNC(ID) \ static __attribute__((noinline, aligned(16384), used)) \ From be7aba9a8b04f26ce94b6e6a79bbb5d9fdbe8242 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 13:05:32 -0400 Subject: [PATCH 29/51] Revise good BOLT candidate criteria and metrics Updated the content to clarify what makes code a good BOLT candidate and improved the explanation of metrics and indicators. --- .../bolt-demo/good-candidates.md | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md index 4043380c52..bc3991a13d 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/good-candidates.md @@ -6,28 +6,30 @@ weight: 4 layout: learningpathall --- -## Which code is a good BOLT candidate? -A few hardware metrics can indicate whether a program is a good candidate for code-layout optimization. -These metrics are commonly analyzed using general methodologies such as the [Arm TopDown methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology). +## What make the code a good BOLT candidate? +Hardware performance metrics can help determine whether a program is a good candidate for code layout optimization with BOLT. Developers often analyze these metrics using methodologies such as the [Arm TopDown methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology). -Here, we focus on a small set of TopDown indicators related to instruction delivery and code locality. -These indicators describe how efficiently the processor can fetch instructions and keep its execution pipeline busy. -When instruction delivery is inefficient, the workload is said to be **front-end bound**, meaning the CPU often waits for instructions instead of executing them. +In this tutorial, you will focus on a small set of TopDown indicators related to instruction delivery and code locality. These indicators describe how efficiently the processor fetches instructions and keeps the execution pipeline busy. + +When instruction delivery is inefficient, the workload is refered to as **front-end bound**, meaning the CPU often waits for instructions instead of executing them. This usually points to instruction fetch or code layout issues, where improving code layout can help. The L1 instruction cache (L1 I-cache) is the first and fastest cache used to store instructions close to the CPU. When instructions are not found there, the CPU must fetch them from slower memory, which can stall execution. MPKI, short for misses per kilo instructions, measures how often an event misses per 1,000 executed instructions, which makes it easier to compare across programs and workloads. -A high **L1 I-cache MPKI** usually indicates poor instruction locality in the binary. +A high L1 I-cache MPKI usually indicates poor instruction locality in the binary. + +Based on these observations, the BOLT community typically considers a program a good candidate for layout optimization when: +- The workload is more than 10% front-end bound +- The L1I cache misses per kilo instructions (MPKI) exceeds 30 -Based on these observations, the BOLT community suggests the following two indicators of a good candidate: -- Front-End bound workload above 10%. -- More than 30 L1 I-cache misses per kilo instructions (MPKI). +Higher branch mispredictions or I-TLB misses can also indicate that code layout optimization may improve performance. -Higher branch mispredictions or I-TLB misses can also indicate that layout optimization may help. +## Collecting the metrics -We can use the Topdown Methodology (see [installation guide](/install-guides/topdown-tool)) to collect these metrics, which is based on the Linux [perf](/install-guides/perf/) tool. -Alternatively, we can compute only the L1 I-cache MPKI metric manually using plain Linux perf stat. +You can collect these metrics using the Topdown Methodology (see [installation guide](/install-guides/topdown-tool)) which builds on the Linux [perf](/install-guides/perf/) profiling tool. + +Alternatively, you can compute only the L1 I-cache MPKI metric manually using a basic Linux `perf stat` command. {{< tabpane code=true >}} {{< tab header="topdown-tool" language="bash" output_lines="2-21">}} @@ -67,12 +69,19 @@ Alternatively, we can compute only the L1 I-cache MPKI metric manually using pla {{< /tab >}} {{< /tabpane >}} -We see that the program is **55%** front-end bound. -At Stage 2, the micro-architectural metrics report **60 L1I MPKI**, which indicates a good candidate for layout optimization. -The branch MPKI of **16** is also relatively high. +## Interpreting the results + +In this example, the program is **55% front-end bound**, which indicates that the processor frequently stalls while waiting for instructions. +At Stage 2, the microarchitectural metrics report an **L1I cache MPKI of about 60**, which strongly suggests poor instruction locality. This value exceeds the typical threshold of 30 MPKI for good BOLT candidates. + +The **branch MPKI of 16** also indicates frequent branch mispredictions, which code layout optimization may improve. + +## Computing MPKI manually + +The `topdown-tool` collects performance counters using `perf` and applies formulas to derive higher-level metrics. + +To compute the **L1I cache MPKI** manually from the `perf stat` output, apply the following formula: -Under the hood, the `topdown-tool` collects perf counters and applies formulas to derive these metrics. -To compute the L1 I-cache MPKI manually from the `perf stat` output, we apply: $$\frac{(\text{L1-icache-misses} \times 1000)}{\text{instructions}}$$ ### Further Reading From 8c0b58547e10a4492ac068e455dc0843611b8d6e Mon Sep 17 00:00:00 2001 From: Arnaud de Grandmaison Date: Fri, 13 Mar 2026 18:14:17 +0100 Subject: [PATCH 30/51] [SME2] Add a new device to the list. --- .../multiplying-matrices-with-sme2/1-get-started.md | 1 + 1 file changed, 1 insertion(+) diff --git a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md index 0088d07e5e..4a6cc42f99 100644 --- a/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md +++ b/content/learning-paths/cross-platform/multiplying-matrices-with-sme2/1-get-started.md @@ -319,6 +319,7 @@ These Android phones support SME2 natively. |-------------------------------------|--------------|---------------------------| | Vivo X300 | 2025 | MediaTek Dimensity 9500 featuring an 8-core Arm C1 CPU cluster and Arm G1-Ultra GPU | | OPPO Find X9 | 2025 | MediaTek Dimensity 9500 featuring an 8-core Arm C1 CPU cluster and Arm G1-Ultra GPU | +| Samsung S26 | 2026 | Samsung Exynos 2600 featuring a SME2-enabled C1-Ultra and C1-Pro CPU cluster | These Apple devices support SME2 natively. From e7f8187cecd8d290a0e967206e43961f880909f8 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 13:16:06 -0400 Subject: [PATCH 31/51] Revise BRBE documentation for clarity and detail Updated the explanation of BRBE's profiling mechanism and its availability. Added details on how to check for BRBE support and clarified the steps for recording a BRBE profile. --- .../bolt-demo/brbe.md | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md index da9f4f2e60..0eb0a9a460 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md @@ -9,7 +9,7 @@ layout: learningpathall ### What is BRBE BRBE stands for Branch Record Buffer Extension. It is an Arm hardware unit with a circular buffer that captures the most recent 32 or 64 taken branches. The exact size depends on the hardware implementation. -For BOLT, BRBE provides an effective, low-overhead sampling mechanism that records taken branches directly in hardware without frequent interruptions. Each recorded taken branch represents a control-flow edge, which makes BRBE an edge-based profiling method. +For BOLT, BRBE provides an efficient and low-overhead way to collect profiling data. The hardware records taken branches directly without requiring frequent interrupts or instrumentation. Each recorded taken branch represents a control-flow edge, which makes BRBE an edge-based profiling method. Taken branches are continuously added to the circular buffer, and the buffer is periodically sampled to keep overheads low. Recording only taken branches is an efficient use of the buffer, since fall-through paths do not need to be captured at runtime. @@ -19,24 +19,12 @@ During post-processing, fall-through edges between the recorded taken branches a When available, BRBE is the preferred profiling option for BOLT. It is expected to have the lowest runtime overhead while still providing near-optimal profiles, close to those obtained with instrumentation. -### Optimizing with BRBE -We check [BRBE availability](#availability) before recording a profile. -We then record a BRBE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. - -```bash { line_numbers=true } -mkdir -p prof -perf record -j any,u -o prof/brbe.data -- ./out/bsort -perf2bolt -p prof/brbe.data -o prof/brbe.fdata out/bsort -llvm-bolt out/bsort -o out/bsort.opt.brbe --data prof/brbe.fdata \ - -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ - --dyno-stats -``` - - ### Availability -BRBE is an optional feature in processors that implement [Armv9.1](https://developer.arm.com/documentation/109697/2025_09/Feature-descriptions/The-Armv9-2-architecture-extension#extension__feat_FEAT_BRBE) or later. To check availability, we record a trace. +BRBE is an optional processor feature called **FEAT_BRBE** (Branch Record Buffer Extension), introduced in the [Armv9.1 architecture](https://developer.arm.com/documentation/109697/2025_09/Feature-descriptions/The-Armv9-2-architecture-extension#extension__feat_FEAT_BRBE). +To check whether your system supports BRBE, attempt to record a branch profile using `perf`. + +If BRBE is available, the command records the branch samples successfully: -On a successful recording we see: ```bash { command_line="user@host | 2-5"} perf record -j any,u -o prof/brbe.data -- ./out/bsort Bubble sorting 10000 elements @@ -45,17 +33,32 @@ Bubble sorting 10000 elements [ perf record: Captured and wrote 40.244 MB brbe.data (26662 samples) ] ``` -When unavailable: +If the processor or kernel does not support BRBE, perf reports an error similar to the following: + ```bash { command_line="user@host | 2-3"} perf record -j any,u -o prof/brbe.data -- ./out/bsort Error: cycles:P: PMU Hardware or event type doesn't support branch stack sampling. ``` -To record a BRBE trace we need a Linux system that is version 6.17 or later. We can check the version using: +Recording BRBE profiles requires a Linux kernel version 6.17 or later. +Check your kernel version with: ```bash perf --version ``` +### Optimizing with BRBE +We then record a BRBE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. + +```bash { line_numbers=true } +mkdir -p prof +perf record -j any,u -o prof/brbe.data -- ./out/bsort +perf2bolt -p prof/brbe.data -o prof/brbe.fdata out/bsort +llvm-bolt out/bsort -o out/bsort.opt.brbe --data prof/brbe.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats +``` + + ### Further Reading From 77047f667611ade2d4b9aa64abf4396ec16ad6ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:19:14 +0000 Subject: [PATCH 32/51] Initial plan From f9c8b29375a145020a994e44e0c310d2e13b4a99 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:26:00 +0000 Subject: [PATCH 33/51] Replace NEON with Neon (Arm approved trademark) in markdown content Co-authored-by: pareenaverma <59843121+pareenaverma@users.noreply.github.com> --- content/install-guides/claude-code.md | 2 +- content/install-guides/github-copilot.md | 2 +- .../cross-platform/adler32/_index.md | 8 ++--- .../cross-platform/adler32/about-2.md | 20 ++++++------- .../cross-platform/adler32/build-6.md | 2 +- .../cross-platform/adler32/more-11.md | 4 +-- .../cross-platform/adler32/neon-7.md | 12 ++++---- .../cross-platform/adler32/neon-debug-9.md | 10 +++---- .../cross-platform/adler32/neon-run-8.md | 12 ++++---- .../cross-platform/adler32/summary-10.md | 22 +++++++------- .../eigen-on-arm-part1.md | 2 +- .../cross-platform/intrinsics/_index.md | 2 +- .../cross-platform/intrinsics/intro.md | 2 +- .../cross-platform/intrinsics/simde.md | 2 +- .../cross-platform/intrinsics/sse2neon.md | 6 ++-- .../kleidiai-explainer/_index.md | 2 +- .../simd-info-demo/conclusion.md | 2 +- .../simd-info-demo/simdinfo-description.md | 6 ++-- .../simd-info-demo/simdinfo-example1-cont.md | 8 ++--- .../simdinfo-example1-porting.md | 10 +++---- .../simd-info-demo/simdinfo-example2.md | 2 +- .../cross-platform/simd-loops/1-about.md | 8 ++--- .../cross-platform/simd-loops/2-using.md | 6 ++-- .../cross-platform/simd-loops/3-example.md | 4 +-- .../cross-platform/simd-loops/4-conclusion.md | 2 +- .../cross-platform/simd-loops/_index.md | 6 ++-- .../03-model-onboarding-and-profiling.md | 2 +- .../1-vectorization.md | 30 +++++++++---------- .../2-code-examples.md | 16 +++++----- .../vectorization-comparison/_index.md | 4 +-- .../a-more-complex-problem-revisited.md | 2 +- .../embedded-and-microcontrollers/_index.md | 2 +- .../migration/3_porting_analysis.md | 2 +- .../migration/5_application_porting.md | 2 +- .../migration/7_alternative.md | 4 +-- .../migration/_index.md | 2 +- .../raspberry-pi-smart-home/1-overview.md | 2 +- .../laptops-and-desktops/_index.md | 2 +- .../dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md | 2 +- .../dgx_spark_llamacpp/4_gb10_processwatch.md | 6 ++-- .../kleidicv-on-mac/build-1.md | 2 +- .../kleidicv-on-mac/run-test-2.md | 6 ++-- .../mobile-graphics-and-gaming/_index.md | 2 +- .../aot-and-cross-compilation.md | 2 +- .../android_halide/intro.md | 2 +- .../android_neon/_index.md | 2 +- .../android_sve2/part1.md | 2 +- .../litert-sme/1-litert-kleidiai-sme2.md | 8 ++--- .../litert-sme/3-build-tool.md | 4 +-- .../onnx/01_fundamentals.md | 6 ++-- .../kleidiai_integration.md | 2 +- .../_index.md | 2 +- .../servers-and-cloud-computing/_index.md | 2 +- .../arm-mcp-server/3-simd-migration.md | 8 ++--- .../migration.md | 2 +- .../bitmap_scan_sve2/01-introduction.md | 8 ++--- .../03-scalar-implementations.md | 4 +-- .../04-vector-implementations.md | 14 ++++----- .../05-benchmarking-and-results.md | 22 +++++++------- .../06-application-and-best-practices.md | 14 ++++----- .../bitmap_scan_sve2/_index.md | 6 ++-- .../deepseek-cpu/deepseek-chatbot.md | 6 ++-- .../docker-mcp-toolkit/1-overview.md | 4 +-- .../3-understand-the-demo.md | 18 +++++------ .../docker-mcp-toolkit/4-run-migration.md | 10 +++---- .../5-validate-and-next-steps.md | 4 +-- .../docker-mcp-toolkit/_index.md | 4 +-- .../llama-cpu/llama-chatbot.md | 6 ++-- .../llama_cpp_streamline/2_llama.cpp_intro.md | 2 +- .../3_llama.cpp_annotation.md | 2 +- .../4_analyze_token_prefill_decode.md | 4 +-- .../migrate-ease/_index.md | 2 +- .../migration/_index.md | 2 +- .../migration/c.md | 6 ++-- .../migration/java.md | 2 +- .../migration/migration.md | 2 +- .../processwatch/running-processwatch.md | 8 ++--- .../processwatch/using-processwatch.md | 22 +++++++------- .../reproducible-libamath/applications.md | 6 ++-- .../reproducible-libamath/examples.md | 14 ++++----- .../reproducible-libamath/reproducibility.md | 6 ++-- .../reproducibility_libamath.md | 12 ++++---- .../servers-and-cloud-computing/sve/_index.md | 6 ++-- .../sve/sve_basics.md | 14 ++++----- .../sve2-match/_index.md | 2 +- .../triggering-pmu-events-2/operation.md | 4 +-- .../using-and-porting-performance-libs/1.md | 2 +- .../using-and-porting-performance-libs/3.md | 6 ++-- 88 files changed, 269 insertions(+), 269 deletions(-) diff --git a/content/install-guides/claude-code.md b/content/install-guides/claude-code.md index 5f8c92b3c3..9fa5be4f64 100644 --- a/content/install-guides/claude-code.md +++ b/content/install-guides/claude-code.md @@ -260,7 +260,7 @@ Here are some example prompts that use the Arm MCP Server tools: - `Scan my workspace for code that needs updating for Arm compatibility` - `Check if the postgres:latest container image supports Arm64 architecture` -- `Search the Arm knowledge base for NEON intrinsics examples` +- `Search the Arm knowledge base for Neon intrinsics examples` - `Find learning resources about migrating from x86 to Arm` - `Analyze this assembly code for performance on Arm processors` diff --git a/content/install-guides/github-copilot.md b/content/install-guides/github-copilot.md index 1a2ab2a64c..299ca09f2a 100644 --- a/content/install-guides/github-copilot.md +++ b/content/install-guides/github-copilot.md @@ -332,7 +332,7 @@ Example prompts that use the Arm MCP Server: - `Scan my workspace for code that needs updating for Arm compatibility` - `Check if the postgres:latest container image supports Arm64 architecture` -- `Search the Arm knowledge base for NEON intrinsics examples` +- `Search the Arm knowledge base for Neon intrinsics examples` - `Find learning resources about migrating from x86 to Arm` ## Troubleshooting MCP Server connections diff --git a/content/learning-paths/cross-platform/adler32/_index.md b/content/learning-paths/cross-platform/adler32/_index.md index f038fe9c70..cd9294380e 100644 --- a/content/learning-paths/cross-platform/adler32/_index.md +++ b/content/learning-paths/cross-platform/adler32/_index.md @@ -1,12 +1,12 @@ --- -title: Write NEON intrinsics using GitHub Copilot to improve Adler32 performance +title: Write Neon intrinsics using GitHub Copilot to improve Adler32 performance minutes_to_complete: 45 -who_is_this_for: This is an introductory topic for C/C++ developers who are interested in using GitHub Copilot to improve performance using NEON intrinsics. +who_is_this_for: This is an introductory topic for C/C++ developers who are interested in using GitHub Copilot to improve performance using Neon intrinsics. learning_objectives: - - Use GitHub Copilot to write NEON intrinsics that accelerate the Adler32 checksum algorithm. + - Use GitHub Copilot to write Neon intrinsics that accelerate the Adler32 checksum algorithm. prerequisites: - An Arm computer running Linux with the GNU compiler (gcc) installed. @@ -43,7 +43,7 @@ further_reading: link: https://en.wikipedia.org/wiki/Adler-32 type: Article - resource: - title: NEON Programming Quick Reference + title: Neon Programming Quick Reference link: https://developer.arm.com/documentation/den0018/a type: Documentation diff --git a/content/learning-paths/cross-platform/adler32/about-2.md b/content/learning-paths/cross-platform/adler32/about-2.md index 3c2eb862bd..e1d75177f3 100644 --- a/content/learning-paths/cross-platform/adler32/about-2.md +++ b/content/learning-paths/cross-platform/adler32/about-2.md @@ -1,5 +1,5 @@ --- -title: About NEON and Adler32 +title: About Neon and Adler32 weight: 2 ### FIXED, DO NOT MODIFY @@ -10,23 +10,23 @@ layout: learningpathall In computing, optimizing performance is crucial for applications that process large amounts of data. This Learning Path guides you through implementing and optimizing the Adler32 checksum algorithm using Arm advanced SIMD (Single Instruction, Multiple Data) instructions. You'll learn how to leverage GitHub Copilot to simplify the development process while achieving significant performance improvements. -## Simplifying Arm NEON Development with GitHub Copilot +## Simplifying Arm Neon Development with GitHub Copilot -Developers recognize that Arm NEON SIMD instructions can significantly boost performance for computationally intensive applications, particularly in areas like image processing, audio/video codecs, and machine learning. However, writing NEON intrinsics directly requires specialized knowledge of the instruction set, careful consideration of data alignment, and complex vector operations that can be error-prone and time-consuming. Many developers avoid implementing these optimizations due to the steep learning curve and development overhead. +Developers recognize that Arm Neon SIMD instructions can significantly boost performance for computationally intensive applications, particularly in areas like image processing, audio/video codecs, and machine learning. However, writing Neon intrinsics directly requires specialized knowledge of the instruction set, careful consideration of data alignment, and complex vector operations that can be error-prone and time-consuming. Many developers avoid implementing these optimizations due to the steep learning curve and development overhead. -The good news is that AI developer tools such as GitHub Copilot make working with NEON intrinsics much more accessible. By providing intelligent code suggestions, automated vectorization hints, and contextual examples tailored to your specific use case, GitHub Copilot can help bridge the knowledge gap and accelerate the development of NEON-optimized code. This allows developers to harness the full performance potential of Arm processors - without the usual complexity and overhead. +The good news is that AI developer tools such as GitHub Copilot make working with Neon intrinsics much more accessible. By providing intelligent code suggestions, automated vectorization hints, and contextual examples tailored to your specific use case, GitHub Copilot can help bridge the knowledge gap and accelerate the development of Neon-optimized code. This allows developers to harness the full performance potential of Arm processors - without the usual complexity and overhead. -You can demonstrate writing NEON intrinsics with GitHub Copilot by creating a full project from scratch and comparing the C implementation to a NEON-optimized version. +You can demonstrate writing Neon intrinsics with GitHub Copilot by creating a full project from scratch and comparing the C implementation to a Neon-optimized version. While you may not create complete projects from scratch - and you shouldn't blindly trust the generated code - it's helpful to see what's possible using an example so you can apply the principles to your own projects. -## Accelerating Adler32 with Arm NEON +## Accelerating Adler32 with Arm Neon -This project demonstrates how to accelerate Adler32 checksum calculations using Arm NEON instructions. +This project demonstrates how to accelerate Adler32 checksum calculations using Arm Neon instructions. -### What is Arm NEON? +### What is Arm Neon? -Arm NEON is an advanced SIMD architecture extension for Arm processors. It provides a set of instructions that can process multiple data elements in parallel using specialized vector registers. NEON technology enables developers to accelerate computationally intensive algorithms by performing the same operation on multiple data points simultaneously, rather than processing them one at a time. This parallelism is particularly valuable for multimedia processing, scientific calculations, and cryptographic operations where the same operation needs to be applied to large datasets. +Arm Neon is an advanced SIMD architecture extension for Arm processors. It provides a set of instructions that can process multiple data elements in parallel using specialized vector registers. Neon technology enables developers to accelerate computationally intensive algorithms by performing the same operation on multiple data points simultaneously, rather than processing them one at a time. This parallelism is particularly valuable for multimedia processing, scientific calculations, and cryptographic operations where the same operation needs to be applied to large datasets. ## What Is the Adler32 Algorithm? @@ -47,7 +47,7 @@ This project walks you through building the following components using GitHub Co - A test program to validate outputs for various input sizes. - A Makefile to build and run the program. - Performance measurement code to record how long the algorithm takes. -- A NEON-optimized version of Adler32. +- A Neon-optimized version of Adler32. - A performance comparison table for both implementations. Continue to the next section to start creating the project. diff --git a/content/learning-paths/cross-platform/adler32/build-6.md b/content/learning-paths/cross-platform/adler32/build-6.md index 5639b7d00d..87e10c6d02 100644 --- a/content/learning-paths/cross-platform/adler32/build-6.md +++ b/content/learning-paths/cross-platform/adler32/build-6.md @@ -59,4 +59,4 @@ The results confirm that your Adler-32 checksum implementation is correct for al The results from GitHub Copilot confirm that the Adler32 checksum calculations are correct and provide initial performance benchmarks. These results offer a solid baseline, but a meaningful comparison requires an optimized implementation. -In the next section, you’ll implement Adler32 using NEON intrinsics and compare its performance against this baseline. \ No newline at end of file +In the next section, you’ll implement Adler32 using Neon intrinsics and compare its performance against this baseline. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/adler32/more-11.md b/content/learning-paths/cross-platform/adler32/more-11.md index 446a2ea136..6687a0e9a1 100644 --- a/content/learning-paths/cross-platform/adler32/more-11.md +++ b/content/learning-paths/cross-platform/adler32/more-11.md @@ -13,6 +13,6 @@ GitHub Copilot can help you explore additional performance and optimization idea - Test different compiler flags using Agent mode to automate iteration and identify the best combinations. - Add Clang support to your Makefile and compare performance against GCC — performance can differ significantly depending on your code structure. - Generate a wider range of data sizes and random patterns to stress-test functionality and measure performance under varied conditions. -- Explore alternative algorithm structures that rely on compiler autovectorization instead of NEON intrinsics — you might discover better performance simply by restructuring the C code. +- Explore alternative algorithm structures that rely on compiler autovectorization instead of Neon intrinsics — you might discover better performance simply by restructuring the C code. -AI tools won’t always generate high-performance code out of the box, but they can rapidly accelerate your experimentation and learning — especially in new areas of programming like NEON-based performance optimization. +AI tools won’t always generate high-performance code out of the box, but they can rapidly accelerate your experimentation and learning — especially in new areas of programming like Neon-based performance optimization. diff --git a/content/learning-paths/cross-platform/adler32/neon-7.md b/content/learning-paths/cross-platform/adler32/neon-7.md index cc4c9ee3cf..2ad8890255 100644 --- a/content/learning-paths/cross-platform/adler32/neon-7.md +++ b/content/learning-paths/cross-platform/adler32/neon-7.md @@ -1,16 +1,16 @@ --- -title: Create a NEON version of Adler32 +title: Create a Neon version of Adler32 weight: 7 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## How Can I Create a NEON Version of Adler32 To Improve Performance? +## How Can I Create a Neon Version of Adler32 To Improve Performance? -Writing NEON intrinsics is an advanced programming topic which requires time and effort to learn and apply. GitHub Copilot is a good way to get started with NEON, both to write new code and to debug issues during development. +Writing Neon intrinsics is an advanced programming topic which requires time and effort to learn and apply. GitHub Copilot is a good way to get started with Neon, both to write new code and to debug issues during development. -You can instruct Github Copilot to create a NEON-optimized version of Adler32 using the prompt below: +You can instruct Github Copilot to create a Neon-optimized version of Adler32 using the prompt below: ```console Add a second implementation of adler32 that processes data in blocks and uses Arm NEON intrinsics to improve performance. @@ -553,6 +553,6 @@ Now that I've created the NEON implementation and updated the test code and Make ``` -GitHub Copilot generated a NEON-optimized implementation of Adler32, along with updates to the test program and Makefile. These changes make it easy to compare the performance of both versions. +GitHub Copilot generated a Neon-optimized implementation of Adler32, along with updates to the test program and Makefile. These changes make it easy to compare the performance of both versions. -Continue to the next section to compile and run the NEON version, and to compare its performance against the original implementation. +Continue to the next section to compile and run the Neon version, and to compare its performance against the original implementation. diff --git a/content/learning-paths/cross-platform/adler32/neon-debug-9.md b/content/learning-paths/cross-platform/adler32/neon-debug-9.md index 9c69ef7689..c2ff5e216a 100644 --- a/content/learning-paths/cross-platform/adler32/neon-debug-9.md +++ b/content/learning-paths/cross-platform/adler32/neon-debug-9.md @@ -1,14 +1,14 @@ --- -title: Debug the NEON version to match the standard C version +title: Debug the Neon version to match the standard C version weight: 9 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## How Can I Debug the NEON Version Adler32 and Compare Performance? +## How Can I Debug the Neon Version Adler32 and Compare Performance? -In the previous step, GitHub Copilot revealed that the NEON implementation did not produce the same checksum results as the standard C version. +In the previous step, GitHub Copilot revealed that the Neon implementation did not produce the same checksum results as the standard C version. Use the prompt below to instruct GitHub Copilot to try to debug and fix the issue. @@ -17,7 +17,7 @@ Can you modify adler32-neon.c to produce the correct checksum results for the ad The NEON version should produce the same checksum as adler32-simple.c but run faster. ``` -If the LLM you’re using cannot resolve the NEON functional issues, consider trying another model, such as Gemini 2.5 Pro. +If the LLM you’re using cannot resolve the Neon functional issues, consider trying another model, such as Gemini 2.5 Pro. The output is: @@ -94,6 +94,6 @@ I then checked for errors in the modified file and rebuilt the project using `ma You can now run the test program adler32-test to verify that the NEON implementation produces the same results as the simple implementation. ``` -GitHub Copilot successfully modified the NEON version to produce correct checksum results matching the standard C implementation. +GitHub Copilot successfully modified the Neon version to produce correct checksum results matching the standard C implementation. Continue to the next section to summarize the project, highlight the performance improvements, and generate a full project description. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/adler32/neon-run-8.md b/content/learning-paths/cross-platform/adler32/neon-run-8.md index ff2a761cc7..f2076c9a6c 100644 --- a/content/learning-paths/cross-platform/adler32/neon-run-8.md +++ b/content/learning-paths/cross-platform/adler32/neon-run-8.md @@ -1,16 +1,16 @@ --- -title: Compare the NEON version to the standard C version of Adler32 +title: Compare the Neon version to the standard C version of Adler32 weight: 8 ### FIXED, DO NOT MODIFY layout: learningpathall --- -## How Can I Run the NEON Version of Adler32 and Compare Performance? +## How Can I Run the Neon Version of Adler32 and Compare Performance? -Before comparing performance, you need to ensure that the NEON implementation builds, runs, and produces the same results as the standard C version. +Before comparing performance, you need to ensure that the Neon implementation builds, runs, and produces the same results as the standard C version. -To build the NEON version and compare it to the C version use the prompt below in your GitHub Copilot Agent session: +To build the Neon version and compare it to the C version use the prompt below in your GitHub Copilot Agent session: ```console Use the Makefile to compile and run the new test program and compare the checksum values and performance of each implementation. @@ -123,6 +123,6 @@ The NEON implementation offers significant performance improvements (up to 5x fa If you'd like, I can analyze the code to identify and fix the issue in the NEON implementation so you can maintain both correctness and the performance benefits. ``` -GitHub Copilot determined that while the NEON version delivers better performance, it currently produces incorrect checksums. This must be addressed before the NEON implementation can be considered correct or production-ready. +GitHub Copilot determined that while the Neon version delivers better performance, it currently produces incorrect checksums. This must be addressed before the Neon implementation can be considered correct or production-ready. -In the next section, you'll fix the functional issues in the NEON implementation and re-run the performance comparison. \ No newline at end of file +In the next section, you'll fix the functional issues in the Neon implementation and re-run the performance comparison. \ No newline at end of file diff --git a/content/learning-paths/cross-platform/adler32/summary-10.md b/content/learning-paths/cross-platform/adler32/summary-10.md index 251238c601..34b72acb11 100644 --- a/content/learning-paths/cross-platform/adler32/summary-10.md +++ b/content/learning-paths/cross-platform/adler32/summary-10.md @@ -35,9 +35,9 @@ The Adler-32 checksum is an algorithm invented by Mark Adler, used in the zlib c This project provides and compares two implementations of the Adler-32 checksum algorithm: 1. A simple, standard C implementation. -2. An optimized implementation using ARM NEON SIMD instructions. +2. An optimized implementation using ARM Neon SIMD instructions. -The goal is to demonstrate the performance benefits of using NEON intrinsics for this type of computation on compatible ARM architectures, such as the Neoverse N1. +The goal is to demonstrate the performance benefits of using Neon intrinsics for this type of computation on compatible ARM architectures, such as the Neoverse N1. The project includes: * Source code for both implementations (`adler32-simple.c`, `adler32-neon.c`). @@ -51,11 +51,11 @@ The project includes: The code in `adler32-simple.c` is a straightforward C implementation following the standard Adler-32 algorithm definition. It processes the input data byte by byte, updating two 16-bit accumulators (`a` and `b`) modulo 65521 (the largest prime smaller than 2^16). -#### 2. NEON-Optimized Implementation +#### 2. Neon-Optimized Implementation -The code in `adler32-neon.c` leverages ARM NEON SIMD (Single Instruction, Multiple Data) instructions to accelerate the checksum calculation. Key aspects include: +The code in `adler32-neon.c` leverages ARM Neon SIMD (Single Instruction, Multiple Data) instructions to accelerate the checksum calculation. Key aspects include: * Processing data in blocks (16 bytes at a time). -* Using NEON intrinsics (`vld1q_u8`, `vmovl_u8`, `vaddq_u16`, `vpaddlq_u16`, `vmulq_u16`, etc.) to perform parallel operations on data vectors. +* Using Neon intrinsics (`vld1q_u8`, `vmovl_u8`, `vaddq_u16`, `vpaddlq_u16`, `vmulq_u16`, etc.) to perform parallel operations on data vectors. * Calculating the sums `S1` (sum of bytes) and `S2` (weighted sum) for each block using vector operations. * Updating the scalar `a` and `b` accumulators based on the block results. * Falling back to the standard implementation for data lengths smaller than the block size or for the remaining bytes after processing full blocks. @@ -64,9 +64,9 @@ The code in `adler32-neon.c` leverages ARM NEON SIMD (Single Instruction, Multip The performance of both implementations was measured on an **AWS Graviton2 processor (based on Arm Neoverse N1 cores)**. The benchmark program (`adler32-test`) calculates the checksum for various data sizes and measures the time taken and throughput (in MB/s). -The following table summarizes the throughput results and the performance improvement factor of the NEON version compared to the simple version: +The following table summarizes the throughput results and the performance improvement factor of the Neon version compared to the simple version: -| Data Size | Simple Throughput (MB/s) | NEON Throughput (MB/s) | Speedup Factor | +| Data Size | Simple Throughput (MB/s) | Neon Throughput (MB/s) | Speedup Factor | | :-------- | :----------------------- | :--------------------- | :------------- | | 1 KB | 244.14 | 976.56 | 4.00x | | 10 KB | 295.93 | 3255.21 | 11.00x | @@ -74,7 +74,7 @@ The following table summarizes the throughput results and the performance improv | 1 MB | 298.33 | 3215.43 | 10.78x | | 10 MB | 298.37 | 3194.89 | 10.71x | -**Note:** Performance results can vary based on the specific hardware, compiler, and system load. The results above demonstrate a significant performance improvement (around **10-11x** for larger data sizes) when using NEON optimization on the Neoverse N1 architecture. +**Note:** Performance results can vary based on the specific hardware, compiler, and system load. The results above demonstrate a significant performance improvement (around **10-11x** for larger data sizes) when using Neon optimization on the Neoverse N1 architecture. ### Building and Running @@ -91,17 +91,17 @@ make run make clean ``` -The table summarizes the speedup obtained by the NEON version. +The table summarizes the speedup obtained by the Neon version. Using Agent mode in GitHub Copilot is a significant benefit when you are actively building and running software. Agent mode can create files and modify them to make needed improvements. ### Tips for Using GitHub Copilot Effectively -This project was completed using GitHub Copilot Agent without modifying the generated files. While that might not be practical in every case, the demonstration shows how NEON intrinsics can significantly boost performance. +This project was completed using GitHub Copilot Agent without modifying the generated files. While that might not be practical in every case, the demonstration shows how Neon intrinsics can significantly boost performance. GitHub Copilot is especially useful for: * Generating vectorized versions of scalar code. -* Writing and adapting NEON intrinsics. +* Writing and adapting Neon intrinsics. * Identifying and fixing bugs in complex low-level code, even for developers who aren’t SIMD experts. Make sure to try different LLMs with Copilot as the results will vary greatly depending on the model. diff --git a/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/eigen-on-arm-part1.md b/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/eigen-on-arm-part1.md index cc70e81d24..80f9677d2f 100644 --- a/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/eigen-on-arm-part1.md +++ b/content/learning-paths/cross-platform/eigen-linear-algebra-on-arm/eigen-on-arm-part1.md @@ -63,7 +63,7 @@ sys 0m0.000s ### Testing on ASIMD -To compare with the SIMD (ASIMD/NEON in the case of Arm), remove the define from compilation and run the same way: +To compare with the SIMD (ASIMD/Neon in the case of Arm), remove the define from compilation and run the same way: ```bash { output_lines = "3-7" } diff --git a/content/learning-paths/cross-platform/intrinsics/_index.md b/content/learning-paths/cross-platform/intrinsics/_index.md index d6eea54443..88d0dd032b 100644 --- a/content/learning-paths/cross-platform/intrinsics/_index.md +++ b/content/learning-paths/cross-platform/intrinsics/_index.md @@ -34,7 +34,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - NEON + - Neon - SVE - Intrinsics - Runbook diff --git a/content/learning-paths/cross-platform/intrinsics/intro.md b/content/learning-paths/cross-platform/intrinsics/intro.md index 68c715617e..e39c5d14c7 100644 --- a/content/learning-paths/cross-platform/intrinsics/intro.md +++ b/content/learning-paths/cross-platform/intrinsics/intro.md @@ -16,7 +16,7 @@ Intrinsics are functions which are built into the compiler and not part of a lib One use of intrinsics is to access SIMD (single-instruction, multiple-data) instructions directly from C/C++ for improved application performance. Intrinsics are easier to work with compared to assembly language, but they often pose a challenge when porting source code to a new architecture. -Intel Streaming SIMD Extensions (SSE) and [Arm NEON](https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-instructions) SIMD instructions increase processor throughput by performing multiple computations with a single instruction. Over the years, Intel and Arm have introduced a variety of SIMD extensions. NEON is used in many of the Arm Cortex-A, Cortex-R, and Neoverse processors. +Intel Streaming SIMD Extensions (SSE) and [Arm Neon](https://developer.arm.com/documentation/dht0002/a/Introducing-Neon/Neon-architecture-overview/Neon-instructions) SIMD instructions increase processor throughput by performing multiple computations with a single instruction. Over the years, Intel and Arm have introduced a variety of SIMD extensions. Neon is used in many of the Arm Cortex-A, Cortex-R, and Neoverse processors. There are generally 3 ways to program SIMD hardware: - The C/C++ compiler recognizes opportunities to use SIMD instructions and inserts them automatically (with or without some guidance) diff --git a/content/learning-paths/cross-platform/intrinsics/simde.md b/content/learning-paths/cross-platform/intrinsics/simde.md index 92775e866e..78b6c4c9e0 100644 --- a/content/learning-paths/cross-platform/intrinsics/simde.md +++ b/content/learning-paths/cross-platform/intrinsics/simde.md @@ -17,7 +17,7 @@ If the code being migrated has `MMX` or `SSE` code then either `sse2neon` or `SI To make the example application compile and run on Arm there are four steps: - Identify the appropriate header file from SIMDe (use the table in the [SIMDEverywhere wiki](https://wiki.debian.org/SIMDEverywhere) to find the right portable header) - Define `SIMDE_ENABLE_NATIVE_ALIASES` macro before the include to enable original `_mm` intrinsics to be recognized -- Replace the x86-specific header file with the SIMDe one to map the intrinsics to NEON instructions +- Replace the x86-specific header file with the SIMDe one to map the intrinsics to Neon instructions - Change the g++ compiler flags for the Arm architecture {{% notice Note %}} diff --git a/content/learning-paths/cross-platform/intrinsics/sse2neon.md b/content/learning-paths/cross-platform/intrinsics/sse2neon.md index 8e60ed12fd..5e78eef0b8 100644 --- a/content/learning-paths/cross-platform/intrinsics/sse2neon.md +++ b/content/learning-paths/cross-platform/intrinsics/sse2neon.md @@ -6,16 +6,16 @@ weight: 4 ## sse2neon -The [sse2neon project](https://github.com/DLTcollab/sse2neon) is a quick way to get C/C++ applications compiling and running on Arm. The `sse2neon.h` header file provides NEON implementations for x64 intrinsics so no further source code changes are needed. +The [sse2neon project](https://github.com/DLTcollab/sse2neon) is a quick way to get C/C++ applications compiling and running on Arm. The `sse2neon.h` header file provides Neon implementations for x64 intrinsics so no further source code changes are needed. -Each intrinsic is replaced with NEON code and so will run on an appropriate Arm platform. +Each intrinsic is replaced with Neon code and so will run on an appropriate Arm platform. ## Porting with sse2neon To make this application compile and run on Arm there are three steps. - Adjust the SSE specific header file usage for the Arm architecture -- Include `sse2neon.h` to map the intrinsics to NEON instructions +- Include `sse2neon.h` to map the intrinsics to Neon instructions - Change the g++ compiler flags for the Arm architecture Here is the new code (`neon.cpp`). The only change is related to the include files. diff --git a/content/learning-paths/cross-platform/kleidiai-explainer/_index.md b/content/learning-paths/cross-platform/kleidiai-explainer/_index.md index adf1f9d19c..7fa92d0378 100644 --- a/content/learning-paths/cross-platform/kleidiai-explainer/_index.md +++ b/content/learning-paths/cross-platform/kleidiai-explainer/_index.md @@ -24,7 +24,7 @@ armips: tools_software_languages: - CPP - Generative AI - - NEON + - Neon - Runbook operatingsystems: diff --git a/content/learning-paths/cross-platform/simd-info-demo/conclusion.md b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md index 14538a712f..e016027268 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/conclusion.md +++ b/content/learning-paths/cross-platform/simd-info-demo/conclusion.md @@ -14,7 +14,7 @@ The primary focus of this Learning Path is to optimize the existing algorithm di If you are interested in data layout strategies to further enhance performance on Arm, see the Learning Path *Optimize SIMD code with vectorization-friendly data layout* linked to in the **Next Steps** section at the of this Learning Path. -Using SIMD.info can be instrumental in reducing the amount of time spent in this process, providing a centralized and user-friendly resource for finding NEON equivalents to intrinsics of other architectures. It saves considerable time and effort by offering detailed descriptions, prototypes, and comparisons directly, eliminating the need for extensive web searches and manual lookups. +Using SIMD.info can be instrumental in reducing the amount of time spent in this process, providing a centralized and user-friendly resource for finding Neon equivalents to intrinsics of other architectures. It saves considerable time and effort by offering detailed descriptions, prototypes, and comparisons directly, eliminating the need for extensive web searches and manual lookups. While porting between vectors of different sizes is more complex, work is underway to complete the integration of SVE and SVE2 Arm extensions and allow matching them with AVX512 intrinsics, as they both use predicate masks. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md index bbee960d78..931dc09da8 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-description.md @@ -48,8 +48,8 @@ This organized structure enables you to browse through SIMD instruction sets acr - Boolean AND NOT 32-bit float - Boolean AND NOT 32-bit signed integers - AVX512: mm512_andnot_epi32 - - NEON: vbic_s32 - - NEON: vbicq_s32 + - Neon: vbic_s32 + - Neon: vbicq_s32 - VSX: vec_andc - Bit Clear - XOR @@ -57,7 +57,7 @@ This organized structure enables you to browse through SIMD instruction sets acr #### Advanced search functionality With its robust search engine, SIMD.info allows you to either search for a specific intrinsic, for example `vaddq_f64`, or enter more general terms, for example "How to add 2 vectors," and it returns a list of the corresponding intrinsics. -You can also filter results based on the specific engine you're working with, such as NEON, SSE4.2, AVX, AVX512, or VSX. This functionality streamlines the process of finding the right commands tailored to your needs. +You can also filter results based on the specific engine you're working with, such as Neon, SSE4.2, AVX, AVX512, or VSX. This functionality streamlines the process of finding the right commands tailored to your needs. #### Comparison tools This feature lets you directly compare SIMD instructions from different, or the same, platforms side by side, offering a clear view of the similarities and differences. It's a helpful tool for porting code across architectures, as it ensures accuracy and efficiency. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md index 85190e5cea..f11cf79897 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md @@ -6,7 +6,7 @@ weight: 5 layout: learningpathall --- -### Using SIMD.info to find NEON Equivalents +### Using SIMD.info to find Neon Equivalents Now that you have a clear view of the example, you can start the process of porting the code to Arm Neon/ASIMD. This is where [SIMD.info](https://simd.info/) comes in. @@ -22,14 +22,14 @@ For the operations in your SSE4.2 example, you have the following intrinsics: To gain a deeper understanding of how these intrinsics work and to surface detailed descriptions, you can use the search feature on SIMD.info. Simply enter the name of the intrinsic in the search bar. You can either select from the suggested results or perform a direct search to retrieve information about each intrinsic. -1. By searching for [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps) you will retrieve information about its purpose, the result type, assembly instructions, prototypes, and an example demonstration. By clicking the **engine** option **"NEON"** you can find its [equivalents](https://simd.info/eq/_mm_add_ps/NEON/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) helps you find the right one. Based on the prototype provided, you can choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32) as it works with 128-bit vectors which is the same as **SSE4.2**. +1. By searching for [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps) you will retrieve information about its purpose, the result type, assembly instructions, prototypes, and an example demonstration. By clicking the **engine** option **"Neon"** you can find its [equivalents](https://simd.info/eq/_mm_add_ps/Neon/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) helps you find the right one. Based on the prototype provided, you can choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32) as it works with 128-bit vectors which is the same as **SSE4.2**. 2. Moving to the next intrinsic, **`_mm_mul_ps`**, you can use the [Intrinsics Tree](https://simd.info/tag-tree) on SIMD.info to find the equivalent. Start by expanding the **Arithmetic** branch and then navigate to the branch **Vector Multiply**. As you are working with 32-bit floats, open the **Vector Multiply 32-bit floats** branch, where you will find several options. The recommended choice is [**`vmulq_f32`**](https://simd.info/c_intrinsic/vmulq_f32), following the same reasoning as before; it operates on 128-bit vectors. -3. For the third intrinsic, **`_mm_sqrt_ps`**, the easiest way to find the corresponding NEON intrinsic is by typing **"Square Root"** in the search bar on SIMD.info. From the [search results](https://simd.info/search?search=Square+Root&simd_engines=1&simd_engines=2&simd_engines=3&simd_engines=4&simd_engines=5), look for the float-specific version and select [**`vsqrtq_f32`**](https://simd.info/c_intrinsic/vsqrtq_f32), which, like the others, works with 128-bit vectors. In the equivalents section about **SSE4.2**, you can see that **`_mm_sqrt_ps`** has its place as a direct match for this operation. +3. For the third intrinsic, **`_mm_sqrt_ps`**, the easiest way to find the corresponding Neon intrinsic is by typing **"Square Root"** in the search bar on SIMD.info. From the [search results](https://simd.info/search?search=Square+Root&simd_engines=1&simd_engines=2&simd_engines=3&simd_engines=4&simd_engines=5), look for the float-specific version and select [**`vsqrtq_f32`**](https://simd.info/c_intrinsic/vsqrtq_f32), which, like the others, works with 128-bit vectors. In the equivalents section about **SSE4.2**, you can see that **`_mm_sqrt_ps`** has its place as a direct match for this operation. 4. For the last intrinsic, **`_mm_cmpgt_ps`**, follow a similar approach as before. Inside the intrinsics tree, start by expanding the **Comparison** folder. Navigate to the subfolder **Vector Compare Greater Than**, and as you are working with 32-bit floats, proceed to **Vector Compare Greater Than 32-bit floats**. The recommended choice is again the 128-bit variant[**`vcgtq_f32`**](https://simd.info/c_intrinsic/vcgtq_f32). -Now that you have found the NEON equivalents for each SSE4.2 intrinsic, you're ready to begin porting the code. Understanding these equivalents is key to ensuring that the code produces the correct results in the calculations as you switch between SIMD engines. +Now that you have found the Neon equivalents for each SSE4.2 intrinsic, you're ready to begin porting the code. Understanding these equivalents is key to ensuring that the code produces the correct results in the calculations as you switch between SIMD engines. diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md index 6e4ad875bc..c664cd1baf 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-porting.md @@ -10,13 +10,13 @@ layout: learningpathall Follow this step-by-step process to porting: -1. Change the loading process to follow NEON's method for initializing vectors. The SSE4.2 intrinsic **`_mm_set_ps`** is in reality a macro, in NEON you can do the same thing with curly braces **`{}`** initialization. -2. Next, replace the SSE4.2 intrinsics with the NEON equivalents that you identified earlier. The key is to ensure that the operations perform the same tasks, such as comparison, addition, multiplication, and square root calculations. -3. Finally, modify the storing process to match NEON's way of moving data from vectors to memory. In NEON, you use functions like [**`vst1q_f32`**](https://simd.info/c_intrinsic/vst1q_f32) for storing 128-bit floating-point vectors and [**`vst1q_u32`**](https://simd.info/c_intrinsic/vst1q_u32) for storing 128-bit integer vectors. +1. Change the loading process to follow Neon's method for initializing vectors. The SSE4.2 intrinsic **`_mm_set_ps`** is in reality a macro, in Neon you can do the same thing with curly braces **`{}`** initialization. +2. Next, replace the SSE4.2 intrinsics with the Neon equivalents that you identified earlier. The key is to ensure that the operations perform the same tasks, such as comparison, addition, multiplication, and square root calculations. +3. Finally, modify the storing process to match Neon's way of moving data from vectors to memory. In Neon, you use functions like [**`vst1q_f32`**](https://simd.info/c_intrinsic/vst1q_f32) for storing 128-bit floating-point vectors and [**`vst1q_u32`**](https://simd.info/c_intrinsic/vst1q_u32) for storing 128-bit integer vectors. -After identifying the NEON intrinsics that you require in the ported program, it's now time to write the code. +After identifying the Neon intrinsics that you require in the ported program, it's now time to write the code. -This time on your Arm Linux machine, create a new file for the ported NEON code named `calculation_neon.c`, populating with the contents as shown below: +This time on your Arm Linux machine, create a new file for the ported Neon code named `calculation_neon.c`, populating with the contents as shown below: ```C #include diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md index 057559a64b..c737e9136e 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example2.md @@ -10,7 +10,7 @@ layout: learningpathall During the porting process, you can see that certain instructions translate seamlessly. However, there are cases where direct equivalents for some intrinsics might not be readily available across architectures. -For example, the [**`_mm_madd_epi16`**](https://simd.info/c_intrinsic/_mm_madd_epi16) intrinsic from SSE2, which performs multiplication of 16-bit signed integer elements in a vector and then does a pairwise addition of adjacent elements increasing the element width, does not have a direct counterpart in NEON. However, it can be emulated using another intrinsic. Similarly its 256 and 512-bit counterparts, [**`_mm256_madd_epi16`**](https://simd.info/c_intrinsic/_mm256_madd_epi16) and [**`_mm512_madd_epi16`**](https://simd.info/c_intrinsic/_mm512_madd_epi16), can be emulated by a sequence of instructions, but here you will see the 128-bit variant. +For example, the [**`_mm_madd_epi16`**](https://simd.info/c_intrinsic/_mm_madd_epi16) intrinsic from SSE2, which performs multiplication of 16-bit signed integer elements in a vector and then does a pairwise addition of adjacent elements increasing the element width, does not have a direct counterpart in Neon. However, it can be emulated using another intrinsic. Similarly its 256 and 512-bit counterparts, [**`_mm256_madd_epi16`**](https://simd.info/c_intrinsic/_mm256_madd_epi16) and [**`_mm512_madd_epi16`**](https://simd.info/c_intrinsic/_mm512_madd_epi16), can be emulated by a sequence of instructions, but here you will see the 128-bit variant. You might already know the equivalent operations for this particular intrinsic, but let's assume that you don't. In this particular use case, reading **`_mm_madd_epi16`** on **SIMD.info** might indicate that a key characteristic of the instruction involved is the widening of the result elements, from 16-bit to 32-bit signed integers. Unfortunately, this is not the case. This particular instruction does not increase the size of the element holding the result values. You will see how this affects the result in the example. diff --git a/content/learning-paths/cross-platform/simd-loops/1-about.md b/content/learning-paths/cross-platform/simd-loops/1-about.md index 8081261413..ee66239926 100644 --- a/content/learning-paths/cross-platform/simd-loops/1-about.md +++ b/content/learning-paths/cross-platform/simd-loops/1-about.md @@ -8,9 +8,9 @@ layout: learningpathall ## Introduction to SIMD on Arm and why it matters for performance on Arm CPUs -Writing high-performance software on Arm often means using single-instruction, multiple-data (SIMD) technologies. Many developers start with NEON, a familiar fixed-width vector extension. As Arm architectures evolve, so do the SIMD capabilities available to you. +Writing high-performance software on Arm often means using single-instruction, multiple-data (SIMD) technologies. Many developers start with Neon, a familiar fixed-width vector extension. As Arm architectures evolve, so do the SIMD capabilities available to you. -This Learning Path uses the Scalable Vector Extension (SVE) and the Scalable Matrix Extension (SME) to demonstrate modern SIMD patterns. They are two powerful, scalable vector extensions designed for modern workloads. Unlike NEON, these architecture extensions are not just wider; they are fundamentally different. They introduce predication, vector-length-agnostic (VLA) programming, gather/scatter, streaming modes, and tile-based compute with ZA state. The result is more power and flexibility, but there can be a learning curve to match. +This Learning Path uses the Scalable Vector Extension (SVE) and the Scalable Matrix Extension (SME) to demonstrate modern SIMD patterns. They are two powerful, scalable vector extensions designed for modern workloads. Unlike Neon, these architecture extensions are not just wider; they are fundamentally different. They introduce predication, vector-length-agnostic (VLA) programming, gather/scatter, streaming modes, and tile-based compute with ZA state. The result is more power and flexibility, but there can be a learning curve to match. ## What is the SIMD Loops project? @@ -18,14 +18,14 @@ The SIMD Loops project offers a hands-on way to climb the learning curve. It is Visit the [SIMD Loops Repo](https://gitlab.arm.com/architecture/simd-loops). -This open-source project (BSD-3-Clause) teaches SIMD development on modern Arm CPUs with SVE, SVE2, SME, and SME2. It’s aimed at developers who know NEON intrinsics and want to explore newer extensions. The goal of SIMD Loops is to provide working, readable examples that demonstrate how to use the full range of features available in SVE, SVE2, and SME2. Each example is a self-contained loop kernel - a small piece of code that performs a specific task like matrix multiplication, vector reduction, histogram, or memory copy. These examples show how that task can be implemented across different vector instruction sets. +This open-source project (BSD-3-Clause) teaches SIMD development on modern Arm CPUs with SVE, SVE2, SME, and SME2. It’s aimed at developers who know Neon intrinsics and want to explore newer extensions. The goal of SIMD Loops is to provide working, readable examples that demonstrate how to use the full range of features available in SVE, SVE2, and SME2. Each example is a self-contained loop kernel - a small piece of code that performs a specific task like matrix multiplication, vector reduction, histogram, or memory copy. These examples show how that task can be implemented across different vector instruction sets. Unlike a cookbook that attempts to provide a recipe for every problem, SIMD Loops takes the opposite approach. It aims to showcase the architecture rather than the problem itself. The loop kernels are chosen to be realistic and meaningful, but the main goal is to demonstrate how specific features and instructions work in practice. If you are trying to understand scalability, predication, gather/scatter, streaming mode, ZA storage, compact instructions, or the mechanics of matrix tiles, this is where you can see them in action. The project includes: - Many numbered loop kernels, each focused on a specific feature or pattern - Reference C implementations to establish expected behavior -- Inline assembly and/or intrinsics for scalar, NEON, SVE, SVE2, SVE2.1, SME2, and SME2.1 +- Inline assembly and/or intrinsics for scalar, Neon, SVE, SVE2, SVE2.1, SME2, and SME2.1 - Build support for different instruction sets, with runtime validation - A simple command-line runner to execute any loop interactively - Optional standalone binaries for bare-metal and simulator use diff --git a/content/learning-paths/cross-platform/simd-loops/2-using.md b/content/learning-paths/cross-platform/simd-loops/2-using.md index 3772a55822..5474c80315 100644 --- a/content/learning-paths/cross-platform/simd-loops/2-using.md +++ b/content/learning-paths/cross-platform/simd-loops/2-using.md @@ -86,7 +86,7 @@ Each loop is implemented in several SIMD extension variants. Conditional compila The native C implementation is written first, and it can be generated either when building natively with `-DHAVE_NATIVE` or through compiler auto-vectorization with `-DHAVE_AUTOVEC`. -When SIMD ACLE is supported (SME, SVE, or NEON), the code is compiled using high-level intrinsics. If ACLE support is not available, the build process falls back to handwritten inline assembly targeting one of the available SIMD extensions, such as SME2.1, SME2, SVE2.1, SVE2, and others. +When SIMD ACLE is supported (SME, SVE, or Neon), the code is compiled using high-level intrinsics. If ACLE support is not available, the build process falls back to handwritten inline assembly targeting one of the available SIMD extensions, such as SME2.1, SME2, SVE2.1, SVE2, and others. The overall code structure also includes setup and cleanup code in the main function, where memory buffers are allocated, the selected loop kernel is executed, and results are verified for correctness. @@ -108,7 +108,7 @@ Build all loops for all targets: make all ``` -Build all loops for a single target, such as NEON: +Build all loops for a single target, such as Neon: ```console make neon @@ -118,7 +118,7 @@ As a result of the build, two types of binaries are generated. The first is a single executable named `simd_loops`, which includes all loop implementations. -Select a specific loop by passing parameters to the program. For example, to run loop 1 for 5 iterations using the NEON target: +Select a specific loop by passing parameters to the program. For example, to run loop 1 for 5 iterations using the Neon target: ```console build/neon/bin/simd_loops -k 1 -n 5 diff --git a/content/learning-paths/cross-platform/simd-loops/3-example.md b/content/learning-paths/cross-platform/simd-loops/3-example.md index 54e1512df9..15bdcc454a 100644 --- a/content/learning-paths/cross-platform/simd-loops/3-example.md +++ b/content/learning-paths/cross-platform/simd-loops/3-example.md @@ -50,7 +50,7 @@ This layout helps optimize memory access patterns across the targeted SIMD archi Loop attributes are specified per target architecture: - **SME targets** — `inner_loop_202` is invoked with the `__arm_streaming` attribute and uses a shared `ZA` register context (`__arm_inout("za")`). These attributes are wrapped in the `LOOP_ATTR` macro -- **SVE or NEON targets** — no additional attributes are required +- **SVE or Neon targets** — no additional attributes are required This design enables portability across SIMD extensions. @@ -229,7 +229,7 @@ For instruction semantics and SME/SME2 optimization guidance, see the [SME Progr Beyond the SME2 and SVE implementations, this loop also includes additional optimized versions that leverage architecture-specific features: -- **NEON**: the NEON version (lines 612–710) uses structure load/store combined with indexed `fmla` to vectorize the computation. +- **Neon**: the Neon version (lines 612–710) uses structure load/store combined with indexed `fmla` to vectorize the computation. - **SVE2.1**: the SVE2.1 version (lines 355–462) extends the base SVE approach using multi-vector loads and stores. diff --git a/content/learning-paths/cross-platform/simd-loops/4-conclusion.md b/content/learning-paths/cross-platform/simd-loops/4-conclusion.md index ccb0de356c..a3ef28358c 100644 --- a/content/learning-paths/cross-platform/simd-loops/4-conclusion.md +++ b/content/learning-paths/cross-platform/simd-loops/4-conclusion.md @@ -10,7 +10,7 @@ layout: learningpathall SIMD Loops is a practical way to learn the intricacies of SVE and SME across modern Arm architectures. By providing small, runnable loop kernels with reference code and optimized variants, it closes the gap between architectural specifications and real applications. -Whether you are moving from NEON or starting directly with SVE and SME, the project offers: +Whether you are moving from Neon or starting directly with SVE and SME, the project offers: - A broad catalog of kernels that highlight specific features (predication, VLA programming, gather/scatter, streaming mode, ZA tiles) - Clear, readable implementations in C, ACLE intrinsics, and selected inline assembly - Flexible build targets and a simple runner to execute and validate loops diff --git a/content/learning-paths/cross-platform/simd-loops/_index.md b/content/learning-paths/cross-platform/simd-loops/_index.md index 3d3caff91a..ed90246494 100644 --- a/content/learning-paths/cross-platform/simd-loops/_index.md +++ b/content/learning-paths/cross-platform/simd-loops/_index.md @@ -7,14 +7,14 @@ who_is_this_for: This is an advanced topic for software developers who want to l learning_objectives: - Improve SIMD code performance using Scalable Vector Extension (SVE) and Scalable Matrix Extension (SME) - - Describe what SIMD Loops contains and how kernels are organized across scalar, NEON, SVE,SVE2, and SME2 variants + - Describe what SIMD Loops contains and how kernels are organized across scalar, Neon, SVE,SVE2, and SME2 variants - Build and run a selected kernel with the provided runner and validate correctness against the C reference - - Choose the appropriate build target to compare NEON, SVE/SVE2, and SME2 implementations + - Choose the appropriate build target to compare Neon, SVE/SVE2, and SME2 implementations prerequisites: - An AArch64 computer running Linux or macOS. You can use cloud instances, refer to [Get started with Arm-based cloud instances](/learning-paths/servers-and-cloud-computing/csp/) for a list of cloud service providers. - - Some familiarity with SIMD programming and NEON intrinsics. + - Some familiarity with SIMD programming and Neon intrinsics. - Recent toolchains that support SVE/SME (GCC 13+ or Clang 16+ recommended) author: diff --git a/content/learning-paths/cross-platform/sme-executorch-profiling/03-model-onboarding-and-profiling.md b/content/learning-paths/cross-platform/sme-executorch-profiling/03-model-onboarding-and-profiling.md index 32799d1d8a..f5b3d795b5 100644 --- a/content/learning-paths/cross-platform/sme-executorch-profiling/03-model-onboarding-and-profiling.md +++ b/content/learning-paths/cross-platform/sme-executorch-profiling/03-model-onboarding-and-profiling.md @@ -269,5 +269,5 @@ Typical outcome after SME2: If data movement dominates after SME2, focus optimizations on transpose elimination to reduce layout changes, layout optimization to choose layouts that minimize copies, and memory access patterns to reduce cache misses. -Use trace-enabled runs to confirm which kernel variants were selected (for example, SME2-enabled kernels vs NEON). Remember: trace-enabled runs are evidence-gathering only and shouldn't be used for direct latency comparisons. +Use trace-enabled runs to confirm which kernel variants were selected (for example, SME2-enabled kernels vs Neon). Remember: trace-enabled runs are evidence-gathering only and shouldn't be used for direct latency comparisons. diff --git a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md index b83ed43fbc..ffade06ac8 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/1-vectorization.md @@ -10,11 +10,11 @@ layout: "learningpathall" Migrating SIMD (Single Instruction, Multiple Data) code from x86 extensions to Arm extensions is a key task for software developers aiming to optimize performance on Arm platforms. -Understanding the mapping from x86 instruction sets such as SSE, AVX, and AMX to Arm’s NEON, SVE, and SME extensions is essential for achieving portability and high performance. This Learning Path provides an overview to help you design a migration plan in which you can leverage Arm features such as scalable vector lengths and advanced matrix operations to adapt your code effectively. +Understanding the mapping from x86 instruction sets such as SSE, AVX, and AMX to Arm’s Neon, SVE, and SME extensions is essential for achieving portability and high performance. This Learning Path provides an overview to help you design a migration plan in which you can leverage Arm features such as scalable vector lengths and advanced matrix operations to adapt your code effectively. Vectorization is a key optimization strategy where one instruction processes multiple data elements simultaneously. It drives performance in High-Performance Computing (HPC), AI and ML, signal processing, and data analytics. -Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers fixed-width vectors for NEON and scalable vectors for SVE and SME, ranging from 128 to 2048 bits. +Both x86 and Arm processors offer rich SIMD capabilities, but they differ in philosophy and design. The x86 architecture provides fixed-width vector units of 128, 256, and 512 bits. The Arm architecture offers fixed-width vectors for Neon and scalable vectors for SVE and SME, ranging from 128 to 2048 bits. If you are migrating SIMD software to Arm, understanding these differences will help you write portable, high-performance code. @@ -22,9 +22,9 @@ If you are migrating SIMD software to Arm, understanding these differences will This section provides some more information about the Arm vector and matrix extensions and shows you when to use each, how they map from SSE/AVX/AMX, and what changes in your programming model (predication, gather/scatter, tiles, streaming mode). -### NEON +### Neon -NEON is a 128-bit SIMD extension available across Armv8-A cores, including Neoverse and mobile. It is well suited to multimedia, DSP, and packet processing. Conceptually, NEON is closest to x86 SSE and AVX used in 128-bit mode, making it the primary target when migrating many SSE workloads. Compiler auto-vectorization to NEON is mature, reducing the need for manual intrinsics. +Neon is a 128-bit SIMD extension available across Armv8-A cores, including Neoverse and mobile. It is well suited to multimedia, DSP, and packet processing. Conceptually, Neon is closest to x86 SSE and AVX used in 128-bit mode, making it the primary target when migrating many SSE workloads. Compiler auto-vectorization to Neon is mature, reducing the need for manual intrinsics. ### Scalable Vector Extension (SVE) @@ -36,17 +36,17 @@ SME accelerates matrix multiplication and is similar in intent to AMX. Unlike AM ## x86 vector and matrix extensions -Here is a brief overview of the x86 families you’ll likely port from: SSE (128-bit), AVX/AVX-512 (256/512-bit with masking), and AMX (tile-based matrix compute). Use this to identify feature equivalents before mapping kernels to NEON, SVE/SVE2, or SME on Arm. +Here is a brief overview of the x86 families you’ll likely port from: SSE (128-bit), AVX/AVX-512 (256/512-bit with masking), and AMX (tile-based matrix compute). Use this to identify feature equivalents before mapping kernels to Neon, SVE/SVE2, or SME on Arm. ### Streaming SIMD Extensions (SSE) The SSE instruction set provides 128-bit XMM registers and supports both integer and floating-point SIMD operations. Despite being an older technology, SSE remains a baseline for many libraries due to its widespread adoption. -However, its fixed-width design can constrain throughput compared with newer extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm NEON, enabling a relatively straightforward transition. +However, its fixed-width design can constrain throughput compared with newer extensions like AVX. When migrating code from SSE to Arm, developers will find that SSE maps well to Arm Neon, enabling a relatively straightforward transition. ### Advanced Vector Extensions (AVX) -AVX provides 256-bit YMM registers, and AVX-512 adds 512-bit ZMM registers. Features include FMA, per-lane masking in AVX-512, and VEX or EVEX encodings. When moving AVX workloads to Arm, 128-bit paths often translate to NEON, while algorithms that scale with vector width are good candidates for SVE. Because SVE is vector-length agnostic, refactor for predication and scalable loops to maintain portability and performance. +AVX provides 256-bit YMM registers, and AVX-512 adds 512-bit ZMM registers. Features include FMA, per-lane masking in AVX-512, and VEX or EVEX encodings. When moving AVX workloads to Arm, 128-bit paths often translate to Neon, while algorithms that scale with vector width are good candidates for SVE. Because SVE is vector-length agnostic, refactor for predication and scalable loops to maintain portability and performance. ### Advanced Matrix Extensions (AMX) @@ -54,11 +54,11 @@ AMX accelerates matrix operations with tile registers configured using a tile pa ## Comparison tables -Use these side-by-side tables to pick the right Arm target and plan refactors. They compare register width, predication/masking, gather/scatter, key operations, typical workloads, and limitations for SSE ↔ NEON, AVX/AVX-512 ↔ SVE/SVE2, and AMX ↔ SME. +Use these side-by-side tables to pick the right Arm target and plan refactors. They compare register width, predication/masking, gather/scatter, key operations, typical workloads, and limitations for SSE ↔ Neon, AVX/AVX-512 ↔ SVE/SVE2, and AMX ↔ SME. -### A comparison of SSE and NEON +### A comparison of SSE and Neon -| Feature | SSE | NEON | +| Feature | SSE | Neon | |---|---|---| | **Register width** | 128-bit (XMM) | 128-bit (Q) | | **Vector length model** | Fixed 128 bits | Fixed 128 bits | @@ -104,11 +104,11 @@ The most significant changes when porting include moving from fixed-width SIMD t ### Vector length model -x86 SIMD (SSE, AVX, and AVX-512) uses fixed widths of 128, 256, or 512 bits. This often requires multiple code paths or dispatch strategies. Arm NEON is also fixed at 128-bit and is a familiar baseline. SVE and SME introduce vector-length agnostic execution from 128 to 2048 bits so the same binary scales across implementations. +x86 SIMD (SSE, AVX, and AVX-512) uses fixed widths of 128, 256, or 512 bits. This often requires multiple code paths or dispatch strategies. Arm Neon is also fixed at 128-bit and is a familiar baseline. SVE and SME introduce vector-length agnostic execution from 128 to 2048 bits so the same binary scales across implementations. ### Programming and intrinsics -x86 intrinsics are extensive, and AVX-512 adds masks and lane controls that increase complexity. NEON intrinsics look familiar to SSE developers. SVE and SME use predication and scalable loops. Prefer auto-vectorization and VLA-friendly patterns over heavy hand-written intrinsics when portability matters. +x86 intrinsics are extensive, and AVX-512 adds masks and lane controls that increase complexity. Neon intrinsics look familiar to SSE developers. SVE and SME use predication and scalable loops. Prefer auto-vectorization and VLA-friendly patterns over heavy hand-written intrinsics when portability matters. ### Matrix acceleration @@ -116,7 +116,7 @@ AMX provides fixed-geometry tile compute optimized for dot products. SME extends ## Summary -Migrating from x86 SIMD to Arm entails adopting Arm’s scalable and predicated programming model with SVE and SME for forward-portable performance, while continuing to use NEON for fixed-width SIMD similar to SSE. +Migrating from x86 SIMD to Arm entails adopting Arm’s scalable and predicated programming model with SVE and SME for forward-portable performance, while continuing to use Neon for fixed-width SIMD similar to SSE. ## Migration tools @@ -124,8 +124,8 @@ Several libraries help translate or abstract SIMD intrinsics to speed up migrati Here are some of the tools available and their key features: -- Sse2neon: an open-source header that maps many SSE2 intrinsics to NEON equivalents. Good for getting code building quickly. Review generated code for performance. See the [sse2neon GitHub repository](https://github.com/DLTcollab/sse2neon). +- Sse2neon: an open-source header that maps many SSE2 intrinsics to Neon equivalents. Good for getting code building quickly. Review generated code for performance. See the [sse2neon GitHub repository](https://github.com/DLTcollab/sse2neon). - SIMD Everywhere (SIMDe): a header-only portability layer that implements many x86 and Arm intrinsics across ISAs, with scalar fallbacks when SIMD is unavailable. See the [simde-everywhere GitHub repository](https://github.com/simd-everywhere/simde). -- Google Highway (hwy): a portable SIMD library and APIs that target multiple ISAs, including NEON, SVE where supported, and AVX, without per-ISA code paths. See the [Google highway GitHub repository](https://github.com/google/highway). +- Google Highway (hwy): a portable SIMD library and APIs that target multiple ISAs, including Neon, SVE where supported, and AVX, without per-ISA code paths. See the [Google highway GitHub repository](https://github.com/google/highway). For more on cross-platform intrinsics, see the Learning Path [Porting architecture-specific intrinsics](/learning-paths/cross-platform/intrinsics/). diff --git a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md index 5d4881fa2d..0847df648c 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/2-code-examples.md @@ -8,7 +8,7 @@ layout: "learningpathall" ## SAXPY example code -This page walks you through a SAXPY (Single-Precision A·X Plus Y) kernel implemented in plain C and with vector extensions on both Arm (NEON, SVE) and x86 (AVX2, AVX-512). You will see how to build and run each version and how the vector width affects throughput. +This page walks you through a SAXPY (Single-Precision A·X Plus Y) kernel implemented in plain C and with vector extensions on both Arm (Neon, SVE) and x86 (AVX2, AVX-512). You will see how to build and run each version and how the vector width affects throughput. SAXPY computes `y[i] = a * x[i] + y[i]` across arrays `x` and `y`. It is widely used in numerical computing and is an accessible way to compare SIMD behavior across ISAs. @@ -66,11 +66,11 @@ gcc -O3 -o saxpy_plain saxpy_plain.c You can use Clang for any of the examples by replacing `gcc` with `clang` on the command line. -## Arm NEON version (128-bit SIMD, 4 floats per operation) +## Arm Neon version (128-bit SIMD, 4 floats per operation) -NEON uses fixed 128-bit registers, processing four `float` values per instruction. It is available on most Armv8-A devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads. +Neon uses fixed 128-bit registers, processing four `float` values per instruction. It is available on most Armv8-A devices and is excellent for accelerating loops and signal processing tasks in mobile and embedded workloads. -The example below processes 16 floats per iteration using four separate NEON operations to improve instruction-level parallelism and reduce loop overhead: +The example below processes 16 floats per iteration using four separate Neon operations to improve instruction-level parallelism and reduce loop overhead: ```c #include @@ -123,13 +123,13 @@ int main() { Use a text editor to copy the code to a file `saxpy_neon.c`. -First, verify your system supports NEON: +First, verify your system supports Neon: ```bash grep -m1 -ow asimd /proc/cpuinfo ``` -If NEON is supported, you should see `asimd` in the output. If no output appears, NEON is not available. +If Neon is supported, you should see `asimd` in the output. If no output appears, Neon is not available. Then build and run the code using: @@ -139,14 +139,14 @@ gcc -O3 -march=armv8-a+simd -o saxpy_neon saxpy_neon.c ``` {{% notice Note %}} -On AArch64, NEON is mandatory; the flag is shown for clarity. +On AArch64, Neon is mandatory; the flag is shown for clarity. {{% /notice %}} ## x86 AVX2 version (256-bit SIMD, 8 floats per operation) -AVX2 doubles the SIMD width compared to NEON, processing 8 single-precision floats at a time in 256-bit registers. +AVX2 doubles the SIMD width compared to Neon, processing 8 single-precision floats at a time in 256-bit registers. This wider SIMD capability enables higher data throughput for numerical and HPC workloads on Intel and AMD CPUs. diff --git a/content/learning-paths/cross-platform/vectorization-comparison/_index.md b/content/learning-paths/cross-platform/vectorization-comparison/_index.md index c1f37d0f0c..c1fa557730 100644 --- a/content/learning-paths/cross-platform/vectorization-comparison/_index.md +++ b/content/learning-paths/cross-platform/vectorization-comparison/_index.md @@ -6,13 +6,13 @@ minutes_to_complete: 30 who_is_this_for: This is an advanced topic for developers migrating vectorized (SIMD) code from x86-64 to Arm64. learning_objectives: - - Identify how Arm vector extensions including NEON, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures + - Identify how Arm vector extensions including Neon, Scalable Vector Extension (SVE), and Scalable Matrix Extension (SME) map to vector extensions from other architectures - Plan a migration strategy using autovectorization, intrinsics, or library substitution prerequisites: - Familiarity with vector extensions, SIMD programming, and compiler intrinsics - - Access to Linux systems with NEON and SVE support + - Access to Linux systems with Neon and SVE support author: - Jason Andrews diff --git a/content/learning-paths/cross-platform/vectorization-friendly-data-layout/a-more-complex-problem-revisited.md b/content/learning-paths/cross-platform/vectorization-friendly-data-layout/a-more-complex-problem-revisited.md index 449794fbc0..6e6b9fb884 100644 --- a/content/learning-paths/cross-platform/vectorization-friendly-data-layout/a-more-complex-problem-revisited.md +++ b/content/learning-paths/cross-platform/vectorization-friendly-data-layout/a-more-complex-problem-revisited.md @@ -220,4 +220,4 @@ Here are some rules for optimal performance with SIMD/vectorized code: So far you have been using Neon/ASIMD instructions, but newer Arm processors also offer the Scalable Vector Extension (SVE). -Proceed to the next section to find out how to use SVE and compare the performance with NEON. +Proceed to the next section to find out how to use SVE and compare the performance with Neon. diff --git a/content/learning-paths/embedded-and-microcontrollers/_index.md b/content/learning-paths/embedded-and-microcontrollers/_index.md index f7af108f4e..6b81e641e5 100644 --- a/content/learning-paths/embedded-and-microcontrollers/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/_index.md @@ -85,7 +85,7 @@ tools_software_languages_filter: - MCP: 1 - MPS3: 1 - MXNet: 1 -- NEON: 1 +- Neon: 1 - NumPy: 1 - Ollama: 1 - Paddle: 1 diff --git a/content/learning-paths/embedded-and-microcontrollers/migration/3_porting_analysis.md b/content/learning-paths/embedded-and-microcontrollers/migration/3_porting_analysis.md index 580f62f3cd..d432db9697 100644 --- a/content/learning-paths/embedded-and-microcontrollers/migration/3_porting_analysis.md +++ b/content/learning-paths/embedded-and-microcontrollers/migration/3_porting_analysis.md @@ -79,6 +79,6 @@ You can draw the following conclusions: * see [aarch64 options](https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html) for compatible compiler options * The AVX intrinsics need to ported to utilize Arm SIMD intrinsics * Arm has three SIMD technologies - * [NEON](https://developer.arm.com/documentation/den0018/a) + * [Neon](https://developer.arm.com/documentation/den0018/a) * Scalable Vector Extension ([SVE](https://developer.arm.com/documentation/102131/0100/?lang=en)) * Scalable Vector Extension version 2 ([SVE2](https://developer.arm.com/documentation/102340/0100/Introducing-SVE2?lang=en)) \ No newline at end of file diff --git a/content/learning-paths/embedded-and-microcontrollers/migration/5_application_porting.md b/content/learning-paths/embedded-and-microcontrollers/migration/5_application_porting.md index 97e71b6fa6..7200521aca 100644 --- a/content/learning-paths/embedded-and-microcontrollers/migration/5_application_porting.md +++ b/content/learning-paths/embedded-and-microcontrollers/migration/5_application_porting.md @@ -24,7 +24,7 @@ cd sobel-simd-opencv ## x86 intrinsics porting -To port the AVX intrinsics, you can use SIMD Everywhere ([SIMDe](https://github.com/simd-everywhere/simde)). By using SIMDe you can keep the AVX intrinsics in the source code and the intrinsics will be replaced by NEON instructions. +To port the AVX intrinsics, you can use SIMD Everywhere ([SIMDe](https://github.com/simd-everywhere/simde)). By using SIMDe you can keep the AVX intrinsics in the source code and the intrinsics will be replaced by Neon instructions. Start by cloning the SIMDe repository: diff --git a/content/learning-paths/embedded-and-microcontrollers/migration/7_alternative.md b/content/learning-paths/embedded-and-microcontrollers/migration/7_alternative.md index 298145a09b..ba3a3ad789 100644 --- a/content/learning-paths/embedded-and-microcontrollers/migration/7_alternative.md +++ b/content/learning-paths/embedded-and-microcontrollers/migration/7_alternative.md @@ -28,12 +28,12 @@ AWS EC2 instances with Graviton processors use the `aarch64` architecture. Gravi * Ubuntu 22.04.2 LTS, * 16GB storage (default is 8GB) * [Arm Neoverse N1](https://www.arm.com/products/silicon-ip-cpu/neoverse/neoverse-n1) - * 2 x NEON engine 128b vector width + * 2 x Neon engine 128b vector width * Graviton3 (c7g.medium) * Ubuntu 22.04.2 LTS * 16GB storage (default is 8GB) * [Arm Neoverse V1](https://www.arm.com/products/silicon-ip-cpu/neoverse/neoverse-v1) - * 2 x SVE engine 256b vector width (4 x NEON engine 128b vector width support) + * 2 x SVE engine 256b vector width (4 x Neon engine 128b vector width support) For more information on Graviton, refer to [Getting Started with AWS](/learning-paths/servers-and-cloud-computing/csp/aws/) and the [AWS Graviton Technical Guide](https://github.com/aws/aws-graviton-getting-started). diff --git a/content/learning-paths/embedded-and-microcontrollers/migration/_index.md b/content/learning-paths/embedded-and-microcontrollers/migration/_index.md index 97dd27e923..a0860e1c4d 100644 --- a/content/learning-paths/embedded-and-microcontrollers/migration/_index.md +++ b/content/learning-paths/embedded-and-microcontrollers/migration/_index.md @@ -29,7 +29,7 @@ tools_software_languages: - GCC - Arm Compiler for Linux - Docker - - NEON + - Neon further_reading: - resource: diff --git a/content/learning-paths/embedded-and-microcontrollers/raspberry-pi-smart-home/1-overview.md b/content/learning-paths/embedded-and-microcontrollers/raspberry-pi-smart-home/1-overview.md index 66c0786a8f..befde3ff4f 100644 --- a/content/learning-paths/embedded-and-microcontrollers/raspberry-pi-smart-home/1-overview.md +++ b/content/learning-paths/embedded-and-microcontrollers/raspberry-pi-smart-home/1-overview.md @@ -18,7 +18,7 @@ You will create a fully local, privacy-first smart home system that leverages th The Raspberry Pi 5's Arm Cortex-A76 processor can manage high-performance computing tasks like AI inference. Key architectural features include: - **Superscalar architecture**: Executes multiple instructions in parallel, improving throughput for compute-heavy tasks -- **128-bit NEON SIMD support**: Accelerates matrix and vector operations, common in the inner loops of language model inference +- **128-bit Neon SIMD support**: Accelerates matrix and vector operations, common in the inner loops of language model inference - **Multi-level cache hierarchy**: Reduces memory latency and improves data access efficiency during runtime - **Thermal efficiency**: Enables sustained performance without active cooling, making it ideal for compact or always-on smart home setups diff --git a/content/learning-paths/laptops-and-desktops/_index.md b/content/learning-paths/laptops-and-desktops/_index.md index d16cefdcfd..87833fcbd2 100644 --- a/content/learning-paths/laptops-and-desktops/_index.md +++ b/content/learning-paths/laptops-and-desktops/_index.md @@ -61,7 +61,7 @@ tools_software_languages_filter: - llvm-mca: 1 - MSBuild: 1 - MTE: 1 -- NEON: 1 +- Neon: 1 - Neovim: 1 - Node.js: 3 - ONNX Runtime: 1 diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md index 944f942fa2..b0a87f6683 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/3_gb10_llamacpp_cpu.md @@ -142,4 +142,4 @@ In this section you have: - Tested quantized model inference using the TinyLlama Q8_0 model. - Used monitoring tools (htop) to confirm efficient CPU utilization. -You have now successfully built and validated the CPU-only version of llama.cpp on the Grace CPU. In the next section, you will learn how to use the Process Watch tool to visualize instruction-level execution and better understand how Armv9 vectorization (SVE2 and NEON) accelerates quantized LLM inference on the Grace CPU. +You have now successfully built and validated the CPU-only version of llama.cpp on the Grace CPU. In the next section, you will learn how to use the Process Watch tool to visualize instruction-level execution and better understand how Armv9 vectorization (SVE2 and Neon) accelerates quantized LLM inference on the Grace CPU. diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md index d57eb5d3d0..54862890d4 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_llamacpp/4_gb10_processwatch.md @@ -156,11 +156,11 @@ ALL ALL 2.52 8.37 0.00 0.00 100.00 26566 ``` Here is an interpretation of the values: -- NEON: 7–15% for SIMD integer and floating-point operations +- Neon: 7–15% for SIMD integer and floating-point operations - FPARMv8: 2-5% for scalar FP operations such as activation and normalization - SVE/SVE2: 0%, the kernel does not issue SVE instructions -This confirms that the Grace CPU performs quantized inference primarily using NEON. +This confirms that the Grace CPU performs quantized inference primarily using Neon. ## Why are SVE and SVE2 inactive? @@ -201,7 +201,7 @@ Throughout this Learning Path, you have learned how to: - Download and run quantized TinyLlama models for efficient testing and benchmarking - Monitor GPU utilization and performance using tools like nvtop - Analyze CPU instruction mix with Process Watch to understand how Armv9 vector instructions are used during inference -- Interpret the impact of NEON, SVE, and SVE2 on AI workloads, and recognize current kernel limitations for vector execution +- Interpret the impact of Neon, SVE, and SVE2 on AI workloads, and recognize current kernel limitations for vector execution By completing these steps, you are now equipped to: diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md index 14956d052c..f8a8cee458 100644 --- a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md @@ -159,7 +159,7 @@ Here are the most important options for Arm systems: You can set these options when running `cmake` to customize your build for your hardware and use case. -KleidiCV automatically selects the fastest available code path for your hardware. If the library detects that SVE2 (Scalable Vector Extension 2) or SME2 (Scalable Matrix Extension 2) is slower than NEON for a specific function, it defaults to NEON—unless you explicitly turn off this behavior by setting `-DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF` or `-DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF`. +KleidiCV automatically selects the fastest available code path for your hardware. If the library detects that SVE2 (Scalable Vector Extension 2) or SME2 (Scalable Matrix Extension 2) is slower than Neon for a specific function, it defaults to Neon—unless you explicitly turn off this behavior by setting `-DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF` or `-DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF`. ## Build the KleidiCV standalone diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md index 4f1327d74a..1134703819 100644 --- a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md @@ -197,7 +197,7 @@ The KleidiCV library detects the platform hardware at runtime and selects the ba - SME2 backend implementation - SME backend implementation - SVE backend implementation -- NEON backend implementation +- Neon backend implementation The following code shows how the library resolves which implementation to use: @@ -428,7 +428,7 @@ kleidicv API:: kleidicv_remap_f32_u8_resolver,NEON backend. kleidicv API:: kleidicv_remap_f32_u16_resolver,NEON backend. kleidicv API:: kleidicv_warp_perspective_stripe_u8_resolver,NEON backend. ``` -The output is truncated for brevity, but you will see detailed performance metrics for each operation at 1280x720 resolution. Look for lines showing the operation name, sample count, mean and median times, and standard deviation. These results help you compare the performance of different backends and confirm that SME or NEON acceleration is active. +The output is truncated for brevity, but you will see detailed performance metrics for each operation at 1280x720 resolution. Look for lines showing the operation name, sample count, mean and median times, and standard deviation. These results help you compare the performance of different backends and confirm that SME or Neon acceleration is active. ## Use lldb to check the SME backend implementation @@ -566,5 +566,5 @@ kleidicv-api-test`kleidicv::sme::saturating_add_abs_with_threshold: ## Summary -In this Learning Path, you tested the KleidiCV build and verified its functionality. You ran both the KleidiCV API tests and the OpenCV performance tests. You also explored how KleidiCV's multiversion support works, enabling it to select the optimal backend like SME, SVE, or NEON at runtime. Finally, you learned how to enable debug output and use the `lldb` debugger to confirm that the SME backend is being used and to inspect the assembly code. +In this Learning Path, you tested the KleidiCV build and verified its functionality. You ran both the KleidiCV API tests and the OpenCV performance tests. You also explored how KleidiCV's multiversion support works, enabling it to select the optimal backend like SME, SVE, or Neon at runtime. Finally, you learned how to enable debug output and use the `lldb` debugger to confirm that the SME backend is being used and to inspect the assembly code. diff --git a/content/learning-paths/mobile-graphics-and-gaming/_index.md b/content/learning-paths/mobile-graphics-and-gaming/_index.md index 4c71f19d5b..45814a60c4 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/_index.md @@ -61,7 +61,7 @@ tools_software_languages_filter: - llvm-mca: 1 - MediaPipe: 2 - MTE: 2 -- NEON: 1 +- Neon: 1 - ONNX: 1 - ONNX Runtime: 2 - OpenGL ES: 1 diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md index b74495cba1..56cb1ffa3e 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/aot-and-cross-compilation.md @@ -88,7 +88,7 @@ int main(int argc, char** argv) { } ``` -In the original implementation constants 128, 255, and 0 were implicitly treated as integers. Here, the threshold value (128) and output values (255, 0) are explicitly cast to uint8_t. This approach removes ambiguity and clearly specifies the types used, ensuring compatibility and clarity. Both approaches result in identical functionality, but explicitly casting helps emphasize the type correctness and may avoid subtle issues during cross-compilation or in certain environments. Additionally, explicit uint8_t casts help avoid implicit promotion to 32-bit integers (and the corresponding narrowings back to 8-bit) in the generated code, reducing redundant cast operations and potential vector widen/narrow overhead—especially on Arm/NEON. +In the original implementation constants 128, 255, and 0 were implicitly treated as integers. Here, the threshold value (128) and output values (255, 0) are explicitly cast to uint8_t. This approach removes ambiguity and clearly specifies the types used, ensuring compatibility and clarity. Both approaches result in identical functionality, but explicitly casting helps emphasize the type correctness and may avoid subtle issues during cross-compilation or in certain environments. Additionally, explicit uint8_t casts help avoid implicit promotion to 32-bit integers (and the corresponding narrowings back to 8-bit) in the generated code, reducing redundant cast operations and potential vector widen/narrow overhead—especially on Arm/Neon. The program takes at least one command-line argument, the output base name used to generate the files (for example, "blur_threshold_android"). Here, the target architecture is explicitly set within the code to Android ARM64: diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md index e2535f65a6..7af3189836 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_halide/intro.md @@ -59,7 +59,7 @@ Halide offers several powerful scheduling strategies for maximum performance: - Parallelism is the execution of computations concurrently across multiple CPU cores, reducing execution time for large datasets -- Vectorization enables simultaneous processing of multiple data elements using SIMD (Single Instruction, Multiple Data) instructions, such as Arm NEON, enhancing performance on Arm CPUs and GPUs +- Vectorization enables simultaneous processing of multiple data elements using SIMD (Single Instruction, Multiple Data) instructions, such as Arm Neon, enhancing performance on Arm CPUs and GPUs - Tiling divides computations into smaller blocks optimized for cache efficiency, improving memory locality and reducing transfer overhead diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md index 234ac62fa3..7b69cafb65 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md @@ -37,7 +37,7 @@ further_reading: type: blog - resource: title: Using Neon Intrinsics - link: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics/Using-NEON-intrinsics + link: https://developer.arm.com/documentation/den0018/a/Neon-Intrinsics/Using-Neon-intrinsics type: documentation - resource: title: Intrinsics diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_sve2/part1.md b/content/learning-paths/mobile-graphics-and-gaming/android_sve2/part1.md index ee953e8e9d..0e28b6f060 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_sve2/part1.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_sve2/part1.md @@ -24,7 +24,7 @@ You will also need a Armv8 powered smartphone running Android. We tested the app You can find the [complete source code on GitHub](https://github.com/dawidborycki/Arm.SVE2). ## Create a project and enable SVE2 support -The process of creating and configuring this project is similar to that used for NEON. Follow these steps: +The process of creating and configuring this project is similar to that used for Neon. Follow these steps: 1. Open Android Studio on your development machine and then click the **+ New Project** icon: diff --git a/content/learning-paths/mobile-graphics-and-gaming/litert-sme/1-litert-kleidiai-sme2.md b/content/learning-paths/mobile-graphics-and-gaming/litert-sme/1-litert-kleidiai-sme2.md index 9bc2d89b68..42e65aea2d 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/litert-sme/1-litert-kleidiai-sme2.md +++ b/content/learning-paths/mobile-graphics-and-gaming/litert-sme/1-litert-kleidiai-sme2.md @@ -24,14 +24,14 @@ To understand how KleidiAI SME2 micro-kernels work in LiteRT, think about a Lite ### LiteRT → XNNPACK workflow -![Diagram showing the workflow for a fully connected operator in LiteRT using XNNPACK. The diagram depicts the flow from LiteRT to XNNPACK, highlighting the use of NEON instructions for matrix multiplication and weight packing on Arm platforms. The technical environment emphasizes operator traversal, hardware detection, and parallel computation. alt-text #center](./litert-xnnpack-workflow.png "LiteRT, XNNPACK workflow") +![Diagram showing the workflow for a fully connected operator in LiteRT using XNNPACK. The diagram depicts the flow from LiteRT to XNNPACK, highlighting the use of Neon instructions for matrix multiplication and weight packing on Arm platforms. The technical environment emphasizes operator traversal, hardware detection, and parallel computation. alt-text #center](./litert-xnnpack-workflow.png "LiteRT, XNNPACK workflow") For batch sizes greater than 1, a fully connected operator performs a matrix multiplication between the input activations (LHS) and the weights (RHS). When LiteRT loads a model, it reads the operators and builds a computation graph. If you select the CPU as the accelerator, LiteRT uses XNNPACK by default. -XNNPACK scans the computation graph and looks for operators it can optimize. XNNPACK also checks the hardware compatibility and chooses the best available micro-kernel. Then, it packs the weight matrix to prepare for efficient computation. On Arm platforms, XNNPACK uses NEON instructions to speed up this packing. +XNNPACK scans the computation graph and looks for operators it can optimize. XNNPACK also checks the hardware compatibility and chooses the best available micro-kernel. Then, it packs the weight matrix to prepare for efficient computation. On Arm platforms, XNNPACK uses Neon instructions to speed up this packing. -During model inference, it splits the matrices into smaller tiles and runs the multiplications in parallel across multiple threads, using NEON instructions for faster processing. +During model inference, it splits the matrices into smaller tiles and runs the multiplications in parallel across multiple threads, using Neon instructions for faster processing. ### LiteRT → XNNPACK → KleidiAI workflow @@ -45,7 +45,7 @@ During model inference, the LHS packing micro-kernel is invoked. After the LHS i ## What you've accomplished and what's next -In this section, you explored how LiteRT leverages XNNPACK and KleidiAI to accelerate fully connected operators on Arm platforms. You learned how XNNPACK uses NEON instructions for efficient matrix operations and how enabling KleidiAI with SME2 further optimizes performance by introducing specialized micro-kernels for packing and matrix multiplication. +In this section, you explored how LiteRT leverages XNNPACK and KleidiAI to accelerate fully connected operators on Arm platforms. You learned how XNNPACK uses Neon instructions for efficient matrix operations and how enabling KleidiAI with SME2 further optimizes performance by introducing specialized micro-kernels for packing and matrix multiplication. You have completed the overview of LiteRT, XNNPACK, KleidiAI, and SME2 integration. Next, you’ll dive deeper into building and benchmarking models with these technologies. diff --git a/content/learning-paths/mobile-graphics-and-gaming/litert-sme/3-build-tool.md b/content/learning-paths/mobile-graphics-and-gaming/litert-sme/3-build-tool.md index b1ce2614a2..7cec7b86e8 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/litert-sme/3-build-tool.md +++ b/content/learning-paths/mobile-graphics-and-gaming/litert-sme/3-build-tool.md @@ -12,7 +12,7 @@ LiteRT provides a standalone performance measurement utility called `benchmark_m In this section, you will build two versions of the benchmark tool: - With KleidiAI and Scalable Matrix Extension version 2 (SME2) enabled, which uses Arm-optimized micro-kernels -- Without KleidiAI and SME2, which provides baseline performance using NEON micro-kernels +- Without KleidiAI and SME2, which provides baseline performance using Neon micro-kernels This comparison demonstrates the performance gains provided by SME2 acceleration. @@ -129,7 +129,7 @@ ${XNNPACK_OPTIONS} "${BENCHMARK_TOOL_PATH}" \ --repo_env=HERMETIC_PYTHON_VERSION=3.12 ``` -This build of the `benchmark_model` disables all SME2 micro-kernels and forces fallback to XNNPACK's NEON micro-kernels. +This build of the `benchmark_model` disables all SME2 micro-kernels and forces fallback to XNNPACK's Neon micro-kernels. You can then use Android Debug Bridge (ADB) to push the benchmark tool to your Android device: diff --git a/content/learning-paths/mobile-graphics-and-gaming/onnx/01_fundamentals.md b/content/learning-paths/mobile-graphics-and-gaming/onnx/01_fundamentals.md index 4902e1202f..0500ac3846 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/onnx/01_fundamentals.md +++ b/content/learning-paths/mobile-graphics-and-gaming/onnx/01_fundamentals.md @@ -33,7 +33,7 @@ ONNX addresses this challenge by acting as a universal exchange format that sepa The key reasons ONNX matters are: - Interoperability: ONNX decouples training from inference. Models trained in PyTorch or TensorFlow can be exported into a common format and executed in a different runtime environment without embedding the original framework. -- Performance: ONNX Runtime includes highly optimized execution backends, supporting hardware acceleration through Arm NEON, CUDA, DirectML, and Android NNAPI. This means the same model can run efficiently across a wide spectrum of hardware. +- Performance: ONNX Runtime includes highly optimized execution backends, supporting hardware acceleration through Arm Neon, CUDA, DirectML, and Android NNAPI. This means the same model can run efficiently across a wide spectrum of hardware. - Portability: a single `.onnx` model file can be deployed across Arm-based cloud servers, embedded Arm devices, and mobile applications, provided the required operators are supported by the target runtime. - Ecosystem: the ONNX Model Zoo and broad industry adoption make it easier to reuse validated architectures across platforms. - Extensibility: custom operators and execution providers allow researchers and hardware vendors to extend ONNX without breaking compatibility with the broader ecosystem. @@ -76,7 +76,7 @@ ONNX Runtime provides: **Cross-platform support** – ORT runs on Windows, Linux, and macOS, as well as mobile platforms like Android and iOS. It supports both x86 and Arm64 architectures, making it suitable for deployment from cloud servers to edge devices such as Raspberry Pi boards and smartphones. **Hardware acceleration** – ORT integrates with a wide range of execution providers (EPs) that tap into hardware capabilities: -* Arm Kleidi kernels accelerated with Arm NEON, SVE2, and SME2 instructions for efficient CPU execution on Arm64 +* Arm Kleidi kernels accelerated with Arm Neon, SVE2, and SME2 instructions for efficient CPU execution on Arm64 * CUDA for NVIDIA GPUs * DirectML for Windows * NNAPI on Android, enabling direct access to mobile accelerators (DSPs, NPUs) @@ -92,7 +92,7 @@ One of ONNX’s greatest strengths is how naturally it integrates into a modern A typical ONNX workflow looks like this: - Train the model: you first use your preferred framework (e.g., PyTorch, TensorFlow, or scikit-learn) to design and train a model. At this stage, you benefit from the flexibility and ecosystem of the framework of your choice. - Export to ONNX: once trained, the model is exported into the ONNX format using built-in converters (such as torch.onnx.export for PyTorch). This produces a portable .onnx file describing the network architecture, weights, and metadata. -- Run inference with ONNX Runtime: the ONNX model can now be executed on different devices using ONNX Runtime. On Arm64 hardware, ONNX Runtime can take advantage of Arm Kleidi kernels accelerated with NEON, SVE2, and SME2 instructions, while on Android devices it can leverage NNAPI to access mobile accelerators (where available). +- Run inference with ONNX Runtime: the ONNX model can now be executed on different devices using ONNX Runtime. On Arm64 hardware, ONNX Runtime can take advantage of Arm Kleidi kernels accelerated with Neon, SVE2, and SME2 instructions, while on Android devices it can leverage NNAPI to access mobile accelerators (where available). - Optimize the model: apply graph optimizations like layer fusion, constant folding, or quantization to improve performance and reduce memory usage, making the model more suitable for edge and mobile deployments. - Deploy: finally, the optimized ONNX model is packaged into its target environment. This could be an Arm64-based embedded system (e.g., Raspberry Pi), a server powered by Arm CPUs (e.g., AWS Graviton), or an Android application distributed via the Play Store. diff --git a/content/learning-paths/mobile-graphics-and-gaming/performance_onnxruntime_kleidiai_sme2/kleidiai_integration.md b/content/learning-paths/mobile-graphics-and-gaming/performance_onnxruntime_kleidiai_sme2/kleidiai_integration.md index 2b11f945bc..891ee1629a 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/performance_onnxruntime_kleidiai_sme2/kleidiai_integration.md +++ b/content/learning-paths/mobile-graphics-and-gaming/performance_onnxruntime_kleidiai_sme2/kleidiai_integration.md @@ -10,7 +10,7 @@ layout: learningpathall ONNX Runtime automatically detects and uses KleidiAI when SME2 support is available: - Detection: MLAS checks the CPU capabilities for SME2 support at runtime. -- Dispatch: when SME2 is detected, MLAS replaces its default kernels with KleidiAI micro-kernels. For example, a Gemm operation that normally uses NEON instructions dispatches to a KleidiAI SME2 micro-kernel instead. +- Dispatch: when SME2 is detected, MLAS replaces its default kernels with KleidiAI micro-kernels. For example, a Gemm operation that normally uses Neon instructions dispatches to a KleidiAI SME2 micro-kernel instead. Currently, KleidiAI in MLAS provides `ArmKleidiAI::MlasConv`, `ArmKleidiAI::MlasGemmBatch`, and `ArmKleidiAI::MlasDynamicQGemmBatch` kernels. diff --git a/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md b/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md index 5caa77dcdb..a65a398431 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/using-neon-intrinsics-to-optimize-unity-on-android/_index.md @@ -26,7 +26,7 @@ armips: - aarch64 - arm64 - arm architecture - - NEON + - Neon tools_software_languages: - Unity - C# diff --git a/content/learning-paths/servers-and-cloud-computing/_index.md b/content/learning-paths/servers-and-cloud-computing/_index.md index b97d2da0d8..5481f854d4 100644 --- a/content/learning-paths/servers-and-cloud-computing/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/_index.md @@ -174,7 +174,7 @@ tools_software_languages_filter: - mongotop: 1 - mpi: 1 - MySQL: 10 -- NEON: 7 +- Neon: 7 - Networking: 1 - Nexmark: 1 - NGINX: 5 diff --git a/content/learning-paths/servers-and-cloud-computing/arm-mcp-server/3-simd-migration.md b/content/learning-paths/servers-and-cloud-computing/arm-mcp-server/3-simd-migration.md index 812b45722a..090f933d6f 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-mcp-server/3-simd-migration.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-mcp-server/3-simd-migration.md @@ -9,9 +9,9 @@ layout: learningpathall {{% notice Note %}} This section uses Visual Studio Code with GitHub Copilot. If you're using a different AI assistant, skip to the next section, where you'll configure the same migration workflow using other agentic systems.{{% /notice %}} -When migrating applications from x86 to Arm, you might encounter SIMD (Single Instruction, Multiple Data) code that is written using architecture-specific intrinsics. On x86 platforms, SIMD is commonly implemented with SSE, AVX, or AVX2 intrinsics, while Arm platforms use NEON and SVE intrinsics to provide similar vectorized capabilities. Updating this code manually can be time-consuming and challenging. By combining the Arm MCP Server with GitHub Copilot, you can create an Arm Cloud Migration Agent that automates much of this work and guides the AI assistant through a structured, architecture-aware migration of your codebase. GitHub Copilot supports two file formats for this: **prompt files** (`.prompt.md`) that reference an already-configured MCP server, and **agent files** (`.agent.md`) that create agents which can be assigned to issues directly in the GitHub interface. +When migrating applications from x86 to Arm, you might encounter SIMD (Single Instruction, Multiple Data) code that is written using architecture-specific intrinsics. On x86 platforms, SIMD is commonly implemented with SSE, AVX, or AVX2 intrinsics, while Arm platforms use Neon and SVE intrinsics to provide similar vectorized capabilities. Updating this code manually can be time-consuming and challenging. By combining the Arm MCP Server with GitHub Copilot, you can create an Arm Cloud Migration Agent that automates much of this work and guides the AI assistant through a structured, architecture-aware migration of your codebase. GitHub Copilot supports two file formats for this: **prompt files** (`.prompt.md`) that reference an already-configured MCP server, and **agent files** (`.agent.md`) that create agents which can be assigned to issues directly in the GitHub interface. -This section walks through both approaches and uses them to migrate a sample x86 application with AVX2 SIMD code to Arm NEON. +This section walks through both approaches and uses them to migrate a sample x86 application with AVX2 SIMD code to Arm Neon. ## Sample x86 code with AVX2 intrinsics @@ -258,7 +258,7 @@ If you're using the **agent file**, select the `arm-migration-agent` from the ag The assistant will: * Detect x86-specific intrinsics - * Rewrite SIMD code using NEON + * Rewrite SIMD code using Neon * Remove architecture-specific build flags * Update container and dependency configurations as needed @@ -288,6 +288,6 @@ If compilation or runtime issues occur, feed the errors back to the AI assistant ## What you've accomplished and what's next -In this section, you've created an Arm Cloud Migration Agent in GitHub Copilot using either a prompt file or an agent file that can be assigned to issues in the GitHub interface, and used it to perform a fully automated migration of x86 AVX2 SIMD code to Arm NEON. You've seen how structured instructions enable the assistant to analyze, transform, and verify architecture-specific code. +In this section, you've created an Arm Cloud Migration Agent in GitHub Copilot using either a prompt file or an agent file that can be assigned to issues in the GitHub interface, and used it to perform a fully automated migration of x86 AVX2 SIMD code to Arm Neon. You've seen how structured instructions enable the assistant to analyze, transform, and verify architecture-specific code. In the next section, you'll learn how to configure other agentic AI systems with the same migration workflow. diff --git a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md index 2a8482de99..70922dc97a 100644 --- a/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md +++ b/content/learning-paths/servers-and-cloud-computing/arm-soc-migration-learning-path/migration.md @@ -71,7 +71,7 @@ Compare Graviton3 and BCM2712 architecture capabilities. What are the key differences I need to handle for cloud-to-edge migration? ``` -For Graviton3 to Pi 5, the Power identifies key differences such as CPU architecture (Neoverse-V1 vs Cortex-A76), available memory (cloud 64 GB+ vs edge 4–8 GB), SIMD capabilities (SVE vs NEON), and peripheral requirements (none vs GPIO/SPI/I2C). The Power analyzes these differences for any Arm SoC pair and identifies the migration challenges you need to address. +For Graviton3 to Pi 5, the Power identifies key differences such as CPU architecture (Neoverse-V1 vs Cortex-A76), available memory (cloud 64 GB+ vs edge 4–8 GB), SIMD capabilities (SVE vs Neon), and peripheral requirements (none vs GPIO/SPI/I2C). The Power analyzes these differences for any Arm SoC pair and identifies the migration challenges you need to address. ### Design the Hardware Abstraction Layer (HAL) diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/01-introduction.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/01-introduction.md index e5bbe5ecbb..135202d128 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/01-introduction.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/01-introduction.md @@ -1,6 +1,6 @@ --- # User change -title: "Optimize bitmap scanning in databases with SVE and NEON on Arm servers" +title: "Optimize bitmap scanning in databases with SVE and Neon on Arm servers" weight: 2 @@ -10,12 +10,12 @@ layout: "learningpathall" Bitmap scanning is a core operation in many database systems. It's essential for powering fast filtering in bitmap indexes, Bloom filters, and column filters. However, these scans can become performance bottlenecks in complex analytical queries. -In this Learning Path, you’ll learn how to accelerate bitmap scanning using Arm’s vector processing technologies - NEON and SVE - on Neoverse V2–based servers like AWS Graviton4. +In this Learning Path, you’ll learn how to accelerate bitmap scanning using Arm’s vector processing technologies - Neon and SVE - on Neoverse V2–based servers like AWS Graviton4. Specifically, you will: * Explore how to use SVE instructions on Arm Neoverse V2–based servers like AWS Graviton4 to optimize bitmap scanning -* Compare scalar, NEON, and SVE implementations to demonstrate the performance benefits of specialized vector instructions +* Compare scalar, Neon, and SVE implementations to demonstrate the performance benefits of specialized vector instructions ## What is bitmap scanning in databases? @@ -35,7 +35,7 @@ Here's how vector processing has evolved to improve bitmap scanning performance: * **Generic scalar processing**: traditional bit-by-bit processing with conditional branches * **Optimized scalar processing**: byte-level skipping to avoid processing empty bytes -* **NEON**: fixed-width 128-bit SIMD processing with vector operations +* **Neon**: fixed-width 128-bit SIMD processing with vector operations * **SVE**: scalable vector processing with predication and specialized instructions like MATCH ## Set up your Arm development environment diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/03-scalar-implementations.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/03-scalar-implementations.md index 320549b0f6..d0d076f141 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/03-scalar-implementations.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/03-scalar-implementations.md @@ -76,7 +76,7 @@ size_t result_count = 0; ``` Instead of iterating through each bit individually, this implementation processes one byte (8 bits) at a time. The main optimization over the previous scalar implementation is checking if an entire byte is zero and skipping it entirely. For sparse bitmaps, this can dramatically reduce the number of bit checks. -## Next up: accelerate bitmap scanning with NEON and SVE +## Next up: accelerate bitmap scanning with Neon and SVE You’ve now implemented two scalar scanning routines: @@ -86,4 +86,4 @@ You’ve now implemented two scalar scanning routines: These provide a solid foundation and performance baseline—but scalar methods can only take you so far. To unlock real throughput gains, it’s time to leverage SIMD (Single Instruction, Multiple Data) execution. -In the next section, you’ll explore how to use Arm NEON and SVE vector instructions to accelerate bitmap scanning. These approaches will process multiple bytes at once and significantly outperform scalar loops—especially on modern Arm-based CPUs like AWS Graviton4. +In the next section, you’ll explore how to use Arm Neon and SVE vector instructions to accelerate bitmap scanning. These approaches will process multiple bytes at once and significantly outperform scalar loops—especially on modern Arm-based CPUs like AWS Graviton4. diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/04-vector-implementations.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/04-vector-implementations.md index 3de8fba739..f08825fb22 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/04-vector-implementations.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/04-vector-implementations.md @@ -1,6 +1,6 @@ --- # User change -title: "Vectorized bitmap scanning with NEON and SVE" +title: "Vectorized bitmap scanning with Neon and SVE" weight: 5 @@ -8,13 +8,13 @@ layout: "learningpathall" --- -Modern Arm CPUs like Neoverse V2 support SIMD (Single Instruction, Multiple Data) extensions that allow processing multiple bytes in parallel. In this section, you'll explore how NEON and SVE vector instructions can dramatically accelerate bitmap scanning by skipping over large regions of unset data and reducing per-bit processing overhead. +Modern Arm CPUs like Neoverse V2 support SIMD (Single Instruction, Multiple Data) extensions that allow processing multiple bytes in parallel. In this section, you'll explore how Neon and SVE vector instructions can dramatically accelerate bitmap scanning by skipping over large regions of unset data and reducing per-bit processing overhead. -## NEON implementation +## Neon implementation -This implementation uses NEON SIMD (Single Instruction, Multiple Data) instructions to process 16 bytes (128 bits) at a time, significantly accelerating the scanning process. +This implementation uses Neon SIMD (Single Instruction, Multiple Data) instructions to process 16 bytes (128 bits) at a time, significantly accelerating the scanning process. -Copy the NEON implementation shown below into the same file: +Copy the Neon implementation shown below into the same file: ```c // NEON implementation of bit vector scanning @@ -84,7 +84,7 @@ size_t scan_bitvector_neon(bitvector_t* bv, uint32_t* result_positions) { return result_count; } ``` -This NEON implementation processes 16 bytes at a time with vector instructions. For sparse bitmaps, entire 16-byte chunks can be skipped at once, providing a significant speedup over byte-level skipping. After vector processing, it falls back to scalar code for any remaining bytes that don't fill a complete 16-byte chunk. +This Neon implementation processes 16 bytes at a time with vector instructions. For sparse bitmaps, entire 16-byte chunks can be skipped at once, providing a significant speedup over byte-level skipping. After vector processing, it falls back to scalar code for any remaining bytes that don't fill a complete 16-byte chunk. ## SVE implementation @@ -159,6 +159,6 @@ The SVE implementation efficiently scans bitmaps by using `svcmpne_u8` to identi ## Next up: apply vectorized scanning to database workloads -With both NEON and SVE implementations in place, you’ve now unlocked the full power of Arm’s vector processing capabilities for bitmap scanning. These SIMD techniques allow you to process large bitvectors more efficiently—especially when filtering sparse datasets or skipping over large blocks of empty rows. +With both Neon and SVE implementations in place, you’ve now unlocked the full power of Arm’s vector processing capabilities for bitmap scanning. These SIMD techniques allow you to process large bitvectors more efficiently—especially when filtering sparse datasets or skipping over large blocks of empty rows. In the next section, you’ll learn how to apply these optimizations in the context of real database operations like bitmap index scans, Bloom filter probes, and column filtering. You’ll also explore best practices for selecting the right implementation based on bit density, and tuning for maximum performance on AWS Graviton4. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/05-benchmarking-and-results.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/05-benchmarking-and-results.md index 0c5edf7bba..e0c3d6077d 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/05-benchmarking-and-results.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/05-benchmarking-and-results.md @@ -142,7 +142,7 @@ When running on a Graviton4 c8g.large instance with Ubuntu 24.04, the results sh ### Execution time (ms) -| Density | Set Bits | Scalar Generic | Scalar Optimized | NEON | SVE | +| Density | Set Bits | Scalar Generic | Scalar Optimized | Neon | SVE | |---------|----------|----------------|------------------|-------|------------| | 0.0000 | 0 | 7.169 | 0.456 | 0.056 | 0.093 | | 0.0001 | 1,000 | 7.176 | 0.477 | 0.090 | 0.109 | @@ -152,7 +152,7 @@ When running on a Graviton4 c8g.large instance with Ubuntu 24.04, the results sh ### Speed-up vs generic scalar -| Density | Scalar Optimized | NEON | SVE | +| Density | Scalar Optimized | Neon | SVE | |---------|------------------|---------|------------| | 0.0000 | 15.72x | 127.41x | 77.70x | | 0.0001 | 15.05x | 80.12x | 65.86x | @@ -172,27 +172,27 @@ The optimized scalar implementation shows significant improvements over the gene * **Reduced Function Calls**: accessing bits directly rather than through function calls * **Better Cache Utilization**: more sequential memory access patterns -### Optimized scalar vs NEON +### Optimized scalar vs Neon -The NEON implementation shows further improvements over the optimized scalar implementation for sparse bit vectors due to: +The Neon implementation shows further improvements over the optimized scalar implementation for sparse bit vectors due to: * **Chunk-level Skipping**: quickly skipping 16 empty bytes at once * **Vectorized Comparison**: checking multiple bytes in parallel * **Early Termination**: quickly determining if a chunk contains any set bits -### NEON vs SVE +### Neon vs SVE -The performance comparison between NEON and SVE depends on the bit density: +The performance comparison between Neon and SVE depends on the bit density: * **Very Sparse Bit Vectors (0% - 0.01% density)**: - - NEON performs better for empty bitvectors due to lower overhead - - NEON achieves up to 127.41x speedup over generic scalar + - Neon performs better for empty bitvectors due to lower overhead + - Neon achieves up to 127.41x speedup over generic scalar - SVE performs better for very sparse bitvectors (0.001% density) - SVE achieves up to 29.07x speedup over generic scalar at 0.001% density * **Higher Density Bit Vectors (0.1% - 10% density)**: - - SVE consistently outperforms NEON - - SVE achieves up to 1.66x speedup over NEON at 0.01% density + - SVE consistently outperforms Neon + - SVE achieves up to 1.66x speedup over Neon at 0.01% density ### Key optimizations in SVE implementation @@ -210,7 +210,7 @@ The SVE implementation includes several key optimizations: ## Next up: apply what you’ve learned to real-world workloads -Now that you’ve benchmarked all four bitmap scanning implementations—scalar (generic and optimized), NEON, and SVE—you have a data-driven understanding of how vectorization impacts performance across different bitmap densities. +Now that you’ve benchmarked all four bitmap scanning implementations—scalar (generic and optimized), Neon, and SVE—you have a data-driven understanding of how vectorization impacts performance across different bitmap densities. In the next section, you’ll explore how to apply these techniques in real-world database workloads, including: diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/06-application-and-best-practices.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/06-application-and-best-practices.md index 1f5bdc05dd..a6757f0763 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/06-application-and-best-practices.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/06-application-and-best-practices.md @@ -11,24 +11,24 @@ layout: "learningpathall" Optimized bitmap scanning can accelerate several core operations in modern database engines, particularly those used for analytical and vectorized workloads. ### Bitmap index scans -Bitmap indexes are widely used in analytical databases to accelerate queries with multiple filter predicates across large datasets. The NEON and SVE implementations can significantly speed up the scanning of these bitmap indexes, especially for queries with low selectivity. +Bitmap indexes are widely used in analytical databases to accelerate queries with multiple filter predicates across large datasets. The Neon and SVE implementations can significantly speed up the scanning of these bitmap indexes, especially for queries with low selectivity. ### Bloom filter checks -Bloom filters are probabilistic structures used to test set membership, commonly employed in join filters or subquery elimination. Vectorized scanning via NEON or SVE accelerates these checks by quickly rejecting rows that don’t match, reducing the workload on subsequent stages of the query. +Bloom filters are probabilistic structures used to test set membership, commonly employed in join filters or subquery elimination. Vectorized scanning via Neon or SVE accelerates these checks by quickly rejecting rows that don’t match, reducing the workload on subsequent stages of the query. ### Column filtering -Columnar databases frequently use bitmap filters to track which rows satisfy filter conditions. These bitmaps can be scanned in a vectorized fashion using NEON or SVE instructions, substantially speeding up predicate evaluation and minimizing CPU cycles spent on row selection. +Columnar databases frequently use bitmap filters to track which rows satisfy filter conditions. These bitmaps can be scanned in a vectorized fashion using Neon or SVE instructions, substantially speeding up predicate evaluation and minimizing CPU cycles spent on row selection. ## Best practices Based on the benchmark results, here are some best practices for optimizing bitmap scanning operations: * Choose the right implementation based on the expected bit density**: - - For empty bit vectors: NEON is optimal + - For empty bit vectors: Neon is optimal - For very sparse bit vectors (0.001% - 0.1% set bits): SVE is optimal due to efficient skipping - - For medium to high densities (> 0.1% density): SVE still outperforms NEON + - For medium to high densities (> 0.1% density): SVE still outperforms Neon * Implement Early Termination**: Always include a fast path for the no-hits case, as this can provide dramatic performance improvements. @@ -36,13 +36,13 @@ Based on the benchmark results, here are some best practices for optimizing bitm * Consider Memory Access Patterns**: Optimize memory access patterns to improve cache utilization. -* Leverage Vector Instructions**: Use NEON or SVE/SVE2 instructions to process multiple bytes in parallel. +* Leverage Vector Instructions**: Use Neon or SVE/SVE2 instructions to process multiple bytes in parallel. ## Conclusion Scalable Vector Extension (SVE) instructions provide a powerful and portable way to accelerate bitmap scanning in modern database systems. When implemented on Arm Neoverse V2–based servers like AWS Graviton4, they deliver substantial performance improvements across a wide range of bit densities. -The SVE implementation shows particularly impressive performance for sparse bitvectors (0.001% - 0.1% density), where it outperforms both scalar and NEON implementations. For higher densities, it maintains a performance advantage by amortizing scan costs across wider vectors. +The SVE implementation shows particularly impressive performance for sparse bitvectors (0.001% - 0.1% density), where it outperforms both scalar and Neon implementations. For higher densities, it maintains a performance advantage by amortizing scan costs across wider vectors. These performance improvements can translate directly to faster query execution times, especially for analytical workloads that involve multiple bitmap operations. diff --git a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md index a58b39fa60..312dd025d5 100644 --- a/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/bitmap_scan_sve2/_index.md @@ -1,5 +1,5 @@ --- -title: Accelerate Bitmap Scanning with NEON and SVE Instructions on Arm servers +title: Accelerate Bitmap Scanning with Neon and SVE Instructions on Arm servers minutes_to_complete: 20 @@ -8,7 +8,7 @@ who_is_this_for: This is an introductory topic for database developers, performa learning_objectives: - Understand bitmap scanning operations in database systems - - Implement bitmap scanning with scalar, NEON, and SVE instructions + - Implement bitmap scanning with scalar, Neon, and SVE instructions - Compare performance between different implementations - Measure performance improvements on Graviton4 instances @@ -33,7 +33,7 @@ operatingsystems: - Linux tools_software_languages: - SVE -- NEON +- Neon - Runbook further_reading: diff --git a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/deepseek-chatbot.md b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/deepseek-chatbot.md index e66eb3d64d..a59cb921b4 100644 --- a/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/deepseek-chatbot.md +++ b/content/learning-paths/servers-and-cloud-computing/deepseek-cpu/deepseek-chatbot.md @@ -151,9 +151,9 @@ This model is `DeepSeek-R1-Q4_0-00001-of-00010.gguf`, so what does each componen As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for performance optimization with three types of GEMV/GEMM kernels corresponding to three processor types: -* AWS Graviton2, where you only have NEON support (you will see less improvement for these GEMV/GEMM kernels). +* AWS Graviton2, where you only have Neon support (you will see less improvement for these GEMV/GEMM kernels). * AWS Graviton3, where the GEMV/GEMM kernels exploit both SVE 256 and MATMUL INT8 support. -* AWS Graviton4, where the GEMV/GEMM kernels exploit NEON/SVE 128 and MATMUL_INT8 support. +* AWS Graviton4, where the GEMV/GEMM kernels exploit Neon/SVE 128 and MATMUL_INT8 support. With the latest commits in `llama.cpp` you will see improvements for these Arm optimized kernels directly on your Arm-based server. You can run the pre-quantized Q4_0 model as is and do not need to re-quantize the model. @@ -380,7 +380,7 @@ llama_perf_context_print: total time = 42340.53 ms / 531 tokens The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton4 instance, you will see: - * NEON = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions. + * Neon = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions. * ARM_FMA = 1 This flag indicates support for Arm Floating-point Multiply and Accumulate instructions. * MATMUL_INT8 = 1 This flag indicates support for Arm int8 matrix multiplication instructions. * SVE = 1 This flag indicates support for the Arm Scalable Vector Extension. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md index 8d70d6472a..6c5e46b01c 100644 --- a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/1-overview.md @@ -27,7 +27,7 @@ When architecture-specific optimizations are present, migration may involve: - Identifying x86-specific intrinsics or assembly - Updating compiler flags and build configurations -- Mapping AVX2 operations to appropriate NEON equivalents +- Mapping AVX2 operations to appropriate Neon equivalents - Rewriting vectorized code and adjusting loop structures - Updating Dockerfiles, base images, and compiler flags - Validating correctness and performance on Arm systems @@ -76,7 +76,7 @@ This Learning Path uses a real-world example: a matrix multiplication benchmark The demo repository is available at [github.com/JoeStech/docker-blog-arm-migration](https://github.com/JoeStech/docker-blog-arm-migration). -By the end of this Learning Path, you'll have a working Arm64 container with NEON-optimized code and an automated pull request containing all migration changes. +By the end of this Learning Path, you'll have a working Arm64 container with Neon-optimized code and an automated pull request containing all migration changes. ## What you've learned and what's next diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md index 7e18d7c2f3..c3098fe4ea 100644 --- a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/3-understand-the-demo.md @@ -24,7 +24,7 @@ Open the `Dockerfile`. There are two areas that require updates for Arm compatib Modern multi-architecture base images typically publish both `linux/amd64` and `linux/arm64` manifests. Updating the base image is the first step toward portability. -**Update compiler flags**: The `-mavx2` flag enables AVX2 vector instructions on x86. Arm processors use different SIMD instruction sets (NEON or SVE), so this flag must be removed or replaced when compiling for Arm. +**Update compiler flags**: The `-mavx2` flag enables AVX2 vector instructions on x86. Arm processors use different SIMD instruction sets (Neon or SVE), so this flag must be removed or replaced when compiling for Arm. Here is the full Dockerfile for reference: ```dockerfile @@ -77,12 +77,12 @@ To run this code on Arm, several adjustments are required: 2. **Intrinsic mapping**: Each AVX2 intrinsic must be mapped to an Arm equivalent. For example: - - `_mm256_setzero_pd()` creates a 256-bit zero vector of four doubles. Arm NEON uses 128-bit registers. - - `_mm256_loadu_pd()` loads 4 doubles at once (NEON loads 2 with `vld1q_f64`). - - `_mm256_add_pd()` and `_mm256_mul_pd()` are 256-bit operations (NEON uses 128-bit equivalents). - - `_mm256_extractf128_pd()` extracts the high 128 bits (not needed on NEON). + - `_mm256_setzero_pd()` creates a 256-bit zero vector of four doubles. Arm Neon uses 128-bit registers. + - `_mm256_loadu_pd()` loads 4 doubles at once (Neon loads 2 with `vld1q_f64`). + - `_mm256_add_pd()` and `_mm256_mul_pd()` are 256-bit operations (Neon uses 128-bit equivalents). + - `_mm256_extractf128_pd()` extracts the high 128 bits (not needed on Neon). -3. **Vector width differences**: AVX2 operates on 256-bit registers (four double-precision values). NEON operates on 128-bit registers (two double-precision values). This affects: +3. **Vector width differences**: AVX2 operates on 256-bit registers (four double-precision values). Neon operates on 128-bit registers (two double-precision values). This affects: - Loop stride - Accumulation logic - Horizontal reduction patterns @@ -93,10 +93,10 @@ To run this code on Arm, several adjustments are required: _mm256_extractf128_pd(...) _mm256_castpd256_pd128(...) ``` -is specific to x86 register structure. On Arm, reduction is implemented using NEON reduction or pairwise-add instructions instead. +is specific to x86 register structure. On Arm, reduction is implemented using Neon reduction or pairwise-add instructions instead. {{% notice Note %}} -On newer Arm platforms supporting SVE or SVE2 (for example Neoverse V1/V2 based platforms), wider vector lengths may be available. SVE uses a vector-length-agnostic (VLA) model, which differs from fixed-width AVX2 and NEON programming. The Arm MCP Server knowledge base can help determine the appropriate approach for your target platform. +On newer Arm platforms supporting SVE or SVE2 (for example Neoverse V1/V2 based platforms), wider vector lengths may be available. SVE uses a vector-length-agnostic (VLA) model, which differs from fixed-width AVX2 and Neon programming. The Arm MCP Server knowledge base can help determine the appropriate approach for your target platform. {{% /notice %}} ## What you've learned and what's next @@ -104,6 +104,6 @@ On newer Arm platforms supporting SVE or SVE2 (for example Neoverse V1/V2 based You have: - Examined a legacy x86 application with AVX2 intrinsics - Identified the architecture-specific elements: base image, compiler flags, SIMD headers, and intrinsic functions -- Understood how vector width differences between AVX2 (256-bit) and NEON (128-bit) affect the migration approach +- Understood how vector width differences between AVX2 (256-bit) and Neon (128-bit) affect the migration approach Next, you'll use GitHub Copilot with the Docker MCP Toolkit to automate the migration process. diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md index 1c77d14169..dd81516b7e 100644 --- a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/4-run-migration.md @@ -94,14 +94,14 @@ Searching knowledge base for: AVX2 to NEON intrinsic conversion The Arm MCP knowledge base provides documented guidance on intrinsic mapping and architecture considerations. Example mappings: -| x86 AVX2 Intrinsic | Arm NEON Equivalent | +| x86 AVX2 Intrinsic | Arm Neon Equivalent | |---------------------|---------------------| | `_mm256_setzero_pd()` | Two `vdupq_n_f64(0.0)` operations | | `_mm256_loadu_pd()` | Two `vld1q_f64()` loads | | `_mm256_add_pd()` | Two `vaddq_f64()` operations | | `_mm256_mul_pd()` | Two `vmulq_f64()` operations | -Because AVX2 operates on 256-bit vectors (four doubles) and NEON operates on 128-bit vectors (two doubles), Copilot adjusts: +Because AVX2 operates on 256-bit vectors (four doubles) and Neon operates on 128-bit vectors (two doubles), Copilot adjusts: - Loop stride - Accumulation logic - Horizontal reduction pattern @@ -142,9 +142,9 @@ After migration, you should see: **Source code updates**: - Added `#ifdef __aarch64__` architecture guards -- Replaced all `_mm256_*` AVX2 intrinsics with NEON equivalents (`vld1q_f64`, `vaddq_f64`, `vmulq_f64`) -- Adjusted loop strides from 4 (AVX2) to 2 (NEON) -- Rewrote horizontal reduction using NEON pair-wise addition +- Replaced all `_mm256_*` AVX2 intrinsics with Neon equivalents (`vld1q_f64`, `vaddq_f64`, `vmulq_f64`) +- Adjusted loop strides from 4 (AVX2) to 2 (Neon) +- Rewrote horizontal reduction using Neon pair-wise addition ## What you've learned and what's next diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md index 52dbb54c99..7711df8d9e 100644 --- a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/5-validate-and-next-steps.md @@ -105,8 +105,8 @@ Not all AI models produce equal results for migration tasks. While the Arm MCP S - Always use a current foundational model for best results. - Test any performance predictions the model makes against actual benchmarks. -- Review the generated NEON code for correctness, especially horizontal reductions and lane indexing. -- NEON lane indices must be compile-time constants, not variables. +- Review the generated Neon code for correctness, especially horizontal reductions and lane indexing. +- Neon lane indices must be compile-time constants, not variables. ## Explore further diff --git a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md index 9208107622..d4edfe75ae 100644 --- a/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/docker-mcp-toolkit/_index.md @@ -1,7 +1,7 @@ --- title: Automate x86 to Arm Migration with Docker MCP Toolkit, VS Code and GitHub Copilot -description: Learn how to use the Docker MCP Toolkit with the Arm MCP Server and GitHub Copilot to automate container and code migration from x86 to Arm64. Through a hands-on example, migrate a legacy C++ application with AVX2 intrinsics to Arm NEON. +description: Learn how to use the Docker MCP Toolkit with the Arm MCP Server and GitHub Copilot to automate container and code migration from x86 to Arm64. Through a hands-on example, migrate a legacy C++ application with AVX2 intrinsics to Arm Neon. minutes_to_complete: 45 @@ -13,7 +13,7 @@ learning_objectives: - Install and configure the Docker MCP Toolkit with the Arm MCP Server, GitHub MCP Server, and Sequential Thinking MCP Server - Connect the MCP Gateway to VS Code with GitHub Copilot - Use AI agents to scan codebases for x86-specific dependencies and intrinsics - - Automate the conversion of x86 AVX2 intrinsics to Arm NEON equivalents using the Arm MCP Server knowledge base + - Automate the conversion of x86 AVX2 intrinsics to Arm Neon equivalents using the Arm MCP Server knowledge base - Create and manage pull requests with migrated code using the GitHub MCP Server prerequisites: diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md index f7de4cec2c..18ff4e0972 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md @@ -168,9 +168,9 @@ In this guide, you will not use any other quantization methods, because Arm has As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for performance optimization with three types of GEMV/GEMM kernels corresponding to three processor types: -* AWS Graviton2, where you only have NEON support (you will see less improvement for these GEMV/GEMM kernels), +* AWS Graviton2, where you only have Neon support (you will see less improvement for these GEMV/GEMM kernels), * AWS Graviton3, where the GEMV/GEMM kernels exploit both SVE 256 and MATMUL INT8 support, and -* AWS Graviton4, where the GEMV/GEMM kernels exploit NEON/SVE 128 and MATMUL_INT8 support +* AWS Graviton4, where the GEMV/GEMM kernels exploit Neon/SVE 128 and MATMUL_INT8 support With the latest commits in `llama.cpp` you will see improvements for these Arm optimized kernels directly on your Arm-based server. You can run the pre-quantized Q4_0 model as is and do not need to re-quantize the model. @@ -232,7 +232,7 @@ llama_perf_context_print: total time = 8427.77 ms / 525 tokens The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton4 instance, you will see: - * NEON = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions + * Neon = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions * ARM_FMA = 1 This flag indicates support for Arm Floating-point Multiply and Accumulate instructions * MATMUL_INT8 = 1 This flag indicates support for Arm int8 matrix multiplication instructions * SVE = 1 This flag indicates support for the Arm Scalable Vector Extension diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md index 40a8e87fd0..5e435940f2 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/2_llama.cpp_intro.md @@ -57,7 +57,7 @@ The architecture of llama.cpp includes several key components that work together ![Architecture diagram showing llama.cpp components including backends, ggml-cpu library, and KleidiAI integration alt-text#center](images/llama_components.jpg "llama.cpp components") -llama.cpp provides optimized support for Arm CPUs through its `ggml-cpu` library, which leverages Arm-specific vector instructions such as NEON and SVE, and includes an AArch64 trait that accelerates inference using 8-bit integer multiply (i8mm) instructions. The `ggml-cpu` library also integrates the Arm [KleidiAI](https://github.com/ARM-software/kleidiai) library as an additional trait. In addition to Arm CPU support, llama.cpp offers backends for GPU, CUDA, and OpenCL to enable inference on a variety of hardware platforms. +llama.cpp provides optimized support for Arm CPUs through its `ggml-cpu` library, which leverages Arm-specific vector instructions such as Neon and SVE, and includes an AArch64 trait that accelerates inference using 8-bit integer multiply (i8mm) instructions. The `ggml-cpu` library also integrates the Arm [KleidiAI](https://github.com/ARM-software/kleidiai) library as an additional trait. In addition to Arm CPU support, llama.cpp offers backends for GPU, CUDA, and OpenCL to enable inference on a variety of hardware platforms. ## Prefill and Decode in autoregressive LLMs diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md index f6288f05ce..a44a59576d 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/3_llama.cpp_annotation.md @@ -182,7 +182,7 @@ Next, configure the project: Set `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER` to your cross compiler path. Make sure that -march in `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS` matches your Arm CPU hardware. -With the flags above you can run `llama-cli` on an Arm CPU that supports NEON dot product and 8-bit integer multiply (i8mm) instructions. +With the flags above you can run `llama-cli` on an Arm CPU that supports Neon dot product and 8-bit integer multiply (i8mm) instructions. The `-static` and `-g` options are also specified to produce a statically linked executable, so it can run on different Arm64 Linux/Android environments without needing shared libraries and to include debug information, which makes source code and function-level profiling in Streamline much easier. diff --git a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md index 58838ce1dc..8f1bd01ec7 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md +++ b/content/learning-paths/servers-and-cloud-computing/llama_cpp_streamline/4_analyze_token_prefill_decode.md @@ -120,7 +120,7 @@ By checking the string of Annotation Marker, the first token generation at Prefi You can see that the first token generated at the Prefill stage takes more time since 78 input tokens have to be processed at the Prefill stage, performing lots of GEMM operations. At the Decode stage, tokens are generated one by one at mostly equal speed; one token takes less time than that of the Prefill stage, thanks to the effect of KV cache. At the Decode stage, it performs many GEMV operations. -You can further investigate it with PMU event counters that are captured by Streamline. At the Prefill stage, the amount of computation, which is indicated by PMU event counters that count the number of Advanced SIMD (NEON), floating-point, and integer data processing instructions, is large. However, the memory access is relatively low. Especially, the number of L3 cache refill/miss is much lower than that of the Decode stage. +You can further investigate it with PMU event counters that are captured by Streamline. At the Prefill stage, the amount of computation, which is indicated by PMU event counters that count the number of Advanced SIMD (Neon), floating-point, and integer data processing instructions, is large. However, the memory access is relatively low. Especially, the number of L3 cache refill/miss is much lower than that of the Decode stage. At Decode stage, the amount of computation is relatively less (since the time of each token is less), but the number of L3 cache refills/misses increases significantly. @@ -137,7 +137,7 @@ In the Functions view of Streamline, you can see the overall percentage of runni ![Screenshot of Streamline Functions view displaying execution time percentages for different functions during llama.cpp execution alt-text#center](images/annotation_prefill_functions.png "Functions view") -As you can see, the function, graph_compute, takes the largest portion of the running time. It shows that large amounts of GEMM and GEMV operations take most of the time. With the `Qwen1_5-0_5b-chat-q4_0` model, the computation (GEMM and GEMV) of Q, K, V vectors and most of FFN layers: their weights are with Q4_0 data type and the input activations are with FP32 data type. The computation is forwarded to KleidiAI trait by `ggml_cpu_extra_compute_forward`. KleidiAI microkernels implemented with NEON dot product and i8mm vector instructions accelerate the computation. +As you can see, the function, graph_compute, takes the largest portion of the running time. It shows that large amounts of GEMM and GEMV operations take most of the time. With the `Qwen1_5-0_5b-chat-q4_0` model, the computation (GEMM and GEMV) of Q, K, V vectors and most of FFN layers: their weights are with Q4_0 data type and the input activations are with FP32 data type. The computation is forwarded to KleidiAI trait by `ggml_cpu_extra_compute_forward`. KleidiAI microkernels implemented with Neon dot product and i8mm vector instructions accelerate the computation. At the Prefill stage, `kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm` KleidiAI ukernel is used for GEMM (Matrix Multiply) operators. It takes advantage of i8mm instructions. Since the Prefill stage only takes a small percentage of the whole time, the percentage of this function is small as shown in figures above. However, if you focus only on the Prefill stage with Samplings view in Timeline, you see `kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm` takes the largest portion of the Prefill stage. diff --git a/content/learning-paths/servers-and-cloud-computing/migrate-ease/_index.md b/content/learning-paths/servers-and-cloud-computing/migrate-ease/_index.md index 7390563c20..7149f14564 100644 --- a/content/learning-paths/servers-and-cloud-computing/migrate-ease/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/migrate-ease/_index.md @@ -27,7 +27,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - NEON + - Neon - SVE - Go - Runbook diff --git a/content/learning-paths/servers-and-cloud-computing/migration/_index.md b/content/learning-paths/servers-and-cloud-computing/migration/_index.md index 9aec54e89a..9c3a09d745 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/_index.md @@ -29,7 +29,7 @@ armips: operatingsystems: - Linux tools_software_languages: - - NEON + - Neon - SVE - Go - Runbook diff --git a/content/learning-paths/servers-and-cloud-computing/migration/c.md b/content/learning-paths/servers-and-cloud-computing/migration/c.md index 1052d1447d..64d7a899cb 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/c.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/c.md @@ -115,11 +115,11 @@ Using LSE can result in significant performance improvement for your application Refer to the Learning Path [Learn about Large System Extensions](/learning-paths/servers-and-cloud-computing/lse/) for more information. -### Porting code with SSE/AVX intrinsics to NEON +### Porting code with SSE/AVX intrinsics to Neon You may have applications which include x86_64 intrinsics. These need special treatment when recompiling. -Refer to the Learning Path [Porting Architecture Specific Intrinsics](/learning-paths/cross-platform/intrinsics/) for the available options to migrate x86_64 intrinsics to NEON. +Refer to the Learning Path [Porting Architecture Specific Intrinsics](/learning-paths/cross-platform/intrinsics/) for the available options to migrate x86_64 intrinsics to Neon. ### Signed and unsigned char data types @@ -139,5 +139,5 @@ Both a compiler and a Linux kernel that supports SVE is required. You will need GCC-11 or newer or LLVM-14 or newer and Linux kernel 4.15 or newer for SVE support. -Refer to the Learning Path [From Arm NEON to SVE](/learning-paths/servers-and-cloud-computing/sve/sve_basics/) for more information and examples. +Refer to the Learning Path [From Arm Neon to SVE](/learning-paths/servers-and-cloud-computing/sve/sve_basics/) for more information and examples. diff --git a/content/learning-paths/servers-and-cloud-computing/migration/java.md b/content/learning-paths/servers-and-cloud-computing/migration/java.md index 915914a840..f49d1bdd12 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/java.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/java.md @@ -87,7 +87,7 @@ uint UseSVE = 0 bool UseSimpleArrayEquals = false {ARCH product} {default} ``` -Depending on your application, you may want to investigate the vector processing flags for SIMD, NEON, SVE, and CRC. +Depending on your application, you may want to investigate the vector processing flags for SIMD, Neon, SVE, and CRC. You can try [Process Watch](/learning-paths/servers-and-cloud-computing/processwatch/) to monitor the usage of SIMD and CRC instructions. diff --git a/content/learning-paths/servers-and-cloud-computing/migration/migration.md b/content/learning-paths/servers-and-cloud-computing/migration/migration.md index 972497e244..8c10b938d2 100644 --- a/content/learning-paths/servers-and-cloud-computing/migration/migration.md +++ b/content/learning-paths/servers-and-cloud-computing/migration/migration.md @@ -81,7 +81,7 @@ Here are a number of real-world migration scenarios. They cover migrations rangi | Application migration | Migration results | |-----------------------|--------------------| |Node.js application | Just works! nothing special for Arm and you may not even notice it is running on Arm| -|C++ application has some x86_64 intrinsics | [Migrate to NEON](/learning-paths/cross-platform/intrinsics/) using sse2neon or SIMDe| +|C++ application has some x86_64 intrinsics | [Migrate to Neon](/learning-paths/cross-platform/intrinsics/) using sse2neon or SIMDe| |Pandoc (documentation tool) has a filter not available on Arm|Rebuild dependency library from source (and ask maintainers for Arm support)| |Encryption in a Java app is slow | Use -XX:+UnlockDiagnosticVMOptions -XX:+UseAESCTRIntrinsics flags to improve Arm crypto performance| |Dependent container not available for Arm|Build the container yourself (and ask the maintainers for Arm support)| diff --git a/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md b/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md index f2f530e2f0..d2eedbbd43 100644 --- a/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md +++ b/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md @@ -45,7 +45,7 @@ By default, Process Watch: * Prints results until the tool is killed (via Ctrl+c). * Prints all results in a table format on `stdout`. * Profiles all running processes. - * Displays counts for the default filters, which are 'FPARMv8', 'NEON', 'SVE', and 'SVE2'. + * Displays counts for the default filters, which are 'FPARMv8', 'Neon', 'SVE', and 'SVE2'. * Sets the sample period to every 10000 events. ## Default Process Watch output @@ -57,7 +57,7 @@ sudo ./processwatch The output should look like: ```output -PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL ALL ALL 0.00 0.29 0.00 0.00 100.00 346 17400 processwatch 0.00 0.36 0.00 0.00 80.64 279 254 systemd-journal 0.00 0.00 0.00 0.00 13.01 45 @@ -66,14 +66,14 @@ ALL ALL 0.00 0.29 0.00 0.00 100.00 346 560 snapd 0.00 0.00 0.00 0.00 1.16 04 296 multipathd 0.00 0.00 0.00 0.00 0.58 02 -PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL ALL ALL 3.57 12.86 0.00 0.00 100.00 140 17400 processwatch 3.73 13.43 0.00 0.00 95.71 134 4939 sshd 0.00 0.00 0.00 0.00 2.86 04 296 multipathd 0.00 0.00 0.00 0.00 0.71 01 560 snapd 0.00 0.00 0.00 0.00 0.71 01 -PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL ALL ALL 1.18 5.12 0.00 0.00 100.00 254 17400 processwatch 1.19 5.16 0.00 0.00 99.21 252 6651 packagekitd 0.00 0.00 0.00 0.00 0.39 01 diff --git a/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md b/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md index 450ff887b2..bd4b434138 100644 --- a/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md +++ b/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md @@ -7,7 +7,7 @@ layout: learningpathall --- ## Using Process Watch -You can use Process Watch to determine the presence or absence of certain instructions. In this section, you will use Process Watch to detect the use of NEON and SVE instructions by this example workload. Start by saving the simple workload shown below in a file name `workload.c`: +You can use Process Watch to determine the presence or absence of certain instructions. In this section, you will use Process Watch to detect the use of Neon and SVE instructions by this example workload. Start by saving the simple workload shown below in a file name `workload.c`: ```C #include #define LEN 1024 @@ -29,7 +29,7 @@ Compile the workload without applying any optimizations: ```bash aarch64-linux-gnu-gcc workload.c -o workload_none -O0 ``` -Now, run the workload in the background and launch `processwatch` on the workload to detect the use of NEON and SVE instructions: +Now, run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: ```bash ./workload_none & [1] 126958 @@ -50,15 +50,15 @@ ALL ALL 0.00 0.00 100.00 26006 ^C ``` -You can see that in this case, the workload is not making use of NEON or SVE instructions. +You can see that in this case, the workload is not making use of Neon or SVE instructions. -## Case 2: Use NEON instructions -Now recompile the same workload to make use of NEON instructions: +## Case 2: Use Neon instructions +Now recompile the same workload to make use of Neon instructions: ```bash aarch64-linux-gnu-gcc workload.c -o workload_neon -O2 -ftree-vectorize -march=armv8.6-a ``` -Run the workload in the background and launch `processwatch` on the workload to detect the use of NEON and SVE instructions: +Run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: ```bash ./workload_neon & [1] 126987 @@ -78,9 +78,9 @@ ALL ALL 32.45 0.00 100.00 26143 126987 workload_neon 32.45 0.00 100.00 26143 ^C ``` -You can now see the workload is retiring NEON instructions as you would expect. +You can now see the workload is retiring Neon instructions as you would expect. -You can run `objdump` on the binary to view the disassembled NEON instructions: +You can run `objdump` on the binary to view the disassembled Neon instructions: ```bash objdump -S workload_neon @@ -108,7 +108,7 @@ Recompile the workload again. This time include support for SVE instructions: ```bash aarch64-linux-gnu-gcc workload.c -o workload_sve -O2 -ftree-vectorize -march=armv8.5-a+sve ``` -Run the workload in the background and launch `processwatch` on the workload to detect the use of NEON and SVE instructions: +Run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: ```bash ./workload_sve & [1] 126997 @@ -119,11 +119,11 @@ You will need to change the PID in the `processwatch` command with the PID of th The output should look like: ```output -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME Neon SVEorSME %TOTAL TOTAL ALL ALL 0.00 96.68 100.00 24914 126997 workload_sve 0.00 96.68 100.00 24914 -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME Neon SVEorSME %TOTAL TOTAL ALL ALL 0.00 96.74 100.00 26137 126997 workload_sve 0.00 96.74 100.00 26137 ^C diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md index 1f1d1b2afe..8d923d44fb 100644 --- a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/applications.md @@ -15,13 +15,13 @@ Reproducibility is not required for every application, but it is critical in sev ## Auto-vectorization -Modern compilers automatically vectorize scalar loops when possible. Depending on compiler decisions, the same source code can be executed as a scalar loop, a NEON vectorized loop, or an SVE vectorized loop. Vectorized loops also often include scalar tail handling for leftover elements that don't fill an entire vector. +Modern compilers automatically vectorize scalar loops when possible. Depending on compiler decisions, the same source code can be executed as a scalar loop, a Neon vectorized loop, or an SVE vectorized loop. Vectorized loops also often include scalar tail handling for leftover elements that don't fill an entire vector. -Reproducibility across math routines guarantees that vectorized loops (NEON or SVE) match regardless of which path the compiler selects. It also ensures that loops over scalar routines produce the same results as their vectorized counterparts, so changing vector width or enabling/disabling auto-vectorization does not change the final output. +Reproducibility across math routines guarantees that vectorized loops (Neon or SVE) match regardless of which path the compiler selects. It also ensures that loops over scalar routines produce the same results as their vectorized counterparts, so changing vector width or enabling/disabling auto-vectorization does not change the final output. ## Distributed computing -In distributed or parallel workloads, computations are often decomposed across multiple machines or execution units. Different nodes can execute scalar, NEON, or SVE code paths, and the decomposition of work can change between runs. Without reproducible math routines, small numerical differences accumulate and lead to divergent final results. +In distributed or parallel workloads, computations are often decomposed across multiple machines or execution units. Different nodes can execute scalar, Neon, or SVE code paths, and the decomposition of work can change between runs. Without reproducible math routines, small numerical differences accumulate and lead to divergent final results. ## Embedded and real-time systems diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md index 38e73a05b2..50c129851b 100644 --- a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/examples.md @@ -1,5 +1,5 @@ --- -title: Verify reproducible results across scalar, NEON, and SVE +title: Verify reproducible results across scalar, Neon, and SVE weight: 5 ### FIXED, DO NOT MODIFY @@ -90,9 +90,9 @@ y = 2.613692045211792 [0x1.4e8d76p+1] {{< /tabpane >}} -### NEON usage +### Neon usage -Next, replace the contents of `app.c` with the following NEON application that invokes the reproducible NEON implementation of the single-precision exponential function `armpl_vexpq_f32()`. Compile and run it again with reproducibility enabled and disabled to compare the results. +Next, replace the contents of `app.c` with the following Neon application that invokes the reproducible Neon implementation of the single-precision exponential function `armpl_vexpq_f32()`. Compile and run it again with reproducibility enabled and disabled to compare the results. {{< tabpane code=true >}} {{< tab header="C code" language="C" output_lines="15">}} @@ -181,14 +181,14 @@ y (lane 7): 2.613692045211792 [0x1.4e8d76p+1] {{< /tab >}} {{< /tabpane >}} -All active lanes of `y` are guaranteed to match the scalar and NEON results exactly. +All active lanes of `y` are guaranteed to match the scalar and Neon results exactly. ### Scope and limitations -In this section you observed that, when reproducibility is enabled (`AMATH_REPRO` enabled), `expf()` produces bitwise-identical results whether it is executed as a scalar, NEON or SVE function. +In this section you observed that, when reproducibility is enabled (`AMATH_REPRO` enabled), `expf()` produces bitwise-identical results whether it is executed as a scalar, Neon or SVE function. -This behavior extends to other reproducible math routines in Libamath. Scalar, NEON, and SVE implementations are numerically aligned for all functions listed in `amath_repro.h`. Reproducible symbols are always prefixed by `armpl_` and are not provided with `ZGV` mangling. Reproducibility is available on Linux platforms, and results are independent of vector width or instruction selection. Reproducible routines prioritize determinism over peak performance. +This behavior extends to other reproducible math routines in Libamath. Scalar, Neon, and SVE implementations are numerically aligned for all functions listed in `amath_repro.h`. Reproducible symbols are always prefixed by `armpl_` and are not provided with `ZGV` mangling. Reproducibility is available on Linux platforms, and results are independent of vector width or instruction selection. Reproducible routines prioritize determinism over peak performance. ## What you've learned and what's next -In this Learning Path, you learned what numerical reproducibility means in floating-point software and explored real-world applications where it is critical. You then enabled cross-vector-extension reproducibility in Libamath and verified that scalar, NEON, and SVE code paths produce bitwise-identical results for the `expf()` function. You can now apply these techniques to your own applications using Arm Performance Libraries. \ No newline at end of file +In this Learning Path, you learned what numerical reproducibility means in floating-point software and explored real-world applications where it is critical. You then enabled cross-vector-extension reproducibility in Libamath and verified that scalar, Neon, and SVE code paths produce bitwise-identical results for the `expf()` function. You can now apply these techniques to your own applications using Arm Performance Libraries. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md index 4af1998e4a..58fdc15265 100644 --- a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility.md @@ -8,7 +8,7 @@ layout: learningpathall ## What is reproducibility? -In numerical software, reproducibility (also referred to as determinism) means you get the exact same floating-point bits for the same inputs, even if you run a different implementation (scalar vs NEON vs SVE). +In numerical software, reproducibility (also referred to as determinism) means you get the exact same floating-point bits for the same inputs, even if you run a different implementation (scalar vs Neon vs SVE). In pure mathematics, two functions `𝑓(𝑥)` and `𝑔(𝑥)` are equivalent if, for all `𝑥` in their domain `𝑓(𝑥) = 𝑔(𝑥)`. @@ -33,9 +33,9 @@ Reproducibility can be defined at different levels, depending on how similar or Reproducibility across different processor architectures, such as x86 and AArch64. * **Cross-vector-extension reproducibility** - Reproducibility across different vector execution paths on the same architecture, such as scalar, NEON, and SVE on AArch64. + Reproducibility across different vector execution paths on the same architecture, such as scalar, Neon, and SVE on AArch64. -This Learning Path focuses on cross-vector-extension reproducibility (scalar, NEON, SVE on AArch64). +This Learning Path focuses on cross-vector-extension reproducibility (scalar, Neon, SVE on AArch64). Now that you understand what numerical reproducibility means and the different levels it can operate at, the next section covers real-world applications where this property is critical. diff --git a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md index 2f1f8b309d..39f69f2f8d 100644 --- a/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md +++ b/content/learning-paths/servers-and-cloud-computing/reproducible-libamath/reproducibility_libamath.md @@ -8,9 +8,9 @@ layout: learningpathall ## Cross-vector-extension reproducibility -On Linux platforms, Libamath supports bitwise-reproducible results across scalar, NEON (AdvSIMD), and SVE implementations for a subset of math functions. +On Linux platforms, Libamath supports bitwise-reproducible results across scalar, Neon (AdvSIMD), and SVE implementations for a subset of math functions. -When reproducibility is enabled, the same input values produce identical floating-point results, regardless of whether a supported function is executed using the scalar, NEON, or SVE code path. This keeps your results deterministic even if your app takes different vector paths. +When reproducibility is enabled, the same input values produce identical floating-point results, regardless of whether a supported function is executed using the scalar, Neon, or SVE code path. This keeps your results deterministic even if your app takes different vector paths. Reproducible Libamath routines operate in the default accuracy mode, guaranteeing results within 3.5 ULP of the correctly rounded value. @@ -18,7 +18,7 @@ Reproducible routines prioritize determinism over peak performance. ## Reproducible symbols -When reproducibility is enabled, reproducible functions use the same public function names as their non-reproducible counterparts. The linker resolves calls to the reproducible implementations when you build with `-DAMATH_REPRO=1` and link `-lamath_repro`, and the scalar, NEON, and SVE variants of a function all produce bitwise-identical results. +When reproducibility is enabled, reproducible functions use the same public function names as their non-reproducible counterparts. The linker resolves calls to the reproducible implementations when you build with `-DAMATH_REPRO=1` and link `-lamath_repro`, and the scalar, Neon, and SVE variants of a function all produce bitwise-identical results. Unlike the symbols in `amath.h` (which don't guarantee reproducibility), reproducible symbols in `amath_repro` are not provided in `ZGV` mangling. Only the `armpl_` notation is used. @@ -38,12 +38,12 @@ Then compile and link with reproducibility enabled: gcc app.c -DAMATH_REPRO=1 -lamath_repro -o app ``` -The `-DAMATH_REPRO=1` flag enables reproducibility at compile time, and `-lamath_repro` links against the reproducible Libamath library. When you follow these steps, calls to supported functions resolve to the reproducible scalar, NEON, or SVE implementations. +The `-DAMATH_REPRO=1` flag enables reproducibility at compile time, and `-lamath_repro` links against the reproducible Libamath library. When you follow these steps, calls to supported functions resolve to the reproducible scalar, Neon, or SVE implementations. -With reproducibility configured, the next section walks through hands-on examples using `expf` across scalar, NEON, and SVE code paths. +With reproducibility configured, the next section walks through hands-on examples using `expf` across scalar, Neon, and SVE code paths. ## What you've learned and what's next You've learned how to enable reproducible math routines in Libamath through compile-time configuration and library linking. You can now compile code with the reproducible library variant and understand the trade-offs between reproducibility and peak performance. -Next, you'll verify reproducible behavior through hands-on examples that compare scalar, NEON, and SVE implementations of the exponential function. \ No newline at end of file +Next, you'll verify reproducible behavior through hands-on examples that compare scalar, Neon, and SVE implementations of the exponential function. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/sve/_index.md b/content/learning-paths/servers-and-cloud-computing/sve/_index.md index 86c07faa09..5a43876fff 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sve/_index.md @@ -6,12 +6,12 @@ minutes_to_complete: 30 who_is_this_for: This is an introductory topic for software developers using SIMD instructions for High-Performance Computing, Machine Learning, Digital Signal Processing, Audio and Video Codec applications. learning_objectives: - - Understand the differences between SVE and NEON for vectorization + - Understand the differences between SVE and Neon for vectorization - Compile code for SVE-capable Arm processors - Run SVE instructions on any Armv8-A processor prerequisites: - - General knowledge about SIMD processing, vectorization or Arm NEON. + - General knowledge about SIMD processing, vectorization or Arm Neon. - An Arm computer running Linux. Cloud instances can be used, refer to the list of [Arm cloud service providers](/learning-paths/servers-and-cloud-computing/csp/). author: Florent Lebeau @@ -31,7 +31,7 @@ operatingsystems: - Linux tools_software_languages: - SVE - - NEON + - Neon - armie - GCC - armclang diff --git a/content/learning-paths/servers-and-cloud-computing/sve/sve_basics.md b/content/learning-paths/servers-and-cloud-computing/sve/sve_basics.md index 17cf661a79..ed85ea51ba 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve/sve_basics.md +++ b/content/learning-paths/servers-and-cloud-computing/sve/sve_basics.md @@ -1,6 +1,6 @@ --- # User change -title: "From Arm NEON to SVE" +title: "From Arm Neon to SVE" weight: 2 # 1 is first, 2 is second, etc. @@ -8,18 +8,18 @@ weight: 2 # 1 is first, 2 is second, etc. layout: "learningpathall" --- -## Arm NEON +## Arm Neon Modern CPUs have vector units that operate in a SIMD fashion. This greatly improves application performance, depending on the vector width. -The Armv7-A Instruction Set Architecture (ISA) introduced Advanced SIMD or Arm NEON instructions. These instructions are supported on the latest Armv8-A and Armv9-A architectures. NEON registers are composed of 32 128-bit registers V0-V31 and support multiple data types: integer, single-precision (SP) floating-point and double-precision (DP) floating-point. +The Armv7-A Instruction Set Architecture (ISA) introduced Advanced SIMD or Arm Neon instructions. These instructions are supported on the latest Armv8-A and Armv9-A architectures. Neon registers are composed of 32 128-bit registers V0-V31 and support multiple data types: integer, single-precision (SP) floating-point and double-precision (DP) floating-point. ## Arm SVE In order to reduce restrictions regarding fixed-length vector sizes, Arm introduced the Scalable Vector Extension (SVE). Arm SVE is vector-length agnostic, allowing vector width from 128 up to 2048 bits. This enables software to scale dynamically to any SVE capable Arm hardware. -SVE is not an extension of NEON but a separate, optional extension of Arm v8-A with a new set of instruction encodings. +SVE is not an extension of Neon but a separate, optional extension of Arm v8-A with a new set of instruction encodings. SVE is used in HPC and general-purpose server software. SVE2 adds capabilities to enable more data-processing domains. ## SVE Vector Length @@ -67,7 +67,7 @@ If the hardware doesn't support SVE the program will crash with an illegal instr SVE is a predicate-centric architecture with: - Scalable vector registers - - Z0-Z31 extends NEON's 128-bit V0-V31 + - Z0-Z31 extends Neon's 128-bit V0-V31 - Supported data types - packed DP, SP, half-precision (HP) floating-point elements - packed 64, 32, 16 and 8-bit integer elements @@ -78,11 +78,11 @@ SVE is a predicate-centric architecture with: ### Simple addition example -Take a look at the example code compiled for SVE (left) and for NEON (right): +Take a look at the example code compiled for SVE (left) and for Neon (right): {{< godbolt width="100%" height="700px" mode="diff" lopt="-O3 -march=armv8-a" ropt="-O3 -march=armv8-a+sve" src="int fun(double * restrict a, double * restrict b, int size)\n{\n for (int i=0; i < size; ++i)\n {\n b[i] += a[i];\n }\n}" >}} -Notice how small the SVE assembly is in comparison to NEON. This is due to the predicate behavior which avoids generating assembly for remainder loops (scalar operations performed when the iteration domain is not a multiple of the vector length. +Notice how small the SVE assembly is in comparison to Neon. This is due to the predicate behavior which avoids generating assembly for remainder loops (scalar operations performed when the iteration domain is not a multiple of the vector length. Observe what the SVE assembly instructions are doing: diff --git a/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md b/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md index d8b4e77243..8f0adad89a 100644 --- a/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/sve2-match/_index.md @@ -32,7 +32,7 @@ operatingsystems: - Linux tools_software_languages: - SVE2 -- NEON +- Neon - Runbook further_reading: diff --git a/content/learning-paths/servers-and-cloud-computing/triggering-pmu-events-2/operation.md b/content/learning-paths/servers-and-cloud-computing/triggering-pmu-events-2/operation.md index 07f3fb7d4b..7f1e50d10a 100644 --- a/content/learning-paths/servers-and-cloud-computing/triggering-pmu-events-2/operation.md +++ b/content/learning-paths/servers-and-cloud-computing/triggering-pmu-events-2/operation.md @@ -18,7 +18,7 @@ The operation mix comprises these groups: ### SIMD percentage -To trigger `ASE_SPEC` and `ASE_INST_SPEC`, create a function using NEON instructions: +To trigger `ASE_SPEC` and `ASE_INST_SPEC`, create a function using Neon instructions: ```C .global simd @@ -44,7 +44,7 @@ ASE_SPEC is 1 ASE_INST_SPEC is 3 ``` -The results show `ASE_SPEC` is 1 and `ASE_INST_SPEC` is 3. `ASE_INST_SPEC` counts speculatively executed Advanced SIMD operations. Meanwhile, `ASE_SPEC` counts speculatively executed Advanced SIMD operations, excluding load, store, and move micro-operations that move data to or from the SIMD registers. `ASE_INST_SPEC` counts 1 from LD2 and 2 from ADD: adding, then storing. `ASE_SPEC` only counts 1 from the actual NEON add operation. +The results show `ASE_SPEC` is 1 and `ASE_INST_SPEC` is 3. `ASE_INST_SPEC` counts speculatively executed Advanced SIMD operations. Meanwhile, `ASE_SPEC` counts speculatively executed Advanced SIMD operations, excluding load, store, and move micro-operations that move data to or from the SIMD registers. `ASE_INST_SPEC` counts 1 from LD2 and 2 from ADD: adding, then storing. `ASE_SPEC` only counts 1 from the actual Neon add operation. ## Scalar floating point percentage diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md index 58c9ebe668..45111b313f 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/1.md @@ -31,7 +31,7 @@ Crafted through extensive benchmarking and optimization, performance libraries c Performance libraries for Arm CPUs - such as the Arm Performance Libraries (APL) - provide highly optimized mathematical functions for scientific computing. An analogous library for accelerating routines on a GPU is cuBLAS, which is available for NVIDIA GPUs. -These libraries can be linked dynamically at runtime or statically during compilation, offering flexibility in deployment. They are designed to support multiple versions of the Arm architecture, including those with NEON and SVE. Generally, only minimal source code changes are required to use these libraries, making them ideal for porting and optimizing applications. +These libraries can be linked dynamically at runtime or statically during compilation, offering flexibility in deployment. They are designed to support multiple versions of the Arm architecture, including those with Neon and SVE. Generally, only minimal source code changes are required to use these libraries, making them ideal for porting and optimizing applications. ### How do I choose the right version of a performance library? diff --git a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md index 747a377c3a..b3145122a5 100644 --- a/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md +++ b/content/learning-paths/servers-and-cloud-computing/using-and-porting-performance-libs/3.md @@ -7,7 +7,7 @@ layout: learningpathall --- ## Libamath -The `libamath` library from Arm is an optimized subset of the standard library math functions for Arm-based CPUs, providing both scalar and vector functions at different levels of precision. It includes vectorized versions (NEON and SVE) of common math functions found in the standard library, such as those in the `` header. +The `libamath` library from Arm is an optimized subset of the standard library math functions for Arm-based CPUs, providing both scalar and vector functions at different levels of precision. It includes vectorized versions (Neon and SVE) of common math functions found in the standard library, such as those in the `` header. The trivial snippet below uses the `` standard cmath header to calculate the base exponential of a scalar value. @@ -106,7 +106,7 @@ In the context of Arm's AArch64 architecture, vector name mangling follows the s Where the values are given below: - `original_name` - the name of scalar `libm` function. - `isa` - 'n' for Neon, 's' for SVE. -- `mask` - 'M' for masked/predicated version, 'N' for unmasked. Only masked routines are defined for SVE, and only unmasked for NEON. -- `vlen` - the integer number representing vector length expressed as number of lanes. For NEON, ='2' in double-precision and ='4' in single-precision. For SVE, ='x'. +- `mask` - 'M' for masked/predicated version, 'N' for unmasked. Only masked routines are defined for SVE, and only unmasked for Neon. +- `vlen` - the integer number representing vector length expressed as number of lanes. For Neon, ='2' in double-precision and ='4' in single-precision. For SVE, ='x'. - `signature` - 'v' for 1 input floating point or integer argument, 'vv' for 2. For further information, see AArch64's vector function ABI. From 05cd9dabf0c30f2289818265372a44b46e49004b Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 13:33:11 -0400 Subject: [PATCH 34/51] Revise BRBE profile recording and optimization steps Updated instructions for recording BRBE profiles and optimizing with BOLT. --- .../servers-and-cloud-computing/bolt-demo/brbe.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md index 0eb0a9a460..a5f89c0972 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/brbe.md @@ -44,10 +44,10 @@ cycles:P: PMU Hardware or event type doesn't support branch stack sampling. Recording BRBE profiles requires a Linux kernel version 6.17 or later. Check your kernel version with: ```bash -perf --version +uname -r ``` ### Optimizing with BRBE -We then record a BRBE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. +After confirming the availability of BRBE on your system, you can now collect a BRBE profile by running the workload under `perf`. Then convert the collected profile into the format that BOLT expects and run the BOLT optimizer. ```bash { line_numbers=true } mkdir -p prof @@ -58,8 +58,5 @@ llvm-bolt out/bsort -o out/bsort.opt.brbe --data prof/brbe.fdata \ --dyno-stats ``` - - - ### Further Reading - [Arm Architecture Reference Manual for A-profile architecture](https://developer.arm.com/documentation/ddi0487/latest) From 47b13ddefd1f905bcad6732d770f4bc7702026dd Mon Sep 17 00:00:00 2001 From: Nina Drozd Date: Fri, 13 Mar 2026 17:48:10 +0000 Subject: [PATCH 35/51] Align learning path with recent move of voice assistant project to github * update links in overview, build and performance sections Signed-off-by: Nina Drozd --- .../voice-assistant/2-overview.md | 8 ++++---- .../mobile-graphics-and-gaming/voice-assistant/3-build.md | 2 +- .../voice-assistant/7-performance.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/2-overview.md b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/2-overview.md index 5752122390..70b1799fc3 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/2-overview.md +++ b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/2-overview.md @@ -36,7 +36,7 @@ This process includes the following stages: The voice assistant pipeline imports and builds a separate module to provide this STT functionality. You can access this at: ``` -https://gitlab.arm.com/kleidi/kleidi-examples/speech-to-text +https://github.com/Arm-Examples/STT-Runner ``` You can build the pipeline for various platforms and independently benchmark the STT functionality: @@ -50,7 +50,7 @@ You can build the pipeline for various platforms and independently benchmark the Currently, this module uses [whisper.cpp](https://github.com/ggml-org/whisper.cpp) and wraps the backend library with a thin C++ layer. The module also provides JNI bindings for developers targeting Android based applications. {{% notice %}} -You can get more information on how to build and use this module in the [speech-to-text README](https://gitlab.arm.com/kleidi/kleidi-examples/speech-to-text/-/blob/main/README.md?ref_type=heads) +You can get more information on how to build and use this module in the [speech-to-text README](http://github.com/Arm-Examples/STT-Runner/blob/main/README.md) {{% /notice %}} ## Large Language Model @@ -64,7 +64,7 @@ By default, the LLM runs asynchronously, streaming tokens as they are generated. The voice assistant pipeline imports and builds a separate module to provide this LLM functionality. You can access this at: ``` -https://gitlab.arm.com/kleidi/kleidi-examples/large-language-models +https://github.com/Arm-Examples/LLM-Runner ``` You can build this pipeline for various platforms and independently benchmark the LLM functionality: @@ -86,7 +86,7 @@ Currently, this module provides a thin C++ layer as well as JNI bindings for dev {{% notice %}} -You can get more information on how to build and use this module in the [large-language-models README](https://gitlab.arm.com/kleidi/kleidi-examples/large-language-models/-/blob/main/README.md?ref_type=heads) +You can get more information on how to build and use this module in the [large-language-models README](https://github.com/Arm-Examples/LLM-Runner/blob/main/README.md) {{% /notice %}} ## Text-to-Speech diff --git a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/3-build.md b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/3-build.md index 440746ec06..e548fb0af7 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/3-build.md +++ b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/3-build.md @@ -11,7 +11,7 @@ layout: learningpathall Start by cloning the repository with the complete example application code: ```bash -git clone https://git.gitlab.arm.com/kleidi/kleidi-examples/real-time-voice-assistant.git voice-assistant.git +git clone https://github.com/Arm-Examples/Real-Time-Voice-Assistant.git voice-assistant.git ``` ## Build the Voice Assistant diff --git a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/7-performance.md b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/7-performance.md index 20181d2f85..b9b003d453 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/7-performance.md +++ b/content/learning-paths/mobile-graphics-and-gaming/voice-assistant/7-performance.md @@ -15,7 +15,7 @@ layout: learningpathall You can also benchmark the LLM functionality on Android phone outside of RTVA application. For this, you can use the Large Language Models repository: ``` -https://gitlab.arm.com/kleidi/kleidi-examples/large-language-models +https://github.com/Arm-Examples/LLM-Runner ``` and build for your chosen LLM backend, ensure that `NDK_PATH` is set properly. SME kernels are enabled by default, so let's first build with SME disabled: @@ -26,7 +26,7 @@ cmake --build ./build ``` {{% notice %}} -For troubleshooting any build issues, refer to [large-language-models README](https://gitlab.arm.com/kleidi/kleidi-examples/large-language-models/-/blob/main/README.md?ref_type=heads) +For troubleshooting any build issues, refer to [large-language-models README](https://github.com/Arm-Examples/LLM-Runner/blob/main/README.md) {{% /notice %}} ### Phone setup From 8b5ab96f08fb49c8b480e56cdf2332d1af20a01c Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 14:14:32 -0400 Subject: [PATCH 36/51] Enhance instrumentation section in documentation Refine the explanation of instrumentation, emphasizing its profiling technique and implications for binary size and runtime overhead. Clarify the process for optimizing binaries with BOLT using instrumentation. --- .../bolt-demo/instrumentation.md | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md index e7b20fbe14..6043b97cff 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/instrumentation.md @@ -6,18 +6,23 @@ weight: 6 layout: learningpathall --- -### What is instrumentation +### What is instrumentation? -Instrumentation is a profiling method, not specific to BOLT, that augments code with counters to record exact execution counts. +Instrumentation is a profiling technique that inserts counters into a program to record how often different parts of the code execute. Unlike sampling-based methods, instrumentation collects exact execution counts. -For BOLT, Instrumentation provides complete execution counts for the paths that run. This gives a near-optimal profile for code-layout optimization and therefore the highest optimization potential, without requiring special hardware. +BOLT can instrument a binary to record control-flow behavior such as how often basic blocks and edges execute. This produces a complete execution profile, which allows BOLT to make highly accurate code layout decisions. -Instrumentation can increase binary size and add significant runtime overhead, making it less attractive for production use. It is mainly used when other profiling methods, such as BRBE, are unavailable, or for comparison to understand the maximum optimization potential. +Because instrumentation records exact execution counts, it often produces near-optimal profiles for code layout optimization. It does not require specialized hardware support. + +However, instrumentation increases the size of the binary and adds extra instructions that update profiling counters. These changes can introduce significant runtime overhead, which makes instrumentation less suitable for production workloads. + +Developers typically use instrumentation when other profiling methods, such as BRBE, are unavailable or when they want to measure the maximum optimization potential of BOLT. ### Optimizing with instrumentation -We first build an instrumented binary and then execute the workload to generate a profile. -By default, BOLT writes the profile to `/tmp/prof.fdata`, unless a path is specified using the `--instrumentation-file` flag. -Finally, we use the generated profile to optimize the binary with BOLT. +First, generate an instrumented version of the binary. BOLT inserts counters into the program to record how often different code paths execute. +Next, run the instrumented program to collect the execution profile. +By default, BOLT writes the profile to `/tmp/prof.fdata`. You can specify a different location using the `--instrumentation-file` option. +Finally, run BOLT again and provide the collected profile to produce an optimized binary. ```bash llvm-bolt --instrument out/bsort -o out/bsort.instr @@ -26,3 +31,4 @@ llvm-bolt out/bsort -o out/bsort.opt.instr --data /tmp/prof.fdata \ -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ --dyno-stats ``` +This process produces an optimized binary named out/bsort.opt.instr, which uses the collected execution profile to improve code layout. From 88cc576049c54809fd196f4ae36f12f34d2f840f Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 14:28:36 -0400 Subject: [PATCH 37/51] Revise SPE documentation for clarity and detail Updated the explanation of SPE, its features, and usage instructions. Improved clarity and detail regarding branch sampling and optimization steps. --- .../bolt-demo/spe.md | 61 +++++++++++-------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md index fb86411b9e..71c033550d 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/spe.md @@ -11,38 +11,26 @@ layout: learningpathall --- ### What is SPE -SPE stands for Statistical Profiling Extension. It is an Arm hardware unit that provides low-overhead, statistical sampling of program execution. -SPE samples microarchitectural events such as instruction execution, memory accesses, and branches. +SPE (Statistical Profiling Extension) is an Arm hardware profiling unit that collects statistical samples of program execution with very low runtime overhead. +SPE periodically samples microarchitectural events such as instruction execution, memory accesses, and branches. The processor records information about the sampled event in a trace buffer, which profiling tools later decode. For BOLT, SPE branch samples are the relevant input as they provide an edge-based control-flow profile. -Unlike [BRBE](../brbe), SPE does not record sequences of taken branches. -Each sample captures only a single transition between two program locations, representing a single edge in the control-flow graph. +Unlike [BRBE](../brbe), SPE does not record sequences of taken branches. Instead, each sample describes only a single branch transition between two program locations, representing a single edge in the control-flow graph. Because of this limited context, SPE typically produces less detailed control-flow profiles than BRBE. -Some implementations also support the Previous Branch Target (PBT) feature. -This feature records 1 taken branch in addition to the edge. -This provides a depth-1 branch history. It extends standard SPE sampling but remains shallower than BRBE. +Some processors also support the Previous Branch Target (PBT) feature. PBT records the target of the most recently taken branch in addition to the sampled edge. This provides a depth-1 branch history, which slightly improves the quality of the reconstructed control-flow profile. -### When to use SPE -SPE provides less detailed control-flow information than BRBE. It can still capture useful branch behavior and guide code layout decisions, making it a good alternative when BRBE is unavailable or instrumentation overhead is prohibitive. - -### Optimizing with SPE -We check [SPE availability](#availability) before recording a profile. -We then record an SPE profile by running our workload under perf, convert it into a format that BOLT understands, and run the BOLT optimization. - -```bash { line_numbers=true } -mkdir -p prof -perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort -perf2bolt -p prof/spe.data -o prof/spe.fdata ./out/bsort --spe -llvm-bolt out/bsort -o out/bsort.opt.spe --data prof/spe.fdata \ - -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ - --dyno-stats -``` +Even with PBT, SPE provides less branch history than BRBE, but it remains a useful profiling option when BRBE is not available. +### When to use SPE +SPE provides less detailed control-flow information than BRBE because it samples individual branch events rather than recording full branch histories. Despite this limitation, SPE can still capture useful branch behavior and guide code layout decisions. +Use SPE when BRBE is unavailable or when instrumentation overhead is too high for the workload. In these cases, SPE offers a practical compromise between profiling overhead and profile quality. ### Availability -SPE is an optional feature in processors that implement [Armv8.1](https://developer.arm.com/documentation/109697/2025_12/Feature-descriptions/The-Armv8-2-architecture-extension#md447-the-armv82-architecture-extension__feat_FEAT_SPE) or later. To check availability, we record a trace. +SPE is an optional processor feature called **FEAT_SPE (Statistical Profiling Extension)**, introduced in the [Armv8.1 architecture](https://developer.arm.com/documentation/109697/2025_12/Feature-descriptions/The-Armv8-2-architecture-extension#md447-the-armv82-architecture-extension__feat_FEAT_SPE). +To check whether your system supports SPE, attempt to record an SPE trace using `perf`. + +If SPE is available, the command records the trace successfully: -On a successful recording we see: ```bash { command_line="user@host | 2-5"} perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort Bubble sorting 10000 elements @@ -51,7 +39,7 @@ Bubble sorting 10000 elements [ perf record: Captured and wrote 13.458 MB prof/spe.data ] ``` -When unavailable: +If the processor or kernel does not support SPE, perf reports an error similar to the following: ```bash { command_line="user@host | 2-12"} perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort @@ -66,12 +54,31 @@ Run 'perf list' for a list of valid events -e, --event event selector. use 'perf list' to list available events ``` +This error indicates that the system does not expose the arm_spe PMU, which usually means that the processor or kernel does not support SPE profiling. -To record an SPE trace we need a Linux system that is version 6.14 or later. We can check the version using: +Recording SPE traces requires a Linux kernel version 6.14 or later. Check the kernel version with: ```bash -perf --version +uname -r +``` +### Optimizing with SPE +Next, collect an SPE profile by running the workload under `perf`. Then convert the recorded trace into a format that BOLT can use and run the BOLT optimizer. +The process consists of three steps: +* Record an SPE profile using perf +* Convert the profile into BOLT’s .fdata format +* Run BOLT to generate an optimized binary. + +```bash { line_numbers=true } +mkdir -p prof +perf record -e arm_spe/branch_filter=1/u -o prof/spe.data -- ./out/bsort +perf2bolt -p prof/spe.data -o prof/spe.fdata ./out/bsort --spe +llvm-bolt out/bsort -o out/bsort.opt.spe --data prof/spe.fdata \ + -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions \ + --dyno-stats ``` +The `perf record` command collects branch samples using the SPE hardware profiler. +The `perf2bolt` tool converts the SPE trace into BOLT’s .fdata profile format, using the --spe option to interpret the samples correctly. +Finally, `llvm-bolt` uses the generated profile to reorganize functions and basic blocks in the binary, producing an optimized binary named `out/bsort.opt.spe`. ### Further Reading - [Arm Statistical Profiling Extension: Performance Analysis Methodology White Paper](https://developer.arm.com/documentation/109429/latest/) From c08c067e79ccb5171445cda10b574ce7c9ae3dfe Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 14:46:30 -0400 Subject: [PATCH 38/51] Clarify PMU definitions and usage in BOLT Updated the PMU section to clarify definitions and usage. Improved explanations of PMU profiling and its application in BOLT optimization. --- .../bolt-demo/pmu.md | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md index 77cab0c413..456c5f0ded 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/pmu.md @@ -11,22 +11,28 @@ layout: learningpathall --- ### What is PMU -PMU stands for Performance Monitoring Unit. It is an Arm hardware unit that provides event-based sampling of program execution. -PMU samples microarchitectural events such as instructions, cycles, branches, and other hardware events. This form of profiling is widely available across Arm systems. +The PMU (Performance Monitoring Unit) is a hardware component that records microarchitectural events during program execution. It supports event-based sampling of events such as instructions, cycles, cache accesses, and branches. Most Arm processors provide a PMU, which makes this profiling method widely available. -For BOLT, PMU provides samples that capture coarse hotness information. Samples are associated with instruction addresses and therefore with *basic blocks*, which are straight-line sequences of instructions that always execute in full once entered. This indicates how often those blocks run, rather than how control flows between them. -For this reason, PMU profiling is often referred to as *basic sampling* rather than *edge sampling*. While it is possible to sample branch events using the PMU, these samples do not include branch target information and therefore still do not provide control-flow edge information. +For BOLT, PMU provides samples that capture coarse hotness information. Samples are associated with instruction addresses and therefore with basic blocks, which are straight-line sequences of instructions that always execute in full once entered. This indicates how often those blocks run, rather than how control flows between them. +For this reason, PMU profiling is often referred to as *basic sampling* rather than *edge sampling*. While it is possible to sample branch events using the PMU, these samples do not include branch target information and therefore do not provide control-flow edge information. Because functions consist of many basic blocks, PMU sampling can provide useful information at the function level. This makes it suitable for coarse-grained optimizations such as function reordering, but can be less effective for fine-grained block layout. Increasing the sampling frequency can improve coverage, but at the cost of higher profile collection overhead. ### When to use PMU -PMU is most useful when BRBE and SPE are unavailable and instrumentation is not feasible. -It provides the least detailed control-flow information among the available methods, so it is typically used as a fallback option. +Use PMU profiling when BRBE and SPE are unavailable and instrumentation is not practical for the workload. +PMU provides the least detailed control-flow information among the profiling methods described in this tutorial. Because it samples instruction addresses rather than control-flow edges, it mainly reveals which parts of the program execute frequently. +For this reason, PMU profiling typically serves as a fallback option for BOLT when more informative profiling methods are not available. -### Optimizing with PMU -We record a PMU profile by running our workload under perf, convert it into a format that BOLT understands, and then run the BOLT optimization. -This tutorial uses instruction sampling. +### Availability +All Arm systems that support the Linux perf tool provide access to PMU events. PMU profiling does not require any additional hardware features beyond standard performance monitoring support. +### Optimizing with PMU +Record a PMU profile by running the workload under `perf`. Then convert the recorded profile into a format that BOLT understands and run the BOLT optimizer. +This tutorial uses instruction sampling, which attributes samples to the instructions that were executing when the sampling event occurred. +The process consists of three steps: + * Record a PMU profile using perf + * Convert the profile into BOLT’s .fdata format + * Run BOLT to generate an optimized binary ```bash { line_numbers=true } mkdir -p prof @@ -37,5 +43,6 @@ llvm-bolt out/bsort -o out/bsort.opt.pmu --data prof/pmu.fdata \ --dyno-stats ``` -### Availability -PMU events are available on all Arm systems that support perf. No additional hardware features are required. +The `perf record` command collects samples of the instructions event in user space. +The `perf2bolt` tool converts the collected samples into BOLT’s .fdata profile format using the --ba option, which interprets the samples as basic-block counts. +Finally, `llvm-bolt` uses the generated profile to reorganize functions and basic blocks in the binary, producing an optimized binary named `out/bsort.opt.pmu`. From 98e7b0aba422abc820f34f6b21f5c83242898879 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 14:56:08 -0400 Subject: [PATCH 39/51] Clarify BOLT optimization verification steps Updated language for clarity and precision regarding BOLT optimization verification. --- .../bolt-demo/verify-optimization.md | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md index 4390b175e0..9cac3313b7 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md @@ -9,10 +9,10 @@ layout: learningpathall ### Verify with runtime {{% notice Note %}} -The example below uses a [BRBE](../brbe) optimized binary. The same verification applies to all BOLT profiling methods. +The example below uses a [BRBE](../brbe) optimized binary. You can apply the same verification steps to binaries optimized using the other BOLT profiling methods. {{% /notice %}} -We start by checking the runtime of the original and optimized BubbleSort binaries. A speedup is the first indication that BOLT improved the layout. +First, compare the runtime of the original and optimized BubbleSort binaries. A shorter runtime provides an initial indication that BOLT improved the code layout. ```bash { command_line="user@host | 2-4,6-8"} time out/bsort @@ -25,14 +25,14 @@ time out/bsort.opt.brbe out/bsort.opt.brbe 0.15s user 0.00s system 99% cpu 0.148 total ``` -In this example, we see a first indication of improvement from the speedup. It is large, around 2x, because the input program is intentionally pathological. Real applications may see smaller improvements. +In this example, the optimized binary runs in about 147 ms, compared with 280 ms for the original binary. This corresponds to roughly a 2× speedup. +The improvement is large because the example program intentionally creates poor code locality. Real applications typically show smaller but still meaningful improvements after BOLT optimization. ### Verify with hardware metrics -We now apply the [TopDown Methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology) again to confirm that BOLT improved the layout. -Runtime shows the effect, but TopDown confirms how the changes appear in the hardware metrics. - -We run the same tool that we used when checking whether the input program was a good candidate, but this time we check the optimized binary, for example the BRBE-optimized one. +Next, apply the [TopDown Methodology](https://developer.arm.com/documentation/109542/02/Arm-Topdown-methodology) again to verify that BOLT improved the code layout. +The runtime comparison shows the performance impact, but the TopDown metrics reveal how the optimization affects processor behavior. +Run the same tool used earlier when evaluating whether the program was a good BOLT candidate. This time, run it on the optimized binary, for example, the BRBE-optimized version. {{< tabpane code=true >}} {{< tab header="topdown-tool" language="bash" output_lines="2-21">}} @@ -72,10 +72,12 @@ We run the same tool that we used when checking whether the input program was a {{< /tab >}} {{< /tabpane >}} -We compare these metrics with the earlier results. Front-end bound and L1I MPKI should be lower after optimization. +Compare these metrics with the earlier results collected from the original binary. After optimization, both frontend bound and L1I MPKI should decrease. + +In this example, the optimized program is 36% frontend bound, down from 55%. The L1I cache MPKI drops to nearly 0, which indicates a significant improvement in instruction locality. -We now see that the optimized program is **36%** front-end bound, down from 55%. In addition, the L1I MPKI is close to **0**, showing that code layout improved. This result is unusually low because the input program is intentionally pathological. +This value is unusually low because the tutorial program intentionally creates poor code locality. -The Branch MPKI also dropped to **10** from 16 because BOLT can improve branch prediction by swapping the fall-through and taken paths based on profile data. +The Branch MPKI also decreases—from 16 to about 10—because BOLT can improve branch prediction. It uses profile data to adjust code layout and swap fall-through and taken paths when beneficial. -We can also compute these MPKIs manually using `perf stat`, as described in the [Good BOLT Candidates](../good-candidates) page. +You can also compute the MPKI values manually using `perf stat`, as described in the [Good BOLT Candidates](../good-candidates) section. From ecaeeb51eb9cd55e2d769f85e44814b7aa7629c7 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 14:58:23 -0400 Subject: [PATCH 40/51] Update verify-optimization.md --- .../bolt-demo/verify-optimization.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md index 9cac3313b7..bb6b5a7547 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/verify-optimization.md @@ -81,3 +81,7 @@ This value is unusually low because the tutorial program intentionally creates p The Branch MPKI also decreases—from 16 to about 10—because BOLT can improve branch prediction. It uses profile data to adjust code layout and swap fall-through and taken paths when beneficial. You can also compute the MPKI values manually using `perf stat`, as described in the [Good BOLT Candidates](../good-candidates) section. + +## Summary + +In this learning path, you learned how to use BOLT to optimize binary code layout using several profiling methods. The optimized binaries improved instruction locality, reduced frontend stalls, and delivered measurable performance gains. From 3aa43b914c6d491ca191dd3f6c129e9656e5cc2d Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:01:33 -0400 Subject: [PATCH 41/51] Update environment setup instructions for AArch64 --- .../servers-and-cloud-computing/bolt-demo/setup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index 0be10ed92b..ee445ad72d 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -8,7 +8,7 @@ layout: learningpathall ### Environment setup -On your AArch64 Linux bare-metal instance, navigate to your home directory (or another empty working directory) and create a file named `bsort.cpp` with the following content: +On your AArch64 Linux machine, navigate to your home directory (or another empty working directory) and create a file named `bsort.cpp` with the following content: ```cpp #include From ff3a83fa1565d3f6ed89bb7bece9b932f495347b Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:05:47 -0400 Subject: [PATCH 42/51] Remove duplicate mkdir commands in setup.md --- .../servers-and-cloud-computing/bolt-demo/setup.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index ee445ad72d..96e7383b1e 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -164,11 +164,9 @@ You will look at why relocations matter [later](#why-relocations) in this Learni {{< tabpane code=true >}} {{< tab header="GNU" language="bash">}} -mkdir -p out gcc bsort.cpp -o out/bsort -O3 -Wl,--emit-relocs -fno-toplevel-reorder {{< /tab >}} {{< tab header="LLVM" language="bash">}} -mkdir -p out clang bsort.cpp -o out/bsort -O3 -fuse-ld=lld -ffunction-sections -Wl,--emit-relocs -Wl,--symbol-ordering-file=orderfile.txt {{< /tab >}} {{< /tabpane >}} From 86d08a88a42f716eccaebfba2db856cd72887ce3 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:15:41 -0400 Subject: [PATCH 43/51] Fix link casing for Neon Intrinsics documentation --- .../mobile-graphics-and-gaming/android_neon/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md index 7b69cafb65..234ac62fa3 100644 --- a/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md +++ b/content/learning-paths/mobile-graphics-and-gaming/android_neon/_index.md @@ -37,7 +37,7 @@ further_reading: type: blog - resource: title: Using Neon Intrinsics - link: https://developer.arm.com/documentation/den0018/a/Neon-Intrinsics/Using-Neon-intrinsics + link: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics/Using-NEON-intrinsics type: documentation - resource: title: Intrinsics From 4678fcf5c0a3e4b1c6a922bb10c2cdc61e1684fb Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:20:40 -0400 Subject: [PATCH 44/51] Fix formatting of SIMD instructions section in intro.md --- content/learning-paths/cross-platform/intrinsics/intro.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/cross-platform/intrinsics/intro.md b/content/learning-paths/cross-platform/intrinsics/intro.md index e39c5d14c7..79d2c83d5c 100644 --- a/content/learning-paths/cross-platform/intrinsics/intro.md +++ b/content/learning-paths/cross-platform/intrinsics/intro.md @@ -16,7 +16,7 @@ Intrinsics are functions which are built into the compiler and not part of a lib One use of intrinsics is to access SIMD (single-instruction, multiple-data) instructions directly from C/C++ for improved application performance. Intrinsics are easier to work with compared to assembly language, but they often pose a challenge when porting source code to a new architecture. -Intel Streaming SIMD Extensions (SSE) and [Arm Neon](https://developer.arm.com/documentation/dht0002/a/Introducing-Neon/Neon-architecture-overview/Neon-instructions) SIMD instructions increase processor throughput by performing multiple computations with a single instruction. Over the years, Intel and Arm have introduced a variety of SIMD extensions. Neon is used in many of the Arm Cortex-A, Cortex-R, and Neoverse processors. +Intel Streaming SIMD Extensions (SSE) and [Arm Neon](https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-instructions) SIMD instructions increase processor throughput by performing multiple computations with a single instruction. Over the years, Intel and Arm have introduced a variety of SIMD extensions. Neon is used in many of the Arm Cortex-A, Cortex-R, and Neoverse processors. There are generally 3 ways to program SIMD hardware: - The C/C++ compiler recognizes opportunities to use SIMD instructions and inserts them automatically (with or without some guidance) From 194f33295b2db21a8964f99fd15a18e528762919 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:21:59 -0400 Subject: [PATCH 45/51] Fix NEON equivalents link for _mm_add_ps intrinsic Updated the link for the NEON equivalents of the _mm_add_ps intrinsic to ensure it points to the correct resource. --- .../cross-platform/simd-info-demo/simdinfo-example1-cont.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md index f11cf79897..3a1cf00695 100644 --- a/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md +++ b/content/learning-paths/cross-platform/simd-info-demo/simdinfo-example1-cont.md @@ -22,7 +22,7 @@ For the operations in your SSE4.2 example, you have the following intrinsics: To gain a deeper understanding of how these intrinsics work and to surface detailed descriptions, you can use the search feature on SIMD.info. Simply enter the name of the intrinsic in the search bar. You can either select from the suggested results or perform a direct search to retrieve information about each intrinsic. -1. By searching for [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps) you will retrieve information about its purpose, the result type, assembly instructions, prototypes, and an example demonstration. By clicking the **engine** option **"Neon"** you can find its [equivalents](https://simd.info/eq/_mm_add_ps/Neon/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) helps you find the right one. Based on the prototype provided, you can choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32) as it works with 128-bit vectors which is the same as **SSE4.2**. +1. By searching for [**`_mm_add_ps`**](https://simd.info/c_intrinsic/_mm_add_ps) you will retrieve information about its purpose, the result type, assembly instructions, prototypes, and an example demonstration. By clicking the **engine** option **"Neon"** you can find its [equivalents](https://simd.info/eq/_mm_add_ps/NEON/) for this engine. The equivalents are: **`vaddq_f32`**, **`vadd_f32`**. [Intrinsics comparison](https://simd.info/c-intrinsics-compare?compare=vaddq_f32:vadd_f32) helps you find the right one. Based on the prototype provided, you can choose [**`vaddq_f32`**](https://simd.info/c_intrinsic/vaddq_f32) as it works with 128-bit vectors which is the same as **SSE4.2**. 2. Moving to the next intrinsic, **`_mm_mul_ps`**, you can use the [Intrinsics Tree](https://simd.info/tag-tree) on SIMD.info to find the equivalent. From 8c6e38f232a48c570c1cf46bfc308e0972f8cc79 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:27:28 -0400 Subject: [PATCH 46/51] Fix typo in llama-chatbot.md regarding NEON flag --- .../servers-and-cloud-computing/llama-cpu/llama-chatbot.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md index 18ff4e0972..ca780a6a86 100644 --- a/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md @@ -232,7 +232,7 @@ llama_perf_context_print: total time = 8427.77 ms / 525 tokens The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton4 instance, you will see: - * Neon = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions + * NEON = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions * ARM_FMA = 1 This flag indicates support for Arm Floating-point Multiply and Accumulate instructions * MATMUL_INT8 = 1 This flag indicates support for Arm int8 matrix multiplication instructions * SVE = 1 This flag indicates support for the Arm Scalable Vector Extension From 5c0cfa063bd68da835fe95a394fc0aeb3504f111 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:29:21 -0400 Subject: [PATCH 47/51] Correct spelling of 'Neon' to 'NEON' in output --- .../processwatch/running-processwatch.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md b/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md index d2eedbbd43..f2f530e2f0 100644 --- a/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md +++ b/content/learning-paths/servers-and-cloud-computing/processwatch/running-processwatch.md @@ -45,7 +45,7 @@ By default, Process Watch: * Prints results until the tool is killed (via Ctrl+c). * Prints all results in a table format on `stdout`. * Profiles all running processes. - * Displays counts for the default filters, which are 'FPARMv8', 'Neon', 'SVE', and 'SVE2'. + * Displays counts for the default filters, which are 'FPARMv8', 'NEON', 'SVE', and 'SVE2'. * Sets the sample period to every 10000 events. ## Default Process Watch output @@ -57,7 +57,7 @@ sudo ./processwatch The output should look like: ```output -PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL ALL ALL 0.00 0.29 0.00 0.00 100.00 346 17400 processwatch 0.00 0.36 0.00 0.00 80.64 279 254 systemd-journal 0.00 0.00 0.00 0.00 13.01 45 @@ -66,14 +66,14 @@ ALL ALL 0.00 0.29 0.00 0.00 100.00 346 560 snapd 0.00 0.00 0.00 0.00 1.16 04 296 multipathd 0.00 0.00 0.00 0.00 0.58 02 -PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL ALL ALL 3.57 12.86 0.00 0.00 100.00 140 17400 processwatch 3.73 13.43 0.00 0.00 95.71 134 4939 sshd 0.00 0.00 0.00 0.00 2.86 04 296 multipathd 0.00 0.00 0.00 0.00 0.71 01 560 snapd 0.00 0.00 0.00 0.00 0.71 01 -PID NAME FPARMv8 Neon SVE SVE2 %TOTAL TOTAL +PID NAME FPARMv8 NEON SVE SVE2 %TOTAL TOTAL ALL ALL 1.18 5.12 0.00 0.00 100.00 254 17400 processwatch 1.19 5.16 0.00 0.00 99.21 252 6651 packagekitd 0.00 0.00 0.00 0.00 0.39 01 From 2ced1e3c333986e9b51798718e2143bf978562f5 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:31:52 -0400 Subject: [PATCH 48/51] Fix references to Neon instructions in documentation Updated documentation to remove references to 'Neon' instructions and corrected workload names in processwatch commands. --- .../processwatch/using-processwatch.md | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md b/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md index bd4b434138..c80a121be8 100644 --- a/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md +++ b/content/learning-paths/servers-and-cloud-computing/processwatch/using-processwatch.md @@ -29,61 +29,61 @@ Compile the workload without applying any optimizations: ```bash aarch64-linux-gnu-gcc workload.c -o workload_none -O0 ``` -Now, run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: +Now, run the workload in the background and launch `processwatch` on the workload to detect the use of and SVE instructions: ```bash ./workload_none & [1] 126958 -sudo ./processwatch -p 126958 -f HasNEON -f HasSVEorSME +sudo ./processwatch -p 126958 -f Has -f HasSVEorSME ``` You will need to change the PID in the `processwatch` command with the PID of the workload running in the background. The output should look like: ```output -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME SVEorSME %TOTAL TOTAL ALL ALL 0.00 0.00 100.00 24726 126958 workload_none 0.00 0.00 100.00 24726 -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME SVEorSME %TOTAL TOTAL ALL ALL 0.00 0.00 100.00 26006 126958 workload_none 0.00 0.00 100.00 26006 ^C ``` -You can see that in this case, the workload is not making use of Neon or SVE instructions. +You can see that in this case, the workload is not making use of or SVE instructions. -## Case 2: Use Neon instructions -Now recompile the same workload to make use of Neon instructions: +## Case 2: Use instructions +Now recompile the same workload to make use of instructions: ```bash -aarch64-linux-gnu-gcc workload.c -o workload_neon -O2 -ftree-vectorize -march=armv8.6-a +aarch64-linux-gnu-gcc workload.c -o workload_ -O2 -ftree-vectorize -march=armv8.6-a ``` -Run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: +Run the workload in the background and launch `processwatch` on the workload to detect the use of and SVE instructions: ```bash -./workload_neon & +./workload_ & [1] 126987 -sudo ./processwatch -p 126987 -f HasNEON -f HasSVEorSME +sudo ./processwatch -p 126987 -f Has -f HasSVEorSME ``` You will need to change the PID in the `processwatch` command with the PID of the workload running in the background. The output should look like: ```output -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME SVEorSME %TOTAL TOTAL ALL ALL 31.75 0.00 100.00 24828 -126987 workload_neon 31.75 0.00 100.00 24828 +126987 workload_ 31.75 0.00 100.00 24828 -PID NAME NEON SVEorSME %TOTAL TOTAL +PID NAME SVEorSME %TOTAL TOTAL ALL ALL 32.45 0.00 100.00 26143 -126987 workload_neon 32.45 0.00 100.00 26143 +126987 workload_ 32.45 0.00 100.00 26143 ^C ``` -You can now see the workload is retiring Neon instructions as you would expect. +You can now see the workload is retiring instructions as you would expect. -You can run `objdump` on the binary to view the disassembled Neon instructions: +You can run `objdump` on the binary to view the disassembled instructions: ```bash -objdump -S workload_neon +objdump -S workload_ ``` The output should look like: ```output @@ -108,22 +108,22 @@ Recompile the workload again. This time include support for SVE instructions: ```bash aarch64-linux-gnu-gcc workload.c -o workload_sve -O2 -ftree-vectorize -march=armv8.5-a+sve ``` -Run the workload in the background and launch `processwatch` on the workload to detect the use of Neon and SVE instructions: +Run the workload in the background and launch `processwatch` on the workload to detect the use of and SVE instructions: ```bash ./workload_sve & [1] 126997 -sudo ./processwatch -p 126997 -f HasNEON -f HasSVEorSME +sudo ./processwatch -p 126997 -f Has -f HasSVEorSME ``` You will need to change the PID in the `processwatch` command with the PID of the workload running in the background. The output should look like: ```output -PID NAME Neon SVEorSME %TOTAL TOTAL +PID NAME NEON SVEorSME %TOTAL TOTAL ALL ALL 0.00 96.68 100.00 24914 126997 workload_sve 0.00 96.68 100.00 24914 -PID NAME Neon SVEorSME %TOTAL TOTAL +PID NAME NEON SVEorSME %TOTAL TOTAL ALL ALL 0.00 96.74 100.00 26137 126997 workload_sve 0.00 96.74 100.00 26137 ^C From f42f59578bad6fbbeb84722307f68cc01d1a09f4 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:41:21 -0400 Subject: [PATCH 49/51] Mark Isaac Sim guide as draft Set the draft status for the Isaac Sim robotics workflow guide. --- .../laptops-and-desktops/dgx_spark_isaac_robotics/_index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md index bada253138..d4b5893f5f 100644 --- a/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md +++ b/content/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/_index.md @@ -1,6 +1,9 @@ --- title: Build Robot Simulation and Reinforcement Learning Workflows with Isaac Sim and Isaac Lab on DGX Spark +draft: true +cascade: + draft: true minutes_to_complete: 90 From 99f5b0926f5cdf7a30da769b9adbdc607fdabb63 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 15:58:01 -0400 Subject: [PATCH 50/51] Add new entries to .wordlist.txt --- .wordlist.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.wordlist.txt b/.wordlist.txt index 9fe8dba959..58dcaf384b 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -5829,3 +5829,19 @@ svfloat svptrue svst vexpq +Agyeman +ConnMan +DockerContainer +Neethu +PyTest +Pytest +Testcontainers +abcdef +cdef +fedcba +kwargs +pytest +reconnection +stdin +testcontainers +ttyUSB From c4827cd8c47ae3b21578b55d2c2e4669491d8c43 Mon Sep 17 00:00:00 2001 From: pareenaverma Date: Fri, 13 Mar 2026 20:06:48 +0000 Subject: [PATCH 51/51] Bolt LP review --- .../bolt-demo/bsort.cpp | 113 ------------------ .../bolt-demo/orderfile.txt | 10 -- .../bolt-demo/setup.md | 2 +- 3 files changed, 1 insertion(+), 124 deletions(-) delete mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp delete mode 100644 content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp b/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp deleted file mode 100644 index 298c2afd3a..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/bsort.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include -#include -#include - -#define ARRAY_LEN 10000 -#define FUNC_COPIES 5 -volatile bool Cond = false; -#define COND() (__builtin_expect(Cond, true)) - -#define NOPS(N) \ - asm volatile( \ - ".rept %0\n" \ - "nop\n" \ - ".endr\n" \ - : : "i"(N) : "memory") - -// Swap functionality plus some cold blocks. -#define SWAP_FUNC(ID) \ - static __attribute__((noinline)) \ - void swap##ID(int *left, int *right) { \ - if (COND()) NOPS(300); \ - int tmp = *left; \ - if (COND()) NOPS(300); else *left = *right; \ - if (COND()) NOPS(300); else *right = tmp; \ - } - -// Aligned at 16KiB -#define COLD_FUNC(ID) \ - static __attribute__((noinline, aligned(16384), used)) \ - void cold_func##ID(void) { \ - asm volatile("nop"); \ - } - -// Create copies of swap, and interleave with big chunks of cold code. -SWAP_FUNC(1) COLD_FUNC(1) -SWAP_FUNC(2) COLD_FUNC(2) -SWAP_FUNC(3) COLD_FUNC(3) -SWAP_FUNC(4) COLD_FUNC(4) -SWAP_FUNC(5) COLD_FUNC(5) - -typedef void (*swap_fty)(int *, int *); -static swap_fty const swap_funcs[FUNC_COPIES] = { - swap1, swap2, swap3, swap4, swap5 -}; - - -/* Sorting Logic */ -void bubble_sort(int *a, int n) { - if (n <= 1) - return; - - int end = n - 1; - int swapped = 1; - unsigned idx = 0; - - while (swapped && end > 0) { - swapped = 0; - // pick a different copy of the swap function, in a round-robin fashion - // and call it. - for (int i = 1; i <= end; ++i) { - if (a[i] < a[i - 1]) { - auto swap_func = swap_funcs[idx++]; - idx %= FUNC_COPIES; - swap_func(&a[i - 1], &a[i]); - swapped = 1; - } - } - --end; - } -} - -void sort_array(int *data) { - for (int i = 0; i < ARRAY_LEN; ++i) { - data[i] = rand(); - } - bubble_sort(data, ARRAY_LEN); -} - -/* Timers, helpers, and main */ -static struct timespec timer_start; -static inline void start_timer(void) { - clock_gettime(CLOCK_MONOTONIC, &timer_start); -} - -static inline void stop_timer(void) { - struct timespec timer_end; - clock_gettime(CLOCK_MONOTONIC, &timer_end); - long long ms = (timer_end.tv_sec - timer_start.tv_sec) * 1000LL + - (timer_end.tv_nsec - timer_start.tv_nsec) / 1000000LL; - printf("%lld ms ", ms); -} - -static void print_first_last(const int *data, int n) { - if (n <= 0) - return; - - const int first = data[0]; - const int last = data[n - 1]; - printf("(first=%d last=%d)\n", first, last); -} - -int main(void) { - srand(0); - printf("Bubble sorting %d elements\n", ARRAY_LEN); - int data[ARRAY_LEN]; - - start_timer(); - sort_array(data); - stop_timer(); - - print_first_last(data, ARRAY_LEN); - return 0; -} diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt b/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt deleted file mode 100644 index 1e6d1a8faa..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/orderfile.txt +++ /dev/null @@ -1,10 +0,0 @@ -_ZL5swap1PiS_ -_ZL10cold_func1v -_ZL5swap2PiS_ -_ZL10cold_func2v -_ZL5swap3PiS_ -_ZL10cold_func3v -_ZL5swap4PiS_ -_ZL10cold_func4v -_ZL5swap5PiS_ -_ZL10cold_func5v diff --git a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md index 96e7383b1e..e64004961e 100644 --- a/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md +++ b/content/learning-paths/servers-and-cloud-computing/bolt-demo/setup.md @@ -126,7 +126,7 @@ int main(void) { } ``` -The [last section](#why-bubble-sort) explains why this tutorial uses BubbleSort as the demonstration workload. +The [Why Bubble Sort?](#why-bubble-sort) section explains why this tutorial uses BubbleSort as the demonstration workload. Create the following directories to organize generated files from this example: ```bash