Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions src/port/amd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,25 +106,26 @@ RX is the board's UART `~B/s` line (host -> board); TX is host-measured
|------------------------------|-----------------|--------:|--------:|
| VMK180 (Versal, A72 @ EL3) | DDR (JTAG) | ~300 | ~334 |
| ZCU102 (ZynqMP, A53 @ EL3) | DDR (SD boot) | ~126 | ~194 |
| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 |
| ZC702 (Zynq-7000, A9 @ SVC) | DDR (JTAG) | ~59 | ~53 |
| ZCU102 (ZynqMP, A53 @ EL3) | OCM (JTAG) | ~10 | ~9 |
| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 |

The single dominant factor is the **memory layout**: the OCM layout runs *all*
code (and the rings) from Normal non-cacheable OCM, so every instruction fetch
and frame copy is uncached. The DDR layout keeps code+data in cacheable DDR and
maps only the GEM DMA region non-cacheable - ~13-30x faster, as the two ZCU102
rows show directly (same SoC/core, OCM ~10/9 vs DDR ~126/194 Mbps). The faster
A72 (Versal) reaches ~300/334 on DDR.
maps only the GEM DMA region non-cacheable. The same-SoC OCM-vs-DDR rows show
the effect directly: ~13x on the ZCU102 A53 (10 -> 126 Mbps) and ~2.7x on the
slower ZC702 A9 (22 -> 59 Mbps). The faster A72 (Versal) reaches ~300/334 on DDR.

How each DDR number was loaded: Versal's PLM trains DDR from a boot PDI, so the
DDR app loads cleanly over JTAG. On ZynqMP, JTAG writes into DDR after a bare
`psu_init` are unreliable (the load goes through the A53 with a cache flush and
either errors or lands corrupt - DDR itself is fine, a direct DAP memtest passes),
so the ZCU102 DDR figure is from an **SD boot**: `FSBL_ELF=.../zynqmp_fsbl.elf
make bootbin` produces a DDR-layout `BOOT.BIN` that the FSBL trains DDR for and
DMA-loads (no JTAG memory writes). Copy it to the SD card's FAT boot partition
and set SW6 = SD. The same applies to ZC702 (its OCM-only port has no DDR layout
yet; a DDR profile is future work).
DDR app loads cleanly over JTAG. On ZC702 the FSBL trains DDR and the JTAG loader
then `dow`s the app to its link address (`0x10000000`), also clean. On ZynqMP,
JTAG writes into DDR after a bare `psu_init` are unreliable (the load goes through
the A53 with a cache flush and either errors or lands corrupt - DDR itself is
fine, a direct DAP memtest passes), so the ZCU102 DDR figure is from an **SD
boot**: `FSBL_ELF=.../zynqmp_fsbl.elf make bootbin` produces a DDR-layout
`BOOT.BIN` that the FSBL trains DDR for and DMA-loads (no JTAG memory writes).
Copy it to the SD card's FAT boot partition and set SW6 = SD.

What it took to get here:

Expand Down
36 changes: 28 additions & 8 deletions src/port/amd/arch/aarch64/mmu_aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,24 @@ static void mmu_build_tables(void)
L1[i] = 0;
}

/* EL-specific system-register names. Default EL3; the wolfBoot demo build
* passes -DWOLFIP_EL2 (the image is chain-loaded at EL2). TCR_EL2/MAIR_EL2
* (E2H=0) share the single-range format of their EL3 counterparts, so the
* same TCR/MAIR values apply. */
#ifdef WOLFIP_EL2
#define SR_MAIR "mair_el2"
#define SR_TCR "tcr_el2"
#define SR_TTBR0 "ttbr0_el2"
#define SR_SCTLR "sctlr_el2"
#define INS_TLBI "tlbi alle2"
#else
#define SR_MAIR "mair_el3"
#define SR_TCR "tcr_el3"
#define SR_TTBR0 "ttbr0_el3"
#define SR_SCTLR "sctlr_el3"
#define INS_TLBI "tlbi alle3"
#endif

void mmu_enable(void)
{
uint64_t mair;
Expand All @@ -195,7 +213,7 @@ void mmu_enable(void)
* ATTR1 = 0x00 (Device-nGnRnE)
* ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable, for DMA buffers) */
mair = (0xFFULL << 0) | (0x00ULL << 8) | (0x44ULL << 16);
__asm__ volatile ("msr mair_el3, %0" :: "r"(mair));
__asm__ volatile ("msr " SR_MAIR ", %0" :: "r"(mair));

/* TCR_EL3: 32-bit VA (T0SZ=32, start level L1), 4 KB granule,
* IRGN0=WB-RA-WA, ORGN0=WB-RA-WA, SH0=Inner shareable, IPS=40 bit.
Expand All @@ -210,15 +228,15 @@ void mmu_enable(void)
| ((uint64_t)2 << 16) /* PS = 40 bit PA */
| ((uint64_t)1 << 23) /* RES1 */
| ((uint64_t)1 << 31); /* RES1 */
__asm__ volatile ("msr tcr_el3, %0" :: "r"(tcr));
__asm__ volatile ("msr " SR_TCR ", %0" :: "r"(tcr));

/* TTBR0_EL3 = &L1. */
__asm__ volatile ("msr ttbr0_el3, %0" :: "r"((uint64_t)(uintptr_t)L1));
/* TTBR0_ELx = &L1. */
__asm__ volatile ("msr " SR_TTBR0 ", %0" :: "r"((uint64_t)(uintptr_t)L1));

__asm__ volatile ("isb" ::: "memory");

/* Invalidate TLBs and I-cache before turning the MMU on. */
__asm__ volatile ("tlbi alle3" ::: "memory");
__asm__ volatile (INS_TLBI ::: "memory");
__asm__ volatile ("ic iallu" ::: "memory");
__asm__ volatile ("dsb sy" ::: "memory");
__asm__ volatile ("isb" ::: "memory");
Expand All @@ -231,12 +249,14 @@ void mmu_enable(void)
* here). Newlib aarch64 memset uses DC ZVA for fast bulk zero
* writes; without DZE=1 the instruction traps UNDEF and the
* exception loop wedges the CPU. */
__asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr));
__asm__ volatile ("mrs %0, " SR_SCTLR : "=r"(sctlr));
sctlr |= (1ULL << 0); /* M */
sctlr |= (1ULL << 2); /* C */
sctlr |= (1ULL << 12); /* I */
sctlr |= (1ULL << 14); /* DZE - allow DC ZVA */
#ifndef WOLFIP_EL2
sctlr |= (1ULL << 14); /* DZE - allow DC ZVA (EL3; RES0 in SCTLR_EL2) */
#endif
sctlr &= ~(1ULL << 1); /* A off */
__asm__ volatile ("msr sctlr_el3, %0" :: "r"(sctlr));
__asm__ volatile ("msr " SR_SCTLR ", %0" :: "r"(sctlr));
__asm__ volatile ("isb" ::: "memory");
}
64 changes: 46 additions & 18 deletions src/port/amd/arch/aarch64/startup_aarch64.S
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,31 @@
*/
#ifndef UART_EARLY_TX_OFF
#define UART_EARLY_TX_OFF 0x30 /* Cadence TX FIFO; PL011 boards pass 0x00 */
#endif

/* Exception level of the entry. The default (JTAG / FSBL / PLM->BL31) drops
* us at EL3. wolfBoot hands a chain-loaded image off at EL2, so the demo
* build passes -DWOLFIP_EL2 (to CFLAGS *and* ASFLAGS) to retarget the
* EL-specific system registers. SCR_ELx exists only at EL3; at EL2 IRQs are
* taken at the current EL once PSTATE.I is unmasked, so that block is
* skipped. The GIC (GICv2 mem-mapped / GICv3 ICC_*_EL1) and the generic
* timer (CNTPCT_EL0) work unchanged at EL2. */
#ifdef WOLFIP_EL2
#define SCTLR_ELx sctlr_el2
#define CPTR_ELx cptr_el2
#define VBAR_ELx vbar_el2
#define SPSR_ELx spsr_el2
#define ELR_ELx elr_el2
#define ESR_ELx esr_el2
#define FAR_ELx far_el2
#else
#define SCTLR_ELx sctlr_el3
#define CPTR_ELx cptr_el3
#define VBAR_ELx vbar_el3
#define SPSR_ELx spsr_el3
#define ELR_ELx elr_el3
#define ESR_ELx esr_el3
#define FAR_ELx far_el3
#endif

/* A loader (FSBL, wolfBoot, ...) that respects the ELF entry
Expand Down Expand Up @@ -106,16 +131,17 @@ _start:
and x0, x0, #0xff /* Aff0 */
cbnz x0, _park_secondary

/* Disable MMU + caches in case FSBL left them on. */
mrs x0, sctlr_el3
/* Disable MMU + caches in case the loader left them on. */
mrs x0, SCTLR_ELx
bic x0, x0, #(1 << 0) /* M - MMU off */
bic x0, x0, #(1 << 2) /* C - D-cache off */
bic x0, x0, #(1 << 12) /* I - I-cache off */
msr sctlr_el3, x0
msr SCTLR_ELx, x0
isb

/* Allow FP/SIMD at EL3 (FSBL does this too, but be explicit). */
msr cptr_el3, xzr
/* Allow FP/SIMD (the loader usually does this; be explicit). Unused
* here (-mgeneral-regs-only) but harmless. */
msr CPTR_ELx, xzr

/* Force SPSel = 1 (use SP_ELx). The IRQ vector at offset 0x280
* (Current EL with SPx) is what we wired el3_irq_trampoline to.
Expand All @@ -134,14 +160,16 @@ _start:
* bit 2 FIQ = 1 (route FIQ to EL3)
* bit 3 EA = 1 (route SError/abort to EL3)
* bit 10 RW = 0 (no lower EL64; we never drop to lower EL) */
#ifndef WOLFIP_EL2
mov x0, #((1 << 1) | (1 << 2) | (1 << 3))
msr scr_el3, x0
isb
#endif

/* Vector base. */
adrp x0, _vectors
add x0, x0, :lo12:_vectors
msr vbar_el3, x0
msr VBAR_ELx, x0

/* Stack pointer. After 'msr spsel, #1' this writes SP_EL3. */
ldr x0, =_stack_top
Expand Down Expand Up @@ -223,15 +251,15 @@ el3_irq_trampoline:
str x30, [sp, #(15 * 16)]
/* Snapshot exception return state in case irq_dispatch (or any
* nested exception inside it) clobbers SPSR_EL3 / ELR_EL3. */
mrs x0, spsr_el3
mrs x1, elr_el3
mrs x0, SPSR_ELx
mrs x1, ELR_ELx
stp x0, x1, [sp, #(16 * 16)]

bl irq_dispatch

ldp x0, x1, [sp, #(16 * 16)]
msr spsr_el3, x0
msr elr_el3, x1
msr SPSR_ELx, x0
msr ELR_ELx, x1
ldp x0, x1, [sp, #(0 * 16)]
ldp x2, x3, [sp, #(1 * 16)]
ldp x4, x5, [sp, #(2 * 16)]
Expand Down Expand Up @@ -271,19 +299,19 @@ irq_disable:
* ------------------------------------------------------------------- */
.type el3_sync_trampoline, %function
el3_sync_trampoline:
mrs x0, esr_el3
mrs x1, elr_el3
mrs x2, far_el3
mrs x3, spsr_el3
mrs x0, ESR_ELx
mrs x1, ELR_ELx
mrs x2, FAR_ELx
mrs x3, SPSR_ELx
bl exception_report
b _hang

.type el3_serror_trampoline, %function
el3_serror_trampoline:
mrs x0, esr_el3
mrs x1, elr_el3
mrs x2, far_el3
mrs x3, spsr_el3
mrs x0, ESR_ELx
mrs x1, ELR_ELx
mrs x2, FAR_ELx
mrs x3, SPSR_ELx
mov x4, #1 /* indicate SError to C */
bl exception_report_serror
b _hang
16 changes: 14 additions & 2 deletions src/port/amd/arch/armv7/mmu_armv7.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,26 @@ static void mmu_build_tables(void)
{
uint32_t i;
uint32_t addr;
uint32_t dma_lo = (uint32_t)(uintptr_t)_dma_buffers_start;
uint32_t dma_hi = (uint32_t)(uintptr_t)_dma_buffers_end;

for (i = 0; i < 4096; i++)
L1[i] = SEC_INVALID;

/* DDR 0x00000000 - 0x3FFFFFFF (1 GB) as Normal WB. */
/* DDR 0x00000000 - 0x3FFFFFFF (1 GB) Normal WB cacheable, except any
* 1 MB section overlapping the GEM DMA region, which is Normal-NC. In
* the OCM layout the DMA buffers live in OCM (mapped NC below), so no
* DDR section is carved and all of DDR stays cacheable. In the DDR
* layout (the wolfBoot / cached-code path) the rings sit in DDR and
* MUST be NC: the 8-byte GEM BDs share 32-byte cache lines, so a
* cacheable ring lets a cache-line clean write stale neighbour BDs back
* over MAC-set OWN bits and wedges RX under sustained load (HIGH-2). */
for (i = 0; i < 1024; i++) {
addr = i * 0x100000u;
L1[i] = SEC_NORMAL_WB(addr);
if (addr + 0x100000u <= dma_lo || addr >= dma_hi)
L1[i] = SEC_NORMAL_WB(addr);
else
L1[i] = SEC_NORMAL_NC(addr);
}

/* PS peripherals at 0xE0000000 - 0xFEFFFFFF (Device). */
Expand Down
66 changes: 66 additions & 0 deletions src/port/amd/boards/zcu102/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ CFLAGS += $(CFLAGS_EXTRA)

ASFLAGS := -mcpu=cortex-a53 -DUART_EARLY_TX_OFF=0x30

# Entry exception level: 3 (default - JTAG / FSBL / PLM->BL31 drop us at EL3)
# or 2 (wolfBoot chain-loads the image at EL2). EL=2 retargets the
# EL-specific system registers in startup_aarch64.S + mmu_aarch64.c.
EL ?= 3
ifeq ($(EL),2)
CFLAGS += -DWOLFIP_EL2
ASFLAGS += -DWOLFIP_EL2
else ifneq ($(EL),3)
$(error EL must be 2 or 3)
endif

# Layout selector. Default ocm keeps the OCM-only layout that the JTAG
# iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass
# LAYOUT=ddr to relink for DDR @ 0x10000000 -- this is the layout
Expand Down Expand Up @@ -63,6 +74,61 @@ LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o)
WOLFIP_OBJ := wolfip.o
OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ)

# Optional network-delivered update (OTA=1). The running, wolfBoot-verified
# wolfIP app fetches a newer signed image over TFTP and stages it to the SD
# OFP_B partition, then resets so wolfBoot boots the higher version. To avoid
# any runtime hand-off, the SAME wolfBoot SD/disk driver SOURCE is compiled
# straight into the app (sdhci.c/disk.c/gpt.c) and fed platform glue from
# sdhci_shim.c. The caller must set WOLFBOOT=/path/to/wolfBoot (no default).
OTA ?= 0
ifeq ($(OTA),1)
# OTA needs the DDR layout: the multi-MB staging buffer + the compiled-in
# wolfBoot drivers do not fit the OCM layout. Fail fast with a clear message
# instead of a generic link-time region overflow.
ifeq ($(LAYOUT),ocm)
$(error OTA=1 requires LAYOUT=ddr (staging buffer + drivers do not fit OCM))
endif
# WOLFBOOT must point at a wolfBoot tree - its src/{sdhci,disk,gpt}.c and
# include/ are compiled into the app. No developer-local default, so OTA
# builds are portable; the demo's build.sh passes WOLFBOOT explicitly.
ifeq ($(strip $(WOLFBOOT)),)
$(error OTA=1 requires WOLFBOOT=/path/to/wolfBoot)
endif
# OFP_B = MBR partition index 2 (matches wolfBoot zynqmp_sdcard.config).
CFLAGS += -DWOLFIP_OTA -DBOOT_PART_B=2 -I$(ROOT)/src/tftp
# App-side OTA glue + the wolfIP TFTP client. syscalls_stub.o satisfies
# the newlib reent/stdio hooks that snprintf (in the TFTP client) pulls in.
OTA_OBJS := ota.o sdhci_shim.o syscalls_stub.o wolftftp.o
# wolfBoot SD-host + disk drivers, compiled from source (no hand-off).
WB_DRV_OBJS := sdhci.o disk.o gpt.o
OTA_OBJS += $(WB_DRV_OBJS)
OBJS += $(OTA_OBJS)
vpath %.c $(ROOT)/src/tftp
vpath %.c $(WOLFBOOT)/src
# SDHCI_DEBUG=1 routes the driver's DEBUG_SDHCI tracing to the app UART
# (printf.h DEBUG_UART path -> uart_printf/uart_write in sdhci_shim.c).
# Default off keeps wolfBoot_printf a no-op (WOLFBOOT_NO_PRINTF). Note the
# per-block read/write traces are verbose during a full image transfer.
SDHCI_DEBUG ?= 0
ifeq ($(SDHCI_DEBUG),1)
WB_PRINT := -DDEBUG_UART -DDEBUG_SDHCI
else
WB_PRINT := -DWOLFBOOT_NO_PRINTF
endif
# The wolfBoot driver objects need the wolfBoot include tree and its
# build defines, scoped to just those objects. They are vendored source,
# so relax -Werror (benign unused-var/function warnings) for them only.
# SDHCI_FORCE_CARD_DETECT: the ZCU102 carrier does not wire the SD card-
# detect pin to the controller, so the SRS09 Card-Inserted/Card-State-
# Stable bits are unreliable (they only settle given enough delay). The
# card is unquestionably present here - we booted from it - so skip the
# present-state check rather than depend on it stabilizing in time.
$(WB_DRV_OBJS): CFLAGS += -I$(WOLFBOOT)/include -I$(WOLFBOOT)/src \
-DDISK_SDCARD $(WB_PRINT) -DTARGET_zynq -DWOLFBOOT_UPDATE_DISK \
-DARCH_AARCH64 -DBOOT_PART_A=1 -DBOOT_PART_B=2 -DSDHCI_FORCE_CARD_DETECT \
-Wno-error -Wno-unused-function -Wno-unused-variable
endif

# Shared sources live outside this board dir; find them by vpath so the
# .o files still land here (keeps clean + JTAG app.elf-in-place working).
vpath %.c $(COMMON):$(ARCH):$(IP)
Expand Down
7 changes: 7 additions & 0 deletions src/port/amd/boards/zcu102/board.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@

const char *board_banner(void)
{
/* The entry exception level depends on the build: EL3 for JTAG/FSBL
* boot, EL2 when wolfBoot chain-loads the image (EL=2 / -DWOLFIP_EL2). */
#ifdef WOLFIP_EL2
return "\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL2) ===\n"
"MMU on, caches on. Bringing up GIC-400 (GICv2)...\n";
#else
return "\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) ===\n"
"MMU on, caches on. Bringing up GIC-400 (GICv2)...\n";
#endif
}

void board_irq_setup(void)
Expand Down
Loading
Loading