diff --git a/src/port/amd/README.md b/src/port/amd/README.md index e3114037..c2198d31 100644 --- a/src/port/amd/README.md +++ b/src/port/amd/README.md @@ -106,25 +106,26 @@ RX is the board's UART `~B/s` line (host -> board); TX is host-measured |------------------------------|-----------------|--------:|--------:| | VMK180 (Versal, A72 @ EL3) | DDR (JTAG) | ~300 | ~334 | | ZCU102 (ZynqMP, A53 @ EL3) | DDR (SD boot) | ~126 | ~194 | -| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 | +| ZC702 (Zynq-7000, A9 @ SVC) | DDR (JTAG) | ~59 | ~53 | | ZCU102 (ZynqMP, A53 @ EL3) | OCM (JTAG) | ~10 | ~9 | +| ZC702 (Zynq-7000, A9 @ SVC) | OCM (JTAG) | ~22 | ~19 | The single dominant factor is the **memory layout**: the OCM layout runs *all* code (and the rings) from Normal non-cacheable OCM, so every instruction fetch and frame copy is uncached. The DDR layout keeps code+data in cacheable DDR and -maps only the GEM DMA region non-cacheable - ~13-30x faster, as the two ZCU102 -rows show directly (same SoC/core, OCM ~10/9 vs DDR ~126/194 Mbps). The faster -A72 (Versal) reaches ~300/334 on DDR. +maps only the GEM DMA region non-cacheable. The same-SoC OCM-vs-DDR rows show +the effect directly: ~13x on the ZCU102 A53 (10 -> 126 Mbps) and ~2.7x on the +slower ZC702 A9 (22 -> 59 Mbps). The faster A72 (Versal) reaches ~300/334 on DDR. How each DDR number was loaded: Versal's PLM trains DDR from a boot PDI, so the -DDR app loads cleanly over JTAG. On ZynqMP, JTAG writes into DDR after a bare -`psu_init` are unreliable (the load goes through the A53 with a cache flush and -either errors or lands corrupt - DDR itself is fine, a direct DAP memtest passes), -so the ZCU102 DDR figure is from an **SD boot**: `FSBL_ELF=.../zynqmp_fsbl.elf -make bootbin` produces a DDR-layout `BOOT.BIN` that the FSBL trains DDR for and -DMA-loads (no JTAG memory writes). Copy it to the SD card's FAT boot partition -and set SW6 = SD. The same applies to ZC702 (its OCM-only port has no DDR layout -yet; a DDR profile is future work). +DDR app loads cleanly over JTAG. On ZC702 the FSBL trains DDR and the JTAG loader +then `dow`s the app to its link address (`0x10000000`), also clean. On ZynqMP, +JTAG writes into DDR after a bare `psu_init` are unreliable (the load goes through +the A53 with a cache flush and either errors or lands corrupt - DDR itself is +fine, a direct DAP memtest passes), so the ZCU102 DDR figure is from an **SD +boot**: `FSBL_ELF=.../zynqmp_fsbl.elf make bootbin` produces a DDR-layout +`BOOT.BIN` that the FSBL trains DDR for and DMA-loads (no JTAG memory writes). +Copy it to the SD card's FAT boot partition and set SW6 = SD. What it took to get here: diff --git a/src/port/amd/arch/aarch64/mmu_aarch64.c b/src/port/amd/arch/aarch64/mmu_aarch64.c index b1939f7d..d36849de 100644 --- a/src/port/amd/arch/aarch64/mmu_aarch64.c +++ b/src/port/amd/arch/aarch64/mmu_aarch64.c @@ -177,6 +177,24 @@ static void mmu_build_tables(void) L1[i] = 0; } +/* EL-specific system-register names. Default EL3; the wolfBoot demo build + * passes -DWOLFIP_EL2 (the image is chain-loaded at EL2). TCR_EL2/MAIR_EL2 + * (E2H=0) share the single-range format of their EL3 counterparts, so the + * same TCR/MAIR values apply. */ +#ifdef WOLFIP_EL2 +#define SR_MAIR "mair_el2" +#define SR_TCR "tcr_el2" +#define SR_TTBR0 "ttbr0_el2" +#define SR_SCTLR "sctlr_el2" +#define INS_TLBI "tlbi alle2" +#else +#define SR_MAIR "mair_el3" +#define SR_TCR "tcr_el3" +#define SR_TTBR0 "ttbr0_el3" +#define SR_SCTLR "sctlr_el3" +#define INS_TLBI "tlbi alle3" +#endif + void mmu_enable(void) { uint64_t mair; @@ -195,7 +213,7 @@ void mmu_enable(void) * ATTR1 = 0x00 (Device-nGnRnE) * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable, for DMA buffers) */ mair = (0xFFULL << 0) | (0x00ULL << 8) | (0x44ULL << 16); - __asm__ volatile ("msr mair_el3, %0" :: "r"(mair)); + __asm__ volatile ("msr " SR_MAIR ", %0" :: "r"(mair)); /* TCR_EL3: 32-bit VA (T0SZ=32, start level L1), 4 KB granule, * IRGN0=WB-RA-WA, ORGN0=WB-RA-WA, SH0=Inner shareable, IPS=40 bit. @@ -210,15 +228,15 @@ void mmu_enable(void) | ((uint64_t)2 << 16) /* PS = 40 bit PA */ | ((uint64_t)1 << 23) /* RES1 */ | ((uint64_t)1 << 31); /* RES1 */ - __asm__ volatile ("msr tcr_el3, %0" :: "r"(tcr)); + __asm__ volatile ("msr " SR_TCR ", %0" :: "r"(tcr)); - /* TTBR0_EL3 = &L1. */ - __asm__ volatile ("msr ttbr0_el3, %0" :: "r"((uint64_t)(uintptr_t)L1)); + /* TTBR0_ELx = &L1. */ + __asm__ volatile ("msr " SR_TTBR0 ", %0" :: "r"((uint64_t)(uintptr_t)L1)); __asm__ volatile ("isb" ::: "memory"); /* Invalidate TLBs and I-cache before turning the MMU on. */ - __asm__ volatile ("tlbi alle3" ::: "memory"); + __asm__ volatile (INS_TLBI ::: "memory"); __asm__ volatile ("ic iallu" ::: "memory"); __asm__ volatile ("dsb sy" ::: "memory"); __asm__ volatile ("isb" ::: "memory"); @@ -231,12 +249,14 @@ void mmu_enable(void) * here). Newlib aarch64 memset uses DC ZVA for fast bulk zero * writes; without DZE=1 the instruction traps UNDEF and the * exception loop wedges the CPU. */ - __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + __asm__ volatile ("mrs %0, " SR_SCTLR : "=r"(sctlr)); sctlr |= (1ULL << 0); /* M */ sctlr |= (1ULL << 2); /* C */ sctlr |= (1ULL << 12); /* I */ - sctlr |= (1ULL << 14); /* DZE - allow DC ZVA */ +#ifndef WOLFIP_EL2 + sctlr |= (1ULL << 14); /* DZE - allow DC ZVA (EL3; RES0 in SCTLR_EL2) */ +#endif sctlr &= ~(1ULL << 1); /* A off */ - __asm__ volatile ("msr sctlr_el3, %0" :: "r"(sctlr)); + __asm__ volatile ("msr " SR_SCTLR ", %0" :: "r"(sctlr)); __asm__ volatile ("isb" ::: "memory"); } diff --git a/src/port/amd/arch/aarch64/startup_aarch64.S b/src/port/amd/arch/aarch64/startup_aarch64.S index 920497de..ee486239 100644 --- a/src/port/amd/arch/aarch64/startup_aarch64.S +++ b/src/port/amd/arch/aarch64/startup_aarch64.S @@ -27,6 +27,31 @@ */ #ifndef UART_EARLY_TX_OFF #define UART_EARLY_TX_OFF 0x30 /* Cadence TX FIFO; PL011 boards pass 0x00 */ +#endif + +/* Exception level of the entry. The default (JTAG / FSBL / PLM->BL31) drops + * us at EL3. wolfBoot hands a chain-loaded image off at EL2, so the demo + * build passes -DWOLFIP_EL2 (to CFLAGS *and* ASFLAGS) to retarget the + * EL-specific system registers. SCR_ELx exists only at EL3; at EL2 IRQs are + * taken at the current EL once PSTATE.I is unmasked, so that block is + * skipped. The GIC (GICv2 mem-mapped / GICv3 ICC_*_EL1) and the generic + * timer (CNTPCT_EL0) work unchanged at EL2. */ +#ifdef WOLFIP_EL2 +#define SCTLR_ELx sctlr_el2 +#define CPTR_ELx cptr_el2 +#define VBAR_ELx vbar_el2 +#define SPSR_ELx spsr_el2 +#define ELR_ELx elr_el2 +#define ESR_ELx esr_el2 +#define FAR_ELx far_el2 +#else +#define SCTLR_ELx sctlr_el3 +#define CPTR_ELx cptr_el3 +#define VBAR_ELx vbar_el3 +#define SPSR_ELx spsr_el3 +#define ELR_ELx elr_el3 +#define ESR_ELx esr_el3 +#define FAR_ELx far_el3 #endif /* A loader (FSBL, wolfBoot, ...) that respects the ELF entry @@ -106,16 +131,17 @@ _start: and x0, x0, #0xff /* Aff0 */ cbnz x0, _park_secondary - /* Disable MMU + caches in case FSBL left them on. */ - mrs x0, sctlr_el3 + /* Disable MMU + caches in case the loader left them on. */ + mrs x0, SCTLR_ELx bic x0, x0, #(1 << 0) /* M - MMU off */ bic x0, x0, #(1 << 2) /* C - D-cache off */ bic x0, x0, #(1 << 12) /* I - I-cache off */ - msr sctlr_el3, x0 + msr SCTLR_ELx, x0 isb - /* Allow FP/SIMD at EL3 (FSBL does this too, but be explicit). */ - msr cptr_el3, xzr + /* Allow FP/SIMD (the loader usually does this; be explicit). Unused + * here (-mgeneral-regs-only) but harmless. */ + msr CPTR_ELx, xzr /* Force SPSel = 1 (use SP_ELx). The IRQ vector at offset 0x280 * (Current EL with SPx) is what we wired el3_irq_trampoline to. @@ -134,14 +160,16 @@ _start: * bit 2 FIQ = 1 (route FIQ to EL3) * bit 3 EA = 1 (route SError/abort to EL3) * bit 10 RW = 0 (no lower EL64; we never drop to lower EL) */ +#ifndef WOLFIP_EL2 mov x0, #((1 << 1) | (1 << 2) | (1 << 3)) msr scr_el3, x0 isb +#endif /* Vector base. */ adrp x0, _vectors add x0, x0, :lo12:_vectors - msr vbar_el3, x0 + msr VBAR_ELx, x0 /* Stack pointer. After 'msr spsel, #1' this writes SP_EL3. */ ldr x0, =_stack_top @@ -223,15 +251,15 @@ el3_irq_trampoline: str x30, [sp, #(15 * 16)] /* Snapshot exception return state in case irq_dispatch (or any * nested exception inside it) clobbers SPSR_EL3 / ELR_EL3. */ - mrs x0, spsr_el3 - mrs x1, elr_el3 + mrs x0, SPSR_ELx + mrs x1, ELR_ELx stp x0, x1, [sp, #(16 * 16)] bl irq_dispatch ldp x0, x1, [sp, #(16 * 16)] - msr spsr_el3, x0 - msr elr_el3, x1 + msr SPSR_ELx, x0 + msr ELR_ELx, x1 ldp x0, x1, [sp, #(0 * 16)] ldp x2, x3, [sp, #(1 * 16)] ldp x4, x5, [sp, #(2 * 16)] @@ -271,19 +299,19 @@ irq_disable: * ------------------------------------------------------------------- */ .type el3_sync_trampoline, %function el3_sync_trampoline: - mrs x0, esr_el3 - mrs x1, elr_el3 - mrs x2, far_el3 - mrs x3, spsr_el3 + mrs x0, ESR_ELx + mrs x1, ELR_ELx + mrs x2, FAR_ELx + mrs x3, SPSR_ELx bl exception_report b _hang .type el3_serror_trampoline, %function el3_serror_trampoline: - mrs x0, esr_el3 - mrs x1, elr_el3 - mrs x2, far_el3 - mrs x3, spsr_el3 + mrs x0, ESR_ELx + mrs x1, ELR_ELx + mrs x2, FAR_ELx + mrs x3, SPSR_ELx mov x4, #1 /* indicate SError to C */ bl exception_report_serror b _hang diff --git a/src/port/amd/arch/armv7/mmu_armv7.c b/src/port/amd/arch/armv7/mmu_armv7.c index 0307636c..cc0e72f2 100644 --- a/src/port/amd/arch/armv7/mmu_armv7.c +++ b/src/port/amd/arch/armv7/mmu_armv7.c @@ -93,14 +93,26 @@ static void mmu_build_tables(void) { uint32_t i; uint32_t addr; + uint32_t dma_lo = (uint32_t)(uintptr_t)_dma_buffers_start; + uint32_t dma_hi = (uint32_t)(uintptr_t)_dma_buffers_end; for (i = 0; i < 4096; i++) L1[i] = SEC_INVALID; - /* DDR 0x00000000 - 0x3FFFFFFF (1 GB) as Normal WB. */ + /* DDR 0x00000000 - 0x3FFFFFFF (1 GB) Normal WB cacheable, except any + * 1 MB section overlapping the GEM DMA region, which is Normal-NC. In + * the OCM layout the DMA buffers live in OCM (mapped NC below), so no + * DDR section is carved and all of DDR stays cacheable. In the DDR + * layout (the wolfBoot / cached-code path) the rings sit in DDR and + * MUST be NC: the 8-byte GEM BDs share 32-byte cache lines, so a + * cacheable ring lets a cache-line clean write stale neighbour BDs back + * over MAC-set OWN bits and wedges RX under sustained load (HIGH-2). */ for (i = 0; i < 1024; i++) { addr = i * 0x100000u; - L1[i] = SEC_NORMAL_WB(addr); + if (addr + 0x100000u <= dma_lo || addr >= dma_hi) + L1[i] = SEC_NORMAL_WB(addr); + else + L1[i] = SEC_NORMAL_NC(addr); } /* PS peripherals at 0xE0000000 - 0xFEFFFFFF (Device). */ diff --git a/src/port/amd/boards/zcu102/Makefile b/src/port/amd/boards/zcu102/Makefile index 9e0953aa..c97e8d99 100644 --- a/src/port/amd/boards/zcu102/Makefile +++ b/src/port/amd/boards/zcu102/Makefile @@ -32,6 +32,17 @@ CFLAGS += $(CFLAGS_EXTRA) ASFLAGS := -mcpu=cortex-a53 -DUART_EARLY_TX_OFF=0x30 +# Entry exception level: 3 (default - JTAG / FSBL / PLM->BL31 drop us at EL3) +# or 2 (wolfBoot chain-loads the image at EL2). EL=2 retargets the +# EL-specific system registers in startup_aarch64.S + mmu_aarch64.c. +EL ?= 3 +ifeq ($(EL),2) + CFLAGS += -DWOLFIP_EL2 + ASFLAGS += -DWOLFIP_EL2 +else ifneq ($(EL),3) + $(error EL must be 2 or 3) +endif + # Layout selector. Default ocm keeps the OCM-only layout that the JTAG # iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass # LAYOUT=ddr to relink for DDR @ 0x10000000 -- this is the layout @@ -63,6 +74,61 @@ LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) WOLFIP_OBJ := wolfip.o OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) +# Optional network-delivered update (OTA=1). The running, wolfBoot-verified +# wolfIP app fetches a newer signed image over TFTP and stages it to the SD +# OFP_B partition, then resets so wolfBoot boots the higher version. To avoid +# any runtime hand-off, the SAME wolfBoot SD/disk driver SOURCE is compiled +# straight into the app (sdhci.c/disk.c/gpt.c) and fed platform glue from +# sdhci_shim.c. The caller must set WOLFBOOT=/path/to/wolfBoot (no default). +OTA ?= 0 +ifeq ($(OTA),1) + # OTA needs the DDR layout: the multi-MB staging buffer + the compiled-in + # wolfBoot drivers do not fit the OCM layout. Fail fast with a clear message + # instead of a generic link-time region overflow. + ifeq ($(LAYOUT),ocm) + $(error OTA=1 requires LAYOUT=ddr (staging buffer + drivers do not fit OCM)) + endif + # WOLFBOOT must point at a wolfBoot tree - its src/{sdhci,disk,gpt}.c and + # include/ are compiled into the app. No developer-local default, so OTA + # builds are portable; the demo's build.sh passes WOLFBOOT explicitly. + ifeq ($(strip $(WOLFBOOT)),) + $(error OTA=1 requires WOLFBOOT=/path/to/wolfBoot) + endif + # OFP_B = MBR partition index 2 (matches wolfBoot zynqmp_sdcard.config). + CFLAGS += -DWOLFIP_OTA -DBOOT_PART_B=2 -I$(ROOT)/src/tftp + # App-side OTA glue + the wolfIP TFTP client. syscalls_stub.o satisfies + # the newlib reent/stdio hooks that snprintf (in the TFTP client) pulls in. + OTA_OBJS := ota.o sdhci_shim.o syscalls_stub.o wolftftp.o + # wolfBoot SD-host + disk drivers, compiled from source (no hand-off). + WB_DRV_OBJS := sdhci.o disk.o gpt.o + OTA_OBJS += $(WB_DRV_OBJS) + OBJS += $(OTA_OBJS) + vpath %.c $(ROOT)/src/tftp + vpath %.c $(WOLFBOOT)/src + # SDHCI_DEBUG=1 routes the driver's DEBUG_SDHCI tracing to the app UART + # (printf.h DEBUG_UART path -> uart_printf/uart_write in sdhci_shim.c). + # Default off keeps wolfBoot_printf a no-op (WOLFBOOT_NO_PRINTF). Note the + # per-block read/write traces are verbose during a full image transfer. + SDHCI_DEBUG ?= 0 + ifeq ($(SDHCI_DEBUG),1) + WB_PRINT := -DDEBUG_UART -DDEBUG_SDHCI + else + WB_PRINT := -DWOLFBOOT_NO_PRINTF + endif + # The wolfBoot driver objects need the wolfBoot include tree and its + # build defines, scoped to just those objects. They are vendored source, + # so relax -Werror (benign unused-var/function warnings) for them only. + # SDHCI_FORCE_CARD_DETECT: the ZCU102 carrier does not wire the SD card- + # detect pin to the controller, so the SRS09 Card-Inserted/Card-State- + # Stable bits are unreliable (they only settle given enough delay). The + # card is unquestionably present here - we booted from it - so skip the + # present-state check rather than depend on it stabilizing in time. + $(WB_DRV_OBJS): CFLAGS += -I$(WOLFBOOT)/include -I$(WOLFBOOT)/src \ + -DDISK_SDCARD $(WB_PRINT) -DTARGET_zynq -DWOLFBOOT_UPDATE_DISK \ + -DARCH_AARCH64 -DBOOT_PART_A=1 -DBOOT_PART_B=2 -DSDHCI_FORCE_CARD_DETECT \ + -Wno-error -Wno-unused-function -Wno-unused-variable +endif + # Shared sources live outside this board dir; find them by vpath so the # .o files still land here (keeps clean + JTAG app.elf-in-place working). vpath %.c $(COMMON):$(ARCH):$(IP) diff --git a/src/port/amd/boards/zcu102/board.c b/src/port/amd/boards/zcu102/board.c index b1f50b16..0d74ecc0 100644 --- a/src/port/amd/boards/zcu102/board.c +++ b/src/port/amd/boards/zcu102/board.c @@ -11,8 +11,15 @@ const char *board_banner(void) { + /* The entry exception level depends on the build: EL3 for JTAG/FSBL + * boot, EL2 when wolfBoot chain-loads the image (EL=2 / -DWOLFIP_EL2). */ +#ifdef WOLFIP_EL2 + return "\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL2) ===\n" + "MMU on, caches on. Bringing up GIC-400 (GICv2)...\n"; +#else return "\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) ===\n" "MMU on, caches on. Bringing up GIC-400 (GICv2)...\n"; +#endif } void board_irq_setup(void) diff --git a/src/port/amd/boards/zcu102/ota.c b/src/port/amd/boards/zcu102/ota.c new file mode 100644 index 00000000..96591e42 --- /dev/null +++ b/src/port/amd/boards/zcu102/ota.c @@ -0,0 +1,378 @@ +/* ota.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Network-delivered firmware update for the ZCU102 wolfBoot + wolfIP demo. + * + * Wires the wolfIP TFTP client (src/tftp/wolftftp.c) to wolfBoot's SD/disk + * drivers (src/sdhci.c, src/disk.c, src/gpt.c, compiled into this app via + * the OTA=1 Makefile path). A one-shot RRQ GET stages a newer *signed* + * image into RAM, writes it to the SD OFP_B partition with disk_part_write(), + * and resets. wolfBoot then verifies OFP_B's RSA-4096/SHA3 signature and, + * because the config selects the higher-version image (WOLFBOOT_NO_PARTITIONS), + * boots the update - with rollback to OFP_A if it ever fails to verify. + * + * The image is authenticated by wolfBoot on the next boot, so a tampered or + * unsigned download simply fails wolfBoot's check and the board falls back + * to the current image; this module does not itself verify the signature. + */ +#include +#include + +#include "wolfip.h" +#include "wolftftp.h" +#include "uart.h" +#include "ota.h" + +/* OFP_B (the update slot) - MBR partition index, matches wolfBoot's + * BOOT_PART_B in zynqmp_sdcard.config. */ +#ifndef BOOT_PART_B +#define BOOT_PART_B 2 +#endif + +/* SD drive number for the disk layer. */ +#define OTA_DRIVE 0 + +/* Largest image we will stage. The signed bare-metal app is well under a + * megabyte; this cap just bounds the static staging buffer (in DDR .bss). + * Override with -DOTA_IMG_MAX=... if a larger image is ever needed. */ +#ifndef OTA_IMG_MAX +#define OTA_IMG_MAX (8U * 1024U * 1024U) +#endif + +/* TFTP tunables. Conservative by default: 512-byte blocks and windowsize 1 + * (one block per ACK) are the most broadly compatible and keep large UDP + * bursts off the poll-driven GEM RX path. Larger values can be restored once + * the basic transfer is confirmed. */ +#define OTA_LOCAL_PORT 20100U +#define OTA_BLKSIZE 512U +#define OTA_WINDOWSIZE 1U +#define OTA_TIMEOUT_S 2U +#define OTA_MAX_RETRIES 5U +#define OTA_RX_BUF 1500U + +/* wolfBoot's disk API (from src/disk.h, compiled in via OTA=1). */ +extern int disk_init(int drv); +extern int disk_open(int drv); +extern int disk_part_write(int drv, int part, uint64_t off, uint64_t sz, + const uint8_t *buf); + +/* ---- Module state (single-shot, single transfer) ----------------- */ +struct ota_sink { + uint32_t bytes; /* bytes staged so far */ + uint8_t disk_ready; /* disk_init/open succeeded */ +}; + +static struct wolfIP *g_stack; +static struct wolftftp_client g_client; +static int g_sock = -1; +static int g_started; +static int g_done; /* terminal: reset pending */ +static struct ota_sink g_sink; +static uint8_t g_rx_buf[OTA_RX_BUF]; + +/* Image staging buffer. Plain DDR .bss (write-back cacheable); the SDHCI + * SDMA coherency is handled by sdhci_platform_dma_prepare/complete in + * sdhci_shim.c, so this need not be uncached. 64-byte aligned to keep the + * cache-maintenance ranges tidy. */ +static uint8_t g_image[OTA_IMG_MAX] __attribute__((aligned(64))); + +/* ---- TFTP io_ops: stage into RAM, then write OFP_B on verify ------ */ +/* Bring the SD card up. disk_init() issues a long sequence of SD commands + * (hundreds of ms). This MUST run before the TFTP transfer starts - doing it + * inside the open callback blocks the client from ACKing the server's first + * data block and the transfer times out (-1003). Returns 0 on success. */ +static int ota_prepare_disk(void) +{ + int rc; + + g_sink.disk_ready = 0; + uart_puts("OTA: init SD card...\n"); + rc = disk_init(OTA_DRIVE); + if (rc < 0) { + uart_puts("OTA: disk_init failed: "); uart_puthex((uint32_t)rc); + uart_puts("\n"); + return -1; + } + uart_puts("OTA: SD ready, reading MBR...\n"); + rc = disk_open(OTA_DRIVE); + if (rc < 0) { + uart_puts("OTA: disk_open (MBR) failed: "); uart_puthex((uint32_t)rc); + uart_puts("\n"); + return -1; + } + g_sink.disk_ready = 1; + return 0; +} + +static int sink_open(void *arg, const char *name, int is_write, + uint32_t *size_hint, void **handle) +{ + struct ota_sink *s = &g_sink; + + (void)arg; + (void)name; + (void)size_hint; + if (!is_write) + return WOLFTFTP_ERR_UNSUPPORTED; + /* SD was brought up in ota_trigger(), before the transfer, so this + * callback stays fast and the TFTP ACKs are not delayed. */ + if (!s->disk_ready) + return WOLFTFTP_ERR_IO; + + s->bytes = 0; + *handle = s; + uart_puts("OTA: staging update to RAM\n"); + return 0; +} + +static int sink_write(void *arg, void *handle, uint32_t offset, + const uint8_t *buf, uint16_t len) +{ + struct ota_sink *s = (struct ota_sink *)handle; + + (void)arg; + if (s == NULL || buf == NULL) + return WOLFTFTP_ERR_IO; + /* TFTP delivers blocks in order; use the running counter as the write + * offset and sanity-check it against the reported offset. */ + if (offset != s->bytes) + return WOLFTFTP_ERR_STATE; + if ((uint32_t)s->bytes + (uint32_t)len > OTA_IMG_MAX) + return WOLFTFTP_ERR_SIZE; + memcpy(&g_image[s->bytes], buf, len); + /* One dot per 16 KB so progress (or where it stalls) is visible. */ + if (((s->bytes) >> 14) != ((s->bytes + len) >> 14)) + uart_putc('.'); + s->bytes += len; + return 0; +} + +static int sink_hash_update(void *arg, void *handle, + const uint8_t *buf, uint16_t len) +{ + (void)arg; + (void)handle; + (void)buf; + (void)len; + /* wolfBoot re-hashes and verifies the RSA-4096/SHA3 signature on the + * next boot, so no client-side hashing is needed here. */ + return 0; +} + +static int sink_verify(void *arg, void *handle, uint32_t total_size) +{ + struct ota_sink *s = (struct ota_sink *)handle; + int rc; + + (void)arg; + if (s == NULL || !s->disk_ready) + return WOLFTFTP_ERR_IO; + if (total_size != 0 && total_size != s->bytes) { + uart_puts("OTA: size mismatch vs tsize\n"); + return WOLFTFTP_ERR_VERIFY; + } + + uart_puts("OTA: writing "); uart_putdec(s->bytes); + uart_puts(" bytes to OFP_B (part "); uart_putdec(BOOT_PART_B); + uart_puts(")...\n"); + rc = disk_part_write(OTA_DRIVE, BOOT_PART_B, 0, + (uint64_t)s->bytes, g_image); + if (rc < 0) { + uart_puts("OTA: disk_part_write failed: "); uart_puthex((uint32_t)rc); + uart_puts("\n"); + return WOLFTFTP_ERR_IO; + } + uart_puts("OTA: update staged to OFP_B\n"); + return 0; +} + +static void sink_close(void *arg, void *handle, int status) +{ + (void)arg; + (void)handle; + (void)status; +} + +/* ---- Transport: send via wolfIP UDP socket ------------------------ */ +static int ota_udp_send(void *arg, uint16_t local_port, + const struct wolftftp_endpoint *remote, const uint8_t *buf, uint16_t len) +{ + struct wolfIP_sockaddr_in dst; + int ret; + + (void)arg; + (void)local_port; + memset(&dst, 0, sizeof(dst)); + dst.sin_family = AF_INET; + dst.sin_port = ee16(remote->port); + dst.sin_addr.s_addr = ee32(remote->ip); + ret = wolfIP_sock_sendto(g_stack, g_sock, buf, len, 0, + (struct wolfIP_sockaddr *)&dst, sizeof(dst)); + if (ret == (int)len) + return 0; + return ret < 0 ? ret : -1; +} + +/* ---- Public API --------------------------------------------------- */ +int ota_trigger(struct wolfIP *stack, uint32_t server_ip_be, + const char *filename) +{ + struct wolfIP_sockaddr_in bind_addr; + struct wolftftp_endpoint server_ep; + struct wolftftp_transport_ops tx; + struct wolftftp_io_ops io; + struct wolftftp_transfer_cfg cfg; + int ret; + + if (stack == NULL || filename == NULL) + return -1; + if (g_started) + return -1; + + g_stack = stack; + + /* Clear terminal state from any previous attempt. Without this, a retry + * after a WOLFTFTP_CLIENT_ERROR leaves g_done set, and ota_poll() would + * early-return on it and never apply a subsequently successful transfer. */ + g_done = 0; + g_sink.bytes = 0; + + /* Initialize the SD card before starting the transfer (see + * ota_prepare_disk). Done first so a card error costs no socket. */ + if (ota_prepare_disk() < 0) + return -1; + + g_sock = wolfIP_sock_socket(stack, AF_INET, IPSTACK_SOCK_DGRAM, 0); + if (g_sock < 0) { + uart_puts("OTA: socket() failed\n"); + return -1; + } + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_addr.sin_family = AF_INET; + bind_addr.sin_port = ee16(OTA_LOCAL_PORT); + bind_addr.sin_addr.s_addr = 0; + ret = wolfIP_sock_bind(stack, g_sock, + (struct wolfIP_sockaddr *)&bind_addr, sizeof(bind_addr)); + if (ret < 0) { + uart_puts("OTA: bind() failed\n"); + wolfIP_sock_close(stack, g_sock); + g_sock = -1; + return -1; + } + + memset(&tx, 0, sizeof(tx)); + tx.send = ota_udp_send; + tx.arg = NULL; + + memset(&io, 0, sizeof(io)); + io.open = sink_open; + io.write = sink_write; + io.hash_update = sink_hash_update; + io.verify = sink_verify; + io.close = sink_close; + io.arg = NULL; + + memset(&cfg, 0, sizeof(cfg)); + cfg.local_port = OTA_LOCAL_PORT; + cfg.blksize = OTA_BLKSIZE; + cfg.timeout_s = OTA_TIMEOUT_S; + cfg.windowsize = OTA_WINDOWSIZE; + cfg.max_retries = OTA_MAX_RETRIES; + cfg.max_image_size = OTA_IMG_MAX; + + wolftftp_client_init(&g_client, &tx, &io, &cfg); + + /* wolftftp endpoints carry host-order IPv4; the trigger passes the + * peer address in network order (sin_addr.s_addr), so swap it. */ + server_ep.ip = ee32(server_ip_be); + server_ep.port = WOLFTFTP_PORT; + ret = wolftftp_client_start_rrq(&g_client, &server_ep, filename); + if (ret != 0) { + uart_puts("OTA: start_rrq failed: "); uart_puthex((uint32_t)ret); + uart_puts("\n"); + wolfIP_sock_close(stack, g_sock); + g_sock = -1; + return ret; + } + g_started = 1; + uart_puts("OTA: requesting '"); uart_puts(filename); + uart_puts("' from "); uart_putip4(ee32(server_ip_be)); uart_puts("\n"); + return 0; +} + +void ota_poll(struct wolfIP *stack, uint32_t now_ms) +{ + struct wolfIP_sockaddr_in remote; + uint32_t rlen; + int n; + + (void)stack; + if (!g_started || g_sock < 0) + return; + + for (;;) { + rlen = sizeof(remote); + n = wolfIP_sock_recvfrom(g_stack, g_sock, g_rx_buf, + sizeof(g_rx_buf), 0, (struct wolfIP_sockaddr *)&remote, &rlen); + if (n <= 0) + break; + { + struct wolftftp_endpoint rep; + rep.ip = ee32(remote.sin_addr.s_addr); + rep.port = ee16(remote.sin_port); + (void)wolftftp_client_receive(&g_client, + OTA_LOCAL_PORT, &rep, g_rx_buf, (uint16_t)n); + } + } + (void)wolftftp_client_poll(&g_client, now_ms); + + if (g_done) + return; + if (g_client.state == WOLFTFTP_CLIENT_COMPLETE) { + g_done = 1; + uart_puts("OTA: transfer complete - resetting to apply update\n\n"); + ota_system_reset(); + /* not reached */ + } else if (g_client.state == WOLFTFTP_CLIENT_ERROR) { + g_done = 1; + uart_puts("OTA: transfer failed: "); + uart_puthex((uint32_t)wolftftp_client_status(&g_client)); + /* Diagnostics: how far did it get + what was negotiated. */ + uart_puts("\n staged="); uart_putdec(g_sink.bytes); + uart_puts(" blk="); uart_putdec(g_client.neg.blksize); + uart_puts(" win="); uart_putdec(g_client.neg.windowsize); + uart_puts(" tsize="); + uart_putdec(g_client.neg.have_tsize ? g_client.neg.tsize : 0); + uart_puts("\n exp_blk="); uart_putdec(g_client.expected_block); + uart_puts(" ack_blk="); uart_putdec(g_client.last_acked_block); + uart_puts(" retries="); uart_putdec(g_client.retries); + uart_puts(" tid_lock="); uart_putdec(g_client.tid_locked); + uart_puts(" srv_port="); uart_putdec(g_client.server.port); + uart_puts("\n keeping current image\n"); + wolfIP_sock_close(g_stack, g_sock); + g_sock = -1; + g_started = 0; + } +} + +int ota_in_progress(void) +{ + return g_started && !g_done; +} diff --git a/src/port/amd/boards/zcu102/ota.h b/src/port/amd/boards/zcu102/ota.h new file mode 100644 index 00000000..a2280f9c --- /dev/null +++ b/src/port/amd/boards/zcu102/ota.h @@ -0,0 +1,50 @@ +/* ota.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Network-delivered firmware update for the ZCU102 wolfBoot + wolfIP demo. + * The running (signed, verified) wolfIP app fetches a newer signed image + * over TFTP and stages it to the SD OFP_B partition using wolfBoot's own + * SD/disk drivers compiled into the app; a system reset then lets wolfBoot + * verify and boot the higher-version image. + */ +#ifndef AMD_ZCU102_OTA_H +#define AMD_ZCU102_OTA_H + +#include +#include "wolfip.h" + +/* Begin a TFTP GET of `filename` from `server_ip_be` (network byte order, + * e.g. straight from sin_addr.s_addr) and stage it to OFP_B. One transfer + * at a time; returns 0 if started, <0 on error or if already running. */ +int ota_trigger(struct wolfIP *stack, uint32_t server_ip_be, + const char *filename); + +/* Drive the in-flight transfer. Call once per main-loop iteration with the + * same millisecond clock fed to wolfIP_poll(). On a successful transfer it + * writes OFP_B and resets the board (does not return). */ +void ota_poll(struct wolfIP *stack, uint32_t now_ms); + +/* Non-zero while a transfer is in progress. */ +int ota_in_progress(void); + +/* ZynqMP system soft reset (provided by sdhci_shim.c). */ +void ota_system_reset(void); + +#endif /* AMD_ZCU102_OTA_H */ diff --git a/src/port/amd/boards/zcu102/sdhci_shim.c b/src/port/amd/boards/zcu102/sdhci_shim.c new file mode 100644 index 00000000..2c17bca0 --- /dev/null +++ b/src/port/amd/boards/zcu102/sdhci_shim.c @@ -0,0 +1,425 @@ +/* sdhci_shim.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Platform glue that lets the ZCU102 wolfIP application consume wolfBoot's + * SD-host-controller and disk drivers (src/sdhci.c, src/disk.c, src/gpt.c) + * by compiling the SAME driver source into the app - no runtime hand-off. + * The drivers call out to a handful of platform hooks; this file provides + * them for the EL2 bare-metal app: + * + * - sdhci_reg_read/write MMIO at the ZynqMP SD1 controller + * - sdhci_platform_init controller bring-up (FSBL already clocked + * SD1 to boot us, so this is a no-op) + * - sdhci_platform_irq_init polled - no IRQ + * - sdhci_platform_set_bus_mode SD (not eMMC) - no-op + * - hal_get_timer_us bridged to the port generic-timer clock + * - sdhci_platform_dma_prepare/complete D-cache maintenance so the + * controller's SDMA sees coherent buffers + * + * The SDMA engine DMAs into the caller's buffer directly (sdhci.c writes + * its address into SRS22), so every buffer handed to the disk layer - the + * app's staging buffer AND disk.c's own MBR sector buffer - must be made + * coherent. The clean/invalidate hooks below do that for buffers in the + * normal write-back-cacheable DDR the app links into. + */ +#include +#include +#include +#include "timer.h" +#include "uart.h" + +/* ZynqMP SD1 (the ZCU102 carrier micro-SD slot, boot device for SW6=SD). + * Matches wolfBoot hal/zynq.c ZYNQMP_SDHCI_BASE = ZYNQMP_SD1_BASE. */ +#define ZYNQMP_SD1_BASE 0xFF170000UL + +/* IOU_SLCR + CRL_APB registers needed to bring up the SD1 host controller, + * mirrored from wolfBoot hal/zynq.h (IOU_SLCR_BASE 0xFF180000, CRL_APB_BASE + * 0xFF5E0000). The SD1 slot type must be set to "Embedded" so the controller + * reports the card as always present - the ZCU102 carrier does not wire the + * card-detect pin to the controller, so without this the card-init busy-wait + * never sees a card and hangs. */ +#define IOU_SLCR_SD_CONFIG_REG2 (*(volatile uint32_t *)0xFF180320UL) +#define SD_CONFIG_REG2_SD1_SLOT_SH 28 +#define SD_CONFIG_REG2_SD1_SLOT_MSK 0x30000000UL +#define RST_LPD_IOU2 (*(volatile uint32_t *)0xFF5E0238UL) +#define RST_LPD_IOU2_SDIO1 (1UL << 6) + +/* The generic wolfBoot sdhci.c driver addresses a Cadence SD4HC controller + * (HRS registers at 0x000-0x1FF, SRS registers at 0x200+), but ZynqMP has an + * Arasan SDHCI controller with the standard SDHCI byte-offset register map and + * width-specific access requirements. sdhci_reg_read/write below translate + * between the two - this logic is copied verbatim from wolfBoot hal/zynq.c so + * the app drives the controller identically to wolfBoot. Without it, e.g. the + * HRS00 soft-reset write lands on the wrong register and the reset-poll hangs. */ +#define CADENCE_SRS_OFFSET 0x200 + +#define STD_SDHCI_SDMA_ADDR 0x00 /* SDMA System Address (32-bit) */ +#define STD_SDHCI_HOST_CTRL1 0x28 /* Host Control 1 (8-bit) */ +#define STD_SDHCI_POWER_CTRL 0x29 /* Power Control (8-bit) */ +#define STD_SDHCI_BLKGAP_CTRL 0x2A /* Block Gap Control (8-bit) */ +#define STD_SDHCI_WAKEUP_CTRL 0x2B /* Wakeup Control (8-bit) */ +#define STD_SDHCI_CLK_CTRL 0x2C /* Clock Control (16-bit) */ +#define STD_SDHCI_TIMEOUT_CTRL 0x2E /* Timeout Control (8-bit) */ +#define STD_SDHCI_SW_RESET 0x2F /* Software Reset (8-bit) */ +#define STD_SDHCI_HOST_CTRL2 0x3C /* Host Control 2 (16-bit) */ +#define STD_SDHCI_SRA 0x01 /* Software Reset for All */ + +#define SDHCI_SRS15_A64 (1U << 29) /* 64-bit addressing */ +#define SDHCI_SRS15_HV4E (1U << 28) /* Host version 4 enable */ +#define SDHCI_SRS16_A64S (1U << 28) /* 64-bit system bus support */ + +/* Cortex-A53 data-cache line size. */ +#define DCACHE_LINE 64UL + +/* ZynqMP system soft reset: CRL_APB.RESET_CTRL, SOFT_RESET (bit 4). Writing + * it resets the whole PS, so the BootROM re-runs FSBL -> ... -> wolfBoot, + * which then picks the higher-version image (our freshly staged update). */ +#define CRL_APB_RESET_CTRL (*(volatile uint32_t *)0xFF5E0218UL) +#define CRL_APB_RESET_CTRL_SOFT_RST (1UL << 4) + +/* ---- SDHCI register access (Cadence SD4HC -> Arasan translation) --- */ +/* Handle reads from Cadence HRS registers (0x000-0x1FF). */ +static uint32_t zynqmp_sdhci_hrs_read(uint32_t hrs_offset) +{ + volatile uint8_t *base = (volatile uint8_t *)ZYNQMP_SD1_BASE; + + switch (hrs_offset) { + case 0x000: { /* HRS00 - Software Reset: map SRA (0x2F bit0) to SWR bit0 */ + uint8_t val = *((volatile uint8_t *)(base + STD_SDHCI_SW_RESET)); + return (val & STD_SDHCI_SRA) ? 1U : 0U; + } + case 0x010: /* HRS04 - PHY access: return ACK so wait loops don't hang */ + return (1U << 26); /* SDHCI_HRS04_UIS_ACK */ + default: + return 0; + } +} + +/* Handle writes to Cadence HRS registers (0x000-0x1FF). */ +static void zynqmp_sdhci_hrs_write(uint32_t hrs_offset, uint32_t val) +{ + volatile uint8_t *base = (volatile uint8_t *)ZYNQMP_SD1_BASE; + + switch (hrs_offset) { + case 0x000: /* HRS00 - Software Reset: issue SRA via 8-bit write at 0x2F */ + if (val & 1U) + *((volatile uint8_t *)(base + STD_SDHCI_SW_RESET)) = STD_SDHCI_SRA; + break; + default: + break; + } +} + +uint32_t sdhci_reg_read(uint32_t offset) +{ + volatile uint8_t *base = (volatile uint8_t *)ZYNQMP_SD1_BASE; + + if (offset >= CADENCE_SRS_OFFSET) { + uint32_t std_off = offset - CADENCE_SRS_OFFSET; + + if (std_off == 0x58) /* SRS22 -> SRS00 legacy SDMA address */ + return *((volatile uint32_t *)(base + STD_SDHCI_SDMA_ADDR)); + if (std_off == 0x5C) /* SRS23 -> 0 (no 64-bit addressing on v3.0) */ + return 0; + { + uint32_t val = *((volatile uint32_t *)(base + std_off)); + if (std_off == 0x40) /* SRS16 Capabilities: mask A64S */ + val &= ~SDHCI_SRS16_A64S; + return val; + } + } + return zynqmp_sdhci_hrs_read(offset); +} + +void sdhci_reg_write(uint32_t offset, uint32_t val) +{ + volatile uint8_t *base = (volatile uint8_t *)ZYNQMP_SD1_BASE; + + if (offset >= CADENCE_SRS_OFFSET) { + uint32_t std_off = offset - CADENCE_SRS_OFFSET; + + /* SRS10 (0x228) = standard 0x28-0x2B, byte-wide each. */ + if (std_off == 0x28) { + *((volatile uint8_t *)(base + STD_SDHCI_HOST_CTRL1)) = + (uint8_t)(val & 0xFF); + *((volatile uint8_t *)(base + STD_SDHCI_POWER_CTRL)) = + (uint8_t)((val >> 8) & 0xFF); + *((volatile uint8_t *)(base + STD_SDHCI_BLKGAP_CTRL)) = + (uint8_t)((val >> 16) & 0xFF); + *((volatile uint8_t *)(base + STD_SDHCI_WAKEUP_CTRL)) = + (uint8_t)((val >> 24) & 0xFF); + return; + } + /* SRS11 (0x22C) = standard 0x2C (16-bit clk), 0x2E (8-bit timeout), + * 0x2F (8-bit software reset). */ + if (std_off == 0x2C) { + *((volatile uint16_t *)(base + STD_SDHCI_CLK_CTRL)) = + (uint16_t)(val & 0xFFFF); + *((volatile uint8_t *)(base + STD_SDHCI_TIMEOUT_CTRL)) = + (uint8_t)((val >> 16) & 0xFF); + *((volatile uint8_t *)(base + STD_SDHCI_SW_RESET)) = + (uint8_t)((val >> 24) & 0xFF); + return; + } + /* SRS22 (0x58) -> SRS00 legacy SDMA address (also restarts DMA). */ + if (std_off == 0x58) { + *((volatile uint32_t *)(base + STD_SDHCI_SDMA_ADDR)) = val; + return; + } + if (std_off == 0x5C) /* SRS23 -> no-op */ + return; + /* SRS15 (0x3C): Arasan v3.0 lacks HV4E/A64 - mask them off. */ + if (std_off == STD_SDHCI_HOST_CTRL2) + val &= ~(SDHCI_SRS15_HV4E | SDHCI_SRS15_A64); + + *((volatile uint32_t *)(base + std_off)) = val; + return; + } + zynqmp_sdhci_hrs_write(offset, val); +} + +/* ---- SDHCI platform bring-up -------------------------------------- */ +/* The FSBL already configured the SD1 MIO pinmux and reference clock to + * boot us, so the clock tree is live. But sdhci_init() soft-resets the + * controller, and on that reset the Capabilities register re-latches the + * IOU_SLCR slot type. We must (re)assert "Embedded slot" for SD1 and pulse + * the SDIO1 controller reset so the capability is picked up - otherwise the + * controller reports no card present (the CD pin is not wired on ZCU102) + * and sdcard_card_init() busy-waits forever. This mirrors wolfBoot's own + * hal/zynq.c sdhci_platform_init() exactly. */ +void sdhci_platform_init(void) +{ + volatile int i; + uint32_t reg; + + reg = IOU_SLCR_SD_CONFIG_REG2; + reg &= ~SD_CONFIG_REG2_SD1_SLOT_MSK; + reg |= (1UL << SD_CONFIG_REG2_SD1_SLOT_SH); /* 01 = Embedded */ + IOU_SLCR_SD_CONFIG_REG2 = reg; + + RST_LPD_IOU2 |= RST_LPD_IOU2_SDIO1; /* assert SDIO1 reset */ + for (i = 0; i < 100; i++) {} + RST_LPD_IOU2 &= ~RST_LPD_IOU2_SDIO1; /* de-assert */ + for (i = 0; i < 1000; i++) {} + + /* Real settle time for the controller + slot-type/present-state to + * stabilize after the reset. wolfBoot's busy-loop above is marginal when + * re-initializing a controller wolfBoot already used; without this the + * Card-State-Stable bit can read 0 on the fast (non-debug) path. */ + delay_us(10000); /* 10 ms */ +} + +void sdhci_platform_irq_init(void) +{ +} + +void sdhci_platform_set_bus_mode(int is_emmc) +{ + (void)is_emmc; +} + +/* ---- Timer bridge -------------------------------------------------- */ +/* wolfBoot's sdhci.c udelay() needs microseconds. timer_now() is a raw + * up-counter at timer_freq() Hz. The seconds/remainder split avoids the + * 64-bit overflow a plain (ticks * 1000000) would hit at long uptimes; it + * equals (ticks * 1000000) / freq for all inputs. */ +uint64_t hal_get_timer_us(void) +{ + uint64_t ticks = timer_now(); + uint64_t freq = timer_freq(); + + return (ticks / freq) * 1000000ULL + ((ticks % freq) * 1000000ULL) / freq; +} + +/* ---- SDMA cache maintenance --------------------------------------- */ +/* Clean+invalidate the data cache over [start, start+sz). dc civac writes + * back any dirty line before invalidating, so it is safe even when the + * buffer's first/last cache line is shared with unrelated data (a partial + * dc ivac there would drop a neighbour's dirty bytes). Used for both + * directions: before a transfer it pushes CPU writes to memory and drops + * stale lines; after a read it drops the lines the DMA wrote underneath. */ +static void dcache_civac_range(uintptr_t start, uint32_t sz) +{ + uintptr_t addr = start & ~(DCACHE_LINE - 1UL); + uintptr_t end = start + sz; + + __asm__ volatile ("dsb sy" ::: "memory"); + while (addr < end) { + __asm__ volatile ("dc civac, %0" :: "r"(addr) : "memory"); + addr += DCACHE_LINE; + } + __asm__ volatile ("dsb sy" ::: "memory"); +} + +void sdhci_platform_dma_prepare(void *buf, uint32_t sz, int is_write) +{ + (void)is_write; + dcache_civac_range((uintptr_t)buf, sz); +} + +void sdhci_platform_dma_complete(void *buf, uint32_t sz, int is_write) +{ + /* After a card->memory read the DMA wrote memory directly; invalidate + * (clean+invalidate) so the CPU reloads the fresh data. Nothing to do + * after a memory->card write. */ + if (!is_write) + dcache_civac_range((uintptr_t)buf, sz); +} + +/* ---- System reset -------------------------------------------------- */ +void ota_system_reset(void) +{ + /* Let the UART FIFO drain before the PS reset, otherwise the last log + * lines ("...resetting to apply update") are truncated and the reset + * looks like a crash. 50 ms is ample for a few lines at 115200. */ + delay_us(50000); + __asm__ volatile ("dsb sy" ::: "memory"); + CRL_APB_RESET_CTRL = CRL_APB_RESET_CTRL_SOFT_RST; + for (;;) + __asm__ volatile ("wfi"); +} + +/* ---- Optional wolfBoot_printf backend (SDHCI_DEBUG=1) -------------- */ +/* When the wolfBoot driver objects are built with -DDEBUG_UART, printf.h + * routes wolfBoot_printf() to uart_printf()/uart_write(); supply minimal + * implementations on top of the port's uart_putc(). Covers the format + * specifiers the driver debug uses: %d %u %x %X (with 0-pad width) %p %llu + * %s %c %%. Unreferenced (and dropped by -gc-sections) in non-debug builds. */ +static void uart_put_u64(uint64_t v) +{ + char buf[20]; + int n = 0; + + if (v == 0) { + uart_putc('0'); + return; + } + while (v != 0 && n < (int)sizeof(buf)) { + buf[n++] = (char)('0' + (v % 10ULL)); + v /= 10ULL; + } + while (n > 0) + uart_putc(buf[--n]); +} + +static void uart_put_hex(uint64_t v, int upper, int width, int zero) +{ + const char *d = upper ? "0123456789ABCDEF" : "0123456789abcdef"; + char buf[16]; + int n = 0; + + if (v == 0) + buf[n++] = '0'; + while (v != 0 && n < (int)sizeof(buf)) { + buf[n++] = d[v & 0xFULL]; + v >>= 4; + } + while (n < width) { + uart_putc(zero ? '0' : ' '); + width--; + } + while (n > 0) + uart_putc(buf[--n]); +} + +void uart_write(const char *buf, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + uart_putc(buf[i]); +} + +void uart_printf(const char *fmt, ...) +{ + va_list ap; + int width; + int zero; + int lng; + char c; + + va_start(ap, fmt); + while (*fmt != '\0') { + if (*fmt != '%') { + uart_putc(*fmt++); + continue; + } + fmt++; + zero = 0; + width = 0; + lng = 0; + if (*fmt == '0') { + zero = 1; + fmt++; + } + while (*fmt >= '0' && *fmt <= '9') { + width = width * 10 + (*fmt - '0'); + fmt++; + } + while (*fmt == 'l') { + lng++; + fmt++; + } + c = *fmt++; + switch (c) { + case 'd': { + int v = va_arg(ap, int); + if (v < 0) { + uart_putc('-'); + uart_put_u64((uint64_t)(-(int64_t)v)); + } else { + uart_put_u64((uint64_t)v); + } + break; + } + case 'u': + uart_put_u64(lng >= 2 ? va_arg(ap, uint64_t) + : (uint64_t)va_arg(ap, unsigned int)); + break; + case 'x': + case 'X': + uart_put_hex(lng >= 2 ? va_arg(ap, uint64_t) + : (uint64_t)va_arg(ap, unsigned int), + c == 'X', width, zero); + break; + case 'p': + uart_putc('0'); + uart_putc('x'); + uart_put_hex((uint64_t)(uintptr_t)va_arg(ap, void *), 0, 0, 0); + break; + case 's': { + const char *s = va_arg(ap, const char *); + uart_puts(s != NULL ? s : "(null)"); + break; + } + case 'c': + uart_putc((char)va_arg(ap, int)); + break; + case '%': + uart_putc('%'); + break; + default: + uart_putc('%'); + uart_putc(c); + break; + } + } + va_end(ap); +} diff --git a/src/port/amd/boards/zcu102/syscalls_stub.c b/src/port/amd/boards/zcu102/syscalls_stub.c new file mode 100644 index 00000000..2dc3002d --- /dev/null +++ b/src/port/amd/boards/zcu102/syscalls_stub.c @@ -0,0 +1,123 @@ +/* syscalls_stub.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Minimal newlib syscall stubs for the OTA build. The wolfIP TFTP client + * uses snprintf(), which pulls in newlib's reent/stdio machinery; that in + * turn references these POSIX hooks at link time even though the bounded + * integer/string formatting the client does never actually calls them. The + * lean (non-OTA) app avoids stdio entirely and so needs none of this. + * + * _sbrk hands out a small static heap so that any incidental allocation + * succeeds rather than corrupting memory; the rest fail cleanly with ENOSYS. + */ +#include +#include +#include +#include + +/* Static heap for _sbrk. snprintf's non-float conversions don't allocate, + * so this is a safety margin, not a hot path. Aligned to 16 bytes: newlib's + * malloc rounds its sbrk returns off the heap base, so a 16-byte-aligned base + * keeps any incidental allocation correctly aligned for AArch64. */ +#ifndef OTA_HEAP_SIZE +#define OTA_HEAP_SIZE (64U * 1024U) +#endif +static char ota_heap[OTA_HEAP_SIZE] __attribute__((aligned(16))); +static char *ota_brk = ota_heap; + +void *_sbrk(ptrdiff_t incr) +{ + char *prev = ota_brk; + + if (incr < 0 || (size_t)(incr) > (size_t)(&ota_heap[OTA_HEAP_SIZE] - ota_brk)) { + errno = ENOMEM; + return (void *)-1; + } + ota_brk += incr; + return prev; +} + +int _close(int fd) +{ + (void)fd; + errno = ENOSYS; + return -1; +} + +off_t _lseek(int fd, off_t off, int whence) +{ + (void)fd; + (void)off; + (void)whence; + errno = ENOSYS; + return (off_t)-1; +} + +ssize_t _read(int fd, void *buf, size_t len) +{ + (void)fd; + (void)buf; + (void)len; + errno = ENOSYS; + return -1; +} + +ssize_t _write(int fd, const void *buf, size_t len) +{ + (void)fd; + (void)buf; + (void)len; + errno = ENOSYS; + return -1; +} + +int _fstat(int fd, struct stat *st) +{ + (void)fd; + if (st != NULL) + st->st_mode = S_IFCHR; + return 0; +} + +int _isatty(int fd) +{ + (void)fd; + return 1; +} + +int _kill(int pid, int sig) +{ + (void)pid; + (void)sig; + errno = ENOSYS; + return -1; +} + +int _getpid(void) +{ + return 1; +} + +void _exit(int code) +{ + (void)code; + for (;;) + __asm__ volatile ("wfi"); +} diff --git a/src/port/amd/boards/zynq7000/Makefile b/src/port/amd/boards/zynq7000/Makefile index c3f577db..658222e3 100644 --- a/src/port/amd/boards/zynq7000/Makefile +++ b/src/port/amd/boards/zynq7000/Makefile @@ -29,7 +29,18 @@ CFLAGS += $(CFLAGS_EXTRA) ASFLAGS := -mcpu=cortex-a9 -marm -LDSCRIPT := target.ld +# Memory layout: ocm (default) runs uncached from OCM (JTAG / FSBL bring-up); +# ddr runs cached from DDR @ 0x10000000 (the wolfBoot / fast path). +LAYOUT ?= ocm +ifeq ($(LAYOUT),ddr) + LDSCRIPT := target_ddr.ld + CFLAGS += -DAMD_LAYOUT_DDR +else ifeq ($(LAYOUT),ocm) + LDSCRIPT := target.ld + CFLAGS += -DAMD_LAYOUT_OCM +else + $(error LAYOUT must be 'ocm' or 'ddr') +endif LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections # Override newlib's memset/memcpy with bytewise variants in main.c # (the same "fast memset uses an instruction the bare-metal setup @@ -45,6 +56,21 @@ LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) WOLFIP_OBJ := wolfip.o OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) +# A change in LAYOUT must force a full rebuild: the objects link against a +# different load address and AMD_LAYOUT_* define. The stamp's mtime only +# bumps when LAYOUT actually changes. +LAYOUT_STAMP := .layout_stamp +$(LAYOUT_STAMP): FORCE + @if [ "`cat $@ 2>/dev/null`" != "$(LAYOUT)" ]; then \ + echo "LAYOUT -> $(LAYOUT) (was `cat $@ 2>/dev/null`); forcing rebuild"; \ + echo "$(LAYOUT)" > $@; \ + fi +$(OBJS): $(LAYOUT_STAMP) + +# The first explicit target rule above is $(LAYOUT_STAMP), which would otherwise +# become make's default goal; pin the default goal to 'all'. +.DEFAULT_GOAL := all + # Shared sources live outside this board dir; find them by vpath so the # .o files still land here (keeps clean + JTAG app.elf-in-place working). vpath %.c $(COMMON):$(ARCH):$(IP) @@ -68,13 +94,16 @@ $(WOLFIP_OBJ): $(ROOT)/src/wolfip.c $(CC) $(ASFLAGS) -c $< -o $@ clean: - rm -f $(OBJS) app.elf BOOT.BIN + rm -f $(OBJS) app.elf BOOT.BIN $(LAYOUT_STAMP) + +FORCE: -.PHONY: all clean help +.PHONY: all clean help FORCE help: @echo "Zynq-7000 wolfIP build (Cortex-A9):" - @echo " make - build app.elf" + @echo " make - build app.elf (OCM layout)" + @echo " make LAYOUT=ddr - build for DDR @ 0x10000000 (wolfBoot/fast)" @echo " make clean - remove artifacts" @echo "" @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/amd/boards/zynq7000/target_ddr.ld b/src/port/amd/boards/zynq7000/target_ddr.ld new file mode 100644 index 00000000..f5864d1e --- /dev/null +++ b/src/port/amd/boards/zynq7000/target_ddr.ld @@ -0,0 +1,102 @@ +/* Zynq-7000 (Cortex-A9) Linker Script - DDR layout + * + * Used when the app runs from DDR (cached code, much faster than the + * uncached OCM layout) - e.g. loaded by wolfBoot or JTAG-dow'd after the + * FSBL has trained DDR. + * + * Memory map: + * DDR : 0x10000000 .. 0x10FFFFFF (16 MB window; matches wolfBoot's + * WOLFBOOT_LOAD_ADDRESS = 0x10000000) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (still mapped; unused by this layout) + * + * The first instruction of the image (the reset vector at 0x10000000) is + * `b _start` (.vectors leads the image), so a loader that jumps to the load + * address - and wolfBoot's image format - both reach the entry. _start + * reprograms VBAR to _vectors afterwards. + */ + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) +ENTRY(_start) + +MEMORY +{ + DDR (rwx) : ORIGIN = 0x10000000, LENGTH = 0x01000000 /* 16 MB */ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 /* still mapped */ +} + +_stack_top = 0x10FFF000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + /* .vectors leads the image so the word at 0x10000000 is `b _start` + * (the reset vector doubles as the wolfBoot entry header). */ + .vectors : + { + . = ALIGN(32); /* VBAR alignment requirement */ + KEEP(*(.vectors)) + } > DDR :text + + .text : + { + . = ALIGN(4); + *(.text*) + *(.rodata*) + . = ALIGN(4); + } > DDR :text + + .data : + { + . = ALIGN(4); + _sdata = .; + *(.data*) + . = ALIGN(4); + _edata = .; + } > DDR :text + + .bss (NOLOAD) : + { + . = ALIGN(4); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(4); + _ebss = .; + } > DDR :text + + .page_tables (NOLOAD) : + { + . = ALIGN(16384); /* TTBR0 wants 16 KB alignment */ + _page_tables_start = .; + *(.page_tables) + . = ALIGN(16384); + _page_tables_end = .; + } > DDR :text + + /* DMA buffers get their own 1 MB-aligned block so the MMU can map just + * this range Normal-NC (the ARMv7 section granularity is 1 MB) without + * making the .text block non-cacheable. NC is required for the GEM + * rings - see mmu_armv7.c (HIGH-2). */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(0x100000); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > DDR :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/amd/common/app.c b/src/port/amd/common/app.c index cb8c0a97..da33bd52 100644 --- a/src/port/amd/common/app.c +++ b/src/port/amd/common/app.c @@ -32,6 +32,13 @@ #include "gem.h" #include "timer.h" #include "app.h" +#ifdef WOLFIP_OTA +#include "ota.h" +/* Filename the host TFTP server offers for the signed update image. */ +#ifndef OTA_FILENAME +#define OTA_FILENAME "wolfip_update.bin" +#endif +#endif #define ECHO_PORT 7 #define RX_BUF_SIZE 1500 @@ -140,6 +147,16 @@ static void udp_echo_cb(int fd, uint16_t event, void *arg) * (first octet in the high byte), so swap before printing. */ uart_puts(" bytes from "); uart_putip4(ee32(peer.sin_addr.s_addr)); uart_puts("\n"); +#ifdef WOLFIP_OTA + /* A datagram beginning "UPDATE" triggers a network-delivered + * firmware update: fetch OTA_FILENAME over TFTP from the sender + * and stage it to OFP_B. The downloaded image is authenticated by + * wolfBoot on the next boot, so the trigger itself need not be. */ + if (n >= 6 && memcmp(udp_rx_buf, "UPDATE", 6) == 0 + && !ota_in_progress()) { + (void)ota_trigger(s, peer.sin_addr.s_addr, OTA_FILENAME); + } +#endif } } #else /* SPEED_TEST */ @@ -425,7 +442,11 @@ int main(void) } #else for (;;) { - (void)wolfIP_poll(IPStack, app_now_ms()); + uint64_t now = app_now_ms(); + (void)wolfIP_poll(IPStack, now); +#ifdef WOLFIP_OTA + ota_poll(IPStack, (uint32_t)now); +#endif } #endif