diff --git a/.gitignore b/.gitignore index 6a316c6..6585849 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ test/samples/ # Internal design notes and AI planning artifacts - not for distribution funnelcake.md docs/superpowers/ +.claude/settings.local.json +*.profdata +*.profraw diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ab6e737 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,71 @@ +# Changelog + +All notable changes to funnelcake are recorded here. Format follows +[Keep a Changelog](https://keepachangelog.com/en/1.1.0/); the project does +not yet follow a tagged release cadence, so changes accumulate under +**Unreleased** until a tag is cut. + +## [Unreleased] + +### Added +- `FUSED_ERR_OUT_OF_MEMORY` (-5) hard-error code returned by + `fused_scaler_init` and `fused_hdr_init` when allocation of the internal + state struct fails. Previously these paths returned the misleading + `FUSED_ERR_NO_STEPS`. +- `FUSED_LOG_INFO` (2) log level for low-frequency status / diagnostic + messages. Routed through the existing `log_warnings` config so callers + that install a `FUSED_LOG_CALLBACK` can filter info out by inspecting + the `level` argument while still receiving warnings. + +### Changed +- The "tone map LUTs generated" diagnostic in `fused_tonemap_generate_luts` + now logs at `FUSED_LOG_INFO` instead of `FUSED_LOG_WARN`. Callers that + previously suppressed it via `log_warnings = FUSED_LOG_SUPPRESS` keep + the same behavior; callback-based loggers can now keep warnings while + dropping info. +- The "no SIMD support detected" notice from both init functions now + routes through `fused_log(&ctx->log_warnings, FUSED_LOG_WARN, …)` + instead of writing to `stderr` directly, so users with a configured + log target see it where they expect. +- `fused_scaler_free` and `fused_hdr_free` now also reset + `effective_width` and `effective_height` on the context, matching the + reset of the other result fields. + +### Fixed +- **HDR init memory leak**: `fused_hdr_init` could leak `state->sdr_temp[i]` + buffers if SDR-only steps had been allocated successfully and a later + step (the 1:1 tonemap output, or the "no valid steps" check) failed. + The error paths called `fused_hdr_free(ctx)` while `ctx->_internal` was + still NULL, skipping the cleanup of `state`'s sdr_temp pointers, and + then `free(state)` released the struct without freeing those buffers. + Init now attaches `state` to `ctx->_internal` immediately after + allocation so any subsequent error path goes through `fused_hdr_free` + and releases everything. +- The misaligned-source warning emitted by `fused_scaler_run` and + `fused_hdr_run` used a process-wide `static int warned` flag, so the + first context to encounter a misaligned source silenced the warning + for every other context in the process (and the flag was not + thread-safe). Each context now owns its own `src_misaligned_warned` + flag inside its internal state. +- The misaligned-source warning was missing a trailing newline. + +### Documentation +- `docs/API.md`: documented `FUSED_LOG_INFO`, `FUSED_ERR_OUT_OF_MEMORY`, + and the relationship between log levels and the routing config. + +--- + +## Conventions + +- **Added** — new public API surface (functions, constants, struct fields). +- **Changed** — non-breaking behavioral changes to existing API. +- **Deprecated** — APIs scheduled for removal. +- **Removed** — APIs that have been deleted. +- **Fixed** — bug fixes that don't change documented behavior. +- **Security** — vulnerability fixes. +- **Performance** — measurable speed/memory wins, with a one-line summary + of the workload and the delta. +- **Documentation** — doc-only changes worth noting. + +Group breaking changes under their own **Breaking** subsection and call +out the impact on callers. diff --git a/INSTALL.md b/INSTALL.md index 0468dc4..3c86a80 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -125,3 +125,59 @@ Artifacts are written to `dist/` as per-architecture tarballs containing release archives are built with `CC=clang LTO=0` so the static library contains standard object files suitable for downstream linkers that do not understand Clang LTO bitcode. + +## macOS Release Artifacts + +To build a native macOS release archive: + + ./scripts/build-macos.sh + +The script must be run on macOS with Xcode command line tools installed. It +writes `dist/funnelcake-macos-.tar.gz` containing `libfunnelcake.a`, +`include/funnelcake.h`, the README, install notes, and `BUILD_INFO`. + +## Windows Release Artifacts + +To build Windows release archives: + + ./scripts/build-windows.sh # bash / MSYS2 / Git Bash + scripts\build-windows.ps1 # native PowerShell 5.1+ + +By default the script builds every Windows target whose toolchain is available. +MinGW-w64 artifacts contain `libfunnelcake.a`; MSVC artifacts contain +`funnelcake.lib`. Both package layouts include `include/funnelcake.h`, the +README, install notes, and `BUILD_INFO`. + +For MinGW-w64 only: + + ./scripts/build-windows.sh --mingw + scripts\build-windows.ps1 -Mingw + +For `x86_64` MinGW, install tools that provide `x86_64-w64-mingw32-gcc` and +`x86_64-w64-mingw32-ar`. For Windows on ARM64 MinGW, install tools that provide +`aarch64-w64-mingw32-gcc` and `aarch64-w64-mingw32-ar`. + +For MSVC only, run from a Visual Studio developer shell where `cl.exe` and +`lib.exe` are in `PATH`: + + ./scripts/build-windows.sh --msvc + scripts\build-windows.ps1 -Msvc + +Both MinGW and MSVC builds use the normal source selection (AVX2 on `x86_64`, +NEON on `aarch64`/ARM64). The NEON kernels guard on `__aarch64__ || _M_ARM64`, +so MSVC ARM64 picks up the same SIMD coverage as the MinGW cross-compile. + +### Windows ARM64 only + +For a Windows-on-ARM64 build without touching the x86_64 paths, use the +dedicated PowerShell driver: + + scripts\build-windows-arm64.ps1 # MinGW + MSVC, whichever is available + scripts\build-windows-arm64.ps1 -Mingw # cross-compile via aarch64-w64-mingw32-gcc + scripts\build-windows-arm64.ps1 -Msvc # native ARM64 MSVC + +The `-Msvc` path requires an "ARM64 Native Tools Command Prompt for VS" or +an equivalent Developer PowerShell with `VSCMD_ARG_TGT_ARCH=arm64`; the +script refuses to run if the shell is not configured for ARM64. Both +artifact layouts mirror `build-windows.ps1`: a per-toolchain `dist/` +package plus a `.zip.sha256`. diff --git a/Makefile b/Makefile index bd6899a..4ba0dac 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,10 @@ TEST_OPT = -O2 UNAME_M := $(shell uname -m) UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Windows_NT) + CFLAGS_BASE += -D__USE_MINGW_ANSI_STDIO=1 +endif + # Normalize FreeBSD's "amd64" to "x86_64" so the SIMD-selection blocks # below match. FreeBSD/arm64 already reports "aarch64". ifeq ($(UNAME_S),FreeBSD) diff --git a/README.md b/README.md index e9971de..4272c82 100644 --- a/README.md +++ b/README.md @@ -620,7 +620,8 @@ benchmark comparison; without it the library and headers install but The scalar fallback is correct on all platforms but significantly slower. On hardware without AVX2, NEON, or RVV, the library logs a one-time notice -to stderr at first init. +through the configured `log_warnings` channel at first init (default: +stderr). ## HDR10 support diff --git a/docs/API.md b/docs/API.md index ddf2b6b..b6a2d98 100644 --- a/docs/API.md +++ b/docs/API.md @@ -221,9 +221,9 @@ struct means write to stderr. | Field | Type | Description | |-------|------|-------------| -| `target` | `int` | One of the `FUSED_LOG_*` constants. | +| `target` | `int` | One of the `FUSED_LOG_*` target constants. | | `file` | `FILE *` | Used when `target == FUSED_LOG_FILE`. Must be a valid open file. | -| `callback` | `void (*)(int level, const char *msg, void *ctx)` | Used when `target == FUSED_LOG_CALLBACK`. `level` is `FUSED_LOG_ERROR` or `FUSED_LOG_WARN`. | +| `callback` | `void (*)(int level, const char *msg, void *ctx)` | Used when `target == FUSED_LOG_CALLBACK`. `level` is one of `FUSED_LOG_ERROR`, `FUSED_LOG_WARN`, `FUSED_LOG_INFO`. | | `callback_ctx` | `void *` | Passed through opaquely as the `ctx` argument to `callback`. | Log target constants: @@ -236,6 +236,15 @@ Log target constants: | `FUSED_LOG_SUPPRESS` | 3 | Discard all messages | | `FUSED_LOG_CALLBACK` | 4 | Call `config.callback` | +Log level constants (passed to callbacks; stderr/stdout/file targets emit +every message regardless of level): + +| Constant | Value | Meaning | +|----------|-------|---------| +| `FUSED_LOG_ERROR` | 0 | Hard error — init failed, no resources allocated. Routed via `log_errors`. | +| `FUSED_LOG_WARN` | 1 | Partial success or fallback — request still produced output. Routed via `log_warnings`. | +| `FUSED_LOG_INFO` | 2 | Low-frequency status / diagnostic. Routed via `log_warnings`; filter on `level` in a callback to drop. | + ## Scale Step Flags @@ -515,6 +524,7 @@ are valid, and `fused_scaler_run` must not be called. | `FUSED_ERR_NO_STEPS` | -2 | No valid step flags remain after filtering (all were rejected or none were set). | | `FUSED_ERR_BAD_DIMENSIONS` | -3 | `src_width` or `src_height` is <= 0, or too small for the requested steps. | | `FUSED_ERR_BAD_ALIGNMENT` | -4 | `src_y_stride` or `src_uv_stride` is not 32-byte aligned. | +| `FUSED_ERR_OUT_OF_MEMORY` | -5 | Allocation of internal state failed; output buffers (if any were allocated earlier in init) have already been released. | ## Alignment Requirements @@ -678,9 +688,12 @@ scaler.log_warnings.callback = my_log; scaler.log_warnings.callback_ctx = my_logger_instance; ``` -The `level` argument to the callback is `FUSED_LOG_ERROR` (0) or -`FUSED_LOG_WARN` (1). The `msg` string is a complete formatted message; -do not call `fused_scaler_*` functions from within the callback. +The `level` argument to the callback is `FUSED_LOG_ERROR` (0), +`FUSED_LOG_WARN` (1), or `FUSED_LOG_INFO` (2). Info-level messages +(e.g. "tone map LUTs generated") share the `log_warnings` config — to +keep warnings but drop info, install a callback and filter on `level`. +The `msg` string is a complete formatted message; do not call +`fused_scaler_*` functions from within the callback. ## HDR10 API Reference diff --git a/funnelcake.pc b/funnelcake.pc new file mode 100644 index 0000000..f7dff74 --- /dev/null +++ b/funnelcake.pc @@ -0,0 +1,11 @@ +prefix=/usr/local +exec_prefix=${prefix} +libdir=/usr/local/lib +includedir=/usr/local/include + +Name: funnelcake +Description: SIMD YUV scaler with HDR/SDR tonemapping +Version: 0.1.0 +Cflags: -I${includedir} +Libs: -L${libdir} -lfunnelcake +Libs.private: -lm diff --git a/include/funnelcake.h b/include/funnelcake.h index 595eb36..0b99dc9 100644 --- a/include/funnelcake.h +++ b/include/funnelcake.h @@ -140,14 +140,22 @@ extern "C" { #define FUSED_ERR_NO_STEPS (-2) /* no valid step flags set after filtering */ #define FUSED_ERR_BAD_DIMENSIONS (-3) /* src_width/height <= 0 or too small */ #define FUSED_ERR_BAD_ALIGNMENT (-4) /* strides not 32-byte aligned */ +#define FUSED_ERR_OUT_OF_MEMORY (-5) /* allocation of internal state failed */ /* -------------------------------------------------------------------------- * Log levels + * + * Levels are passed to FUSED_LOG_CALLBACK callbacks (which can filter on + * them); stderr/stdout/file targets emit every message regardless of level. + * Routing of info-level diagnostics shares the warnings logger config, so + * callers that want to drop info but keep warnings should install a callback + * and filter by level. * -------------------------------------------------------------------------- */ #define FUSED_LOG_ERROR 0 #define FUSED_LOG_WARN 1 +#define FUSED_LOG_INFO 2 /* low-frequency diagnostic / status messages */ /* -------------------------------------------------------------------------- diff --git a/libfunnelcake.1.dylib b/libfunnelcake.1.dylib new file mode 100755 index 0000000..f6dfd48 Binary files /dev/null and b/libfunnelcake.1.dylib differ diff --git a/scripts/build-macos.sh b/scripts/build-macos.sh new file mode 100755 index 0000000..08b17ad --- /dev/null +++ b/scripts/build-macos.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +DIST_DIR="${REPO_ROOT}/dist" +BUILD_DATE="$(date -u +%Y%m%dT%H%M%SZ)" +HOST_OS="$(uname -s)" +HOST_ARCH="$(uname -m)" + +usage() { + cat <<'EOF' +Usage: scripts/build-macos.sh + +Build a native macOS release archive. This script must run on macOS with +Xcode command line tools installed. +EOF +} + +if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then + usage + exit 0 +fi + +if [ "$#" -gt 0 ]; then + echo "error: unknown option: $1" >&2 + usage >&2 + exit 1 +fi + +require_tool() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "error: required tool not found: $1" >&2 + exit 1 + fi +} + +if [ "${HOST_OS}" != "Darwin" ]; then + echo "error: macOS artifacts require a Darwin host" >&2 + exit 1 +fi + +require_tool clang +require_tool make +require_tool shasum +require_tool tar + +mkdir -p "${DIST_DIR}" + +package_dir="funnelcake-macos-${HOST_ARCH}" + +echo "==> Building macOS ${HOST_ARCH} artifact" +( + cd "${REPO_ROOT}" + make clean + make lib CC=clang LTO=0 UNAME_S=Darwin UNAME_M="${HOST_ARCH}" +) + +rm -rf "${DIST_DIR}/${package_dir}" +mkdir -p "${DIST_DIR}/${package_dir}/include" +cp "${REPO_ROOT}/libfunnelcake.a" "${DIST_DIR}/${package_dir}/" +cp "${REPO_ROOT}/include/funnelcake.h" "${DIST_DIR}/${package_dir}/include/" +cp "${REPO_ROOT}/README.md" "${REPO_ROOT}/INSTALL.md" "${DIST_DIR}/${package_dir}/" + +{ + printf '%s\n' "name=funnelcake" + printf '%s\n' "target_os=macos" + printf '%s\n' "target_arch=${HOST_ARCH}" + printf '%s\n' "compiler=clang" + printf '%s\n' "lto=0" + printf '%s\n' "build_date=${BUILD_DATE}" +} > "${DIST_DIR}/${package_dir}/BUILD_INFO" + +rm -f "${DIST_DIR}/${package_dir}.tar.gz" "${DIST_DIR}/${package_dir}.tar.gz.sha256" +( + cd "${DIST_DIR}" + tar -czf "${package_dir}.tar.gz" "${package_dir}" + shasum -a 256 "${package_dir}.tar.gz" > "${package_dir}.tar.gz.sha256" +) + +( + cd "${REPO_ROOT}" + make clean +) + +echo "" +echo "Artifacts written to ${DIST_DIR}:" +echo " ${package_dir}.tar.gz" +echo " ${package_dir}.tar.gz.sha256" diff --git a/scripts/build-windows-arm64.ps1 b/scripts/build-windows-arm64.ps1 new file mode 100644 index 0000000..3822b77 --- /dev/null +++ b/scripts/build-windows-arm64.ps1 @@ -0,0 +1,222 @@ +#Requires -Version 5.1 +[CmdletBinding()] +param( + [switch]$All, + [switch]$Mingw, + [switch]$Msvc, + [switch]$Help +) + +$ErrorActionPreference = 'Stop' + +$ScriptDir = $PSScriptRoot +$RepoRoot = (Resolve-Path (Join-Path $ScriptDir '..')).Path +$DistDir = Join-Path $RepoRoot 'dist' +$BuildDir = Join-Path $RepoRoot 'build\windows' +$BuildDate = (Get-Date).ToUniversalTime().ToString('yyyyMMddTHHmmssZ') +$Artifacts = New-Object System.Collections.Generic.List[string] + +function Show-Usage { +@' +Usage: scripts\build-windows-arm64.ps1 [-All] [-Mingw] [-Msvc] + +Build Windows ARM64 (aarch64) release archives: + -All build every supported ARM64 target available on this host (default) + -Mingw build a MinGW-w64 aarch64 static library package (cross compile) + -Msvc build an MSVC ARM64 static library package from a Visual Studio + ARM64 developer shell + +The MinGW target requires the aarch64-w64-mingw32-gcc cross compiler. It +produces a NEON-enabled libfunnelcake.a (the GCC __aarch64__ macro selects +the NEON kernel sources). + +The MSVC target requires cl.exe and lib.exe configured for ARM64, usually by +running from "ARM64 Native Tools Command Prompt for VS" or an equivalent +Developer PowerShell with VSCMD_ARG_TGT_ARCH=arm64. The MSVC build includes +the NEON kernels (gated on _M_ARM64 in addition to __aarch64__), matching +the MinGW build's instruction-set coverage. +'@ +} + +if ($Help) { Show-Usage; return } + +# default: build everything when no flag is given +if (-not $Mingw -and -not $Msvc -and -not $All) { $Mingw = $true; $Msvc = $true } +if ($All) { $Mingw = $true; $Msvc = $true } + +function Test-Tool { + param([string]$Name) + [bool](Get-Command $Name -ErrorAction SilentlyContinue) +} + +function Resolve-MakeTool { + if (Test-Tool 'make') { return 'make' } + if (Test-Tool 'mingw32-make') { return 'mingw32-make' } + return $null +} + +function Write-AsciiFile { + param([string]$Path, [string]$Content) + $enc = New-Object System.Text.UTF8Encoding($false) + [System.IO.File]::WriteAllText($Path, $Content, $enc) +} + +function Get-Sha256Line { + param([string]$Path) + $hash = (Get-FileHash -Algorithm SHA256 -Path $Path).Hash.ToLower() + $name = Split-Path -Leaf $Path + return "$hash $name`n" +} + +function New-PackageDir { + param( + [string]$PackageDir, + [string]$TargetArch, + [string]$Compiler, + [string]$LibraryPath, + [string]$LibraryName + ) + $dest = Join-Path $DistDir $PackageDir + if (Test-Path $dest) { Remove-Item -Recurse -Force $dest } + New-Item -ItemType Directory -Path (Join-Path $dest 'include') -Force | Out-Null + Copy-Item $LibraryPath (Join-Path $dest $LibraryName) + Copy-Item (Join-Path $RepoRoot 'include\funnelcake.h') (Join-Path $dest 'include') + Copy-Item (Join-Path $RepoRoot 'README.md') $dest + Copy-Item (Join-Path $RepoRoot 'INSTALL.md') $dest + + $info = @( + 'name=funnelcake' + 'target_os=windows' + "base_distribution=$Compiler" + "target_arch=$TargetArch" + "compiler=$Compiler" + 'lto=0' + "build_date=$BuildDate" + ) -join "`n" + Write-AsciiFile -Path (Join-Path $dest 'BUILD_INFO') -Content ($info + "`n") +} + +function New-Zip { + param([string]$PackageDir) + $zip = Join-Path $DistDir "$PackageDir.zip" + $sha = "$zip.sha256" + $source = Join-Path $DistDir $PackageDir + if (Test-Path $zip) { Remove-Item -Force $zip } + if (Test-Path $sha) { Remove-Item -Force $sha } + Compress-Archive -Path $source -DestinationPath $zip -Force + Write-AsciiFile -Path $sha -Content (Get-Sha256Line $zip) + $Artifacts.Add("$PackageDir.zip") | Out-Null + $Artifacts.Add("$PackageDir.zip.sha256") | Out-Null +} + +function Invoke-MingwArm64Build { + $arch = 'aarch64' + $cc = 'aarch64-w64-mingw32-gcc' + $arTool = 'aarch64-w64-mingw32-ar' + $package = "funnelcake-windows-mingw-$arch" + $makeTool = Resolve-MakeTool + + if (-not (Test-Tool $cc) -or -not (Test-Tool $arTool) -or -not $makeTool) { + Write-Host "==> Skipping MinGW ${arch}: $cc, $arTool, and make are required" + return + } + + Write-Host "==> Building Windows MinGW $arch artifact" + Push-Location $RepoRoot + try { + & $makeTool clean + if ($LASTEXITCODE -ne 0) { throw "$makeTool clean failed" } + & $makeTool lib "CC=$cc" "AR=$arTool" 'LTO=0' 'UNAME_S=Windows_NT' "UNAME_M=$arch" + if ($LASTEXITCODE -ne 0) { throw "$makeTool lib failed" } + } + finally { Pop-Location } + + New-PackageDir -PackageDir $package -TargetArch $arch -Compiler 'mingw-w64' ` + -LibraryPath (Join-Path $RepoRoot 'libfunnelcake.a') -LibraryName 'libfunnelcake.a' + New-Zip -PackageDir $package +} + +function Test-MsvcArm64Target { + $val = $env:VSCMD_ARG_TGT_ARCH + if (-not $val) { $val = $env:PROCESSOR_ARCHITECTURE } + return ($val -match '^(arm64|ARM64)$') +} + +function Invoke-MsvcArm64Build { + $commonSources = @( + 'src\funnelcake.c', + 'src\funnelcake_hdr.c', + 'src\log.c', + 'src\detect.c', + 'src\kernels_scalar.c', + 'src\kernels_hdr_scalar.c', + 'src\tonemap.c', + 'src\kernels_upscale_scalar.c', + 'src\kernels_neon.c', + 'src\kernels_hdr_neon.c', + 'src\kernels_upscale_neon.c' + ) + + if (-not (Test-Tool 'cl') -or -not (Test-Tool 'lib')) { + Write-Host '==> Skipping MSVC: cl.exe and lib.exe are required' + return + } + + if (-not (Test-MsvcArm64Target)) { + Write-Host '==> Skipping MSVC: this shell does not target ARM64 (set VSCMD_ARG_TGT_ARCH=arm64 or use the ARM64 Native Tools prompt)' + return + } + + $arch = 'arm64' + $package = "funnelcake-windows-msvc-$arch" + $objDir = Join-Path $BuildDir "msvc-$arch" + $libPath = Join-Path $objDir 'funnelcake.lib' + $includeDir = Join-Path $RepoRoot 'include' + $srcDir = Join-Path $RepoRoot 'src' + + Write-Host "==> Building Windows MSVC $arch artifact" + if (Test-Path $objDir) { Remove-Item -Recurse -Force $objDir } + New-Item -ItemType Directory -Path $objDir -Force | Out-Null + + $objs = @() + foreach ($rel in $commonSources) { + $srcPath = Join-Path $RepoRoot $rel + $objPath = Join-Path $objDir ([IO.Path]::GetFileNameWithoutExtension($rel) + '.obj') + & cl /nologo /std:c11 /O2 /W4 /WX ` + /D_CRT_SECURE_NO_WARNINGS /D_POSIX_C_SOURCE=200112L ` + "/I$includeDir" "/I$srcDir" ` + "/Fo$objPath" /c $srcPath + if ($LASTEXITCODE -ne 0) { throw "cl failed for $rel" } + $objs += $objPath + } + + & lib /nologo "/OUT:$libPath" $objs + if ($LASTEXITCODE -ne 0) { throw 'lib failed' } + + New-PackageDir -PackageDir $package -TargetArch $arch -Compiler 'msvc' ` + -LibraryPath $libPath -LibraryName 'funnelcake.lib' + New-Zip -PackageDir $package +} + +New-Item -ItemType Directory -Path $DistDir -Force | Out-Null +New-Item -ItemType Directory -Path $BuildDir -Force | Out-Null + +if ($Mingw) { Invoke-MingwArm64Build } +if ($Msvc) { Invoke-MsvcArm64Build } + +$finalMake = Resolve-MakeTool +if ($finalMake) { + Push-Location $RepoRoot + try { & $finalMake clean | Out-Null } finally { Pop-Location } +} + +if ($Artifacts.Count -eq 0) { + Write-Host '' + Write-Host 'No Windows ARM64 artifacts were built.' + Write-Host 'Install aarch64-w64-mingw32-gcc, or run -Msvc from an ARM64 Visual Studio developer shell.' + exit 1 +} + +Write-Host '' +Write-Host "Artifacts written to ${DistDir}:" +foreach ($a in $Artifacts) { Write-Host " $a" } diff --git a/scripts/build-windows.ps1 b/scripts/build-windows.ps1 new file mode 100644 index 0000000..99404c5 --- /dev/null +++ b/scripts/build-windows.ps1 @@ -0,0 +1,220 @@ +#Requires -Version 5.1 +[CmdletBinding()] +param( + [switch]$All, + [switch]$Mingw, + [switch]$Msvc, + [switch]$Help +) + +$ErrorActionPreference = 'Stop' + +$ScriptDir = $PSScriptRoot +$RepoRoot = (Resolve-Path (Join-Path $ScriptDir '..')).Path +$DistDir = Join-Path $RepoRoot 'dist' +$BuildDir = Join-Path $RepoRoot 'build\windows' +$BuildDate = (Get-Date).ToUniversalTime().ToString('yyyyMMddTHHmmssZ') +$Artifacts = New-Object System.Collections.Generic.List[string] + +function Show-Usage { +@' +Usage: scripts\build-windows.ps1 [-All] [-Mingw] [-Msvc] + +Build Windows release archives: + -All build every supported Windows target available on this host (default) + -Mingw build MinGW-w64 static library packages + -Msvc build an MSVC static library package from a Visual Studio shell + +MinGW targets require cross compilers such as x86_64-w64-mingw32-gcc. +The MSVC target requires cl.exe and lib.exe in PATH, usually by running from +"x64 Native Tools Command Prompt for VS" or an equivalent Developer PowerShell. +'@ +} + +if ($Help) { Show-Usage; return } + +# default: build everything when no flag is given +if (-not $Mingw -and -not $Msvc -and -not $All) { $Mingw = $true; $Msvc = $true } +if ($All) { $Mingw = $true; $Msvc = $true } + +function Test-Tool { + param([string]$Name) + [bool](Get-Command $Name -ErrorAction SilentlyContinue) +} + +function Resolve-MakeTool { + if (Test-Tool 'make') { return 'make' } + if (Test-Tool 'mingw32-make') { return 'mingw32-make' } + return $null +} + +function Write-AsciiFile { + param([string]$Path, [string]$Content) + $enc = New-Object System.Text.UTF8Encoding($false) + [System.IO.File]::WriteAllText($Path, $Content, $enc) +} + +function Get-Sha256Line { + param([string]$Path) + $hash = (Get-FileHash -Algorithm SHA256 -Path $Path).Hash.ToLower() + $name = Split-Path -Leaf $Path + return "$hash $name`n" +} + +function New-PackageDir { + param( + [string]$PackageDir, + [string]$TargetArch, + [string]$Compiler, + [string]$LibraryPath, + [string]$LibraryName + ) + $dest = Join-Path $DistDir $PackageDir + if (Test-Path $dest) { Remove-Item -Recurse -Force $dest } + New-Item -ItemType Directory -Path (Join-Path $dest 'include') -Force | Out-Null + Copy-Item $LibraryPath (Join-Path $dest $LibraryName) + Copy-Item (Join-Path $RepoRoot 'include\funnelcake.h') (Join-Path $dest 'include') + Copy-Item (Join-Path $RepoRoot 'README.md') $dest + Copy-Item (Join-Path $RepoRoot 'INSTALL.md') $dest + + $info = @( + 'name=funnelcake' + 'target_os=windows' + "base_distribution=$Compiler" + "target_arch=$TargetArch" + "compiler=$Compiler" + 'lto=0' + "build_date=$BuildDate" + ) -join "`n" + Write-AsciiFile -Path (Join-Path $dest 'BUILD_INFO') -Content ($info + "`n") +} + +function New-Zip { + param([string]$PackageDir) + $zip = Join-Path $DistDir "$PackageDir.zip" + $sha = "$zip.sha256" + $source = Join-Path $DistDir $PackageDir + if (Test-Path $zip) { Remove-Item -Force $zip } + if (Test-Path $sha) { Remove-Item -Force $sha } + Compress-Archive -Path $source -DestinationPath $zip -Force + Write-AsciiFile -Path $sha -Content (Get-Sha256Line $zip) + $Artifacts.Add("$PackageDir.zip") | Out-Null + $Artifacts.Add("$PackageDir.zip.sha256") | Out-Null +} + +function Invoke-MingwBuild { + param([string]$Arch, [string]$CC, [string]$ARTool) + $package = "funnelcake-windows-mingw-$Arch" + $makeTool = Resolve-MakeTool + + if (-not (Test-Tool $CC) -or -not (Test-Tool $ARTool) -or -not $makeTool) { + Write-Host "==> Skipping MinGW ${Arch}: $CC, $ARTool, and make are required" + return + } + + Write-Host "==> Building Windows MinGW $Arch artifact" + Push-Location $RepoRoot + try { + & $makeTool clean + if ($LASTEXITCODE -ne 0) { throw "$makeTool clean failed" } + & $makeTool lib "CC=$CC" "AR=$ARTool" 'LTO=0' 'UNAME_S=Windows_NT' "UNAME_M=$Arch" + if ($LASTEXITCODE -ne 0) { throw "$makeTool lib failed" } + } + finally { Pop-Location } + + New-PackageDir -PackageDir $package -TargetArch $Arch -Compiler 'mingw-w64' ` + -LibraryPath (Join-Path $RepoRoot 'libfunnelcake.a') -LibraryName 'libfunnelcake.a' + New-Zip -PackageDir $package +} + +function Get-MsvcArchName { + $val = $env:VSCMD_ARG_TGT_ARCH + if (-not $val) { $val = $env:PROCESSOR_ARCHITECTURE } + switch -Regex ($val) { + '^(x64|AMD64|amd64)$' { return 'x64' } + '^(arm64|ARM64)$' { return 'arm64' } + default { return 'unknown' } + } +} + +function Invoke-MsvcBuild { + $commonSources = @( + 'src\funnelcake.c', + 'src\funnelcake_hdr.c', + 'src\log.c', + 'src\detect.c', + 'src\kernels_scalar.c', + 'src\kernels_hdr_scalar.c', + 'src\tonemap.c', + 'src\kernels_upscale_scalar.c' + ) + + if (-not (Test-Tool 'cl') -or -not (Test-Tool 'lib')) { + Write-Host '==> Skipping MSVC: cl.exe and lib.exe are required' + return + } + + $arch = Get-MsvcArchName + if ($arch -eq 'unknown') { + Write-Host '==> Skipping MSVC: unable to determine Visual Studio target architecture' + return + } + + $package = "funnelcake-windows-msvc-$arch" + $objDir = Join-Path $BuildDir "msvc-$arch" + $libPath = Join-Path $objDir 'funnelcake.lib' + $includeDir = Join-Path $RepoRoot 'include' + $srcDir = Join-Path $RepoRoot 'src' + + Write-Host "==> Building Windows MSVC $arch artifact" + if (Test-Path $objDir) { Remove-Item -Recurse -Force $objDir } + New-Item -ItemType Directory -Path $objDir -Force | Out-Null + + $objs = @() + foreach ($rel in $commonSources) { + $srcPath = Join-Path $RepoRoot $rel + $objPath = Join-Path $objDir ([IO.Path]::GetFileNameWithoutExtension($rel) + '.obj') + & cl /nologo /std:c11 /O2 /W4 /WX ` + /D_CRT_SECURE_NO_WARNINGS /D_POSIX_C_SOURCE=200112L ` + "/I$includeDir" "/I$srcDir" ` + "/Fo$objPath" /c $srcPath + if ($LASTEXITCODE -ne 0) { throw "cl failed for $rel" } + $objs += $objPath + } + + & lib /nologo "/OUT:$libPath" $objs + if ($LASTEXITCODE -ne 0) { throw 'lib failed' } + + New-PackageDir -PackageDir $package -TargetArch $arch -Compiler 'msvc' ` + -LibraryPath $libPath -LibraryName 'funnelcake.lib' + New-Zip -PackageDir $package +} + +New-Item -ItemType Directory -Path $DistDir -Force | Out-Null +New-Item -ItemType Directory -Path $BuildDir -Force | Out-Null + +if ($Mingw) { + Invoke-MingwBuild -Arch 'x86_64' -CC 'x86_64-w64-mingw32-gcc' -ARTool 'x86_64-w64-mingw32-ar' + Invoke-MingwBuild -Arch 'aarch64' -CC 'aarch64-w64-mingw32-gcc' -ARTool 'aarch64-w64-mingw32-ar' +} + +if ($Msvc) { + Invoke-MsvcBuild +} + +$finalMake = Resolve-MakeTool +if ($finalMake) { + Push-Location $RepoRoot + try { & $finalMake clean | Out-Null } finally { Pop-Location } +} + +if ($Artifacts.Count -eq 0) { + Write-Host '' + Write-Host 'No Windows artifacts were built.' + Write-Host 'Install MinGW-w64, or run -Msvc from a Visual Studio developer shell.' + exit 1 +} + +Write-Host '' +Write-Host "Artifacts written to ${DistDir}:" +foreach ($a in $Artifacts) { Write-Host " $a" } diff --git a/scripts/build-windows.sh b/scripts/build-windows.sh new file mode 100755 index 0000000..7605b8d --- /dev/null +++ b/scripts/build-windows.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +DIST_DIR="${REPO_ROOT}/dist" +BUILD_DIR="${REPO_ROOT}/build/windows" +BUILD_DATE="$(date -u +%Y%m%dT%H%M%SZ)" +ARTIFACTS=() + +build_mingw=1 +build_msvc=1 + +usage() { + cat <<'EOF' +Usage: scripts/build-windows.sh [--all] [--mingw] [--msvc] + +Build Windows release archives: + --all build every supported Windows target available on this host (default) + --mingw build MinGW-w64 static library packages + --msvc build an MSVC static library package from a Visual Studio shell + +MinGW targets require cross compilers such as x86_64-w64-mingw32-gcc. +The MSVC target requires cl.exe and lib.exe in PATH, usually by running from +"x64 Native Tools Command Prompt for VS" or an equivalent Developer PowerShell. +EOF +} + +if [ "$#" -gt 0 ]; then + build_mingw=0 + build_msvc=0 +fi + +while [ "$#" -gt 0 ]; do + case "$1" in + --all) + build_mingw=1 + build_msvc=1 + ;; + --mingw) + build_mingw=1 + ;; + --msvc) + build_msvc=1 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "error: unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift +done + +have_tool() { + command -v "$1" >/dev/null 2>&1 +} + +hash_file() { + local file="$1" + + if have_tool shasum; then + shasum -a 256 "${file}" + elif have_tool sha256sum; then + sha256sum "${file}" + else + echo "error: shasum or sha256sum is required to write checksums" >&2 + exit 1 + fi +} + +native_path() { + if have_tool cygpath; then + cygpath -w "$1" + else + printf '%s\n' "$1" + fi +} + +make_package_dir() { + local package_dir="$1" + local target_arch="$2" + local compiler="$3" + local library_path="$4" + local library_name="$5" + + rm -rf "${DIST_DIR}/${package_dir}" + mkdir -p "${DIST_DIR}/${package_dir}/include" + cp "${library_path}" "${DIST_DIR}/${package_dir}/${library_name}" + cp "${REPO_ROOT}/include/funnelcake.h" "${DIST_DIR}/${package_dir}/include/" + cp "${REPO_ROOT}/README.md" "${REPO_ROOT}/INSTALL.md" "${DIST_DIR}/${package_dir}/" + + { + printf '%s\n' "name=funnelcake" + printf '%s\n' "target_os=windows" + printf '%s\n' "base_distribution=${compiler}" + printf '%s\n' "target_arch=${target_arch}" + printf '%s\n' "compiler=${compiler}" + printf '%s\n' "lto=0" + printf '%s\n' "build_date=${BUILD_DATE}" + } > "${DIST_DIR}/${package_dir}/BUILD_INFO" +} + +make_zip() { + local package_dir="$1" + + if ! have_tool zip; then + echo "error: zip is required for Windows release archives" >&2 + exit 1 + fi + + rm -f "${DIST_DIR}/${package_dir}.zip" "${DIST_DIR}/${package_dir}.zip.sha256" + ( + cd "${DIST_DIR}" + zip -qr "${package_dir}.zip" "${package_dir}" + hash_file "${package_dir}.zip" > "${package_dir}.zip.sha256" + ) + ARTIFACTS+=("${package_dir}.zip" "${package_dir}.zip.sha256") +} + +build_windows_mingw() { + local arch="$1" + local cc="$2" + local ar="$3" + local package_dir="funnelcake-windows-mingw-${arch}" + + if ! have_tool "${cc}" || ! have_tool "${ar}" || ! have_tool make; then + echo "==> Skipping MinGW ${arch}: ${cc}, ${ar}, and make are required" + return 0 + fi + + echo "==> Building Windows MinGW ${arch} artifact" + ( + cd "${REPO_ROOT}" + make clean + make lib CC="${cc}" AR="${ar}" LTO=0 UNAME_S=Windows_NT UNAME_M="${arch}" + ) + + make_package_dir "${package_dir}" "${arch}" "mingw-w64" \ + "${REPO_ROOT}/libfunnelcake.a" "libfunnelcake.a" + make_zip "${package_dir}" +} + +msvc_arch_name() { + case "${VSCMD_ARG_TGT_ARCH:-${PROCESSOR_ARCHITECTURE:-unknown}}" in + x64|AMD64|amd64) printf '%s\n' "x64" ;; + arm64|ARM64) printf '%s\n' "arm64" ;; + *) printf '%s\n' "unknown" ;; + esac +} + +build_windows_msvc() { + local arch + local obj_dir + local lib_path + local package_dir + local src + local obj + local objs=() + local include_dir + local src_dir + local src_path + local obj_path + local lib_path_native + local common_sources=( + src/funnelcake.c + src/funnelcake_hdr.c + src/log.c + src/detect.c + src/kernels_scalar.c + src/kernels_hdr_scalar.c + src/tonemap.c + src/kernels_upscale_scalar.c + ) + + if ! have_tool cl || ! have_tool lib; then + echo "==> Skipping MSVC: cl.exe and lib.exe are required" + return 0 + fi + + arch="$(msvc_arch_name)" + if [ "${arch}" = "unknown" ]; then + echo "==> Skipping MSVC: unable to determine Visual Studio target architecture" + return 0 + fi + + package_dir="funnelcake-windows-msvc-${arch}" + obj_dir="${BUILD_DIR}/msvc-${arch}" + lib_path="${obj_dir}/funnelcake.lib" + include_dir="$(native_path "${REPO_ROOT}/include")" + src_dir="$(native_path "${REPO_ROOT}/src")" + lib_path_native="$(native_path "${lib_path}")" + + echo "==> Building Windows MSVC ${arch} artifact" + rm -rf "${obj_dir}" + mkdir -p "${obj_dir}" + + for src in "${common_sources[@]}"; do + obj="${obj_dir}/$(basename "${src}" .c).obj" + src_path="$(native_path "${REPO_ROOT}/${src}")" + obj_path="$(native_path "${obj}")" + MSYS2_ARG_CONV_EXCL='*' cl /nologo /std:c11 /O2 /W4 /WX \ + /D_CRT_SECURE_NO_WARNINGS /D_POSIX_C_SOURCE=200112L \ + /I"${include_dir}" /I"${src_dir}" \ + /Fo"${obj_path}" /c "${src_path}" + objs+=("$(native_path "${obj}")") + done + + MSYS2_ARG_CONV_EXCL='*' lib /nologo /OUT:"${lib_path_native}" "${objs[@]}" + make_package_dir "${package_dir}" "${arch}" "msvc" "${lib_path}" "funnelcake.lib" + make_zip "${package_dir}" +} + +mkdir -p "${DIST_DIR}" "${BUILD_DIR}" + +if [ "${build_mingw}" -eq 1 ]; then + build_windows_mingw "x86_64" "x86_64-w64-mingw32-gcc" "x86_64-w64-mingw32-ar" + build_windows_mingw "aarch64" "aarch64-w64-mingw32-gcc" "aarch64-w64-mingw32-ar" +fi + +if [ "${build_msvc}" -eq 1 ]; then + build_windows_msvc +fi + +( + cd "${REPO_ROOT}" + make clean +) + +if [ "${#ARTIFACTS[@]}" -eq 0 ]; then + echo "" + echo "No Windows artifacts were built." + echo "Install MinGW-w64, or run --msvc from a Visual Studio developer shell." + exit 1 +fi + +echo "" +echo "Artifacts written to ${DIST_DIR}:" +for artifact in "${ARTIFACTS[@]}"; do + echo " ${artifact}" +done diff --git a/src/detect.c b/src/detect.c index ef58e2c..07cb042 100644 --- a/src/detect.c +++ b/src/detect.c @@ -90,13 +90,14 @@ static void detect_x86(void) #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(_WIN32) /* - * On aarch64 macOS (Apple Silicon), NEON is architecturally mandatory and - * always available. No runtime detection needed. + * On aarch64 Apple Silicon and Windows on ARM, NEON (Advanced SIMD) is + * architecturally mandatory and always available. No runtime detection + * needed - just assert it. */ static void detect_aarch64(void) { @@ -123,7 +124,7 @@ static void detect_aarch64(void) } } -#else /* aarch64 Linux (and other non-Apple aarch64) */ +#else /* aarch64 Linux (and other non-Apple, non-Windows aarch64) */ #include @@ -154,9 +155,9 @@ static void detect_aarch64(void) fclose(f); } -#endif /* __APPLE__ */ +#endif /* __APPLE__ || _WIN32 */ -#endif /* __aarch64__ */ +#endif /* __aarch64__ || _M_ARM64 */ #if defined(__riscv) && (__riscv_xlen == 64) @@ -306,7 +307,7 @@ const fused_cpu_caps_t *fused_detect_cpu(void) if (force_scalar == NULL || force_scalar[0] == '\0') { #if defined(__x86_64__) detect_x86(); -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) detect_aarch64(); #elif defined(__riscv) && (__riscv_xlen == 64) detect_riscv(); diff --git a/src/funnelcake.c b/src/funnelcake.c index 55e4203..8f493dd 100644 --- a/src/funnelcake.c +++ b/src/funnelcake.c @@ -232,7 +232,7 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) fused_kernel_fn simd_thirds_up_fn = NULL; fused_kernel_fn simd_pow2_up_fn = NULL; -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) if (caps->has_neon) { has_simd = 1; simd_thirds_fn = fused_kernel_thirds_neon; @@ -264,16 +264,18 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) #endif if (!has_simd) { - /* One-time stderr notice. Suppressed when scalar was explicitly - * requested via FUNNELCAKE_FORCE_SCALAR (the parity test toggles - * this on and off; printing the warning on every flip would flood - * the test output and confuse readers into thinking SIMD is broken). */ + /* One-time notice routed through the configured warning logger so + * callers using FUSED_LOG_SUPPRESS / FUSED_LOG_CALLBACK can control + * it. Suppressed when scalar was explicitly requested via + * FUNNELCAKE_FORCE_SCALAR (the parity test toggles this on and off; + * printing the warning on every flip would flood the test output and + * confuse readers into thinking SIMD is broken). */ static int g_no_simd_warned = 0; const char *force_scalar_env = getenv("FUNNELCAKE_FORCE_SCALAR"); int forced_scalar = (force_scalar_env != NULL && force_scalar_env[0] != '\0'); if (!g_no_simd_warned && !forced_scalar) { g_no_simd_warned = 1; - fprintf(stderr, + fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake: no SIMD support detected; using scalar kernel\n"); } warn_bits |= FUSED_WARN_BIT_SCALAR; @@ -356,11 +358,11 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) int chroma_h = out_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { /* Allocation failure - free what we got and reject this step */ - free(py); free(pu); free(pv); + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake: %s rejected: out-of-memory allocating output planes\n", sd->name); @@ -421,10 +423,10 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) int chroma_h = up_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)up_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)up_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake: upscale level %dx rejected: out-of-memory\n", (1 << (k + 1))); @@ -493,10 +495,10 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) int chroma_h = tail_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)tail_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)tail_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake: upscale 1.5x tail rejected: out-of-memory\n"); warn_bits |= FUSED_WARN_BIT_PARTIAL; @@ -548,7 +550,7 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) fused_scaler_free(ctx); fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake: out-of-memory allocating internal state\n"); - return FUSED_ERR_NO_STEPS; + return FUSED_ERR_OUT_OF_MEMORY; } fused_kernel_params_t *p = &state->params; @@ -626,7 +628,7 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) if (max_scratch_w > 0) { size_t bytes = (size_t)((max_scratch_w + 63) & ~63); void *sp = NULL; - if (posix_memalign(&sp, 64, bytes) == 0) { + if (fused_aligned_alloc(&sp, 64, bytes) == 0) { p->upscale_scratch = (uint8_t *)sp; } else { fused_log(&ctx->log_warnings, FUSED_LOG_WARN, @@ -670,13 +672,13 @@ int fused_scaler_init(fused_scaler_ctx_t *ctx) if (pool_bytes > 0) { void *sp = NULL; size_t aligned_bytes = (pool_bytes + 63) & ~(size_t)63; - if (posix_memalign(&sp, 64, aligned_bytes) == 0) { + if (fused_aligned_alloc(&sp, 64, aligned_bytes) == 0) { p->scratch_pool = (uint8_t *)sp; p->scratch_pool_size = aligned_bytes; } else { fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake: failed to allocate downscale scratch pool " - "(%zu bytes)\n", aligned_bytes); + "(%llu bytes)\n", (unsigned long long)aligned_bytes); } } } @@ -727,15 +729,14 @@ void fused_scaler_run(fused_scaler_ctx_t *ctx, /* Check source plane alignment */ if (((uintptr_t)src_y & 31) || ((uintptr_t)src_u & 31) || ((uintptr_t)src_v & 31)) { - /* Warn once about misaligned source planes */ - static int warned = 0; - if (!warned) { + /* Warn once per context about misaligned source planes */ + if (!state->src_misaligned_warned) { fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake: source planes are not 32-byte aligned " "(Y=%p U=%p V=%p). Falling back to scalar kernel. " - "Performance will be significantly reduced.", + "Performance will be significantly reduced.\n", (const void*)src_y, (const void*)src_u, (const void*)src_v); - warned = 1; + state->src_misaligned_warned = 1; } /* Fall back to scalar - pick the variant matching the configured * (want_down, want_up) combination. */ @@ -768,22 +769,22 @@ void fused_scaler_free(fused_scaler_ctx_t *ctx) { if (!ctx) return; for (int i = 0; i < 8; i++) { - free(ctx->outputs[i].plane_y); - free(ctx->outputs[i].plane_u); - free(ctx->outputs[i].plane_v); + fused_aligned_free(ctx->outputs[i].plane_y); + fused_aligned_free(ctx->outputs[i].plane_u); + fused_aligned_free(ctx->outputs[i].plane_v); memset(&ctx->outputs[i], 0, sizeof(fused_scale_output_t)); } for (int i = 0; i < FUSED_MAX_UPSCALE_STEPS; i++) { - free(ctx->upscale_outputs[i].plane_y); - free(ctx->upscale_outputs[i].plane_u); - free(ctx->upscale_outputs[i].plane_v); + fused_aligned_free(ctx->upscale_outputs[i].plane_y); + fused_aligned_free(ctx->upscale_outputs[i].plane_u); + fused_aligned_free(ctx->upscale_outputs[i].plane_v); memset(&ctx->upscale_outputs[i], 0, sizeof(fused_scale_output_t)); } if (ctx->_internal) { fused_internal_t *state = (fused_internal_t *)ctx->_internal; - free(state->params.upscale_scratch); + fused_aligned_free(state->params.upscale_scratch); state->params.upscale_scratch = NULL; - free(state->params.scratch_pool); + fused_aligned_free(state->params.scratch_pool); state->params.scratch_pool = NULL; state->params.scratch_pool_size = 0; } @@ -793,4 +794,6 @@ void fused_scaler_free(fused_scaler_ctx_t *ctx) ctx->rejected_flags = 0; ctx->achieved_upscale_flags = 0; ctx->achieved_upscale_tail = 0; + ctx->effective_width = 0; + ctx->effective_height = 0; } diff --git a/src/funnelcake_hdr.c b/src/funnelcake_hdr.c index 7ae97d8..93ffc4a 100644 --- a/src/funnelcake_hdr.c +++ b/src/funnelcake_hdr.c @@ -278,7 +278,7 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) fused_hdr_kernel_fn simd_thirds_up_fn = NULL; fused_hdr_kernel_fn simd_pow2_up_fn = NULL; -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) if (caps->has_neon) { has_simd = 1; simd_thirds_fn = fused_kernel_thirds_hdr_neon; @@ -310,14 +310,16 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) #endif if (!has_simd) { - /* See matching block in funnelcake.c for why FUNNELCAKE_FORCE_SCALAR - * suppresses this warning. */ + /* One-time notice routed through the configured warning logger so + * callers using FUSED_LOG_SUPPRESS / FUSED_LOG_CALLBACK can control + * it. See matching block in funnelcake.c for why + * FUNNELCAKE_FORCE_SCALAR suppresses this warning. */ static int g_no_simd_warned = 0; const char *force_scalar_env = getenv("FUNNELCAKE_FORCE_SCALAR"); int forced_scalar = (force_scalar_env != NULL && force_scalar_env[0] != '\0'); if (!g_no_simd_warned && !forced_scalar) { g_no_simd_warned = 1; - fprintf(stderr, + fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake-hdr: no SIMD support detected; using scalar kernel\n"); } warn_bits |= FUSED_WARN_BIT_SCALAR; @@ -335,13 +337,17 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) memset(ctx->hdr_outputs, 0, sizeof(ctx->hdr_outputs)); memset(ctx->sdr_outputs, 0, sizeof(ctx->sdr_outputs)); - /* Allocate internal state early so we can store sdr_temp pointers */ + /* Allocate internal state early so we can store sdr_temp pointers. + * Attach to ctx->_internal immediately so any subsequent error path + * can rely on fused_hdr_free for cleanup and avoid leaking sdr_temp[] + * buffers populated during the per-step loop below. */ fused_hdr_internal_t *state = calloc(1, sizeof(fused_hdr_internal_t)); if (!state) { fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake-hdr: out-of-memory allocating internal state\n"); - return FUSED_ERR_NO_STEPS; + return FUSED_ERR_OUT_OF_MEMORY; } + ctx->_internal = state; for (int i = 0; i < 8; i++) { const step_desc_t *sd = &k_steps[i]; @@ -411,10 +417,10 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int uv_stride = stride_for_hdr(chroma_w); void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake-hdr: %s rejected: out-of-memory allocating HDR output planes\n", sd->name); @@ -440,18 +446,18 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int uv_stride = stride_for(chroma_w); void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)out_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake-hdr: %s rejected: out-of-memory allocating SDR output planes\n", sd->name); /* If we already allocated HDR planes for this step, free them */ if (step_wants_hdr && (achieved_hdr & sd->flag)) { - free(ctx->hdr_outputs[i].plane_y); - free(ctx->hdr_outputs[i].plane_u); - free(ctx->hdr_outputs[i].plane_v); + fused_aligned_free(ctx->hdr_outputs[i].plane_y); + fused_aligned_free(ctx->hdr_outputs[i].plane_u); + fused_aligned_free(ctx->hdr_outputs[i].plane_v); memset(&ctx->hdr_outputs[i], 0, sizeof(fused_hdr_output_t)); achieved_hdr &= ~sd->flag; } @@ -477,14 +483,14 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int hdr_uv_stride = stride_for_hdr(chroma_w); void *ty = NULL, *tu = NULL, *tv = NULL; - if (fused_alloc_aligned(&ty, 32, (size_t)hdr_y_stride * (size_t)out_h) != 0 || - fused_alloc_aligned(&tu, 32, (size_t)hdr_uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&tv, 32, (size_t)hdr_uv_stride * (size_t)chroma_h) != 0) { - free(ty); free(tu); free(tv); + if (fused_aligned_alloc(&ty, 32, (size_t)hdr_y_stride * (size_t)out_h) != 0 || + fused_aligned_alloc(&tu, 32, (size_t)hdr_uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&tv, 32, (size_t)hdr_uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(ty); fused_aligned_free(tu); fused_aligned_free(tv); /* Roll back SDR allocation for this step */ - free(ctx->sdr_outputs[i].plane_y); - free(ctx->sdr_outputs[i].plane_u); - free(ctx->sdr_outputs[i].plane_v); + fused_aligned_free(ctx->sdr_outputs[i].plane_y); + fused_aligned_free(ctx->sdr_outputs[i].plane_u); + fused_aligned_free(ctx->sdr_outputs[i].plane_v); memset(&ctx->sdr_outputs[i], 0, sizeof(fused_scale_output_t)); achieved_sdr &= ~sd->flag; fused_log(&ctx->log_warnings, FUSED_LOG_WARN, @@ -537,10 +543,10 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int chroma_h = up_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)up_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)up_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake-hdr: upscale level %dx rejected: out-of-memory\n", (1 << (k + 1))); @@ -604,10 +610,10 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int chroma_h = tail_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)tail_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)tail_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_log(&ctx->log_warnings, FUSED_LOG_WARN, "funnelcake-hdr: upscale 1.5x tail rejected: out-of-memory\n"); warn_bits |= FUSED_WARN_BIT_PARTIAL; @@ -646,8 +652,6 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) if (achieved_any == 0 && !ctx->tonemap_1x && !hdr_want_up) { fused_hdr_free(ctx); - free(state); - ctx->_internal = NULL; fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake-hdr: no valid output steps after validation\n"); return FUSED_ERR_NO_STEPS; @@ -665,16 +669,14 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int chroma_h = eff_h / 2; void *py = NULL, *pu = NULL, *pv = NULL; - if (fused_alloc_aligned(&py, 32, (size_t)y_stride * (size_t)eff_h) != 0 || - fused_alloc_aligned(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || - fused_alloc_aligned(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { - free(py); free(pu); free(pv); + if (fused_aligned_alloc(&py, 32, (size_t)y_stride * (size_t)eff_h) != 0 || + fused_aligned_alloc(&pu, 32, (size_t)uv_stride * (size_t)chroma_h) != 0 || + fused_aligned_alloc(&pv, 32, (size_t)uv_stride * (size_t)chroma_h) != 0) { + fused_aligned_free(py); fused_aligned_free(pu); fused_aligned_free(pv); fused_hdr_free(ctx); - free(state); - ctx->_internal = NULL; fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake-hdr: out-of-memory allocating 1:1 tonemap output\n"); - return FUSED_ERR_NO_STEPS; + return FUSED_ERR_OUT_OF_MEMORY; } ctx->output_1x.width = eff_w; @@ -743,10 +745,10 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) int tmp_stride = stride_for_hdr(chroma_w); /* 32-byte aligned */ size_t tmp_bytes = (size_t)tmp_stride * (size_t)chroma_h; - if (posix_memalign((void **)&p->p010_tmp_u, 32, tmp_bytes) != 0 || - posix_memalign((void **)&p->p010_tmp_v, 32, tmp_bytes) != 0) { - free(p->p010_tmp_u); - free(p->p010_tmp_v); + if (fused_aligned_alloc((void **)&p->p010_tmp_u, 32, tmp_bytes) != 0 || + fused_aligned_alloc((void **)&p->p010_tmp_v, 32, tmp_bytes) != 0) { + fused_aligned_free(p->p010_tmp_u); + fused_aligned_free(p->p010_tmp_v); p->p010_tmp_u = NULL; p->p010_tmp_v = NULL; fused_log(&ctx->log_warnings, FUSED_LOG_WARN, @@ -843,7 +845,7 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) if (max_scratch_w > 0) { size_t bytes = (size_t)((max_scratch_w + 63) & ~63) * sizeof(uint16_t); void *sp = NULL; - if (posix_memalign(&sp, 64, bytes) == 0) { + if (fused_aligned_alloc(&sp, 64, bytes) == 0) { p->upscale_scratch_hdr = (uint16_t *)sp; } else { fused_log(&ctx->log_warnings, FUSED_LOG_WARN, @@ -876,7 +878,7 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) if (pool_bytes > 0) { void *sp = NULL; size_t aligned_bytes = (pool_bytes + 63) & ~(size_t)63; - if (posix_memalign(&sp, 64, aligned_bytes) == 0) { + if (fused_aligned_alloc(&sp, 64, aligned_bytes) == 0) { p->scratch_pool = (uint8_t *)sp; p->scratch_pool_size = aligned_bytes; } else { @@ -913,8 +915,7 @@ int fused_hdr_init(fused_hdr_ctx_t *ctx) state->sdr_flags = achieved_sdr; state->tonemap_1x = ctx->tonemap_1x; state->is_custom_lut = (ctx->tonemap.curve == FUSED_TONEMAP_CUSTOM) ? 1 : 0; - - ctx->_internal = state; + /* ctx->_internal already set immediately after state allocation. */ return warn_bits; /* 0 == FUSED_OK if nothing was warned */ } @@ -940,14 +941,13 @@ void fused_hdr_run(fused_hdr_ctx_t *ctx, if (((uintptr_t)src_y & 31) || ((uintptr_t)src_u & 31) || (!is_p010 && src_v && ((uintptr_t)src_v & 31))) { - static int warned = 0; - if (!warned) { + if (!state->src_misaligned_warned) { fused_log(&ctx->log_errors, FUSED_LOG_ERROR, "funnelcake-hdr: source planes are not 32-byte aligned " "(Y=%p U=%p V=%p). Falling back to scalar kernel. " - "Performance will be significantly reduced.", + "Performance will be significantly reduced.\n", (const void *)src_y, (const void *)src_u, (const void *)src_v); - warned = 1; + state->src_misaligned_warned = 1; } const fused_hdr_kernel_params_t *p = &state->params; int hdr_want_down = (p->active_outputs != 0); @@ -1047,17 +1047,17 @@ void fused_hdr_free(fused_hdr_ctx_t *ctx) /* Free HDR output planes */ for (int i = 0; i < 8; i++) { - free(ctx->hdr_outputs[i].plane_y); - free(ctx->hdr_outputs[i].plane_u); - free(ctx->hdr_outputs[i].plane_v); + fused_aligned_free(ctx->hdr_outputs[i].plane_y); + fused_aligned_free(ctx->hdr_outputs[i].plane_u); + fused_aligned_free(ctx->hdr_outputs[i].plane_v); memset(&ctx->hdr_outputs[i], 0, sizeof(fused_hdr_output_t)); } /* Free SDR output planes */ for (int i = 0; i < 8; i++) { - free(ctx->sdr_outputs[i].plane_y); - free(ctx->sdr_outputs[i].plane_u); - free(ctx->sdr_outputs[i].plane_v); + fused_aligned_free(ctx->sdr_outputs[i].plane_y); + fused_aligned_free(ctx->sdr_outputs[i].plane_u); + fused_aligned_free(ctx->sdr_outputs[i].plane_v); memset(&ctx->sdr_outputs[i], 0, sizeof(fused_scale_output_t)); } @@ -1065,28 +1065,28 @@ void fused_hdr_free(fused_hdr_ctx_t *ctx) fused_hdr_internal_t *state = (fused_hdr_internal_t *)ctx->_internal; if (state) { for (int i = 0; i < 8; i++) { - free(state->sdr_temp[i].y); - free(state->sdr_temp[i].u); - free(state->sdr_temp[i].v); + fused_aligned_free(state->sdr_temp[i].y); + fused_aligned_free(state->sdr_temp[i].u); + fused_aligned_free(state->sdr_temp[i].v); } - free(state->params.p010_tmp_u); - free(state->params.p010_tmp_v); - free(state->params.upscale_scratch_hdr); - free(state->params.scratch_pool); + fused_aligned_free(state->params.p010_tmp_u); + fused_aligned_free(state->params.p010_tmp_v); + fused_aligned_free(state->params.upscale_scratch_hdr); + fused_aligned_free(state->params.scratch_pool); free(state); } /* Free 1:1 tonemap output planes */ - free(ctx->output_1x.plane_y); - free(ctx->output_1x.plane_u); - free(ctx->output_1x.plane_v); + fused_aligned_free(ctx->output_1x.plane_y); + fused_aligned_free(ctx->output_1x.plane_u); + fused_aligned_free(ctx->output_1x.plane_v); memset(&ctx->output_1x, 0, sizeof(fused_scale_output_t)); /* Free upscale HDR output planes */ for (int i = 0; i < FUSED_MAX_UPSCALE_STEPS; i++) { - free(ctx->upscale_hdr_outputs[i].plane_y); - free(ctx->upscale_hdr_outputs[i].plane_u); - free(ctx->upscale_hdr_outputs[i].plane_v); + fused_aligned_free(ctx->upscale_hdr_outputs[i].plane_y); + fused_aligned_free(ctx->upscale_hdr_outputs[i].plane_u); + fused_aligned_free(ctx->upscale_hdr_outputs[i].plane_v); memset(&ctx->upscale_hdr_outputs[i], 0, sizeof(fused_hdr_output_t)); } @@ -1096,4 +1096,6 @@ void fused_hdr_free(fused_hdr_ctx_t *ctx) ctx->rejected_flags = 0; ctx->achieved_upscale_flags = 0; ctx->achieved_upscale_tail = 0; + ctx->effective_width = 0; + ctx->effective_height = 0; } diff --git a/src/internal.h b/src/internal.h index c247171..9e4417c 100644 --- a/src/internal.h +++ b/src/internal.h @@ -13,10 +13,53 @@ #include #include /* posix_memalign */ +#if defined(_WIN32) +#include +#endif #if defined(__linux__) # include /* madvise, MADV_HUGEPAGE */ #endif +static inline int fused_aligned_alloc(void **ptr, size_t alignment, size_t size) +{ +#if defined(_WIN32) + void *p = _aligned_malloc(size ? size : alignment, alignment); + if (!p) return -1; + *ptr = p; + return 0; +#else + return posix_memalign(ptr, alignment, size); +#endif +} + +static inline void fused_aligned_free(void *ptr) +{ +#if defined(_WIN32) + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +/* -------------------------------------------------------------------------- + * Portability macros + * + * GCC/Clang ship a few extensions the NEON kernels rely on; MSVC (used for + * the Windows ARM64 build) needs equivalents. + * -------------------------------------------------------------------------- */ +#if defined(__GNUC__) || defined(__clang__) +# define FUSED_HOT __attribute__((hot)) +# define FUSED_PREFETCH(p) __builtin_prefetch(p) +#else +# define FUSED_HOT +# if defined(_M_ARM64) || defined(_M_ARM64EC) +# include +# define FUSED_PREFETCH(p) __prefetch((const void *)(p)) +# else +# define FUSED_PREFETCH(p) ((void)0) +# endif +#endif + /* -------------------------------------------------------------------------- * Constants * -------------------------------------------------------------------------- */ @@ -193,6 +236,7 @@ typedef struct { fused_kernel_params_t params; fused_kernel_fn kernel_fn; int has_simd; /* 1 if a SIMD kernel was selected, 0 = scalar only */ + int src_misaligned_warned; /* per-context one-shot flag for run() */ } fused_internal_t; @@ -228,7 +272,7 @@ void fused_kernel_pow2_avx2(const fused_kernel_params_t *p, const uint8_t *src_v); #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* NEON (aarch64 only) */ void fused_kernel_thirds_neon(const fused_kernel_params_t *p, const uint8_t *src_y, @@ -298,7 +342,7 @@ void fused_kernel_pow2_up_avx2(const fused_kernel_params_t *p, const uint8_t *src_v); #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) void fused_kernel_upscale_neon(const fused_kernel_params_t *p, const uint8_t *src_y, const uint8_t *src_u, @@ -470,6 +514,7 @@ typedef struct { uint32_t sdr_flags; int tonemap_1x; int is_custom_lut; /* 1 if using FUSED_TONEMAP_CUSTOM (skip RGB chroma path) */ + int src_misaligned_warned; /* per-context one-shot flag for run() */ } fused_hdr_internal_t; @@ -501,7 +546,7 @@ void fused_kernel_pow2_hdr_avx2(const fused_hdr_kernel_params_t *p, const uint16_t *src_v); #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* NEON (aarch64 only) */ void fused_kernel_thirds_hdr_neon(const fused_hdr_kernel_params_t *p, const uint16_t *src_y, @@ -564,7 +609,7 @@ void fused_kernel_pow2_up_hdr_avx2(const fused_hdr_kernel_params_t *p, const uint16_t *src_v); #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) void fused_kernel_upscale_hdr_neon(const fused_hdr_kernel_params_t *p, const uint16_t *src_y, const uint16_t *src_u, diff --git a/src/kernels_hdr_neon.c b/src/kernels_hdr_neon.c index 3014790..aba8f70 100644 --- a/src/kernels_hdr_neon.c +++ b/src/kernels_hdr_neon.c @@ -32,10 +32,10 @@ * - Chunk sizes: 48 bytes = 24 uint16_t = 8 triplets (thirds family), * 16 bytes = 8 uint16_t elements (pow2 family). * - * Guarded by __aarch64__ so this file is a no-op on other platforms. + * Guarded by __aarch64__/_M_ARM64 so this file is a no-op on other platforms. */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "internal.h" #include @@ -239,7 +239,7 @@ static void h_filter_halve_hdr(const uint16_t *restrict src, * uint16_t elements with 8 elements per Q register instead of 16. * ----------------------------------------------------------------------- */ -static void __attribute__((hot)) scale_plane_pow2_hdr_neon( +static void FUSED_HOT scale_plane_pow2_hdr_neon( const uint16_t *restrict src, int src_w, int src_h, int src_stride, uint32_t active_outputs, @@ -302,8 +302,8 @@ static void __attribute__((hot)) scale_plane_pow2_hdr_neon( * per-row stream tracker has to re-lock; the prefetches bridge that. */ if (g + 1 < num_groups) { const uint16_t *nxt = grp_base + (size_t)group_rows * (size_t)src_el_stride; - __builtin_prefetch(nxt); - __builtin_prefetch(nxt + src_el_stride); + FUSED_PREFETCH(nxt); + FUSED_PREFETCH(nxt + src_el_stride); } /* -- Vertical cascade (NEON) --------------------------------- */ @@ -502,7 +502,7 @@ static inline uint16x8x3_t deinterleave_chunk_hdr( } -static void __attribute__((hot)) scale_plane_thirds_hdr_neon( +static void FUSED_HOT scale_plane_thirds_hdr_neon( const uint16_t *restrict src, int src_w, int src_h, int src_stride, uint32_t active_outputs, @@ -572,7 +572,7 @@ static void __attribute__((hot)) scale_plane_thirds_hdr_neon( int tail_cols = src_w - tail_start; /* Deinterleave buffer (stack-allocated, 24 uint16_t = 48 bytes) */ - uint16_t __attribute__((aligned(16))) chunk_buf[24]; + _Alignas(16) uint16_t chunk_buf[24]; /* Output row cursors */ int out_row[4] = { 0, 0, 0, 0 }; @@ -592,12 +592,12 @@ static void __attribute__((hot)) scale_plane_thirds_hdr_neon( * row; this bridges the per-row stream restart at the group boundary. */ if (g6 + 1 < base6_groups) { const uint16_t *nxt = grp + (size_t)6 * (size_t)src_el_stride; - __builtin_prefetch(nxt); - __builtin_prefetch(nxt + src_el_stride); - __builtin_prefetch(nxt + 2 * src_el_stride); - __builtin_prefetch(nxt + 3 * src_el_stride); - __builtin_prefetch(nxt + 4 * src_el_stride); - __builtin_prefetch(nxt + 5 * src_el_stride); + FUSED_PREFETCH(nxt); + FUSED_PREFETCH(nxt + src_el_stride); + FUSED_PREFETCH(nxt + 2 * src_el_stride); + FUSED_PREFETCH(nxt + 3 * src_el_stride); + FUSED_PREFETCH(nxt + 4 * src_el_stride); + FUSED_PREFETCH(nxt + 5 * src_el_stride); } /* Compute output row base pointers (element pointers, not byte) */ @@ -907,7 +907,7 @@ static void __attribute__((hot)) scale_plane_thirds_hdr_neon( * into two 8-element vectors), then process U and V identically to I010. * ----------------------------------------------------------------------- */ -void __attribute__((hot)) fused_kernel_pow2_hdr_neon( +void FUSED_HOT fused_kernel_pow2_hdr_neon( const fused_hdr_kernel_params_t *p, const uint16_t *src_y, const uint16_t *src_u, @@ -1007,7 +1007,7 @@ void __attribute__((hot)) fused_kernel_pow2_hdr_neon( } -void __attribute__((hot)) fused_kernel_thirds_hdr_neon( +void FUSED_HOT fused_kernel_thirds_hdr_neon( const fused_hdr_kernel_params_t *p, const uint16_t *src_y, const uint16_t *src_u, diff --git a/src/kernels_neon.c b/src/kernels_neon.c index fc1b57b..b71863c 100644 --- a/src/kernels_neon.c +++ b/src/kernels_neon.c @@ -37,7 +37,7 @@ * as the AVX2 version. */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "internal.h" #include @@ -237,7 +237,7 @@ static void h_filter_halve(const uint8_t *restrict src, * Horizontal: NEON vpaddlq_u8 + vrshrn_n_u16 cascade * ----------------------------------------------------------------------- */ -static void __attribute__((hot)) scale_plane_pow2_neon( +static void FUSED_HOT scale_plane_pow2_neon( const uint8_t *restrict src, int src_w, int src_h, int src_stride, uint32_t active_outputs, @@ -510,7 +510,7 @@ static inline uint8x16_t neon_blend_reg(uint8x16_t a, uint8x16_t b, } -static void __attribute__((hot)) scale_plane_thirds_neon( +static void FUSED_HOT scale_plane_thirds_neon( const uint8_t *restrict src, int src_w, int src_h, int src_stride, uint32_t active_outputs, @@ -580,7 +580,7 @@ static void __attribute__((hot)) scale_plane_thirds_neon( int tail_cols = src_w - tail_start; /* Deinterleave buffer (stack-allocated, 48 bytes aligned) */ - uint8_t __attribute__((aligned(16))) chunk_buf[48]; + _Alignas(16) uint8_t chunk_buf[48]; /* Output row cursors */ int out_row[4] = { 0, 0, 0, 0 }; @@ -909,7 +909,7 @@ static void __attribute__((hot)) scale_plane_thirds_neon( * and half height with the same kernel. * ----------------------------------------------------------------------- */ -void __attribute__((hot)) fused_kernel_pow2_neon(const fused_kernel_params_t *p, +void FUSED_HOT fused_kernel_pow2_neon(const fused_kernel_params_t *p, const uint8_t *src_y, const uint8_t *src_u, const uint8_t *src_v) @@ -963,7 +963,7 @@ void __attribute__((hot)) fused_kernel_pow2_neon(const fused_kernel_params_t *p, } -void __attribute__((hot)) fused_kernel_thirds_neon(const fused_kernel_params_t *p, +void FUSED_HOT fused_kernel_thirds_neon(const fused_kernel_params_t *p, const uint8_t *src_y, const uint8_t *src_u, const uint8_t *src_v) diff --git a/src/kernels_upscale_neon.c b/src/kernels_upscale_neon.c index bf46480..610aed4 100644 --- a/src/kernels_upscale_neon.c +++ b/src/kernels_upscale_neon.c @@ -45,7 +45,7 @@ * */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include "internal.h" #include "upscale_chunk.h" diff --git a/src/log.h b/src/log.h index d95b3c3..b014fc6 100644 --- a/src/log.h +++ b/src/log.h @@ -26,6 +26,9 @@ * call config->callback(level, buf, ctx) */ void fused_log(const fused_log_config_t *config, int level, const char *fmt, ...) - __attribute__((format(printf, 3, 4))); +#if defined(__GNUC__) || defined(__clang__) + __attribute__((format(printf, 3, 4))) +#endif + ; #endif /* FUNNELCAKE_LOG_H */ diff --git a/src/tonemap.c b/src/tonemap.c index 8a0db13..d83c770 100644 --- a/src/tonemap.c +++ b/src/tonemap.c @@ -18,10 +18,12 @@ #if defined(__x86_64__) #include #endif -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) #include #endif +/* FUSED_HOT and FUSED_PREFETCH are provided by internal.h. */ + /* -------------------------------------------------------------------------- * ST 2084 (PQ) EOTF constants @@ -356,7 +358,10 @@ void fused_tonemap_generate_luts(fused_hdr_internal_t *hdr, hdr->linear_to_sdr[i] = (uint8_t)clamp_i((int)(V * 255.0 + 0.5), 0, 255); } - fused_log(log_warn, FUSED_LOG_WARN, + /* One-time-per-init diagnostic: emitted at INFO level so callback-based + * loggers can filter it out without losing real warnings. Stderr/file + * targets will see it on every init. */ + fused_log(log_warn, FUSED_LOG_INFO, "funnelcake: tone map LUTs generated - transfer=%s curve=%d " "peak=%d target=%d\n", (src_transfer == FUSED_TRC_HLG) ? "HLG" : "PQ", @@ -444,7 +449,7 @@ static void tonemap_luma_avx2(const uint8_t *lut_y, #endif /* __x86_64__ */ -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) /* * tonemap_luma_neon - exact LUT tone mapping, 16 pixels per iteration. @@ -466,7 +471,7 @@ static void tonemap_luma_neon(const uint8_t *lut_y, int x = 0; if (y + 1 < height) - __builtin_prefetch(sy + src_y_pitch, 0, 2); + FUSED_PREFETCH(sy + src_y_pitch); for (; x < simd_w; x += 16) { uint16x8_t v0 = vandq_u16(vld1q_u16(sy + x), mask10); @@ -575,7 +580,7 @@ static inline void tonemap_pixel_rgb( } -__attribute__((hot)) +FUSED_HOT void fused_tonemap_apply( const fused_hdr_internal_t *state, const uint16_t *src_y, int src_y_stride, @@ -608,7 +613,7 @@ void fused_tonemap_apply( tonemap_luma_avx2(lut_y, src_y, src_y_stride, dst_y, dst_y_stride, width, height); } else -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) if (fused_detect_cpu()->has_neon) { tonemap_luma_neon(lut_y, src_y, src_y_stride, dst_y, dst_y_stride, width, height); @@ -705,7 +710,7 @@ void fused_tonemap_apply( * fused_tonemap_apply_p010 - interleaved P010 chroma * -------------------------------------------------------------------------- */ -__attribute__((hot)) +FUSED_HOT void fused_tonemap_apply_p010( const fused_hdr_internal_t *state, const uint16_t *src_y, int src_y_stride, @@ -732,7 +737,7 @@ void fused_tonemap_apply_p010( tonemap_luma_avx2(lut_y, src_y, src_y_stride, dst_y, dst_y_stride, width, height); } else -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) if (fused_detect_cpu()->has_neon) { tonemap_luma_neon(lut_y, src_y, src_y_stride, dst_y, dst_y_stride, width, height);