Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 173 additions & 39 deletions app/src/main/cpp/winlator/drawable.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include <android/bitmap.h>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Thick-line rendering is now angle-dependent because the two branches draw different pixel footprints.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At app/src/main/cpp/winlator/drawable.c, line 270:

<comment>Thick-line rendering is now angle-dependent because the two branches draw different pixel footprints.</comment>

<file context>
@@ -203,19 +246,37 @@ Java_com_winlator_xserver_Drawable_drawLine(JNIEnv *env, jclass obj, jshort x0,
     while (true) {
-        for (int16_t i = 0; i < lineWidth; i++) {
-            memcpy(dataAddr + (x0 + (i + y0) * stride) * 4, row, rowSize);
+        if (isHorizontal) {
+            // Horizontal-ish: write a full row of pixels at once
+            for (int16_t i = 0; i < lineWidth; i++) {
</file context>

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Writing uint32_t pixels through a casted uint8_t stack buffer can be misaligned and violates aliasing/alignment guarantees; use a properly aligned uint32_t buffer instead.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At app/src/main/cpp/winlator/drawable.c, line 219:

<comment>Writing `uint32_t` pixels through a casted `uint8_t` stack buffer can be misaligned and violates aliasing/alignment guarantees; use a properly aligned `uint32_t` buffer instead.</comment>

<file context>
@@ -168,18 +203,26 @@ Java_com_winlator_xserver_Drawable_fillRect(JNIEnv *env, jclass obj, jshort x, j
 
-    for (int i = 0; i < rowSize; i += 4) memcpy(row + i, rgba, 4);
+    uint32_t color32 = ((uint32_t)rgba[3] << 24) | ((uint32_t)rgba[2] << 16) | ((uint32_t)rgba[1] << 8) | rgba[0];
+    uint32_t *row32 = (uint32_t *)(void *)row;  // (void*) cast silences strict-aliasing warning
+    for (int i = 0; i < width; i++) row32[i] = color32;
     for (int16_t i = 0; i < height; i++) {
</file context>

#include <android/log.h>

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

#define WHITE 0xffffff
#define BLACK 0x000000
#define printf(...) __android_log_print(ANDROID_LOG_DEBUG, "System.out", __VA_ARGS__);
Expand Down Expand Up @@ -85,8 +89,24 @@ Java_com_winlator_xserver_Drawable_drawBitmap(JNIEnv *env, jclass obj,
}

int stride = getBitmapBytePad(width);
for (int16_t y = 0, x; y < height; y++) {
for (x = 0; x < width; x++) *dstDataAddr++ = getBit(srcDataAddr, x) ? WHITE : BLACK;
int fullBytes = width >> 3; // number of complete 8-pixel bytes per row
int remainder = width & 7; // leftover pixels in the last partial byte */

for (int16_t y = 0; y < height; y++) {
// Unpack all 8 pixels from full byte
for (int b = 0; b < fullBytes; b++) {
uint8_t byte = srcDataAddr[b];
for (int bit = 0; bit < 8; bit++) {
*dstDataAddr++ = (byte >> bit) & 1 ? WHITE : BLACK;
}
}
// Handle remainders and grab only first 4 bits (X11 pads out last 4)
if (remainder) {
uint8_t byte = srcDataAddr[fullBytes];
for (int bit = 0; bit < remainder; bit++) {
*dstDataAddr++ = (byte >> bit) & 1 ? WHITE : BLACK;
}
}
srcDataAddr += stride;
}
}
Expand Down Expand Up @@ -137,18 +157,33 @@ Java_com_winlator_xserver_Drawable_copyAreaOp(JNIEnv *env, jclass obj, jshort sr
return;
}

// Fast path: GCF_COPY is plain pixel blitting — copy only RGB bytes to match
if (gcFunction == GCF_COPY) {
for (int16_t y = 0; y < height; y++) {
// Hoist row base pointers: eliminates a multiply per pixel in the inner loop
uint8_t *s = srcDataAddr + (srcX + (y + srcY) * srcStride) * 4;
uint8_t *d = dstDataAddr + (dstX + (y + dstY) * dstStride) * 4;
for (int16_t x = 0; x < width; x++, s += 4, d += 4) {
d[0] = s[0]; d[1] = s[1]; d[2] = s[2];
/* byte 3 (alpha) intentionally not copied */
}
}
return;
}

for (int16_t y = 0; y < height; y++) {
for (int16_t x = 0; x < width; x++) {
int i = (x + srcX + (y + srcY) * srcStride) * 4;
int j = (x + dstX + (y + dstY) * dstStride) * 4;
int srcColor = (srcDataAddr[i+0] << 16) | (srcDataAddr[i+1] << 8) | srcDataAddr[i+2];
int dstColor = (dstDataAddr[j+0] << 16) | (dstDataAddr[j+1] << 8) | dstDataAddr[j+2];
// Hoist row base pointers: eliminates a multiply per pixel in the inner loop
uint8_t *s = srcDataAddr + (srcX + (y + srcY) * srcStride) * 4;
uint8_t *d = dstDataAddr + (dstX + (y + dstY) * dstStride) * 4;
for (int16_t x = 0; x < width; x++, s += 4, d += 4) {
int srcColor = ((int)s[0] << 16) | ((int)s[1] << 8) | s[2];
int dstColor = ((int)d[0] << 16) | ((int)d[1] << 8) | d[2];

dstColor = setPixelOp(srcColor, dstColor, gcFunction);

dstDataAddr[j+0] = (dstColor >> 16) & 0xff;
dstDataAddr[j+1] = (dstColor >> 8) & 0xff;
dstDataAddr[j+2] = dstColor & 0xff;
d[0] = (dstColor >> 16) & 0xff;
d[1] = (dstColor >> 8) & 0xff;
d[2] = dstColor & 0xff;
}
}
}
Expand All @@ -168,18 +203,26 @@ Java_com_winlator_xserver_Drawable_fillRect(JNIEnv *env, jclass obj, jshort x, j
unpackColor(color, rgba);

int rowSize = width * 4;
uint8_t *row = malloc(rowSize);
if (!row) {
printf("Error: Failed to allocate memory for row\n");
return;
uint8_t stackRow[4096 * 4];
uint8_t *row = stackRow;
bool heapRow = false;
if (width > 4096) {
row = malloc(rowSize);
if (!row) {
printf("Error: Failed to allocate memory for row\n");
return;
}
heapRow = true;
}

for (int i = 0; i < rowSize; i += 4) memcpy(row + i, rgba, 4);
uint32_t color32 = ((uint32_t)rgba[3] << 24) | ((uint32_t)rgba[2] << 16) | ((uint32_t)rgba[1] << 8) | rgba[0];
Comment thread
phobos665 marked this conversation as resolved.
uint32_t *row32 = (uint32_t *)(void *)row; // (void*) cast silences strict-aliasing warning
for (int i = 0; i < width; i++) row32[i] = color32;
Comment thread
phobos665 marked this conversation as resolved.
for (int16_t i = 0; i < height; i++) {
memcpy(dataAddr + (x + (i + y) * stride) * 4, row, rowSize);
}

free(row);
if (heapRow) free(row);
}

JNIEXPORT void JNICALL
Expand All @@ -192,7 +235,7 @@ Java_com_winlator_xserver_Drawable_drawLine(JNIEnv *env, jclass obj, jshort x0,
printf("Error: NULL buffer address in drawLine\n");
return;
}

int dx = abs(x1-x0);
int dy = -abs(y1-y0);
int8_t sx = x0 < x1 ? 1 : -1;
Expand All @@ -203,19 +246,37 @@ Java_com_winlator_xserver_Drawable_drawLine(JNIEnv *env, jclass obj, jshort x0,
unpackColor(color, rgba);

int rowSize = lineWidth * 4;
uint8_t *row = malloc(rowSize);
if (!row) {
printf("Error: Failed to allocate memory for row\n");
return;
uint8_t stackRow[4096 * 4];
uint8_t *row = stackRow;
bool heapRow = false;
if (lineWidth > 4096) {
row = malloc(rowSize);
if (!row) {
printf("Error: Failed to allocate memory for row\n");
return;
}
heapRow = true;
}

for (int i = 0; i < rowSize; i += 4) {
memcpy(row + i, rgba, 4);
}
uint32_t color32 = ((uint32_t)rgba[3] << 24) | ((uint32_t)rgba[2] << 16) | ((uint32_t)rgba[1] << 8) | rgba[0];
uint32_t *row32 = (uint32_t *)(void *)row;
for (int i = 0; i < lineWidth; i++) row32[i] = color32;

/* Determine dominant direction once before the loop — not per-step,
* since x0/y0 change each iteration and would flip the branch mid-line. */
bool isHorizontal = abs(x1 - x0) >= abs(y1 - y0);

while (true) {
for (int16_t i = 0; i < lineWidth; i++) {
memcpy(dataAddr + (x0 + (i + y0) * stride) * 4, row, rowSize);
if (isHorizontal) {
// Horizontal-ish: write a full row of pixels at once
for (int16_t i = 0; i < lineWidth; i++) {
memcpy(dataAddr + (x0 + (i + y0) * stride) * 4, row, rowSize);
}
} else {
// Vertical-ish: write individual pixels
for (int16_t i = 0; i < lineWidth; i++) {
((uint32_t *)dataAddr)[(x0 + i) + y0 * stride] = color32;
}
}
Comment on lines 269 to 280
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Show prior drawLine implementation to confirm what brush shape was rendered before this PR.
git log --all --oneline -- app/src/main/cpp/winlator/drawable.c | head -n 20
git show HEAD~1:app/src/main/cpp/winlator/drawable.c 2>/dev/null | sed -n '/Java_com_winlator_xserver_Drawable_drawLine/,/^}/p'

Repository: utkarshdalal/GameNative

Length of output: 458


🏁 Script executed:

#!/bin/bash
# First, find and read the drawable.c file
find . -name "drawable.c" -type f

Repository: utkarshdalal/GameNative

Length of output: 106


🏁 Script executed:

#!/bin/bash
# Read the drawable.c file to see the full drawLine implementation around lines 269-280
wc -l app/src/main/cpp/winlator/drawable.c
sed -n '240,300p' app/src/main/cpp/winlator/drawable.c

Repository: utkarshdalal/GameNative

Length of output: 2059


🏁 Script executed:

#!/bin/bash
# Try to get the git history to find previous versions
cd app/src/main/cpp/winlator && git log --oneline drawable.c | head -20

Repository: utkarshdalal/GameNative

Length of output: 183


🏁 Script executed:

#!/bin/bash
# Show the merge commit details and parents
cd app/src/main/cpp/winlator && git show 874b6f53 --stat | head -40

Repository: utkarshdalal/GameNative

Length of output: 2520


🏁 Script executed:

#!/bin/bash
# Get the prior version before the merge
cd app/src/main/cpp/winlator && git show 874b6f53^:drawable.c 2>/dev/null | sed -n '240,300p'

Repository: utkarshdalal/GameNative

Length of output: 49


🏁 Script executed:

#!/bin/bash
# Also try the other parent
cd app/src/main/cpp/winlator && git show 874b6f53^2:drawable.c 2>/dev/null | sed -n '240,300p'

Repository: utkarshdalal/GameNative

Length of output: 49


🏁 Script executed:

#!/bin/bash
# Show git diff for this file in the merge to see what changed
cd app/src/main/cpp/winlator && git show 874b6f53 -- drawable.c | grep -A 50 -B 10 "isHorizontal" | head -100

Repository: utkarshdalal/GameNative

Length of output: 2678


🏁 Script executed:

#!/bin/bash
# Check what branches exist to understand the structure better
git branch -a | head -20

Repository: utkarshdalal/GameNative

Length of output: 171


🏁 Script executed:

#!/bin/bash
# Try to find commits that specifically changed drawLine
cd app/src/main/cpp/winlator && git log --all --oneline -S "isHorizontal" -- drawable.c

Repository: utkarshdalal/GameNative

Length of output: 183


🏁 Script executed:

#!/bin/bash
# Check if there's a prior version of the file on master branch
git show master:app/src/main/cpp/winlator/drawable.c 2>&1 | grep -A 20 "drawLine" | head -40

Repository: utkarshdalal/GameNative

Length of output: 913


🏁 Script executed:

#!/bin/bash
# Get full history of commits before the merge
git log --oneline --all | head -20

Repository: utkarshdalal/GameNative

Length of output: 1386


🏁 Script executed:

#!/bin/bash
# Try to see the actual state on master before this merge
git show master:app/src/main/cpp/winlator/drawable.c 2>&1 | wc -l

Repository: utkarshdalal/GameNative

Length of output: 71


🏁 Script executed:

#!/bin/bash
# Get the full drawLine function from master
git show master:app/src/main/cpp/winlator/drawable.c 2>&1 | sed -n '/Java_com_winlator_xserver_Drawable_drawLine/,/^}/p'

Repository: utkarshdalal/GameNative

Length of output: 1351


Inconsistent brush shape between horizontal and vertical branches when lineWidth > 1 is a regression.

The prior implementation painted a lineWidth × lineWidth square at every Bresenham step for all lines. The current code breaks this:

  • Horizontal-ish: Unchanged — still paints lineWidth × lineWidth square (memcpy loop over lineWidth iterations, each writing rowSize bytes).
  • Vertical-ish: New asymmetry — paints only lineWidth × 1 horizontal strip (single-pixel-height row at y0).

For lineWidth == 1 both are identical, but for thicker lines this creates asymmetric rendering: vertical-ish lines render with reduced thickness compared to horizontal-ish lines, breaking the prior consistent behavior.

Either restore uniform square brushes (apply the memcpy approach to both branches), or implement symmetric perpendicular strips (write a column of lineWidth pixels for horizontal, row for vertical). The current mixed approach is a regression.

Symmetric perpendicular brush fix
        if (isHorizontal) {
-            // Horizontal-ish: write a full row of pixels at once
+            // Horizontal-ish: thicken vertically — column of lineWidth pixels at x0.
            for (int16_t i = 0; i < lineWidth; i++) {
-                memcpy(dataAddr + (x0 + (i + y0) * stride) * 4, row, rowSize);
+                ((uint32_t *)dataAddr)[x0 + (y0 + i) * stride] = color32;
            }
        } else {
            // Vertical-ish: write individual pixels
            for (int16_t i = 0; i < lineWidth; i++) {
                ((uint32_t *)dataAddr)[(x0 + i) + y0 * stride] = color32;
            }
        }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/src/main/cpp/winlator/drawable.c` around lines 269 - 280, The vertical
branch in the drawing loop (conditioned on isHorizontal) writes only a
lineWidth×1 strip while the horizontal branch paints a lineWidth×lineWidth
square, causing asymmetric brush shapes for lineWidth>1; update the vertical-ish
branch (the loop that uses ((uint32_t *)dataAddr)[(x0 + i) + y0 * stride] =
color32) to paint the same square brush as the horizontal branch (use the same
memcpy-based row writes or an equivalent nested loop writing rowSize bytes for
each of lineWidth rows at each Bresenham step using dataAddr, x0, y0, stride,
row, rowSize and color32) so both branches produce identical lineWidth×lineWidth
coverage per step.

if (x0 == x1 && y0 == y1) break;

Expand All @@ -230,7 +291,7 @@ Java_com_winlator_xserver_Drawable_drawLine(JNIEnv *env, jclass obj, jshort x0,
}
}

free(row);
if (heapRow) free(row);
}

JNIEXPORT void JNICALL
Expand All @@ -240,22 +301,47 @@ Java_com_winlator_xserver_Drawable_drawAlphaMaskedBitmap(JNIEnv *env, jclass obj
jbyte backGreen, jbyte backBlue,
jobject srcData, jobject maskData,
jobject dstData) {
int *srcDataAddr = (*env)->GetDirectBufferAddress(env, srcData);
int *maskDataAddr = (*env)->GetDirectBufferAddress(env, maskData);
int *dstDataAddr = (*env)->GetDirectBufferAddress(env, dstData);
uint32_t *srcDataAddr = (*env)->GetDirectBufferAddress(env, srcData);
uint32_t *maskDataAddr = (*env)->GetDirectBufferAddress(env, maskData);
uint32_t *dstDataAddr = (*env)->GetDirectBufferAddress(env, dstData);

if (!srcDataAddr || !maskDataAddr || !dstDataAddr) {
printf("Error: NULL buffer address in drawAlphaMaskedBitmap\n");
return;
}

int foreColor = packColor(foreRed, foreGreen, foreBlue);
int backColor = packColor(backRed, backGreen, backBlue);
uint32_t foreColor = (uint32_t)packColor(foreRed, foreGreen, foreBlue) | 0xff000000u;
uint32_t backColor = (uint32_t)packColor(backRed, backGreen, backBlue) | 0xff000000u;

jlong dstLength = (*env)->GetDirectBufferCapacity(env, dstData) / 4;
for (int i = 0; i < dstLength; i++) {
dstDataAddr[i] = maskDataAddr[i] == WHITE ? (srcDataAddr[i] == WHITE ? foreColor : backColor) | 0xff000000 : 0x00000000;
const uint32_t whiteMask = (uint32_t)WHITE;
#ifdef __ARM_NEON
uint32x4_t vFore = vdupq_n_u32(foreColor);
uint32x4_t vBack = vdupq_n_u32(backColor);
uint32x4_t vWhite = vdupq_n_u32(whiteMask);
uint32x4_t vZero = vdupq_n_u32(0u);
jlong i = 0;
for (; i + 3 < dstLength; i += 4) {
uint32x4_t vMask = vld1q_u32(maskDataAddr + i);
uint32x4_t vSrc = vld1q_u32(srcDataAddr + i);
uint32x4_t maskIsWhite = vceqq_u32(vMask, vWhite);
uint32x4_t srcIsWhite = vceqq_u32(vSrc, vWhite);
uint32x4_t color = vbslq_u32(srcIsWhite, vFore, vBack);
uint32x4_t result = vbslq_u32(maskIsWhite, color, vZero);
vst1q_u32(dstDataAddr + i, result);
}
for (; i < dstLength; i++) {
dstDataAddr[i] = maskDataAddr[i] == whiteMask
? (srcDataAddr[i] == whiteMask ? foreColor : backColor)
: 0u;
}
#else
for (jlong i = 0; i < dstLength; i++) {
dstDataAddr[i] = maskDataAddr[i] == whiteMask
? (srcDataAddr[i] == whiteMask ? foreColor : backColor)
: 0u;
}
#endif
}

/* replace the whole JNI body */
Expand Down Expand Up @@ -302,12 +388,60 @@ Java_com_winlator_xserver_Pixmap_toBitmap(JNIEnv *env, jclass obj, jobject color
return;
}

for (int i = 0, size = info.width * info.height * 4; i < size; i += 4) {
pixels[i+2] = colorDataAddr[i+0];
pixels[i+1] = colorDataAddr[i+1];
pixels[i+0] = colorDataAddr[i+2];
pixels[i+3] = maskDataAddr ? maskDataAddr[i+0] : colorDataAddr[i+3];
int size = info.width * info.height;
uint8_t *src = (uint8_t *)colorDataAddr;

// Byte-Swapping using ARM NEON in order to rely on a shuffle table
// to reduce operations since they're deterministic.
#ifdef __ARM_NEON
if (!maskDataAddr) {
/* Fast path: no mask — swap R and B channels across 4 pixels at a time.
* src layout per pixel: [R, G, B, A]
* dst layout per pixel: [B, G, R, A]
* vrev32q_u8 reverses the 4 bytes within each 32-bit pixel: RGBA → ABGR,
* which maps R→B and B→R with G and A landing in wrong positions.
* Instead we use vtbl (byte table lookup) to do an exact per-byte shuffle. */
static const uint8_t shuffle[16] = {
2, 1, 0, 3, /* pixel 0: swap bytes 0 and 2 (R↔B), keep 1 (G) and 3 (A) */
6, 5, 4, 7, /* pixel 1 */
10, 9, 8, 11, /* pixel 2 */
14, 13, 12, 15 /* pixel 3 */
};
uint8x16_t vShuffle = vld1q_u8(shuffle);
int i = 0;
for (; i + 3 < size; i += 4) {
uint8x16_t vSrc = vld1q_u8(src + i * 4);
uint8x16_t vDst = vqtbl1q_u8(vSrc, vShuffle);
vst1q_u8(pixels + i * 4, vDst);
}
Comment on lines +396 to +416
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Confirm whether the project targets armeabi-v7a (in which case __ARM_NEON without __aarch64__ is reachable).
fd -t f -e gradle -e kts | xargs rg -n -C2 'abiFilters|ndk\s*\{|armeabi-v7a|arm64-v8a' 2>/dev/null
fd -i 'application\.mk|android\.mk|cmakelists\.txt' | xargs rg -n -C2 'APP_ABI|ANDROID_ABI|armeabi|arm64' 2>/dev/null

Repository: utkarshdalal/GameNative

Length of output: 315


vqtbl1q_u8 is AArch64-only — guard must include __aarch64__.

The project explicitly targets both arm64-v8a and armeabi-v7a (as seen in app/build.gradle.kts:76). The code uses #ifdef __ARM_NEON, which is defined for both ARMv7-A and AArch64, but vqtbl1q_u8 is an A64-only intrinsic and does not exist on ARMv7-A NEON. This will cause a compilation failure on 32-bit ARM builds.

Change the guard to:

-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) && defined(__aarch64__)

The scalar #else branch already handles both cases, so 32-bit ARM will fall back to the portable path unchanged. Alternatively, if you need NEON vectorization on ARMv7-A, use vtbl1_u8 on uint8x8_t halves instead.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
#ifdef __ARM_NEON
if (!maskDataAddr) {
/* Fast path: no maskswap R and B channels across 4 pixels at a time.
* src layout per pixel: [R, G, B, A]
* dst layout per pixel: [B, G, R, A]
* vrev32q_u8 reverses the 4 bytes within each 32-bit pixel: RGBAABGR,
* which maps RB and BR with G and A landing in wrong positions.
* Instead we use vtbl (byte table lookup) to do an exact per-byte shuffle. */
static const uint8_t shuffle[16] = {
2, 1, 0, 3, /* pixel 0: swap bytes 0 and 2 (R↔B), keep 1 (G) and 3 (A) */
6, 5, 4, 7, /* pixel 1 */
10, 9, 8, 11, /* pixel 2 */
14, 13, 12, 15 /* pixel 3 */
};
uint8x16_t vShuffle = vld1q_u8(shuffle);
int i = 0;
for (; i + 3 < size; i += 4) {
uint8x16_t vSrc = vld1q_u8(src + i * 4);
uint8x16_t vDst = vqtbl1q_u8(vSrc, vShuffle);
vst1q_u8(pixels + i * 4, vDst);
}
`#if` defined(__ARM_NEON) && defined(__aarch64__)
if (!maskDataAddr) {
/* Fast path: no maskswap R and B channels across 4 pixels at a time.
* src layout per pixel: [R, G, B, A]
* dst layout per pixel: [B, G, R, A]
* vrev32q_u8 reverses the 4 bytes within each 32-bit pixel: RGBAABGR,
* which maps RB and BR with G and A landing in wrong positions.
* Instead we use vtbl (byte table lookup) to do an exact per-byte shuffle. */
static const uint8_t shuffle[16] = {
2, 1, 0, 3, /* pixel 0: swap bytes 0 and 2 (R↔B), keep 1 (G) and 3 (A) */
6, 5, 4, 7, /* pixel 1 */
10, 9, 8, 11, /* pixel 2 */
14, 13, 12, 15 /* pixel 3 */
};
uint8x16_t vShuffle = vld1q_u8(shuffle);
int i = 0;
for (; i + 3 < size; i += 4) {
uint8x16_t vSrc = vld1q_u8(src + i * 4);
uint8x16_t vDst = vqtbl1q_u8(vSrc, vShuffle);
vst1q_u8(pixels + i * 4, vDst);
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/src/main/cpp/winlator/drawable.c` around lines 396 - 416, The NEON block
uses the AArch64-only intrinsic vqtbl1q_u8 under the existing `#ifdef` __ARM_NEON,
which breaks 32-bit ARM builds; update the preprocessor guard to require AArch64
(e.g., check __aarch64__ in addition to __ARM_NEON) around the vqtbl1q_u8 path
(the static shuffle, vShuffle, vld1q_u8/vqtbl1q_u8/vst1q_u8 loop) so armeabi-v7a
falls back to the existing scalar/#else branch, or replace with an
ARMv7-compatible implementation using vtbl1_u8 on uint8x8_t halves if you want
NEON on 32-bit.

/* Scalar cleanup for remaining 0-3 pixels */
for (; i < size; i++) {
int j = i * 4;
pixels[j+0] = src[j+2];
pixels[j+1] = src[j+1];
pixels[j+2] = src[j+0];
pixels[j+3] = src[j+3];
}
} else {
/* Mask path — scalar, same as before */
uint8_t *mask = (uint8_t *)maskDataAddr;
for (int i = 0; i < size; i++) {
int j = i * 4;
pixels[j+0] = src[j+2];
pixels[j+1] = src[j+1];
pixels[j+2] = src[j+0];
pixels[j+3] = mask[j];
}
}
#else
for (int i = 0; i < size; i++) {
int j = i * 4;
pixels[j+0] = src[j+2];
pixels[j+1] = src[j+1];
pixels[j+2] = src[j+0];
pixels[j+3] = maskDataAddr ? ((uint8_t *)maskDataAddr)[j] : src[j+3];
}
#endif

AndroidBitmap_unlockPixels(env, bitmap);
}
Loading