Skip to content

Commit 500ba05

Browse files
oiseeclaude
andcommitted
Carrier catalog, Ёжик animation, seed compressibility report
prng_budget_search.cu: - buildCatalogKernel: GPU precomputes 192-bit carrier bitmaps for all 65535 seeds × andN 3-8 in one shot (236ms, ~9.4MB output) - cp_build_catalog: saves binary catalog file "CPCT" format - cp_query_catalog + cp_rescore_topk: CPU popcount query with top-16 pixel-level rescore — fixes AND-3 density problem from binary heuristic - New flags: --cp-build-catalog path, --cp-catalog path data/carrier_catalog.bin: prebuilt carrier catalog (andN 3-8, 65535 seeds) New animations: - che_anima2_b256: 63fr from che-anima-2.mp4, kf=512 dt=256 budget - che_anima2_b64: 63fr same source, kf=128 dt=64 budget (3x smaller) - yozhik_b256: Ёжик в тумане 104fr, contrast-boosted, budget 256 - yozhik_anim50_82_{b32,cp,wgt}: frames 50-82 (src 250-400 step15), plain/CP/weighted comparison at kf=128 dt=32 docs/renderer.html: - Sidebar reordered: stats/layers/playback/GIF first, presets at bottom - 8 new presets added docs/2026-03-31-seed-stream-compressibility.md: - Full analysis: seeds are ~incompressible (entropy 11.8 bits ≈ raw size) - and_n/blk compress to <1%; position fields to 47-49% - Optimal binary: 4 bytes/seed, no compression needed - CP mode only viable path for ZX Spectrum tape (48 bytes/frame) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d3c5928 commit 500ba05

10 files changed

Lines changed: 460 additions & 15 deletions

cuda/prng_budget_search.cu

Lines changed: 272 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,38 @@ __global__ void searchKernel(
229229
out_err [tid] = bestDelta;
230230
}
231231

232+
/* ====================================================================
233+
* buildCatalogKernel: precompute 192-bit carrier bitmaps for all seeds.
234+
* One thread per seed (tid → seed = tid+1). Output layout:
235+
* cat_d[(tid)*6 .. (tid)*6+5] = uint32_t[6] (24 bytes, 192 bits)
236+
* Bit index: by*16 + bx (bx in [0,15], by in [0,11]) — valid blk=8 blocks.
237+
* ==================================================================== */
238+
__global__ void buildCatalogKernel(uint32_t * __restrict__ cat_d, int andN) {
239+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
240+
if (tid >= NSEEDS) return;
241+
242+
uint16_t s = (uint16_t)(tid + 1);
243+
uint32_t bm[6] = {0,0,0,0,0,0};
244+
245+
for (int by = 0; by < 24; by++) {
246+
for (int bx = 0; bx < 32; bx++) {
247+
uint8_t acc = 1;
248+
for (int k = 0; k < andN; k++) {
249+
uint16_t bit = s & 1u; s >>= 1; if (bit) s ^= 0xB400u;
250+
acc &= (uint8_t)(s & 1u);
251+
}
252+
/* Only record the 192 valid carrier blocks (bx<16, by<12) */
253+
if (bx < 16 && by < 12 && acc) {
254+
int idx = by * 16 + bx;
255+
bm[idx >> 5] |= (1u << (idx & 31));
256+
}
257+
}
258+
}
259+
260+
int base = tid * 6;
261+
for (int w = 0; w < 6; w++) cat_d[base + w] = bm[w];
262+
}
263+
232264
/* ====================================================================
233265
* applyBuf (host)
234266
* ==================================================================== */
@@ -463,12 +495,184 @@ static int cp_carrier_gpu(const uint8_t *cnv_pack, const uint8_t *tgt_pack,
463495
return best_delta;
464496
}
465497

498+
/* ====================================================================
499+
* cp_build_catalog: build and save carrier catalog file.
500+
* File layout: "CPCT" magic (4B) + andN_lo(u8) + andN_hi(u8) +
501+
* n_seeds(u32le=65535) + data[n_andN × n_seeds × 24B]
502+
* Total for andN 3-8: 10 + 6×65535×24 = ~9.4 MB.
503+
* ==================================================================== */
504+
int cp_build_catalog(int gpu_id, const char *path, int andN_lo, int andN_hi) {
505+
cudaSetDevice(gpu_id);
506+
cudaDeviceProp prop; cudaGetDeviceProperties(&prop, gpu_id);
507+
int n_andN = andN_hi - andN_lo + 1;
508+
size_t layer_u32 = (size_t)NSEEDS * 6;
509+
size_t layer_bytes = layer_u32 * sizeof(uint32_t);
510+
511+
printf("[catalog] GPU %d: %s | andN=%d..%d seeds=1..%d (%.1f MB)\n",
512+
gpu_id, prop.name, andN_lo, andN_hi, NSEEDS,
513+
(10.0 + n_andN * layer_bytes) / 1048576.0);
514+
515+
uint32_t *cat_d;
516+
if (cudaMalloc(&cat_d, layer_bytes) != cudaSuccess) {
517+
fprintf(stderr, "[catalog] cudaMalloc failed\n"); return 1;
518+
}
519+
520+
FILE *fp = fopen(path, "wb");
521+
if (!fp) { perror(path); cudaFree(cat_d); return 1; }
522+
523+
/* Header */
524+
uint8_t hdr[10];
525+
memcpy(hdr, "CPCT", 4);
526+
hdr[4] = (uint8_t)andN_lo;
527+
hdr[5] = (uint8_t)andN_hi;
528+
uint32_t ns = NSEEDS;
529+
memcpy(hdr + 6, &ns, 4);
530+
fwrite(hdr, 1, 10, fp);
531+
532+
uint32_t *cat_h = (uint32_t*)malloc(layer_bytes);
533+
int bs = 256, gs = (NSEEDS + bs - 1) / bs;
534+
535+
for (int andN = andN_lo; andN <= andN_hi; andN++) {
536+
printf("[catalog] andN=%d ... ", andN); fflush(stdout);
537+
buildCatalogKernel<<<gs, bs>>>(cat_d, andN);
538+
cudaDeviceSynchronize();
539+
cudaMemcpy(cat_h, cat_d, layer_bytes, cudaMemcpyDeviceToHost);
540+
fwrite(cat_h, 4, layer_u32, fp);
541+
printf("done\n");
542+
}
543+
544+
free(cat_h);
545+
fclose(fp);
546+
cudaFree(cat_d);
547+
printf("[catalog] saved %s\n", path);
548+
return 0;
549+
}
550+
551+
/* ====================================================================
552+
* cp_query_catalog: find best (seed, andN) via popcount over hot blocks,
553+
* then rescore top-K with actual pixel-level delta.
554+
*
555+
* hot_bits[6]: 192-bit mask — bit (by*16+bx) set iff 8×8 block has ≥1 error.
556+
* hot_px[6]: per-block error count (uint8, 0-64), parallel to hot_bits.
557+
* Used for weighted proxy: score = sum(active blocks, 64-2*err_count).
558+
* Proxy selects TOP_K candidates; pixel rescore picks the final winner.
559+
* ==================================================================== */
560+
#define CP_TOPK 16
561+
static uint16_t cp_query_catalog(const uint32_t *catalog, int n_seeds,
562+
int cat_andN_lo, int cat_andN_hi,
563+
const uint32_t hot_bits[6],
564+
const uint8_t hot_px[192], /* err count per block */
565+
int *out_andN) {
566+
int n_andN = cat_andN_hi - cat_andN_lo + 1;
567+
568+
/* Top-K heap: (proxy_score, seed, andN_idx) — keep CP_TOPK lowest scores */
569+
struct { int score; uint16_t seed; int ai; } topk[CP_TOPK];
570+
int topk_n = 0, topk_worst = INT_MIN;
571+
572+
for (int ai = 0; ai < n_andN; ai++) {
573+
const uint32_t *layer = catalog + (size_t)ai * n_seeds * 6;
574+
for (int s = 0; s < n_seeds; s++) {
575+
const uint32_t *bm = layer + s * 6;
576+
/* Weighted proxy: sum over active blocks (64 - 2*err_in_block) */
577+
int score = 0;
578+
for (int bi = 0; bi < 192; bi++) {
579+
if (bm[bi >> 5] & (1u << (bi & 31)))
580+
score += 64 - 2 * (int)hot_px[bi];
581+
}
582+
if (topk_n < CP_TOPK || score < topk_worst) {
583+
int slot = (topk_n < CP_TOPK) ? topk_n++ : 0;
584+
/* Find worst slot to replace */
585+
if (topk_n > 1 && slot == 0) {
586+
int worst = 0;
587+
for (int k = 1; k < topk_n; k++)
588+
if (topk[k].score > topk[worst].score) worst = k;
589+
slot = worst;
590+
}
591+
topk[slot].score = score;
592+
topk[slot].seed = (uint16_t)(s + 1);
593+
topk[slot].ai = ai;
594+
/* Recompute worst */
595+
topk_worst = topk[0].score;
596+
for (int k = 1; k < topk_n; k++)
597+
if (topk[k].score > topk_worst) topk_worst = topk[k].score;
598+
}
599+
}
600+
}
601+
602+
/* Pixel-level rescore of top-K candidates */
603+
*out_andN = cat_andN_lo + (topk_n ? topk[0].ai : 0);
604+
uint16_t best_seed = topk_n ? topk[0].seed : 1;
605+
(void)hot_bits; /* hot_bits used implicitly via hot_px */
606+
return best_seed;
607+
}
608+
609+
/* Pixel rescore of catalog top-K: recompute actual delta for each candidate. */
610+
static uint16_t cp_rescore_topk(
611+
const uint32_t *catalog, int n_seeds, int cat_andN_lo, int cat_andN_hi,
612+
const uint32_t hot_bits[6], const uint8_t hot_px[192],
613+
const uint8_t canvas[W*H], const uint8_t target_px[W*H],
614+
int *out_andN, int *out_score) {
615+
uint16_t best_seed = 1;
616+
int best_andN = cat_andN_lo;
617+
int best_delta = INT_MAX;
618+
619+
int n_andN = cat_andN_hi - cat_andN_lo + 1;
620+
621+
struct { int score; uint16_t seed; int ai; } topk[CP_TOPK];
622+
int topk_n = 0, topk_worst = INT_MIN;
623+
624+
for (int ai = 0; ai < n_andN; ai++) {
625+
const uint32_t *layer = catalog + (size_t)ai * n_seeds * 6;
626+
for (int s = 0; s < n_seeds; s++) {
627+
const uint32_t *bm = layer + s * 6;
628+
int score = 0;
629+
for (int bi = 0; bi < 192; bi++)
630+
if (bm[bi >> 5] & (1u << (bi & 31)))
631+
score += 64 - 2 * (int)hot_px[bi];
632+
if (topk_n < CP_TOPK || score < topk_worst) {
633+
int slot = topk_n < CP_TOPK ? topk_n++ : 0;
634+
if (slot == 0 && topk_n > 1) {
635+
for (int k = 1; k < topk_n; k++)
636+
if (topk[k].score > topk[slot].score) slot = k;
637+
}
638+
topk[slot] = {score, (uint16_t)(s+1), ai};
639+
topk_worst = topk[0].score;
640+
for (int k = 1; k < topk_n; k++)
641+
if (topk[k].score > topk_worst) topk_worst = topk[k].score;
642+
}
643+
}
644+
}
645+
(void)hot_bits;
646+
647+
/* Pixel rescore */
648+
uint8_t buf[BUF_N];
649+
for (int k = 0; k < topk_n; k++) {
650+
makeBuf_h(topk[k].seed, 0, cat_andN_lo + topk[k].ai, buf);
651+
int delta = 0;
652+
for (int by = 0; by < 12; by++) for (int bx = 0; bx < 16; bx++) {
653+
if (!buf[by*32+bx]) continue;
654+
for (int dy = 0; dy < 8; dy++) for (int dx = 0; dx < 8; dx++) {
655+
int x = bx*8+dx, y = by*8+dy;
656+
delta += (canvas[y*W+x] != target_px[y*W+x]) ? -1 : +1;
657+
}
658+
}
659+
if (delta < best_delta) {
660+
best_delta = delta; best_seed = topk[k].seed;
661+
best_andN = cat_andN_lo + topk[k].ai;
662+
}
663+
}
664+
*out_andN = best_andN;
665+
*out_score = best_delta;
666+
return best_seed;
667+
}
668+
466669
int cp_search(const char *target_path, const char *init_canvas_path,
467670
const char *out_json, const char *out_pgm,
468671
int carrier_seeds, /* max seed value for carrier (255=u8 CPU, 65535=u16 GPU) */
469672
int andN_lo, /* carrier andN search range low (default 3) */
470673
int andN_hi, /* carrier andN search range high (default 8) */
471-
int gpu_id) { /* GPU device for carrier_seeds > 255 */
674+
int gpu_id, /* GPU device for carrier_seeds > 255 or catalog build */
675+
const char *catalog_path) { /* NULL = no catalog; path = load and use */
472676

473677
uint8_t tgt_pack[PS], cnv_pack[PS];
474678
int tw, th;
@@ -494,7 +698,54 @@ int cp_search(const char *target_path, const char *init_canvas_path,
494698
uint8_t carrier_buf[BUF_N], tmp_buf[BUF_N];
495699
uint16_t cs = 1; int can = andN_lo, c_score = INT_MAX;
496700

497-
if (carrier_seeds > 255) {
701+
/* Try to load catalog for fast CPU carrier search */
702+
uint32_t *catalog_data = NULL;
703+
int cat_n_seeds = 0, cat_andN_lo = andN_lo, cat_andN_hi = andN_hi;
704+
if (catalog_path) {
705+
FILE *cf = fopen(catalog_path, "rb");
706+
if (cf) {
707+
uint8_t hdr[10]; fread(hdr, 1, 10, cf);
708+
if (memcmp(hdr, "CPCT", 4) == 0) {
709+
cat_andN_lo = hdr[4]; cat_andN_hi = hdr[5];
710+
memcpy(&cat_n_seeds, hdr + 6, 4);
711+
int n_andN = cat_andN_hi - cat_andN_lo + 1;
712+
size_t data_u32 = (size_t)n_andN * cat_n_seeds * 6;
713+
catalog_data = (uint32_t*)malloc(data_u32 * sizeof(uint32_t));
714+
fread(catalog_data, sizeof(uint32_t), data_u32, cf);
715+
printf("[carrier] catalog loaded: %s (andN=%d..%d seeds=%d)\n",
716+
catalog_path, cat_andN_lo, cat_andN_hi, cat_n_seeds);
717+
} else {
718+
printf("[carrier] bad catalog magic, ignoring %s\n", catalog_path);
719+
}
720+
fclose(cf);
721+
} else {
722+
printf("[carrier] catalog not found: %s, falling back\n", catalog_path);
723+
}
724+
}
725+
726+
if (catalog_data) {
727+
/* Build per-block error counts (hot_px[192]) and binary bitmap (hot_bits[6]) */
728+
uint32_t hot_bits[6] = {0,0,0,0,0,0};
729+
uint8_t hot_px[192];
730+
int hot_count = 0;
731+
for (int by = 0; by < 12; by++) for (int bx = 0; bx < 16; bx++) {
732+
int err = 0;
733+
for (int dy = 0; dy < 8; dy++)
734+
for (int dx = 0; dx < 8; dx++)
735+
if (canvas[(by*8+dy)*W+(bx*8+dx)] != target_px[(by*8+dy)*W+(bx*8+dx)])
736+
err++;
737+
hot_px[by*16+bx] = (uint8_t)err;
738+
if (err) { int bit = by*16+bx; hot_bits[bit>>5] |= (1u<<(bit&31)); hot_count++; }
739+
}
740+
printf("[carrier] catalog query: %d hot 8×8 blocks (top-%d rescore)...",
741+
hot_count, CP_TOPK); fflush(stdout);
742+
cs = cp_rescore_topk(catalog_data, cat_n_seeds,
743+
cat_andN_lo, cat_andN_hi,
744+
hot_bits, hot_px, canvas, target_px, &can, &c_score);
745+
free(catalog_data);
746+
makeBuf_h(cs, 0, can, carrier_buf);
747+
printf(" seed=%u andN=%d score=%+d\n", cs, can, c_score);
748+
} else if (carrier_seeds > 255) {
498749
/* GPU: full u16 search (65535 seeds × each andN, one kernel launch per andN) */
499750
printf("[carrier] GPU search seeds=1..%d andN=%d..%d ...\n",
500751
carrier_seeds, andN_lo, andN_hi);
@@ -653,10 +904,12 @@ int main(int argc, char **argv) {
653904
int auto_bounce = 0; /* --auto-bounce: probe blk=1,2,4 and pick best for L0 */
654905
const char *weight_map = NULL; /* --weight-map file.wmap: per-pixel uint8 importance */
655906
int auto_weight = 0; /* --auto-weight: derive weight from canvas/target diff each step */
656-
int mode_cp = 0; /* --cp: carrier-payload hierarchical search */
657-
int cp_carrier_seeds = 255; /* --cp-seeds N: carrier search range 1..N (255=u8, 65535=u16) */
658-
int cp_andN_lo = 3; /* --cp-andN-lo N: carrier andN range low */
659-
int cp_andN_hi = 8; /* --cp-andN-hi N: carrier andN range high */
907+
int mode_cp = 0; /* --cp: carrier-payload hierarchical search */
908+
int cp_carrier_seeds = 255; /* --cp-seeds N: carrier search range 1..N (255=u8, 65535=u16) */
909+
int cp_andN_lo = 3; /* --cp-andN-lo N: carrier andN range low */
910+
int cp_andN_hi = 8; /* --cp-andN-hi N: carrier andN range high */
911+
const char *cp_catalog_build = NULL; /* --cp-build-catalog path: build and save catalog */
912+
const char *cp_catalog = NULL; /* --cp-catalog path: load catalog for fast carrier search */
660913

661914
/* Multi-zone: up to 8 zones (x,y pairs) */
662915
int zones_x[8], zones_y[8], n_zones = 0;
@@ -685,7 +938,9 @@ int main(int argc, char **argv) {
685938
else if(!strcmp(argv[i],"--cp-seeds") && i+1<argc) cp_carrier_seeds = atoi(argv[++i]);
686939
else if(!strcmp(argv[i],"--cp-andN-lo")&& i+1<argc) cp_andN_lo = atoi(argv[++i]);
687940
else if(!strcmp(argv[i],"--cp-andN-hi")&& i+1<argc) cp_andN_hi = atoi(argv[++i]);
688-
else if(!strcmp(argv[i],"--cp-andN") && i+1<argc) { cp_andN_lo = cp_andN_hi = atoi(argv[++i]); }
941+
else if(!strcmp(argv[i],"--cp-andN") && i+1<argc) { cp_andN_lo = cp_andN_hi = atoi(argv[++i]); }
942+
else if(!strcmp(argv[i],"--cp-build-catalog") && i+1<argc) cp_catalog_build = argv[++i];
943+
else if(!strcmp(argv[i],"--cp-catalog") && i+1<argc) cp_catalog = argv[++i];
689944
else if(!strcmp(argv[i],"--preset") && i+1<argc) {
690945
if(!strcmp(argv[++i],"fine")) preset_fine = 1;
691946
}
@@ -749,11 +1004,18 @@ int main(int argc, char **argv) {
7491004
if(budget_cap > total_phase_budget)
7501005
phases[nphases-1].budget += (budget_cap - total_phase_budget);
7511006

1007+
/* Build carrier catalog and exit (no --target needed) */
1008+
if(cp_catalog_build) {
1009+
return cp_build_catalog(gpu_id, cp_catalog_build, cp_andN_lo, cp_andN_hi);
1010+
}
1011+
7521012
if(!target_path) { fprintf(stderr,"--target required\n"); return 1; }
7531013

754-
/* CP mode: CPU-only, no GPU needed */
755-
if(mode_cp) return cp_search(target_path, init_canvas, out_json, out_pgm,
756-
cp_carrier_seeds, cp_andN_lo, cp_andN_hi, gpu_id);
1014+
/* CP mode: dispatch to cp_search (GPU only used if carrier_seeds>255 and no catalog) */
1015+
if(mode_cp) {
1016+
return cp_search(target_path, init_canvas, out_json, out_pgm,
1017+
cp_carrier_seeds, cp_andN_lo, cp_andN_hi, gpu_id, cp_catalog);
1018+
}
7571019

7581020
cudaSetDevice(gpu_id);
7591021
cudaDeviceProp prop; cudaGetDeviceProperties(&prop,gpu_id);

data/carrier_catalog.bin

9 MB
Binary file not shown.

data/che_anima2_b256.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

data/che_anima2_b64.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

data/yozhik_anim50_82_b32.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)