@@ -229,6 +229,38 @@ __global__ void searchKernel(
229229 out_err [tid] = bestDelta;
230230}
231231
232+ /* ====================================================================
233+ * buildCatalogKernel: precompute 192-bit carrier bitmaps for all seeds.
234+ * One thread per seed (tid → seed = tid+1). Output layout:
235+ * cat_d[(tid)*6 .. (tid)*6+5] = uint32_t[6] (24 bytes, 192 bits)
236+ * Bit index: by*16 + bx (bx in [0,15], by in [0,11]) — valid blk=8 blocks.
237+ * ==================================================================== */
238+ __global__ void buildCatalogKernel (uint32_t * __restrict__ cat_d, int andN) {
239+ int tid = blockIdx .x * blockDim .x + threadIdx .x ;
240+ if (tid >= NSEEDS) return ;
241+
242+ uint16_t s = (uint16_t )(tid + 1 );
243+ uint32_t bm[6 ] = {0 ,0 ,0 ,0 ,0 ,0 };
244+
245+ for (int by = 0 ; by < 24 ; by++) {
246+ for (int bx = 0 ; bx < 32 ; bx++) {
247+ uint8_t acc = 1 ;
248+ for (int k = 0 ; k < andN; k++) {
249+ uint16_t bit = s & 1u ; s >>= 1 ; if (bit) s ^= 0xB400u ;
250+ acc &= (uint8_t )(s & 1u );
251+ }
252+ /* Only record the 192 valid carrier blocks (bx<16, by<12) */
253+ if (bx < 16 && by < 12 && acc) {
254+ int idx = by * 16 + bx;
255+ bm[idx >> 5 ] |= (1u << (idx & 31 ));
256+ }
257+ }
258+ }
259+
260+ int base = tid * 6 ;
261+ for (int w = 0 ; w < 6 ; w++) cat_d[base + w] = bm[w];
262+ }
263+
232264/* ====================================================================
233265 * applyBuf (host)
234266 * ==================================================================== */
@@ -463,12 +495,184 @@ static int cp_carrier_gpu(const uint8_t *cnv_pack, const uint8_t *tgt_pack,
463495 return best_delta;
464496}
465497
498+ /* ====================================================================
499+ * cp_build_catalog: build and save carrier catalog file.
500+ * File layout: "CPCT" magic (4B) + andN_lo(u8) + andN_hi(u8) +
501+ * n_seeds(u32le=65535) + data[n_andN × n_seeds × 24B]
502+ * Total for andN 3-8: 10 + 6×65535×24 = ~9.4 MB.
503+ * ==================================================================== */
504+ int cp_build_catalog (int gpu_id, const char *path, int andN_lo, int andN_hi) {
505+ cudaSetDevice (gpu_id);
506+ cudaDeviceProp prop; cudaGetDeviceProperties (&prop, gpu_id);
507+ int n_andN = andN_hi - andN_lo + 1 ;
508+ size_t layer_u32 = (size_t )NSEEDS * 6 ;
509+ size_t layer_bytes = layer_u32 * sizeof (uint32_t );
510+
511+ printf (" [catalog] GPU %d: %s | andN=%d..%d seeds=1..%d (%.1f MB)\n " ,
512+ gpu_id, prop.name , andN_lo, andN_hi, NSEEDS,
513+ (10.0 + n_andN * layer_bytes) / 1048576.0 );
514+
515+ uint32_t *cat_d;
516+ if (cudaMalloc (&cat_d, layer_bytes) != cudaSuccess) {
517+ fprintf (stderr, " [catalog] cudaMalloc failed\n " ); return 1 ;
518+ }
519+
520+ FILE *fp = fopen (path, " wb" );
521+ if (!fp) { perror (path); cudaFree (cat_d); return 1 ; }
522+
523+ /* Header */
524+ uint8_t hdr[10 ];
525+ memcpy (hdr, " CPCT" , 4 );
526+ hdr[4 ] = (uint8_t )andN_lo;
527+ hdr[5 ] = (uint8_t )andN_hi;
528+ uint32_t ns = NSEEDS;
529+ memcpy (hdr + 6 , &ns, 4 );
530+ fwrite (hdr, 1 , 10 , fp);
531+
532+ uint32_t *cat_h = (uint32_t *)malloc (layer_bytes);
533+ int bs = 256 , gs = (NSEEDS + bs - 1 ) / bs;
534+
535+ for (int andN = andN_lo; andN <= andN_hi; andN++) {
536+ printf (" [catalog] andN=%d ... " , andN); fflush (stdout);
537+ buildCatalogKernel<<<gs, bs>>> (cat_d, andN);
538+ cudaDeviceSynchronize ();
539+ cudaMemcpy (cat_h, cat_d, layer_bytes, cudaMemcpyDeviceToHost);
540+ fwrite (cat_h, 4 , layer_u32, fp);
541+ printf (" done\n " );
542+ }
543+
544+ free (cat_h);
545+ fclose (fp);
546+ cudaFree (cat_d);
547+ printf (" [catalog] saved %s\n " , path);
548+ return 0 ;
549+ }
550+
551+ /* ====================================================================
552+ * cp_query_catalog: find best (seed, andN) via popcount over hot blocks,
553+ * then rescore top-K with actual pixel-level delta.
554+ *
555+ * hot_bits[6]: 192-bit mask — bit (by*16+bx) set iff 8×8 block has ≥1 error.
556+ * hot_px[6]: per-block error count (uint8, 0-64), parallel to hot_bits.
557+ * Used for weighted proxy: score = sum(active blocks, 64-2*err_count).
558+ * Proxy selects TOP_K candidates; pixel rescore picks the final winner.
559+ * ==================================================================== */
560+ #define CP_TOPK 16
561+ static uint16_t cp_query_catalog (const uint32_t *catalog, int n_seeds,
562+ int cat_andN_lo, int cat_andN_hi,
563+ const uint32_t hot_bits[6 ],
564+ const uint8_t hot_px[192 ], /* err count per block */
565+ int *out_andN) {
566+ int n_andN = cat_andN_hi - cat_andN_lo + 1 ;
567+
568+ /* Top-K heap: (proxy_score, seed, andN_idx) — keep CP_TOPK lowest scores */
569+ struct { int score; uint16_t seed; int ai; } topk[CP_TOPK];
570+ int topk_n = 0 , topk_worst = INT_MIN;
571+
572+ for (int ai = 0 ; ai < n_andN; ai++) {
573+ const uint32_t *layer = catalog + (size_t )ai * n_seeds * 6 ;
574+ for (int s = 0 ; s < n_seeds; s++) {
575+ const uint32_t *bm = layer + s * 6 ;
576+ /* Weighted proxy: sum over active blocks (64 - 2*err_in_block) */
577+ int score = 0 ;
578+ for (int bi = 0 ; bi < 192 ; bi++) {
579+ if (bm[bi >> 5 ] & (1u << (bi & 31 )))
580+ score += 64 - 2 * (int )hot_px[bi];
581+ }
582+ if (topk_n < CP_TOPK || score < topk_worst) {
583+ int slot = (topk_n < CP_TOPK) ? topk_n++ : 0 ;
584+ /* Find worst slot to replace */
585+ if (topk_n > 1 && slot == 0 ) {
586+ int worst = 0 ;
587+ for (int k = 1 ; k < topk_n; k++)
588+ if (topk[k].score > topk[worst].score ) worst = k;
589+ slot = worst;
590+ }
591+ topk[slot].score = score;
592+ topk[slot].seed = (uint16_t )(s + 1 );
593+ topk[slot].ai = ai;
594+ /* Recompute worst */
595+ topk_worst = topk[0 ].score ;
596+ for (int k = 1 ; k < topk_n; k++)
597+ if (topk[k].score > topk_worst) topk_worst = topk[k].score ;
598+ }
599+ }
600+ }
601+
602+ /* Pixel-level rescore of top-K candidates */
603+ *out_andN = cat_andN_lo + (topk_n ? topk[0 ].ai : 0 );
604+ uint16_t best_seed = topk_n ? topk[0 ].seed : 1 ;
605+ (void )hot_bits; /* hot_bits used implicitly via hot_px */
606+ return best_seed;
607+ }
608+
609+ /* Pixel rescore of catalog top-K: recompute actual delta for each candidate. */
610+ static uint16_t cp_rescore_topk (
611+ const uint32_t *catalog, int n_seeds, int cat_andN_lo, int cat_andN_hi,
612+ const uint32_t hot_bits[6 ], const uint8_t hot_px[192 ],
613+ const uint8_t canvas[W*H], const uint8_t target_px[W*H],
614+ int *out_andN, int *out_score) {
615+ uint16_t best_seed = 1 ;
616+ int best_andN = cat_andN_lo;
617+ int best_delta = INT_MAX;
618+
619+ int n_andN = cat_andN_hi - cat_andN_lo + 1 ;
620+
621+ struct { int score; uint16_t seed; int ai; } topk[CP_TOPK];
622+ int topk_n = 0 , topk_worst = INT_MIN;
623+
624+ for (int ai = 0 ; ai < n_andN; ai++) {
625+ const uint32_t *layer = catalog + (size_t )ai * n_seeds * 6 ;
626+ for (int s = 0 ; s < n_seeds; s++) {
627+ const uint32_t *bm = layer + s * 6 ;
628+ int score = 0 ;
629+ for (int bi = 0 ; bi < 192 ; bi++)
630+ if (bm[bi >> 5 ] & (1u << (bi & 31 )))
631+ score += 64 - 2 * (int )hot_px[bi];
632+ if (topk_n < CP_TOPK || score < topk_worst) {
633+ int slot = topk_n < CP_TOPK ? topk_n++ : 0 ;
634+ if (slot == 0 && topk_n > 1 ) {
635+ for (int k = 1 ; k < topk_n; k++)
636+ if (topk[k].score > topk[slot].score ) slot = k;
637+ }
638+ topk[slot] = {score, (uint16_t )(s+1 ), ai};
639+ topk_worst = topk[0 ].score ;
640+ for (int k = 1 ; k < topk_n; k++)
641+ if (topk[k].score > topk_worst) topk_worst = topk[k].score ;
642+ }
643+ }
644+ }
645+ (void )hot_bits;
646+
647+ /* Pixel rescore */
648+ uint8_t buf[BUF_N];
649+ for (int k = 0 ; k < topk_n; k++) {
650+ makeBuf_h (topk[k].seed , 0 , cat_andN_lo + topk[k].ai , buf);
651+ int delta = 0 ;
652+ for (int by = 0 ; by < 12 ; by++) for (int bx = 0 ; bx < 16 ; bx++) {
653+ if (!buf[by*32 +bx]) continue ;
654+ for (int dy = 0 ; dy < 8 ; dy++) for (int dx = 0 ; dx < 8 ; dx++) {
655+ int x = bx*8 +dx, y = by*8 +dy;
656+ delta += (canvas[y*W+x] != target_px[y*W+x]) ? -1 : +1 ;
657+ }
658+ }
659+ if (delta < best_delta) {
660+ best_delta = delta; best_seed = topk[k].seed ;
661+ best_andN = cat_andN_lo + topk[k].ai ;
662+ }
663+ }
664+ *out_andN = best_andN;
665+ *out_score = best_delta;
666+ return best_seed;
667+ }
668+
466669int cp_search (const char *target_path, const char *init_canvas_path,
467670 const char *out_json, const char *out_pgm,
468671 int carrier_seeds, /* max seed value for carrier (255=u8 CPU, 65535=u16 GPU) */
469672 int andN_lo, /* carrier andN search range low (default 3) */
470673 int andN_hi, /* carrier andN search range high (default 8) */
471- int gpu_id) { /* GPU device for carrier_seeds > 255 */
674+ int gpu_id, /* GPU device for carrier_seeds > 255 or catalog build */
675+ const char *catalog_path) { /* NULL = no catalog; path = load and use */
472676
473677 uint8_t tgt_pack[PS], cnv_pack[PS];
474678 int tw, th;
@@ -494,7 +698,54 @@ int cp_search(const char *target_path, const char *init_canvas_path,
494698 uint8_t carrier_buf[BUF_N], tmp_buf[BUF_N];
495699 uint16_t cs = 1 ; int can = andN_lo, c_score = INT_MAX;
496700
497- if (carrier_seeds > 255 ) {
701+ /* Try to load catalog for fast CPU carrier search */
702+ uint32_t *catalog_data = NULL ;
703+ int cat_n_seeds = 0 , cat_andN_lo = andN_lo, cat_andN_hi = andN_hi;
704+ if (catalog_path) {
705+ FILE *cf = fopen (catalog_path, " rb" );
706+ if (cf) {
707+ uint8_t hdr[10 ]; fread (hdr, 1 , 10 , cf);
708+ if (memcmp (hdr, " CPCT" , 4 ) == 0 ) {
709+ cat_andN_lo = hdr[4 ]; cat_andN_hi = hdr[5 ];
710+ memcpy (&cat_n_seeds, hdr + 6 , 4 );
711+ int n_andN = cat_andN_hi - cat_andN_lo + 1 ;
712+ size_t data_u32 = (size_t )n_andN * cat_n_seeds * 6 ;
713+ catalog_data = (uint32_t *)malloc (data_u32 * sizeof (uint32_t ));
714+ fread (catalog_data, sizeof (uint32_t ), data_u32, cf);
715+ printf (" [carrier] catalog loaded: %s (andN=%d..%d seeds=%d)\n " ,
716+ catalog_path, cat_andN_lo, cat_andN_hi, cat_n_seeds);
717+ } else {
718+ printf (" [carrier] bad catalog magic, ignoring %s\n " , catalog_path);
719+ }
720+ fclose (cf);
721+ } else {
722+ printf (" [carrier] catalog not found: %s, falling back\n " , catalog_path);
723+ }
724+ }
725+
726+ if (catalog_data) {
727+ /* Build per-block error counts (hot_px[192]) and binary bitmap (hot_bits[6]) */
728+ uint32_t hot_bits[6 ] = {0 ,0 ,0 ,0 ,0 ,0 };
729+ uint8_t hot_px[192 ];
730+ int hot_count = 0 ;
731+ for (int by = 0 ; by < 12 ; by++) for (int bx = 0 ; bx < 16 ; bx++) {
732+ int err = 0 ;
733+ for (int dy = 0 ; dy < 8 ; dy++)
734+ for (int dx = 0 ; dx < 8 ; dx++)
735+ if (canvas[(by*8 +dy)*W+(bx*8 +dx)] != target_px[(by*8 +dy)*W+(bx*8 +dx)])
736+ err++;
737+ hot_px[by*16 +bx] = (uint8_t )err;
738+ if (err) { int bit = by*16 +bx; hot_bits[bit>>5 ] |= (1u <<(bit&31 )); hot_count++; }
739+ }
740+ printf (" [carrier] catalog query: %d hot 8×8 blocks (top-%d rescore)..." ,
741+ hot_count, CP_TOPK); fflush (stdout);
742+ cs = cp_rescore_topk (catalog_data, cat_n_seeds,
743+ cat_andN_lo, cat_andN_hi,
744+ hot_bits, hot_px, canvas, target_px, &can, &c_score);
745+ free (catalog_data);
746+ makeBuf_h (cs, 0 , can, carrier_buf);
747+ printf (" seed=%u andN=%d score=%+d\n " , cs, can, c_score);
748+ } else if (carrier_seeds > 255 ) {
498749 /* GPU: full u16 search (65535 seeds × each andN, one kernel launch per andN) */
499750 printf (" [carrier] GPU search seeds=1..%d andN=%d..%d ...\n " ,
500751 carrier_seeds, andN_lo, andN_hi);
@@ -653,10 +904,12 @@ int main(int argc, char **argv) {
653904 int auto_bounce = 0 ; /* --auto-bounce: probe blk=1,2,4 and pick best for L0 */
654905 const char *weight_map = NULL ; /* --weight-map file.wmap: per-pixel uint8 importance */
655906 int auto_weight = 0 ; /* --auto-weight: derive weight from canvas/target diff each step */
656- int mode_cp = 0 ; /* --cp: carrier-payload hierarchical search */
657- int cp_carrier_seeds = 255 ; /* --cp-seeds N: carrier search range 1..N (255=u8, 65535=u16) */
658- int cp_andN_lo = 3 ; /* --cp-andN-lo N: carrier andN range low */
659- int cp_andN_hi = 8 ; /* --cp-andN-hi N: carrier andN range high */
907+ int mode_cp = 0 ; /* --cp: carrier-payload hierarchical search */
908+ int cp_carrier_seeds = 255 ; /* --cp-seeds N: carrier search range 1..N (255=u8, 65535=u16) */
909+ int cp_andN_lo = 3 ; /* --cp-andN-lo N: carrier andN range low */
910+ int cp_andN_hi = 8 ; /* --cp-andN-hi N: carrier andN range high */
911+ const char *cp_catalog_build = NULL ; /* --cp-build-catalog path: build and save catalog */
912+ const char *cp_catalog = NULL ; /* --cp-catalog path: load catalog for fast carrier search */
660913
661914 /* Multi-zone: up to 8 zones (x,y pairs) */
662915 int zones_x[8 ], zones_y[8 ], n_zones = 0 ;
@@ -685,7 +938,9 @@ int main(int argc, char **argv) {
685938 else if (!strcmp (argv[i]," --cp-seeds" ) && i+1 <argc) cp_carrier_seeds = atoi (argv[++i]);
686939 else if (!strcmp (argv[i]," --cp-andN-lo" )&& i+1 <argc) cp_andN_lo = atoi (argv[++i]);
687940 else if (!strcmp (argv[i]," --cp-andN-hi" )&& i+1 <argc) cp_andN_hi = atoi (argv[++i]);
688- else if (!strcmp (argv[i]," --cp-andN" ) && i+1 <argc) { cp_andN_lo = cp_andN_hi = atoi (argv[++i]); }
941+ else if (!strcmp (argv[i]," --cp-andN" ) && i+1 <argc) { cp_andN_lo = cp_andN_hi = atoi (argv[++i]); }
942+ else if (!strcmp (argv[i]," --cp-build-catalog" ) && i+1 <argc) cp_catalog_build = argv[++i];
943+ else if (!strcmp (argv[i]," --cp-catalog" ) && i+1 <argc) cp_catalog = argv[++i];
689944 else if (!strcmp (argv[i]," --preset" ) && i+1 <argc) {
690945 if (!strcmp (argv[++i]," fine" )) preset_fine = 1 ;
691946 }
@@ -749,11 +1004,18 @@ int main(int argc, char **argv) {
7491004 if (budget_cap > total_phase_budget)
7501005 phases[nphases-1 ].budget += (budget_cap - total_phase_budget);
7511006
1007+ /* Build carrier catalog and exit (no --target needed) */
1008+ if (cp_catalog_build) {
1009+ return cp_build_catalog (gpu_id, cp_catalog_build, cp_andN_lo, cp_andN_hi);
1010+ }
1011+
7521012 if (!target_path) { fprintf (stderr," --target required\n " ); return 1 ; }
7531013
754- /* CP mode: CPU-only, no GPU needed */
755- if (mode_cp) return cp_search (target_path, init_canvas, out_json, out_pgm,
756- cp_carrier_seeds, cp_andN_lo, cp_andN_hi, gpu_id);
1014+ /* CP mode: dispatch to cp_search (GPU only used if carrier_seeds>255 and no catalog) */
1015+ if (mode_cp) {
1016+ return cp_search (target_path, init_canvas, out_json, out_pgm,
1017+ cp_carrier_seeds, cp_andN_lo, cp_andN_hi, gpu_id, cp_catalog);
1018+ }
7571019
7581020 cudaSetDevice (gpu_id);
7591021 cudaDeviceProp prop; cudaGetDeviceProperties (&prop,gpu_id);
0 commit comments