diff --git a/docs/paper/reductions.typ b/docs/paper/reductions.typ index d47f6909..00bb1bf3 100644 --- a/docs/paper/reductions.typ +++ b/docs/paper/reductions.typ @@ -196,6 +196,7 @@ "StaffScheduling": [Staff Scheduling], "SteinerTree": [Steiner Tree], "SteinerTreeInGraphs": [Steiner Tree in Graphs], + "MinimumExternalMacroDataCompression": [Minimum External Macro Data Compression], "StringToStringCorrection": [String-to-String Correction], "StrongConnectivityAugmentation": [Strong Connectivity Augmentation], "SubgraphIsomorphism": [Subgraph Isomorphism], @@ -4983,6 +4984,88 @@ A classical NP-complete problem from Garey and Johnson @garey1979[Ch.~3, p.~76], ] } +#{ + let x = load-model-example("MinimumExternalMacroDataCompression") + let alpha-size = x.instance.alphabet_size + let s = x.instance.string + let n = s.len() + let h = x.instance.pointer_cost + let alpha-map = range(alpha-size).map(i => str.from-unicode(97 + i)) + let s-str = s.map(c => alpha-map.at(c)).join("") + let opt-val = metric-value(x.optimal_value) + [ + #problem-def("MinimumExternalMacroDataCompression")[ + Given a finite alphabet $Sigma$ of size $k$, a string $s in Sigma^*$ of length $n$, and a pointer cost $h in ZZ^+$, find a dictionary string $D in Sigma^*$ and a compressed string $C in (Sigma union {p_1, dots, p_n})^*$, where each $p_i$ is a pointer referencing a contiguous substring of $D$, such that $s$ can be obtained from $C$ by replacing every pointer with its referenced substring, minimizing the total cost $|D| + |C| + (h - 1) times$ (number of pointer occurrences in $C$). + ][ + A classical NP-hard data compression problem, listed as SR22 in Garey and Johnson @garey1979. The macro model of data compression was introduced by #cite(, form: "prose"), who proved NP-completeness via transformation from Vertex Cover. #cite(, form: "prose") provided a comprehensive analysis of the macro compression framework, showing that NP-completeness persists even when $h$ is any fixed integer $gt.eq 2$, when the alphabet has $gt.eq 3$ symbols, and when $D$ contains no pointers (the "external" variant). The LZ-family of practical compression algorithms (LZ77, LZSS, LZ78) are restricted forms of this general macro model. The related Smallest Grammar Problem is APX-hard @charikar2005.#footnote[No algorithm improving on brute-force enumeration is known for optimal external macro compression.] + + *Example.* Let $Sigma = {#alpha-map.join(", ")}$ and $s = #s-str$ (length #n) with pointer cost $h = #h$. + + #pred-commands( + "pred create --example MinimumExternalMacroDataCompression -o min-emdc.json", + "pred solve min-emdc.json", + "pred evaluate min-emdc.json --config " + x.optimal_config.map(str).join(","), + ) + + #figure({ + let blue = graph-colors.at(0) + let green = graph-colors.at(1) + let cell(ch, highlight: false, ptr: false) = { + let fill = if ptr { green.transparentize(70%) } else if highlight { blue.transparentize(70%) } else { white } + box(width: 0.5cm, height: 0.55cm, fill: fill, stroke: 0.5pt + luma(120), + align(center + horizon, text(8pt, weight: "bold", ch))) + } + let ptr-cell(label) = { + box(width: 1.5cm, height: 0.55cm, fill: green.transparentize(70%), stroke: 0.5pt + luma(120), + align(center + horizon, text(7pt, weight: "bold", label))) + } + // D = first 6 symbols of s (one copy of the pattern) + let d-len = alpha-size + let d-syms = s.slice(0, d-len) + // C = 3 pointers, each referencing D[0..6] + let num-ptrs = calc.div-euclid(n, d-len) + align(center, stack(dir: ttb, spacing: 0.5cm, + // Source string + stack(dir: ltr, spacing: 0pt, + box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$s: quad$])), + ..s.map(c => cell(alpha-map.at(c))), + ), + // Dictionary D + stack(dir: ltr, spacing: 0pt, + box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$D: quad$])), + ..d-syms.map(c => cell(alpha-map.at(c), highlight: true)), + ), + // Compressed string C = 3 pointers + stack(dir: ltr, spacing: 0pt, + box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$C: quad$])), + ..range(num-ptrs).map(_ => ptr-cell[$arrow.r D[0..#d-len]$]), + ), + )) + }, + caption: [Minimum External Macro Data Compression: with $s = #s-str$ (length #n) and pointer cost $h = #h$, the optimal compression stores $D = #s-str.slice(0, alpha-size)$ (#alpha-size symbols) and uses #calc.div-euclid(n, alpha-size) pointers in $C$, achieving cost $#alpha-size + #calc.div-euclid(n, alpha-size) + (#h - 1) times #calc.div-euclid(n, alpha-size) = #opt-val$ vs.~uncompressed cost #n.], + ) + + This instance has a repeating pattern of length #alpha-size, allowing the dictionary $D$ to store one copy and the compressed string $C$ to reference it via pointers. Each pointer costs $h = #h$ (the pointer symbol itself plus $h - 1 = #(h - 1)$ extra), so the total cost is $|D| + |C| + (h - 1) times |"pointers"| = #alpha-size + #calc.div-euclid(n, alpha-size) + #(h - 1) times #calc.div-euclid(n, alpha-size) = #opt-val$, saving $#(n - int(opt-val))$ over the uncompressed cost of #n. + ] + ] +} + +#reduction-rule("MinimumExternalMacroDataCompression", "ILP")[ + The compression problem decomposes into a dictionary selection (which symbols appear at which positions in $D$) and a string partitioning (which segments of $s$ are literals vs.~pointers). Both are naturally expressed with binary variables and linear constraints. The partition structure is modeled as a flow on a DAG whose nodes are string positions and whose arcs are candidate segments. +][ + _Construction._ For alphabet $Sigma$ of size $k$, string $s$ of length $n$, and pointer cost $h$: + + _Variables:_ (1) Binary $d_(j,c) in {0,1}$ for each dictionary position $j in {0, dots, n-1}$ and symbol $c in Sigma$: $d_(j,c) = 1$ iff $D[j] = c$. (2) Binary $u_j in {0,1}$: $u_j = 1$ iff dictionary position $j$ is used. (3) Binary $ell_i in {0,1}$ for each string position $i$: $ell_i = 1$ iff position $i$ is covered by a literal. (4) Binary $p_(i,lambda,delta) in {0,1}$ for each valid triple $(i, lambda, delta)$ with $i + lambda <= n$ and $delta + lambda <= n$: $p_(i,lambda,delta) = 1$ iff positions $[i, i + lambda)$ are covered by a pointer referencing $D[delta .. delta + lambda)$. + + _Constraints:_ (1) Dictionary one-hot: $sum_(c in Sigma) d_(j,c) <= 1$ for all $j$. (2) Linking: $d_(j,c) <= u_j$ for all $j, c$. (3) Contiguity: $u_(j+1) <= u_j$ for all $j < n - 1$. (4) Partition flow: the segments form a partition of ${0, dots, n-1}$ via flow conservation on nodes $0, dots, n$. (5) Pointer matching: $p_(i,lambda,delta) <= d_(delta+r, s[i+r])$ for all offsets $r in {0, dots, lambda - 1}$. + + _Objective:_ Minimize $sum_j u_j + sum_i ell_i + h sum_(i,lambda,delta) p_(i,lambda,delta)$. + + _Correctness._ ($arrow.r.double$) An optimal $(D, C)$ pair determines a feasible ILP assignment: set $d_(j,c) = 1$ for each symbol in $D$, $u_j = 1$ for used positions, and activate the corresponding literal or pointer variables for each $C$-slot. The partition flow is satisfied by construction. ($arrow.l.double$) Any feasible ILP solution defines a valid dictionary (one-hot + contiguity) and a valid partition of $s$ into literal and pointer segments (flow conservation + matching), with cost equal to the objective. + + _Solution extraction._ Read $D$ from the $d_(j,c)$ indicators. Walk through the active segments (via $ell_i$ and $p_(i,lambda,delta)$) to reconstruct $C$. +] + #{ let x = load-model-example("MinimumFeedbackArcSet") let nv = x.instance.graph.num_vertices diff --git a/docs/paper/references.bib b/docs/paper/references.bib index 8a21f928..f7d6f456 100644 --- a/docs/paper/references.bib +++ b/docs/paper/references.bib @@ -1476,6 +1476,16 @@ @article{edmondsjohnson1973 year = {1973} } +@inproceedings{charikar2005, + author = {Moses Charikar and Eric Lehman and Ding Liu and Rina Panigrahy and Manoj Prabhakaran and Amit Sahai and Abhi Shelat}, + title = {The Smallest Grammar Problem}, + booktitle = {IEEE Transactions on Information Theory}, + volume = {51}, + number = {7}, + pages = {2554--2576}, + year = {2005} +} + @article{lenstra1977, author = {J. K. Lenstra and A. H. G. Rinnooy Kan and P. Brucker}, title = {Complexity of Machine Scheduling Problems}, @@ -1511,6 +1521,24 @@ @techreport{plaisted1976 year = {1976} } +@techreport{storer1977, + author = {James A. Storer}, + title = {NP-Completeness Results Concerning Data Compression}, + institution = {Princeton University, Department of Electrical Engineering and Computer Science}, + number = {234}, + year = {1977} +} + +@article{storer1982, + author = {James A. Storer and Thomas G. Szymanski}, + title = {Data Compression via Textual Substitution}, + journal = {Journal of the ACM}, + volume = {29}, + number = {4}, + pages = {928--951}, + year = {1982} +} + @article{haase2016, author = {Haase, Christoph and Kiefer, Stefan}, title = {The Complexity of the {K}th Largest Subset Problem and Related Problems}, diff --git a/problemreductions-cli/src/cli.rs b/problemreductions-cli/src/cli.rs index 62015a45..8a134445 100644 --- a/problemreductions-cli/src/cli.rs +++ b/problemreductions-cli/src/cli.rs @@ -314,6 +314,7 @@ Flags by problem type: SequencingToMinimizeMaximumCumulativeCost --costs [--precedence-pairs] SequencingToMinimizeWeightedCompletionTime --lengths, --weights [--precedence-pairs] SequencingToMinimizeWeightedTardiness --sizes, --weights, --deadlines, --bound + MinimumExternalMacroDataCompression --string, --pointer-cost [--alphabet-size] SCS --strings [--alphabet-size] StringToStringCorrection --source-string, --target-string, --bound [--alphabet-size] D2CIF --arcs, --capacities, --source-1, --sink-1, --source-2, --sink-2, --requirement-1, --requirement-2 @@ -760,6 +761,9 @@ pub struct CreateArgs { /// Target string for StringToStringCorrection (comma-separated symbol indices, e.g., "0,1,3,2") #[arg(long)] pub target_string: Option, + /// Pointer cost for MinimumExternalMacroDataCompression (positive integer) + #[arg(long)] + pub pointer_cost: Option, /// Expression tree for IntegerExpressionMembership (JSON, e.g., '{"Sum":[{"Atom":1},{"Atom":2}]}') #[arg(long)] pub expression: Option, diff --git a/problemreductions-cli/src/commands/create.rs b/problemreductions-cli/src/commands/create.rs index 129e8f13..f766e542 100644 --- a/problemreductions-cli/src/commands/create.rs +++ b/problemreductions-cli/src/commands/create.rs @@ -25,9 +25,10 @@ use problemreductions::models::misc::{ ConjunctiveBooleanQuery, ConsistencyOfDatabaseFrequencyTables, EnsembleComputation, ExpectedRetrievalCost, FlowShopScheduling, FrequencyTable, GroupingBySwapping, IntExpr, IntegerExpressionMembership, JobShopScheduling, KnownValue, KthLargestMTuple, - LongestCommonSubsequence, MinimumTardinessSequencing, MultiprocessorScheduling, PaintShop, - PartiallyOrderedKnapsack, ProductionPlanning, QueryArg, RectilinearPictureCompression, - RegisterSufficiency, ResourceConstrainedScheduling, SchedulingToMinimizeWeightedCompletionTime, + LongestCommonSubsequence, MinimumExternalMacroDataCompression, MinimumTardinessSequencing, + MultiprocessorScheduling, PaintShop, PartiallyOrderedKnapsack, ProductionPlanning, QueryArg, + RectilinearPictureCompression, RegisterSufficiency, ResourceConstrainedScheduling, + SchedulingToMinimizeWeightedCompletionTime, SchedulingWithIndividualDeadlines, SequencingToMinimizeMaximumCumulativeCost, SequencingToMinimizeWeightedCompletionTime, SequencingToMinimizeWeightedTardiness, SequencingWithReleaseTimesAndDeadlines, SequencingWithinIntervals, ShortestCommonSupersequence, @@ -173,6 +174,7 @@ fn all_data_flags_empty(args: &CreateArgs) -> bool { && args.num_attributes.is_none() && args.source_string.is_none() && args.target_string.is_none() + && args.pointer_cost.is_none() && args.capacities.is_none() && args.source_1.is_none() && args.sink_1.is_none() @@ -757,6 +759,9 @@ fn example_for(canonical: &str, graph_type: Option<&str>) -> &'static str { "--strings \"010110;100101;001011\" --alphabet-size 2" } "GroupingBySwapping" => "--string \"0,1,2,0,1,2\" --bound 5", + "MinimumExternalMacroDataCompression" => { + "--string \"0,1,0,1\" --pointer-cost 2 --alphabet-size 2" + } "MinimumCardinalityKey" => { "--num-attributes 6 --dependencies \"0,1>2;0,2>3;1,3>4;2,4>5\"" } @@ -892,6 +897,8 @@ fn help_flag_hint( "raw strings: \"ABAC;BACA\" or symbol lists: \"0,1,0;1,0,1\"" } ("GroupingBySwapping", "string") => "symbol list: \"0,1,2,0,1,2\"", + ("MinimumExternalMacroDataCompression", "string") => "symbol list: \"0,1,0,1\"", + ("MinimumExternalMacroDataCompression", "pointer_cost") => "positive integer: 2", ("ShortestCommonSupersequence", "strings") => "symbol lists: \"0,1,2;1,2,0\"", ("MultipleChoiceBranching", "partition") => "semicolon-separated groups: \"0,1;2,3\"", ("IntegralFlowHomologousArcs", "homologous_pairs") => { @@ -3157,6 +3164,57 @@ pub fn create(args: &CreateArgs, out: &OutputConfig) -> Result<()> { ) } + // MinimumExternalMacroDataCompression + "MinimumExternalMacroDataCompression" => { + let usage = "Usage: pred create MinimumExternalMacroDataCompression --string \"0,1,0,1\" --pointer-cost 2 [--alphabet-size 2]"; + let string_str = args.string.as_deref().ok_or_else(|| { + anyhow::anyhow!("MinimumExternalMacroDataCompression requires --string\n\n{usage}") + })?; + let pointer_cost = args.pointer_cost.ok_or_else(|| { + anyhow::anyhow!( + "MinimumExternalMacroDataCompression requires --pointer-cost\n\n{usage}" + ) + })?; + anyhow::ensure!( + pointer_cost > 0, + "--pointer-cost must be a positive integer\n\n{usage}" + ); + + let string: Vec = if string_str.trim().is_empty() { + Vec::new() + } else { + string_str + .split(',') + .map(|value| { + value + .trim() + .parse::() + .context("invalid symbol index") + }) + .collect::>>()? + }; + let inferred = string.iter().copied().max().map_or(0, |value| value + 1); + let alphabet_size = args.alphabet_size.unwrap_or(inferred); + anyhow::ensure!( + alphabet_size >= inferred, + "--alphabet-size {} is smaller than max symbol + 1 ({}) in the input string", + alphabet_size, + inferred + ); + anyhow::ensure!( + alphabet_size > 0 || string.is_empty(), + "MinimumExternalMacroDataCompression requires a positive alphabet for non-empty strings.\n\n{usage}" + ); + ( + ser(MinimumExternalMacroDataCompression::new( + alphabet_size, + string, + pointer_cost, + ))?, + resolved_variant.clone(), + ) + } + // ClosestVectorProblem "ClosestVectorProblem" => { let basis_str = args.basis.as_deref().ok_or_else(|| { @@ -7867,6 +7925,7 @@ mod tests { storage: None, quantifiers: None, homologous_pairs: None, + pointer_cost: None, expression: None, coeff_a: None, coeff_b: None, diff --git a/src/models/misc/minimum_external_macro_data_compression.rs b/src/models/misc/minimum_external_macro_data_compression.rs new file mode 100644 index 00000000..dd6fbbd0 --- /dev/null +++ b/src/models/misc/minimum_external_macro_data_compression.rs @@ -0,0 +1,280 @@ +//! Minimum External Macro Data Compression problem implementation. +//! +//! Given an alphabet Sigma, a string s in Sigma*, and a pointer cost h, +//! find a dictionary string D and compressed string C minimizing the total +//! cost |D| + |C| + (h-1) * (number of pointer occurrences in D and C), +//! such that s can be reconstructed from C by replacing pointers with their +//! referenced substrings of D. +//! +//! The configuration uses 2*|s| slots: |s| slots for D (dictionary) and |s| +//! slots for C (compressed string). D-slots use alphabet symbols or empty. +//! C-slots use alphabet symbols, pointers into D (start, len), or empty. +//! D is restricted to be pointer-free (pure alphabet string). +//! +//! This problem is NP-hard (Storer, 1977; Storer & Szymanski, 1978). +//! Reference: Garey & Johnson A4 SR22. + +use crate::registry::{FieldInfo, ProblemSchemaEntry}; +use crate::traits::Problem; +use crate::types::Min; +use serde::{Deserialize, Serialize}; + +inventory::submit! { + ProblemSchemaEntry { + name: "MinimumExternalMacroDataCompression", + display_name: "Minimum External Macro Data Compression", + aliases: &[], + dimensions: &[], + module_path: module_path!(), + description: "Find minimum-cost compression using an external dictionary and compressed string with pointers", + fields: &[ + FieldInfo { name: "alphabet_size", type_name: "usize", description: "Size of the alphabet (symbols indexed 0..alphabet_size)" }, + FieldInfo { name: "string", type_name: "Vec", description: "Source string as symbol indices" }, + FieldInfo { name: "pointer_cost", type_name: "usize", description: "Pointer cost h (each pointer contributes h to the cost)" }, + ], + } +} + +/// Minimum External Macro Data Compression problem. +/// +/// Given an alphabet of size `k`, a string `s` over `{0, ..., k-1}`, and +/// a pointer cost `h`, find dictionary string D and compressed string C +/// that minimize cost = |D| + |C| + (h-1) * (pointer count in C). +/// +/// # Representation +/// +/// The configuration is a vector of `2 * string_length` entries: +/// - First `string_length` entries are D-slots: each is a symbol index +/// in `{0, ..., alphabet_size-1}` or `alphabet_size` (empty/unused). +/// - Next `string_length` entries are C-slots: each is: +/// - A symbol index in `{0, ..., alphabet_size-1}` (literal) +/// - `alphabet_size` (empty/unused) +/// - A value in `{alphabet_size+1, ..., alphabet_size + |s|*(|s|+1)/2}` +/// encoding a pointer (start, len) into D. +/// +/// D is the prefix of non-empty D-slots. C is the prefix of non-empty C-slots. +/// The cost is |D| + |C| + (h-1) * (number of pointer symbols in C). +/// +/// # Example +/// +/// ``` +/// use problemreductions::models::misc::MinimumExternalMacroDataCompression; +/// use problemreductions::{Problem, Solver, BruteForce}; +/// +/// // Alphabet {a, b}, string "abab", pointer cost h=2 +/// let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2); +/// let solver = BruteForce::new(); +/// let solution = solver.find_witness(&problem); +/// assert!(solution.is_some()); +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MinimumExternalMacroDataCompression { + alphabet_size: usize, + string: Vec, + pointer_cost: usize, +} + +impl MinimumExternalMacroDataCompression { + /// Create a new MinimumExternalMacroDataCompression instance. + /// + /// # Panics + /// + /// Panics if `alphabet_size` is 0 and the string is non-empty, or if + /// any symbol in the string is >= `alphabet_size`, or if `pointer_cost` is 0. + pub fn new(alphabet_size: usize, string: Vec, pointer_cost: usize) -> Self { + assert!( + alphabet_size > 0 || string.is_empty(), + "alphabet_size must be > 0 when the string is non-empty" + ); + assert!( + string + .iter() + .all(|&s| s < alphabet_size || alphabet_size == 0), + "all symbols must be less than alphabet_size" + ); + assert!(pointer_cost > 0, "pointer_cost must be positive"); + Self { + alphabet_size, + string, + pointer_cost, + } + } + + /// Returns the length of the source string. + pub fn string_length(&self) -> usize { + self.string.len() + } + + /// Returns the alphabet size. + pub fn alphabet_size(&self) -> usize { + self.alphabet_size + } + + /// Returns the pointer cost h. + pub fn pointer_cost(&self) -> usize { + self.pointer_cost + } + + /// Returns the source string. + pub fn string(&self) -> &[usize] { + &self.string + } + + /// Returns the number of valid pointers into D (|s|*(|s|+1)/2). + fn num_pointers(&self) -> usize { + let n = self.string.len(); + n * (n + 1) / 2 + } + + /// Returns the C-slot domain size: alphabet_size + 1 (empty) + num_pointers. + fn c_domain_size(&self) -> usize { + self.alphabet_size + 1 + self.num_pointers() + } + + /// Decode a pointer index (offset from alphabet_size+1) into (start, len) + /// in the dictionary. Pointers are enumerated as: + /// index 0 -> (0, 1), 1 -> (0, 2), ..., n-1 -> (0, n), + /// n -> (1, 1), n+1 -> (1, 2), ..., etc. + fn decode_pointer(&self, ptr_idx: usize) -> Option<(usize, usize)> { + let n = self.string.len(); + // Enumerate (start, len) pairs where 0 <= start < n, 1 <= len <= n - start + let mut idx = 0; + for start in 0..n { + let max_len = n - start; + if ptr_idx < idx + max_len { + let len = ptr_idx - idx + 1; + return Some((start, len)); + } + idx += max_len; + } + None + } +} + +impl Problem for MinimumExternalMacroDataCompression { + const NAME: &'static str = "MinimumExternalMacroDataCompression"; + type Value = Min; + + fn variant() -> Vec<(&'static str, &'static str)> { + crate::variant_params![] + } + + fn dims(&self) -> Vec { + let n = self.string.len(); + let d_domain = self.alphabet_size + 1; // symbols + empty + let c_domain = self.c_domain_size(); // symbols + empty + pointers + let mut dims = vec![d_domain; n]; // D-slots + dims.extend(vec![c_domain; n]); // C-slots + dims + } + + fn evaluate(&self, config: &[usize]) -> Min { + let n = self.string.len(); + if config.len() != 2 * n { + return Min(None); + } + + // Handle empty string case + if n == 0 { + return Min(Some(0)); + } + + let empty_d = self.alphabet_size; // empty marker for D-slots + let empty_c = self.alphabet_size; // empty marker for C-slots + + // Decode D: prefix of non-empty D-slots + let d_slots = &config[..n]; + let d_len = d_slots.iter().position(|&v| v == empty_d).unwrap_or(n); + + // Verify contiguous: all after first empty must be empty + for &v in &d_slots[d_len..] { + if v != empty_d { + return Min(None); + } + } + + // Verify D symbols are valid alphabet symbols + let d_str: Vec = d_slots[..d_len].to_vec(); + if d_str.iter().any(|&v| v >= self.alphabet_size) { + return Min(None); + } + + // Decode C: prefix of non-empty C-slots + let c_slots = &config[n..]; + let c_len = c_slots.iter().position(|&v| v == empty_c).unwrap_or(n); + + // Verify contiguous: all after first empty must be empty + for &v in &c_slots[c_len..] { + if v != empty_c { + return Min(None); + } + } + + // Decode C into a sequence of symbols, counting pointers + let mut decoded = Vec::new(); + let mut pointer_count: usize = 0; + + for &v in &c_slots[..c_len] { + if v < self.alphabet_size { + // Literal symbol + decoded.push(v); + } else if v > self.alphabet_size { + // Pointer into D + let ptr_idx = v - (self.alphabet_size + 1); + if let Some((start, len)) = self.decode_pointer(ptr_idx) { + // Pointer must reference valid portion of D + if start + len > d_len { + return Min(None); + } + decoded.extend_from_slice(&d_str[start..start + len]); + pointer_count += 1; + } else { + return Min(None); + } + } else { + // v == empty_c, but we already filtered those out + return Min(None); + } + } + + // Check decoded string matches the source string + if decoded != self.string { + return Min(None); + } + + // Compute cost: |D| + |C| + (h-1) * pointer_count + let cost = d_len + c_len + (self.pointer_cost - 1) * pointer_count; + Min(Some(cost)) + } +} + +crate::declare_variants! { + default MinimumExternalMacroDataCompression => "(alphabet_size + 1) ^ string_length * (alphabet_size + 1 + string_length * (string_length + 1) / 2) ^ string_length", +} + +#[cfg(feature = "example-db")] +pub(crate) fn canonical_model_example_specs() -> Vec { + // Issue #441 example: alphabet {a,b,c,d,e,f} (6), s="abcdefabcdefabcdef" (18), h=2. + // Optimal: D="abcdef"(6), C = ptr(0,6) ptr(0,6) ptr(0,6), cost = 6+3+(2-1)*3 = 12. + // Solved via ILP reduction (brute force infeasible at this size). + // + // Config encoding (2*18 = 36 slots): + // D-slots: [0,1,2,3,4,5, 6,6,...,6] (6 symbols + 12 empty, empty=alphabet_size=6) + // C-slots: [ptr(0,6), ptr(0,6), ptr(0,6), 6,6,...,6] (3 pointers + 15 empty) + // ptr(0,6) index: start=0, len=6 → index 5 → encoded as 6+1+5 = 12 + let s: Vec = (0..6).cycle().take(18).collect(); + let mut optimal_config = vec![0, 1, 2, 3, 4, 5]; + optimal_config.extend(vec![6; 12]); // empty D-slots + optimal_config.extend(vec![12, 12, 12]); // 3 pointers to D[0..6] + optimal_config.extend(vec![6; 15]); // empty C-slots + vec![crate::example_db::specs::ModelExampleSpec { + id: "minimum_external_macro_data_compression", + instance: Box::new(MinimumExternalMacroDataCompression::new(6, s, 2)), + optimal_config, + optimal_value: serde_json::json!(12), + }] +} + +#[cfg(test)] +#[path = "../../unit_tests/models/misc/minimum_external_macro_data_compression.rs"] +mod tests; diff --git a/src/models/misc/mod.rs b/src/models/misc/mod.rs index cfbaf213..1b8c1f05 100644 --- a/src/models/misc/mod.rs +++ b/src/models/misc/mod.rs @@ -16,6 +16,7 @@ //! - [`Knapsack`]: 0-1 Knapsack (maximize value subject to weight capacity) //! - [`MultiprocessorScheduling`]: Schedule tasks on processors to meet a deadline //! - [`LongestCommonSubsequence`]: Longest Common Subsequence +//! - [`MinimumExternalMacroDataCompression`]: Minimize compression cost using external dictionary //! - [`MinimumTardinessSequencing`]: Minimize tardy tasks in single-machine scheduling //! - [`PaintShop`]: Minimize color switches in paint shop scheduling //! - [`CosineProductIntegration`]: Balanced sign assignment for integer frequencies @@ -82,6 +83,7 @@ mod job_shop_scheduling; mod knapsack; mod kth_largest_m_tuple; mod longest_common_subsequence; +mod minimum_external_macro_data_compression; mod minimum_tardiness_sequencing; mod multiprocessor_scheduling; pub(crate) mod paintshop; @@ -128,6 +130,7 @@ pub use job_shop_scheduling::JobShopScheduling; pub use knapsack::Knapsack; pub use kth_largest_m_tuple::KthLargestMTuple; pub use longest_common_subsequence::LongestCommonSubsequence; +pub use minimum_external_macro_data_compression::MinimumExternalMacroDataCompression; pub use minimum_tardiness_sequencing::MinimumTardinessSequencing; pub use multiprocessor_scheduling::MultiprocessorScheduling; pub use paintshop::PaintShop; @@ -198,6 +201,7 @@ pub(crate) fn canonical_model_example_specs() -> Vec, + /// Total number of variables. + total_vars: usize, +} + +impl VarLayout { + fn new(n: usize, k: usize) -> Self { + let d_offset = 0; + let d_used_offset = d_offset + n * k; + let lit_offset = d_used_offset + n; + let ptr_offset = lit_offset + n; + + // Enumerate all valid (i, l, d_start) triples + let mut ptr_triples = Vec::new(); + for i in 0..n { + for l in 1..=(n - i) { + for d_start in 0..=(n - l) { + ptr_triples.push((i, l, d_start)); + } + } + } + + let total_vars = ptr_offset + ptr_triples.len(); + Self { + n, + k, + d_offset, + d_used_offset, + lit_offset, + ptr_offset, + ptr_triples, + total_vars, + } + } + + fn d_var(&self, j: usize, c: usize) -> usize { + self.d_offset + j * self.k + c + } + + fn d_used_var(&self, j: usize) -> usize { + self.d_used_offset + j + } + + fn lit_var(&self, i: usize) -> usize { + self.lit_offset + i + } + + fn ptr_var(&self, i: usize, l: usize, d_start: usize) -> usize { + // Find the index of (i, l, d_start) in ptr_triples + let idx = self + .ptr_triples + .iter() + .position(|&(pi, pl, pd)| pi == i && pl == l && pd == d_start) + .expect("invalid ptr triple"); + self.ptr_offset + idx + } + + /// Get all ptr variable indices for segments starting at position i with length l. + fn ptr_vars_for_segment(&self, i: usize, l: usize) -> Vec { + self.ptr_triples + .iter() + .enumerate() + .filter(|(_, &(pi, pl, _))| pi == i && pl == l) + .map(|(idx, _)| self.ptr_offset + idx) + .collect() + } +} + +/// Result of reducing MinimumExternalMacroDataCompression to ILP. +#[derive(Debug, Clone)] +pub struct ReductionEMDCToILP { + target: ILP, + /// Variable layout for solution extraction. + layout: VarLayout, + /// The source string (needed for extract_solution). + source_string: Vec, + /// Alphabet size. + alphabet_size: usize, +} + +impl ReductionResult for ReductionEMDCToILP { + type Source = MinimumExternalMacroDataCompression; + type Target = ILP; + + fn target_problem(&self) -> &ILP { + &self.target + } + + fn extract_solution(&self, target_solution: &[usize]) -> Vec { + let n = self.layout.n; + let k = self.alphabet_size; + let empty = k; // empty marker + + // Build D-slots + let mut d_slots = vec![empty; n]; + for j in 0..n { + if target_solution[self.layout.d_used_var(j)] == 1 { + for c in 0..k { + if target_solution[self.layout.d_var(j, c)] == 1 { + d_slots[j] = c; + break; + } + } + } + } + + // Walk through active segments to build C-slots + let mut c_slots = vec![empty; n]; + let mut c_pos = 0; + let mut pos = 0; + while pos < n { + // Check if lit[pos] = 1 + if target_solution[self.layout.lit_var(pos)] == 1 { + // Literal at position pos + c_slots[c_pos] = self.source_string[pos]; + c_pos += 1; + pos += 1; + continue; + } + // Check for an active pointer starting at pos + let mut found = false; + for l in 1..=(n - pos) { + for d_start in 0..=(n - l) { + let var_idx = self.layout.ptr_var(pos, l, d_start); + if target_solution[var_idx] == 1 { + // Encode pointer (d_start, l) as EMDC pointer index + let ptr_idx = encode_pointer(n, d_start, l); + c_slots[c_pos] = k + 1 + ptr_idx; + c_pos += 1; + pos += l; + found = true; + break; + } + } + if found { + break; + } + } + if !found { + // Should not happen with a valid ILP solution + pos += 1; + } + } + + // Combine D-slots and C-slots + let mut config = d_slots; + config.extend(c_slots); + config + } +} + +/// Encode a pointer (start, len) into the EMDC pointer index. +/// Pointers enumerate: (0,1),(0,2),...,(0,n), (1,1),(1,2),...,(1,n-1), ... +fn encode_pointer(n: usize, start: usize, len: usize) -> usize { + let mut idx = 0; + for s in 0..start { + idx += n - s; + } + idx + len - 1 +} + +#[reduction( + overhead = { + num_vars = "string_length * alphabet_size + 2 * string_length + string_length ^ 3", + num_constraints = "string_length + string_length * alphabet_size + string_length + string_length + 1 + string_length ^ 3 * string_length", + } +)] +impl ReduceTo> for MinimumExternalMacroDataCompression { + type Result = ReductionEMDCToILP; + + fn reduce_to(&self) -> Self::Result { + let n = self.string_length(); + let k = self.alphabet_size(); + let h = self.pointer_cost(); + let s = self.string(); + + // Handle empty string + if n == 0 { + let layout = VarLayout::new(0, k); + let target = ILP::new(0, vec![], vec![], ObjectiveSense::Minimize); + return ReductionEMDCToILP { + target, + layout, + source_string: vec![], + alphabet_size: k, + }; + } + + let layout = VarLayout::new(n, k); + let num_vars = layout.total_vars; + let mut constraints = Vec::new(); + + // 1. Dictionary one-hot: for each j, sum_c d[j][c] <= 1 + for j in 0..n { + let terms: Vec<(usize, f64)> = (0..k).map(|c| (layout.d_var(j, c), 1.0)).collect(); + constraints.push(LinearConstraint::le(terms, 1.0)); + } + + // 2. Dictionary linking: d[j][c] <= d_used[j] for all j, c + for j in 0..n { + for c in 0..k { + constraints.push(LinearConstraint::le( + vec![(layout.d_var(j, c), 1.0), (layout.d_used_var(j), -1.0)], + 0.0, + )); + } + } + + // 3. Dictionary contiguous: d_used[j+1] <= d_used[j] for j=0..n-2 + for j in 0..n.saturating_sub(1) { + constraints.push(LinearConstraint::le( + vec![ + (layout.d_used_var(j + 1), 1.0), + (layout.d_used_var(j), -1.0), + ], + 0.0, + )); + } + + // 4. Flow conservation on DAG: positions 0..n are nodes. + // A segment (i, l) contributes to outgoing flow at node i and incoming flow at node i+l. + // For segment (i, l): + // - if l == 1: flow = lit[i] + sum_{d_start} ptr[i][1][d_start] + // - if l >= 2: flow = sum_{d_start} ptr[i][l][d_start] + // + // Flow constraints: + // At node 0: sum of outgoing segments = 1 + // At node i (1..n-1): sum of incoming = sum of outgoing + // At node n: sum of incoming = 1 + + // Helper: get all terms for "segment flow" at (i, l) + // Returns the variable indices with coefficient 1.0 + let segment_terms = |i: usize, l: usize| -> Vec<(usize, f64)> { + let mut terms = Vec::new(); + if l == 1 { + terms.push((layout.lit_var(i), 1.0)); + } + for &var in &layout.ptr_vars_for_segment(i, l) { + terms.push((var, 1.0)); + } + terms + }; + + // For each node, compute outgoing and incoming segment terms + for node in 0..=n { + let mut all_terms: Vec<(usize, f64)> = Vec::new(); + + if node == 0 { + // sum of outgoing(0, l) = 1 + for l in 1..=n { + all_terms.extend(segment_terms(0, l)); + } + constraints.push(LinearConstraint::eq(all_terms, 1.0)); + } else if node == n { + // sum of incoming(n) = 1 + // incoming at node n: segments (j, l) where j + l = n + for j in 0..n { + let l = n - j; + all_terms.extend(segment_terms(j, l)); + } + constraints.push(LinearConstraint::eq(all_terms, 1.0)); + } else { + // node 1..n-1: incoming = outgoing + // incoming: segments (j, l) where j + l = node + let mut incoming = Vec::new(); + for j in 0..node { + let l = node - j; + incoming.extend(segment_terms(j, l)); + } + // outgoing: segments (node, l) for valid l + let mut outgoing = Vec::new(); + for l in 1..=(n - node) { + outgoing.extend(segment_terms(node, l)); + } + // incoming - outgoing = 0 + for (var, coef) in incoming { + all_terms.push((var, coef)); + } + for (var, coef) in outgoing { + all_terms.push((var, -coef)); + } + constraints.push(LinearConstraint::eq(all_terms, 0.0)); + } + } + + // 5. Pointer matching: ptr[i][l][d_start] <= d[d_start+offset][s[i+offset]] + // for all offset=0..l-1 + for (idx, &(i, l, d_start)) in layout.ptr_triples.iter().enumerate() { + let ptr_idx = layout.ptr_offset + idx; + for offset in 0..l { + let symbol = s[i + offset]; + // ptr[i][l][d_start] <= d[d_start + offset][symbol] + constraints.push(LinearConstraint::le( + vec![ + (ptr_idx, 1.0), + (layout.d_var(d_start + offset, symbol), -1.0), + ], + 0.0, + )); + } + } + + // 6. Literal matching: lit[i] can only be active if position i exists + // (this is always true for i < n, so no constraint needed). + // But we do need: if lit[i] = 1, the literal is s[i], which is automatic + // in the extract_solution. No additional constraint needed because the + // objective already penalizes literals. + + // Objective: minimize sum d_used[j] + sum lit[i] + h * sum ptr[i][l][d_start] + let mut objective: Vec<(usize, f64)> = Vec::new(); + for j in 0..n { + objective.push((layout.d_used_var(j), 1.0)); + } + for i in 0..n { + objective.push((layout.lit_var(i), 1.0)); + } + for (idx, _) in layout.ptr_triples.iter().enumerate() { + objective.push((layout.ptr_offset + idx, h as f64)); + } + + let target = ILP::new(num_vars, constraints, objective, ObjectiveSense::Minimize); + + ReductionEMDCToILP { + target, + layout, + source_string: s.to_vec(), + alphabet_size: k, + } + } +} + +#[cfg(feature = "example-db")] +pub(crate) fn canonical_rule_example_specs() -> Vec { + use crate::export::SolutionPair; + + // s = "ab" (len 2), alphabet {a,b} (size 2), h=2 + // Optimal: uncompressed, D="" C="ab", cost = 0+2+0 = 2 + // Config: D-slots=[2,2], C-slots=[0,1] + // ILP target_config: all d and d_used = 0, lit[0]=1, lit[1]=1, all ptr = 0 + vec![crate::example_db::specs::RuleExampleSpec { + id: "minimumexternalmacrodatacompression_to_ilp", + build: || { + let source = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let reduction = ReduceTo::>::reduce_to(&source); + let layout = &reduction.layout; + let n = 2; + let k = 2; + + // Build target config: all zeros, then set lit[0]=1, lit[1]=1 + let mut target_config = vec![0usize; layout.total_vars]; + target_config[layout.lit_var(0)] = 1; + target_config[layout.lit_var(1)] = 1; + + // Verify this is correct + let source_config = reduction.extract_solution(&target_config); + debug_assert_eq!(source_config[..n], [k, k]); // D empty + debug_assert_eq!(source_config[n..], [0, 1]); // C = "ab" + + crate::example_db::specs::rule_example_with_witness::<_, ILP>( + source, + SolutionPair { + source_config, + target_config, + }, + ) + }, + }] +} + +#[cfg(test)] +#[path = "../unit_tests/rules/minimumexternalmacrodatacompression_ilp.rs"] +mod tests; diff --git a/src/rules/mod.rs b/src/rules/mod.rs index b7afb06e..abb9e1fb 100644 --- a/src/rules/mod.rs +++ b/src/rules/mod.rs @@ -154,6 +154,8 @@ pub(crate) mod minimumcutintoboundedsets_ilp; #[cfg(feature = "ilp-solver")] pub(crate) mod minimumdominatingset_ilp; #[cfg(feature = "ilp-solver")] +pub(crate) mod minimumexternalmacrodatacompression_ilp; +#[cfg(feature = "ilp-solver")] pub(crate) mod minimumfeedbackarcset_ilp; #[cfg(feature = "ilp-solver")] pub(crate) mod minimumfeedbackvertexset_ilp; @@ -346,6 +348,7 @@ pub(crate) fn canonical_rule_example_specs() -> Vec::NAME, + "MinimumExternalMacroDataCompression" + ); + assert_eq!( + ::variant(), + vec![] + ); + // dims: 6 D-slots (domain 4) + 6 C-slots (domain 4 + 6*7/2 = 25) + let dims = problem.dims(); + assert_eq!(dims.len(), 12); + assert_eq!(dims[0], 4); // alphabet_size + 1 + assert_eq!(dims[6], 25); // alphabet_size + 1 + 6*7/2 +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_uncompressed() { + // alphabet {a, b}, s = "ab", h = 2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + // Uncompressed: D = "" (empty, empty), C = "ab" + // D-slots: [2, 2] (both empty) + // C-slots: [0, 1] (literal a, literal b) + // Cost = 0 + 2 + 0 = 2 + assert_eq!(problem.evaluate(&[2, 2, 0, 1]), Min(Some(2))); +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_with_pointer() { + // alphabet {a, b}, s = "abab", h = 2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1, 0, 1], 2); + // D = "ab" (len 2), C = "ptr(0,2) ptr(0,2)" + // D-slots: [0, 1, 2, 2] (a, b, empty, empty) + // C-slots: pointer (0,2) = index 1 in pointer enumeration: + // ptr(0,1)=0, ptr(0,2)=1, ptr(0,3)=2, ptr(0,4)=3, ptr(1,1)=4, ... + // So ptr(0,2) has index 1, encoded as alphabet_size+1+1 = 2+1+1 = 4 + // C-slots: [4, 4, 2, 2] (two pointers, two empty) + // This decodes: D[0..2] = "ab", D[0..2] = "ab" => "abab" = s. Valid! + // Cost = 2 + 2 + (2-1)*2 = 6 + assert_eq!(problem.evaluate(&[0, 1, 2, 2, 4, 4, 2, 2]), Min(Some(6))); +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_invalid_decode() { + // alphabet {a, b}, s = "ab", h = 2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + // C = "ba" doesn't match s = "ab" + assert_eq!(problem.evaluate(&[2, 2, 1, 0]), Min(None)); +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_wrong_length() { + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + assert_eq!(problem.evaluate(&[0, 1, 0]), Min(None)); + assert_eq!(problem.evaluate(&[0, 1, 0, 1, 0]), Min(None)); +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_interleaved_empty() { + // D-slots have interleaved empty + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + // D-slots: [2, 0] (empty then non-empty -> invalid) + assert_eq!(problem.evaluate(&[2, 0, 0, 1]), Min(None)); +} + +#[test] +fn test_minimum_external_macro_data_compression_evaluate_pointer_out_of_range() { + // alphabet {a, b}, s = "ab", h = 2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + // D = "a" (len 1), C = "ptr(0,2)" which references D[0..2] but D only has 1 element + // ptr(0,2) index = 1, encoded as 2+1+1 = 4 + assert_eq!(problem.evaluate(&[0, 2, 4, 2]), Min(None)); +} + +#[test] +fn test_minimum_external_macro_data_compression_empty_string() { + let problem = MinimumExternalMacroDataCompression::new(2, vec![], 2); + assert_eq!(problem.dims(), Vec::::new()); + assert_eq!(problem.evaluate(&[]), Min(Some(0))); +} + +#[test] +fn test_minimum_external_macro_data_compression_brute_force() { + // alphabet {a, b}, s = "ab", h = 2 + // Search space: 3^2 * 6^2 = 324 (feasible for brute force) + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let solver = BruteForce::new(); + let witness = solver + .find_witness(&problem) + .expect("should find a solution"); + let val = problem.evaluate(&witness); + assert!(val.0.is_some()); + // Optimal is uncompressed: cost = 2 + assert_eq!(val.0.unwrap(), 2); +} + +#[test] +fn test_minimum_external_macro_data_compression_solve_aggregate() { + use crate::solvers::Solver; + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let solver = BruteForce::new(); + let val = solver.solve(&problem); + assert_eq!(val, Min(Some(2))); +} + +#[test] +fn test_minimum_external_macro_data_compression_serialization() { + let problem = MinimumExternalMacroDataCompression::new(3, vec![0, 1, 2], 2); + let json = serde_json::to_value(&problem).unwrap(); + let restored: MinimumExternalMacroDataCompression = serde_json::from_value(json).unwrap(); + assert_eq!(restored.alphabet_size(), problem.alphabet_size()); + assert_eq!(restored.string(), problem.string()); + assert_eq!(restored.pointer_cost(), problem.pointer_cost()); +} + +#[test] +fn test_minimum_external_macro_data_compression_paper_example() { + // Paper example: Sigma = {a,b,c,d,e,f} (size 6), s = "abcdefabcdefabcdef" (len 18), h = 2 + // Optimal: D = "abcdef" (6), C = "ptr(0,6) ptr(0,6) ptr(0,6)" (3, 3 ptrs) + // Cost = 6 + 3 + 1*3 = 12 + // + // However, this has 2*18 = 36 variables with large domains, way too big for brute force. + // Instead, verify the optimal config evaluates correctly on a smaller instance. + // + // Use s = "abab" (len 4), alphabet {a,b} (size 2), h = 1 + // D = "ab" (2), C = "ptr(0,2) ptr(0,2)" (2, 2 ptrs) + // Cost = 2 + 2 + 0*2 = 4. Uncompressed = 4. Equal! + // + // Better: verify the issue example manually on the actual type. + // Construct s = [0,1,2,3,4,5, 0,1,2,3,4,5, 0,1,2,3,4,5], alphabet_size = 6, h = 2 + let problem = MinimumExternalMacroDataCompression::new( + 6, + vec![0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5], + 2, + ); + assert_eq!(problem.string_length(), 18); + + // Construct the optimal config manually: + // D-slots: [0,1,2,3,4,5, empty*12] = [0,1,2,3,4,5, 6,6,6,6,6,6,6,6,6,6,6,6] + let mut config = vec![0, 1, 2, 3, 4, 5]; + config.extend(vec![6; 12]); // 12 empty D-slots + + // C-slots: ptr(0,6) ptr(0,6) ptr(0,6) empty*15 + // ptr(0,6): start=0, len=6. In pointer enumeration: + // start=0: (0,1)->0, (0,2)->1, (0,3)->2, (0,4)->3, (0,5)->4, (0,6)->5, ... + // (0,7)->6, ..., (0,18)->17 + // So ptr(0,6) has index 5, encoded as alphabet_size+1+5 = 6+1+5 = 12 + config.extend(vec![12, 12, 12]); // 3 pointers + config.extend(vec![6; 15]); // 15 empty C-slots + + let val = problem.evaluate(&config); + assert_eq!(val, Min(Some(12))); // 6 + 3 + 1*3 = 12 +} + +#[test] +fn test_minimum_external_macro_data_compression_find_all_witnesses() { + // alphabet {a}, s = "a", h = 2 + // 2*1 = 2 variables. D-domain = 2, C-domain = 2 + 1 = 3. Total = 2*3 = 6 + let problem = MinimumExternalMacroDataCompression::new(1, vec![0], 2); + let solver = BruteForce::new(); + let solutions = solver.find_all_witnesses(&problem); + // There should be at least one witness: uncompressed [1, 0] (D=empty, C=a) + assert!(solutions.contains(&vec![1, 0])); + for sol in &solutions { + let val = problem.evaluate(sol); + assert!(val.0.is_some()); + } +} diff --git a/src/unit_tests/rules/minimumexternalmacrodatacompression_ilp.rs b/src/unit_tests/rules/minimumexternalmacrodatacompression_ilp.rs new file mode 100644 index 00000000..48720b06 --- /dev/null +++ b/src/unit_tests/rules/minimumexternalmacrodatacompression_ilp.rs @@ -0,0 +1,146 @@ +use super::*; +use crate::models::algebraic::{ObjectiveSense, ILP}; +use crate::solvers::ILPSolver; +use crate::traits::Problem; +use crate::types::Min; + +#[test] +fn test_emdc_to_ilp_closed_loop() { + // s = "ab" (len 2), alphabet {a,b}, h=2 + // Optimal: uncompressed, cost = 2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let reduction = ReduceTo::>::reduce_to(&problem); + + let ilp_solution = ILPSolver::new() + .solve(reduction.target_problem()) + .expect("ILP should be solvable"); + let extracted = reduction.extract_solution(&ilp_solution); + let value = problem.evaluate(&extracted); + assert!(value.is_valid(), "Extracted solution should be valid"); + assert_eq!(value, Min(Some(2))); +} + +#[test] +fn test_emdc_to_ilp_compression_wins() { + // alphabet {a,b,c,d,e,f} (6), s="abcdefabcdefabcdef" (18), h=2 + // Optimal: D="abcdef"(6), C=3 pointers of (0,6), cost=6+3+1*3=12 + // (pointer cost h=2, so (h-1)*3 = 3, total = 6+3+3 = 12) + // Uncompressed: 18 + let s: Vec = (0..6).cycle().take(18).collect(); + let problem = MinimumExternalMacroDataCompression::new(6, s, 2); + let reduction = ReduceTo::>::reduce_to(&problem); + + let ilp_solution = ILPSolver::new() + .solve(reduction.target_problem()) + .expect("ILP should be solvable"); + let extracted = reduction.extract_solution(&ilp_solution); + let value = problem.evaluate(&extracted); + assert!(value.is_valid(), "Extracted solution should be valid"); + assert_eq!(value, Min(Some(12))); +} + +#[test] +fn test_emdc_to_ilp_structure() { + // s = "ab" (len 2), alphabet {a,b} (k=2), h=2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let reduction = ReduceTo::>::reduce_to(&problem); + let ilp = reduction.target_problem(); + + let _n = 2; + let _k = 2; + // d[j][c]: 2*2 = 4 + // d_used[j]: 2 + // lit[i]: 2 + // ptr triples: (0,1,0),(0,1,1),(0,2,0),(1,1,0),(1,1,1) = 5 + // Total = 4 + 2 + 2 + 5 = 13 + assert_eq!(ilp.num_vars, 13); + assert_eq!(ilp.sense, ObjectiveSense::Minimize); + + // Constraints: + // one-hot: n = 2 + // linking: n*k = 4 + // contiguous: n-1 = 1 + // flow: n+1 = 3 + // ptr matching: each ptr triple's matching constraints + // (0,1,0): 1, (0,1,1): 1, (0,2,0): 2, (1,1,0): 1, (1,1,1): 1 = 6 + // Total = 2 + 4 + 1 + 3 + 6 = 16 + assert_eq!(ilp.constraints.len(), 16); +} + +#[test] +fn test_emdc_to_ilp_empty() { + // Empty string: cost should be 0 + let problem = MinimumExternalMacroDataCompression::new(2, vec![], 1); + let reduction = ReduceTo::>::reduce_to(&problem); + let ilp = reduction.target_problem(); + + assert_eq!(ilp.num_vars, 0); + assert!(ilp.constraints.is_empty()); + + // For empty ILP, the solution is empty + let extracted = reduction.extract_solution(&[]); + let value = problem.evaluate(&extracted); + assert_eq!(value, Min(Some(0))); +} + +#[test] +fn test_emdc_to_ilp_bf_vs_ilp() { + // Small instance: s="ab", alphabet {a,b}, h=2 + let problem = MinimumExternalMacroDataCompression::new(2, vec![0, 1], 2); + let reduction = ReduceTo::>::reduce_to(&problem); + crate::rules::test_helpers::assert_bf_vs_ilp(&problem, &reduction); +} + +#[test] +fn test_emdc_to_ilp_single_char() { + // s = "a" (len 1), alphabet {a} (k=1), h=1 + // Uncompressed: cost = 0+1+0 = 1. With D="a"(1), C=ptr(0,1)(1, 1 ptr): cost = 1+1+0 = 2. + // So uncompressed is optimal. + let problem = MinimumExternalMacroDataCompression::new(1, vec![0], 1); + let reduction = ReduceTo::>::reduce_to(&problem); + + let ilp_solution = ILPSolver::new() + .solve(reduction.target_problem()) + .expect("ILP should be solvable"); + let extracted = reduction.extract_solution(&ilp_solution); + let value = problem.evaluate(&extracted); + assert!(value.is_valid()); + assert_eq!(value, Min(Some(1))); +} + +#[test] +fn test_emdc_to_ilp_repeated_string() { + // s = "aaa" (len 3), alphabet {a} (k=1), h=1 + // Uncompressed: cost = 3. D="a"(1), C=ptr(0,1)*3: cost = 1+3+0=4. + // D="aaa"(3), C=ptr(0,3): cost = 3+1+0 = 4. + // D="aa"(2), C=ptr(0,1) ptr(0,2): cost = 2+2+0 = 4. + // Uncompressed is best at 3. + let problem = MinimumExternalMacroDataCompression::new(1, vec![0, 0, 0], 1); + let reduction = ReduceTo::>::reduce_to(&problem); + + let ilp_solution = ILPSolver::new() + .solve(reduction.target_problem()) + .expect("ILP should be solvable"); + let extracted = reduction.extract_solution(&ilp_solution); + let value = problem.evaluate(&extracted); + assert!(value.is_valid()); + assert_eq!(value, Min(Some(3))); +} + +#[cfg(feature = "example-db")] +#[test] +fn test_emdc_to_ilp_canonical_example_spec() { + let spec = canonical_rule_example_specs() + .into_iter() + .find(|spec| spec.id == "minimumexternalmacrodatacompression_to_ilp") + .expect("missing canonical EMDC -> ILP example spec"); + let example = (spec.build)(); + + assert_eq!( + example.source.problem, + "MinimumExternalMacroDataCompression" + ); + assert_eq!(example.target.problem, "ILP"); + assert_eq!(example.source.instance["alphabet_size"], 2); + assert_eq!(example.source.instance["string"], serde_json::json!([0, 1])); +}