Skip to content

Commit a70b132

Browse files
isPANNclaude
andauthored
Fix #441: Add MinimumExternalMacroDataCompression model (#819)
* feat: add MinimumExternalMacroDataCompression model (#441) Implement the Minimum External Macro Data Compression problem (Garey & Johnson SR22). Given an alphabet, string, and pointer cost, find a dictionary D and compressed string C minimizing total compression cost. - Model with Min<usize> value type, pointer-free D restriction - CLI create support with --string, --pointer-cost flags - 13 unit tests covering creation, evaluation, brute force, serialization - Canonical example in example-db - Paper entry with problem definition and visualization - Bibliography entries for Storer (1977), Storer & Szymanski (1982), Charikar et al. (2005) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add MinimumExternalMacroDataCompression → ILP reduction and update canonical example - Implement ILP reduction using segment-flow formulation with dictionary one-hot, contiguity, flow conservation, and pointer matching constraints - Update canonical example to issue #441's compression-demonstrating instance (s="abcdefabcdefabcdef", h=2, optimal=12 via dictionary+pointers) - Update paper figure to show compression with D="abcdef" and 3 pointers - 7 ILP reduction tests + 13 model tests all pass Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add paper reduction-rule entry for EMDC → ILP Segment-flow ILP formulation with dictionary one-hot, contiguity, flow conservation, and pointer matching constraints. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 81d92e5 commit a70b132

File tree

11 files changed

+1201
-7
lines changed

11 files changed

+1201
-7
lines changed

docs/paper/reductions.typ

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@
196196
"StaffScheduling": [Staff Scheduling],
197197
"SteinerTree": [Steiner Tree],
198198
"SteinerTreeInGraphs": [Steiner Tree in Graphs],
199+
"MinimumExternalMacroDataCompression": [Minimum External Macro Data Compression],
199200
"StringToStringCorrection": [String-to-String Correction],
200201
"StrongConnectivityAugmentation": [Strong Connectivity Augmentation],
201202
"SubgraphIsomorphism": [Subgraph Isomorphism],
@@ -4983,6 +4984,88 @@ A classical NP-complete problem from Garey and Johnson @garey1979[Ch.~3, p.~76],
49834984
]
49844985
}
49854986

4987+
#{
4988+
let x = load-model-example("MinimumExternalMacroDataCompression")
4989+
let alpha-size = x.instance.alphabet_size
4990+
let s = x.instance.string
4991+
let n = s.len()
4992+
let h = x.instance.pointer_cost
4993+
let alpha-map = range(alpha-size).map(i => str.from-unicode(97 + i))
4994+
let s-str = s.map(c => alpha-map.at(c)).join("")
4995+
let opt-val = metric-value(x.optimal_value)
4996+
[
4997+
#problem-def("MinimumExternalMacroDataCompression")[
4998+
Given a finite alphabet $Sigma$ of size $k$, a string $s in Sigma^*$ of length $n$, and a pointer cost $h in ZZ^+$, find a dictionary string $D in Sigma^*$ and a compressed string $C in (Sigma union {p_1, dots, p_n})^*$, where each $p_i$ is a pointer referencing a contiguous substring of $D$, such that $s$ can be obtained from $C$ by replacing every pointer with its referenced substring, minimizing the total cost $|D| + |C| + (h - 1) times$ (number of pointer occurrences in $C$).
4999+
][
5000+
A classical NP-hard data compression problem, listed as SR22 in Garey and Johnson @garey1979. The macro model of data compression was introduced by #cite(<storer1977>, form: "prose"), who proved NP-completeness via transformation from Vertex Cover. #cite(<storer1982>, form: "prose") provided a comprehensive analysis of the macro compression framework, showing that NP-completeness persists even when $h$ is any fixed integer $gt.eq 2$, when the alphabet has $gt.eq 3$ symbols, and when $D$ contains no pointers (the "external" variant). The LZ-family of practical compression algorithms (LZ77, LZSS, LZ78) are restricted forms of this general macro model. The related Smallest Grammar Problem is APX-hard @charikar2005.#footnote[No algorithm improving on brute-force enumeration is known for optimal external macro compression.]
5001+
5002+
*Example.* Let $Sigma = {#alpha-map.join(", ")}$ and $s = #s-str$ (length #n) with pointer cost $h = #h$.
5003+
5004+
#pred-commands(
5005+
"pred create --example MinimumExternalMacroDataCompression -o min-emdc.json",
5006+
"pred solve min-emdc.json",
5007+
"pred evaluate min-emdc.json --config " + x.optimal_config.map(str).join(","),
5008+
)
5009+
5010+
#figure({
5011+
let blue = graph-colors.at(0)
5012+
let green = graph-colors.at(1)
5013+
let cell(ch, highlight: false, ptr: false) = {
5014+
let fill = if ptr { green.transparentize(70%) } else if highlight { blue.transparentize(70%) } else { white }
5015+
box(width: 0.5cm, height: 0.55cm, fill: fill, stroke: 0.5pt + luma(120),
5016+
align(center + horizon, text(8pt, weight: "bold", ch)))
5017+
}
5018+
let ptr-cell(label) = {
5019+
box(width: 1.5cm, height: 0.55cm, fill: green.transparentize(70%), stroke: 0.5pt + luma(120),
5020+
align(center + horizon, text(7pt, weight: "bold", label)))
5021+
}
5022+
// D = first 6 symbols of s (one copy of the pattern)
5023+
let d-len = alpha-size
5024+
let d-syms = s.slice(0, d-len)
5025+
// C = 3 pointers, each referencing D[0..6]
5026+
let num-ptrs = calc.div-euclid(n, d-len)
5027+
align(center, stack(dir: ttb, spacing: 0.5cm,
5028+
// Source string
5029+
stack(dir: ltr, spacing: 0pt,
5030+
box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$s: quad$])),
5031+
..s.map(c => cell(alpha-map.at(c))),
5032+
),
5033+
// Dictionary D
5034+
stack(dir: ltr, spacing: 0pt,
5035+
box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$D: quad$])),
5036+
..d-syms.map(c => cell(alpha-map.at(c), highlight: true)),
5037+
),
5038+
// Compressed string C = 3 pointers
5039+
stack(dir: ltr, spacing: 0pt,
5040+
box(width: 1.5cm, height: 0.5cm, align(right + horizon, text(8pt)[$C: quad$])),
5041+
..range(num-ptrs).map(_ => ptr-cell[$arrow.r D[0..#d-len]$]),
5042+
),
5043+
))
5044+
},
5045+
caption: [Minimum External Macro Data Compression: with $s = #s-str$ (length #n) and pointer cost $h = #h$, the optimal compression stores $D = #s-str.slice(0, alpha-size)$ (#alpha-size symbols) and uses #calc.div-euclid(n, alpha-size) pointers in $C$, achieving cost $#alpha-size + #calc.div-euclid(n, alpha-size) + (#h - 1) times #calc.div-euclid(n, alpha-size) = #opt-val$ vs.~uncompressed cost #n.],
5046+
) <fig:emdc>
5047+
5048+
This instance has a repeating pattern of length #alpha-size, allowing the dictionary $D$ to store one copy and the compressed string $C$ to reference it via pointers. Each pointer costs $h = #h$ (the pointer symbol itself plus $h - 1 = #(h - 1)$ extra), so the total cost is $|D| + |C| + (h - 1) times |"pointers"| = #alpha-size + #calc.div-euclid(n, alpha-size) + #(h - 1) times #calc.div-euclid(n, alpha-size) = #opt-val$, saving $#(n - int(opt-val))$ over the uncompressed cost of #n.
5049+
]
5050+
]
5051+
}
5052+
5053+
#reduction-rule("MinimumExternalMacroDataCompression", "ILP")[
5054+
The compression problem decomposes into a dictionary selection (which symbols appear at which positions in $D$) and a string partitioning (which segments of $s$ are literals vs.~pointers). Both are naturally expressed with binary variables and linear constraints. The partition structure is modeled as a flow on a DAG whose nodes are string positions and whose arcs are candidate segments.
5055+
][
5056+
_Construction._ For alphabet $Sigma$ of size $k$, string $s$ of length $n$, and pointer cost $h$:
5057+
5058+
_Variables:_ (1) Binary $d_(j,c) in {0,1}$ for each dictionary position $j in {0, dots, n-1}$ and symbol $c in Sigma$: $d_(j,c) = 1$ iff $D[j] = c$. (2) Binary $u_j in {0,1}$: $u_j = 1$ iff dictionary position $j$ is used. (3) Binary $ell_i in {0,1}$ for each string position $i$: $ell_i = 1$ iff position $i$ is covered by a literal. (4) Binary $p_(i,lambda,delta) in {0,1}$ for each valid triple $(i, lambda, delta)$ with $i + lambda <= n$ and $delta + lambda <= n$: $p_(i,lambda,delta) = 1$ iff positions $[i, i + lambda)$ are covered by a pointer referencing $D[delta .. delta + lambda)$.
5059+
5060+
_Constraints:_ (1) Dictionary one-hot: $sum_(c in Sigma) d_(j,c) <= 1$ for all $j$. (2) Linking: $d_(j,c) <= u_j$ for all $j, c$. (3) Contiguity: $u_(j+1) <= u_j$ for all $j < n - 1$. (4) Partition flow: the segments form a partition of ${0, dots, n-1}$ via flow conservation on nodes $0, dots, n$. (5) Pointer matching: $p_(i,lambda,delta) <= d_(delta+r, s[i+r])$ for all offsets $r in {0, dots, lambda - 1}$.
5061+
5062+
_Objective:_ Minimize $sum_j u_j + sum_i ell_i + h sum_(i,lambda,delta) p_(i,lambda,delta)$.
5063+
5064+
_Correctness._ ($arrow.r.double$) An optimal $(D, C)$ pair determines a feasible ILP assignment: set $d_(j,c) = 1$ for each symbol in $D$, $u_j = 1$ for used positions, and activate the corresponding literal or pointer variables for each $C$-slot. The partition flow is satisfied by construction. ($arrow.l.double$) Any feasible ILP solution defines a valid dictionary (one-hot + contiguity) and a valid partition of $s$ into literal and pointer segments (flow conservation + matching), with cost equal to the objective.
5065+
5066+
_Solution extraction._ Read $D$ from the $d_(j,c)$ indicators. Walk through the active segments (via $ell_i$ and $p_(i,lambda,delta)$) to reconstruct $C$.
5067+
]
5068+
49865069
#{
49875070
let x = load-model-example("MinimumFeedbackArcSet")
49885071
let nv = x.instance.graph.num_vertices

docs/paper/references.bib

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,16 @@ @article{edmondsjohnson1973
14761476
year = {1973}
14771477
}
14781478

1479+
@inproceedings{charikar2005,
1480+
author = {Moses Charikar and Eric Lehman and Ding Liu and Rina Panigrahy and Manoj Prabhakaran and Amit Sahai and Abhi Shelat},
1481+
title = {The Smallest Grammar Problem},
1482+
booktitle = {IEEE Transactions on Information Theory},
1483+
volume = {51},
1484+
number = {7},
1485+
pages = {2554--2576},
1486+
year = {2005}
1487+
}
1488+
14791489
@article{lenstra1977,
14801490
author = {J. K. Lenstra and A. H. G. Rinnooy Kan and P. Brucker},
14811491
title = {Complexity of Machine Scheduling Problems},
@@ -1511,6 +1521,24 @@ @techreport{plaisted1976
15111521
year = {1976}
15121522
}
15131523

1524+
@techreport{storer1977,
1525+
author = {James A. Storer},
1526+
title = {NP-Completeness Results Concerning Data Compression},
1527+
institution = {Princeton University, Department of Electrical Engineering and Computer Science},
1528+
number = {234},
1529+
year = {1977}
1530+
}
1531+
1532+
@article{storer1982,
1533+
author = {James A. Storer and Thomas G. Szymanski},
1534+
title = {Data Compression via Textual Substitution},
1535+
journal = {Journal of the ACM},
1536+
volume = {29},
1537+
number = {4},
1538+
pages = {928--951},
1539+
year = {1982}
1540+
}
1541+
15141542
@article{haase2016,
15151543
author = {Haase, Christoph and Kiefer, Stefan},
15161544
title = {The Complexity of the {K}th Largest Subset Problem and Related Problems},

problemreductions-cli/src/cli.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ Flags by problem type:
314314
SequencingToMinimizeMaximumCumulativeCost --costs [--precedence-pairs]
315315
SequencingToMinimizeWeightedCompletionTime --lengths, --weights [--precedence-pairs]
316316
SequencingToMinimizeWeightedTardiness --sizes, --weights, --deadlines, --bound
317+
MinimumExternalMacroDataCompression --string, --pointer-cost [--alphabet-size]
317318
SCS --strings [--alphabet-size]
318319
StringToStringCorrection --source-string, --target-string, --bound [--alphabet-size]
319320
D2CIF --arcs, --capacities, --source-1, --sink-1, --source-2, --sink-2, --requirement-1, --requirement-2
@@ -760,6 +761,9 @@ pub struct CreateArgs {
760761
/// Target string for StringToStringCorrection (comma-separated symbol indices, e.g., "0,1,3,2")
761762
#[arg(long)]
762763
pub target_string: Option<String>,
764+
/// Pointer cost for MinimumExternalMacroDataCompression (positive integer)
765+
#[arg(long)]
766+
pub pointer_cost: Option<usize>,
763767
/// Expression tree for IntegerExpressionMembership (JSON, e.g., '{"Sum":[{"Atom":1},{"Atom":2}]}')
764768
#[arg(long)]
765769
pub expression: Option<String>,

problemreductions-cli/src/commands/create.rs

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@ use problemreductions::models::misc::{
2525
ConjunctiveBooleanQuery, ConsistencyOfDatabaseFrequencyTables, EnsembleComputation,
2626
ExpectedRetrievalCost, FlowShopScheduling, FrequencyTable, GroupingBySwapping, IntExpr,
2727
IntegerExpressionMembership, JobShopScheduling, KnownValue, KthLargestMTuple,
28-
LongestCommonSubsequence, MinimumTardinessSequencing, MultiprocessorScheduling, PaintShop,
29-
PartiallyOrderedKnapsack, ProductionPlanning, QueryArg, RectilinearPictureCompression,
30-
RegisterSufficiency, ResourceConstrainedScheduling, SchedulingToMinimizeWeightedCompletionTime,
28+
LongestCommonSubsequence, MinimumExternalMacroDataCompression, MinimumTardinessSequencing,
29+
MultiprocessorScheduling, PaintShop, PartiallyOrderedKnapsack, ProductionPlanning, QueryArg,
30+
RectilinearPictureCompression, RegisterSufficiency, ResourceConstrainedScheduling,
31+
SchedulingToMinimizeWeightedCompletionTime,
3132
SchedulingWithIndividualDeadlines, SequencingToMinimizeMaximumCumulativeCost,
3233
SequencingToMinimizeWeightedCompletionTime, SequencingToMinimizeWeightedTardiness,
3334
SequencingWithReleaseTimesAndDeadlines, SequencingWithinIntervals, ShortestCommonSupersequence,
@@ -173,6 +174,7 @@ fn all_data_flags_empty(args: &CreateArgs) -> bool {
173174
&& args.num_attributes.is_none()
174175
&& args.source_string.is_none()
175176
&& args.target_string.is_none()
177+
&& args.pointer_cost.is_none()
176178
&& args.capacities.is_none()
177179
&& args.source_1.is_none()
178180
&& args.sink_1.is_none()
@@ -757,6 +759,9 @@ fn example_for(canonical: &str, graph_type: Option<&str>) -> &'static str {
757759
"--strings \"010110;100101;001011\" --alphabet-size 2"
758760
}
759761
"GroupingBySwapping" => "--string \"0,1,2,0,1,2\" --bound 5",
762+
"MinimumExternalMacroDataCompression" => {
763+
"--string \"0,1,0,1\" --pointer-cost 2 --alphabet-size 2"
764+
}
760765
"MinimumCardinalityKey" => {
761766
"--num-attributes 6 --dependencies \"0,1>2;0,2>3;1,3>4;2,4>5\""
762767
}
@@ -892,6 +897,8 @@ fn help_flag_hint(
892897
"raw strings: \"ABAC;BACA\" or symbol lists: \"0,1,0;1,0,1\""
893898
}
894899
("GroupingBySwapping", "string") => "symbol list: \"0,1,2,0,1,2\"",
900+
("MinimumExternalMacroDataCompression", "string") => "symbol list: \"0,1,0,1\"",
901+
("MinimumExternalMacroDataCompression", "pointer_cost") => "positive integer: 2",
895902
("ShortestCommonSupersequence", "strings") => "symbol lists: \"0,1,2;1,2,0\"",
896903
("MultipleChoiceBranching", "partition") => "semicolon-separated groups: \"0,1;2,3\"",
897904
("IntegralFlowHomologousArcs", "homologous_pairs") => {
@@ -3157,6 +3164,57 @@ pub fn create(args: &CreateArgs, out: &OutputConfig) -> Result<()> {
31573164
)
31583165
}
31593166

3167+
// MinimumExternalMacroDataCompression
3168+
"MinimumExternalMacroDataCompression" => {
3169+
let usage = "Usage: pred create MinimumExternalMacroDataCompression --string \"0,1,0,1\" --pointer-cost 2 [--alphabet-size 2]";
3170+
let string_str = args.string.as_deref().ok_or_else(|| {
3171+
anyhow::anyhow!("MinimumExternalMacroDataCompression requires --string\n\n{usage}")
3172+
})?;
3173+
let pointer_cost = args.pointer_cost.ok_or_else(|| {
3174+
anyhow::anyhow!(
3175+
"MinimumExternalMacroDataCompression requires --pointer-cost\n\n{usage}"
3176+
)
3177+
})?;
3178+
anyhow::ensure!(
3179+
pointer_cost > 0,
3180+
"--pointer-cost must be a positive integer\n\n{usage}"
3181+
);
3182+
3183+
let string: Vec<usize> = if string_str.trim().is_empty() {
3184+
Vec::new()
3185+
} else {
3186+
string_str
3187+
.split(',')
3188+
.map(|value| {
3189+
value
3190+
.trim()
3191+
.parse::<usize>()
3192+
.context("invalid symbol index")
3193+
})
3194+
.collect::<Result<Vec<_>>>()?
3195+
};
3196+
let inferred = string.iter().copied().max().map_or(0, |value| value + 1);
3197+
let alphabet_size = args.alphabet_size.unwrap_or(inferred);
3198+
anyhow::ensure!(
3199+
alphabet_size >= inferred,
3200+
"--alphabet-size {} is smaller than max symbol + 1 ({}) in the input string",
3201+
alphabet_size,
3202+
inferred
3203+
);
3204+
anyhow::ensure!(
3205+
alphabet_size > 0 || string.is_empty(),
3206+
"MinimumExternalMacroDataCompression requires a positive alphabet for non-empty strings.\n\n{usage}"
3207+
);
3208+
(
3209+
ser(MinimumExternalMacroDataCompression::new(
3210+
alphabet_size,
3211+
string,
3212+
pointer_cost,
3213+
))?,
3214+
resolved_variant.clone(),
3215+
)
3216+
}
3217+
31603218
// ClosestVectorProblem
31613219
"ClosestVectorProblem" => {
31623220
let basis_str = args.basis.as_deref().ok_or_else(|| {
@@ -7867,6 +7925,7 @@ mod tests {
78677925
storage: None,
78687926
quantifiers: None,
78697927
homologous_pairs: None,
7928+
pointer_cost: None,
78707929
expression: None,
78717930
coeff_a: None,
78727931
coeff_b: None,

0 commit comments

Comments
 (0)