Skip to content

Commit 769a918

Browse files
authored
Merge branch 'master' into master
2 parents c43ddfd + 2da0465 commit 769a918

35 files changed

+4474
-132
lines changed

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,14 @@
112112
<dependency>
113113
<groupId>com.puppycrawl.tools</groupId>
114114
<artifactId>checkstyle</artifactId>
115-
<version>12.0.1</version>
115+
<version>12.1.0</version>
116116
</dependency>
117117
</dependencies>
118118
</plugin>
119119
<plugin>
120120
<groupId>com.github.spotbugs</groupId>
121121
<artifactId>spotbugs-maven-plugin</artifactId>
122-
<version>4.9.7.0</version>
122+
<version>4.9.8.1</version>
123123
<configuration>
124124
<excludeFilterFile>spotbugs-exclude.xml</excludeFilterFile>
125125
<includeTests>true</includeTests>
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.math.BigDecimal;
4+
import java.math.MathContext;
5+
import java.util.ArrayList;
6+
import java.util.Collections;
7+
import java.util.HashMap;
8+
import java.util.List;
9+
import java.util.Map;
10+
11+
/**
12+
* An implementation of the Arithmetic Coding algorithm.
13+
*
14+
* <p>
15+
* Arithmetic coding is a form of entropy encoding used in lossless data
16+
* compression. It encodes an entire message into a single number, a fraction n
17+
* where (0.0 <= n < 1.0). Unlike Huffman coding, which assigns a specific
18+
* bit sequence to each symbol, arithmetic coding represents the message as a
19+
* sub-interval of the [0, 1) interval.
20+
* </p>
21+
*
22+
* <p>
23+
* This implementation uses BigDecimal for precision to handle the shrinking
24+
* intervals, making it suitable for educational purposes to demonstrate the
25+
* core logic.
26+
* </p>
27+
*
28+
* <p>
29+
* Time Complexity: O(n*m) for compression and decompression where n is the
30+
* length of the input and m is the number of unique symbols, due to the need
31+
* to calculate symbol probabilities.
32+
* </p>
33+
*
34+
* <p>
35+
* References:
36+
* <ul>
37+
* <li><a href="https://en.wikipedia.org/wiki/Arithmetic_coding">Wikipedia:
38+
* Arithmetic coding</a></li>
39+
* </ul>
40+
* </p>
41+
*/
42+
public final class ArithmeticCoding {
43+
44+
private ArithmeticCoding() {
45+
}
46+
47+
/**
48+
* Compresses a string using the Arithmetic Coding algorithm.
49+
*
50+
* @param uncompressed The string to be compressed.
51+
* @return The compressed representation as a BigDecimal number.
52+
* @throws IllegalArgumentException if the input string is null or empty.
53+
*/
54+
public static BigDecimal compress(String uncompressed) {
55+
if (uncompressed == null || uncompressed.isEmpty()) {
56+
throw new IllegalArgumentException("Input string cannot be null or empty.");
57+
}
58+
59+
Map<Character, Symbol> probabilityTable = calculateProbabilities(uncompressed);
60+
61+
BigDecimal low = BigDecimal.ZERO;
62+
BigDecimal high = BigDecimal.ONE;
63+
64+
for (char symbol : uncompressed.toCharArray()) {
65+
BigDecimal range = high.subtract(low);
66+
Symbol sym = probabilityTable.get(symbol);
67+
68+
high = low.add(range.multiply(sym.high()));
69+
low = low.add(range.multiply(sym.low()));
70+
}
71+
72+
return low; // Return the lower bound of the final interval
73+
}
74+
75+
/**
76+
* Decompresses a BigDecimal number back into the original string.
77+
*
78+
* @param compressed The compressed BigDecimal number.
79+
* @param length The length of the original uncompressed string.
80+
* @param probabilityTable The probability table used during compression.
81+
* @return The original, uncompressed string.
82+
*/
83+
public static String decompress(BigDecimal compressed, int length, Map<Character, Symbol> probabilityTable) {
84+
StringBuilder decompressed = new StringBuilder();
85+
86+
// Create a sorted list of symbols for deterministic decompression, matching the
87+
// order used in calculateProbabilities
88+
List<Map.Entry<Character, Symbol>> sortedSymbols = new ArrayList<>(probabilityTable.entrySet());
89+
sortedSymbols.sort(Map.Entry.comparingByKey());
90+
91+
BigDecimal low = BigDecimal.ZERO;
92+
BigDecimal high = BigDecimal.ONE;
93+
94+
for (int i = 0; i < length; i++) {
95+
BigDecimal range = high.subtract(low);
96+
97+
// Find which symbol the compressed value falls into
98+
for (Map.Entry<Character, Symbol> entry : sortedSymbols) {
99+
Symbol sym = entry.getValue();
100+
101+
// Calculate the actual range for this symbol in the current interval
102+
BigDecimal symLow = low.add(range.multiply(sym.low()));
103+
BigDecimal symHigh = low.add(range.multiply(sym.high()));
104+
105+
// Check if the compressed value falls within this symbol's range
106+
if (compressed.compareTo(symLow) >= 0 && compressed.compareTo(symHigh) < 0) {
107+
decompressed.append(entry.getKey());
108+
109+
// Update the interval for the next iteration
110+
low = symLow;
111+
high = symHigh;
112+
break;
113+
}
114+
}
115+
}
116+
117+
return decompressed.toString();
118+
}
119+
120+
/**
121+
* Calculates the frequency and probability range for each character in the
122+
* input string in a deterministic order.
123+
*
124+
* @param text The input string.
125+
* @return A map from each character to a Symbol object containing its
126+
* probability range.
127+
*/
128+
public static Map<Character, Symbol> calculateProbabilities(String text) {
129+
Map<Character, Integer> frequencies = new HashMap<>();
130+
for (char c : text.toCharArray()) {
131+
frequencies.put(c, frequencies.getOrDefault(c, 0) + 1);
132+
}
133+
134+
// Sort the characters to ensure a deterministic order for the probability table
135+
List<Character> sortedKeys = new ArrayList<>(frequencies.keySet());
136+
Collections.sort(sortedKeys);
137+
138+
Map<Character, Symbol> probabilityTable = new HashMap<>();
139+
BigDecimal currentLow = BigDecimal.ZERO;
140+
int total = text.length();
141+
142+
for (char symbol : sortedKeys) {
143+
BigDecimal probability = BigDecimal.valueOf(frequencies.get(symbol)).divide(BigDecimal.valueOf(total), MathContext.DECIMAL128);
144+
BigDecimal high = currentLow.add(probability);
145+
probabilityTable.put(symbol, new Symbol(currentLow, high));
146+
currentLow = high;
147+
}
148+
149+
return probabilityTable;
150+
}
151+
152+
/**
153+
* Helper class to store the probability range [low, high) for a symbol.
154+
*/
155+
public record Symbol(BigDecimal low, BigDecimal high) {
156+
}
157+
}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.Arrays;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
7+
/**
8+
* Implementation of the Burrows-Wheeler Transform (BWT) and its inverse.
9+
* <p>
10+
* BWT is a reversible data transformation algorithm that rearranges a string into runs of
11+
* similar characters. While not a compression algorithm itself, it significantly improves
12+
* the compressibility of data for subsequent algorithms like Move-to-Front encoding and
13+
* Run-Length Encoding.
14+
* </p>
15+
*
16+
* <p>The transform works by:
17+
* <ol>
18+
* <li>Generating all rotations of the input string</li>
19+
* <li>Sorting these rotations lexicographically</li>
20+
* <li>Taking the last column of the sorted matrix as output</li>
21+
* <li>Recording the index of the original string in the sorted matrix</li>
22+
* </ol>
23+
* </p>
24+
*
25+
* <p><b>Important:</b> The input string should end with a unique end-of-string marker
26+
* (typically '$') that:
27+
* <ul>
28+
* <li>Does not appear anywhere else in the text</li>
29+
* <li>Is lexicographically smaller than all other characters</li>
30+
* <li>Ensures unique rotations and enables correct inverse transformation</li>
31+
* </ul>
32+
* Without this marker, the inverse transform may not correctly reconstruct the original string.
33+
* </p>
34+
*
35+
* <p><b>Time Complexity:</b>
36+
* <ul>
37+
* <li>Forward transform: O(n² log n) where n is the string length</li>
38+
* <li>Inverse transform: O(n) using the LF-mapping technique</li>
39+
* </ul>
40+
* </p>
41+
*
42+
* <p><b>Example:</b></p>
43+
* <pre>
44+
* Input: "banana$"
45+
* Output: BWTResult("annb$aa", 4)
46+
* - "annb$aa" is the transformed string (groups similar characters)
47+
* - 4 is the index of the original string in the sorted rotations
48+
* </pre>
49+
*
50+
* @see <a href="https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform">Burrows–Wheeler transform (Wikipedia)</a>
51+
*/
52+
public final class BurrowsWheelerTransform {
53+
54+
private BurrowsWheelerTransform() {
55+
}
56+
57+
/**
58+
* A container for the result of the forward BWT.
59+
* <p>
60+
* Contains the transformed string and the index of the original string
61+
* in the sorted rotations matrix, both of which are required for the
62+
* inverse transformation.
63+
* </p>
64+
*/
65+
public static class BWTResult {
66+
/** The transformed string (last column of the sorted rotation matrix) */
67+
public final String transformed;
68+
69+
/** The index of the original string in the sorted rotations matrix */
70+
public final int originalIndex;
71+
72+
/**
73+
* Constructs a BWTResult with the transformed string and original index.
74+
*
75+
* @param transformed the transformed string (L-column)
76+
* @param originalIndex the index of the original string in sorted rotations
77+
*/
78+
public BWTResult(String transformed, int originalIndex) {
79+
this.transformed = transformed;
80+
this.originalIndex = originalIndex;
81+
}
82+
83+
@Override
84+
public boolean equals(Object obj) {
85+
if (this == obj) {
86+
return true;
87+
}
88+
if (obj == null || getClass() != obj.getClass()) {
89+
return false;
90+
}
91+
BWTResult bwtResult = (BWTResult) obj;
92+
return originalIndex == bwtResult.originalIndex && transformed.equals(bwtResult.transformed);
93+
}
94+
95+
@Override
96+
public int hashCode() {
97+
return 31 * transformed.hashCode() + originalIndex;
98+
}
99+
100+
@Override
101+
public String toString() {
102+
return "BWTResult[transformed=" + transformed + ", originalIndex=" + originalIndex + "]";
103+
}
104+
}
105+
106+
/**
107+
* Performs the forward Burrows-Wheeler Transform on the input string.
108+
* <p>
109+
* The algorithm generates all cyclic rotations of the input, sorts them
110+
* lexicographically, and returns the last column of this sorted matrix
111+
* along with the position of the original string.
112+
* </p>
113+
*
114+
* <p><b>Note:</b> It is strongly recommended that the input string ends with
115+
* a unique end-of-string marker (e.g., '$') that is lexicographically smaller
116+
* than any other character in the string. This ensures correct inversion.</p>
117+
*
118+
* @param text the input string to transform; must not be {@code null}
119+
* @return a {@link BWTResult} object containing the transformed string (L-column)
120+
* and the index of the original string in the sorted rotations matrix;
121+
* returns {@code BWTResult("", -1)} for empty input
122+
* @throws NullPointerException if {@code text} is {@code null}
123+
*/
124+
public static BWTResult transform(String text) {
125+
if (text == null || text.isEmpty()) {
126+
return new BWTResult("", -1);
127+
}
128+
129+
int n = text.length();
130+
131+
// Generate all rotations of the input string
132+
String[] rotations = new String[n];
133+
for (int i = 0; i < n; i++) {
134+
rotations[i] = text.substring(i) + text.substring(0, i);
135+
}
136+
137+
// Sort rotations lexicographically
138+
Arrays.sort(rotations);
139+
int originalIndex = Arrays.binarySearch(rotations, text);
140+
StringBuilder lastColumn = new StringBuilder(n);
141+
for (int i = 0; i < n; i++) {
142+
lastColumn.append(rotations[i].charAt(n - 1));
143+
}
144+
145+
return new BWTResult(lastColumn.toString(), originalIndex);
146+
}
147+
148+
/**
149+
* Performs the inverse Burrows-Wheeler Transform using the LF-mapping technique.
150+
* <p>
151+
* The LF-mapping (Last-First mapping) is an efficient method to reconstruct
152+
* the original string from the BWT output without explicitly reconstructing
153+
* the entire sorted rotations matrix.
154+
* </p>
155+
*
156+
* <p>The algorithm works by:
157+
* <ol>
158+
* <li>Creating the first column by sorting the BWT string</li>
159+
* <li>Building a mapping from first column indices to last column indices</li>
160+
* <li>Following this mapping starting from the original index to reconstruct the string</li>
161+
* </ol>
162+
* </p>
163+
*
164+
* @param bwtString the transformed string (L-column) from the forward transform; must not be {@code null}
165+
* @param originalIndex the index of the original string row from the forward transform;
166+
* use -1 for empty strings
167+
* @return the original, untransformed string; returns empty string if input is empty or {@code originalIndex} is -1
168+
* @throws NullPointerException if {@code bwtString} is {@code null}
169+
* @throws IllegalArgumentException if {@code originalIndex} is out of valid range (except -1)
170+
*/
171+
public static String inverseTransform(String bwtString, int originalIndex) {
172+
if (bwtString == null || bwtString.isEmpty() || originalIndex == -1) {
173+
return "";
174+
}
175+
176+
int n = bwtString.length();
177+
if (originalIndex < 0 || originalIndex >= n) {
178+
throw new IllegalArgumentException("Original index must be between 0 and " + (n - 1) + ", got: " + originalIndex);
179+
}
180+
181+
char[] lastColumn = bwtString.toCharArray();
182+
char[] firstColumn = bwtString.toCharArray();
183+
Arrays.sort(firstColumn);
184+
185+
// Create the "next" array for LF-mapping.
186+
// next[i] stores the row index in the last column that corresponds to firstColumn[i]
187+
int[] next = new int[n];
188+
189+
// Track the count of each character seen so far in the last column
190+
Map<Character, Integer> countMap = new HashMap<>();
191+
192+
// Store the first occurrence index of each character in the first column
193+
Map<Character, Integer> firstOccurrence = new HashMap<>();
194+
195+
for (int i = 0; i < n; i++) {
196+
if (!firstOccurrence.containsKey(firstColumn[i])) {
197+
firstOccurrence.put(firstColumn[i], i);
198+
}
199+
}
200+
201+
// Build the LF-mapping
202+
for (int i = 0; i < n; i++) {
203+
char c = lastColumn[i];
204+
int count = countMap.getOrDefault(c, 0);
205+
int firstIndex = firstOccurrence.get(c);
206+
next[firstIndex + count] = i;
207+
countMap.put(c, count + 1);
208+
}
209+
210+
// Reconstruct the original string by following the LF-mapping
211+
StringBuilder originalString = new StringBuilder(n);
212+
int currentRow = originalIndex;
213+
for (int i = 0; i < n; i++) {
214+
originalString.append(firstColumn[currentRow]);
215+
currentRow = next[currentRow];
216+
}
217+
218+
return originalString.toString();
219+
}
220+
}

0 commit comments

Comments
 (0)