-
Notifications
You must be signed in to change notification settings - Fork 263
Add Compression Utils #3102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
SethSmucker
wants to merge
8
commits into
integration
Choose a base branch
from
task/compression-utils
base: integration
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Add Compression Utils #3102
Changes from 7 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
109a959
Init with compression sketch
SethSmucker 3096695
wip
SethSmucker 74b9172
Starter tests
SethSmucker 98a63f4
Refactor + add 7ZIP utils to mvn
SethSmucker 92e36dc
Add Efficiency Test
SethSmucker d95f162
Fixes
SethSmucker 6d9a11f
formatting
SethSmucker bda8282
Fixes
SethSmucker File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
236 changes: 236 additions & 0 deletions
236
warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
package datawave.util.compression; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.nio.charset.Charset; | ||
import java.util.zip.GZIPInputStream; | ||
import java.util.zip.GZIPOutputStream; | ||
|
||
import org.apache.commons.codec.binary.Base64; | ||
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; | ||
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; | ||
import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; | ||
import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream; | ||
|
||
// Bzip2, Gz, Lz4, Lzo, NoCompression, Snappy, ZStandard | ||
|
||
/** | ||
* Provides a base contract for compression and decompression methods. | ||
* | ||
* <p> | ||
* General process: | ||
* </p> | ||
* <ol> | ||
* <li><b>Compression</b>: Original Data → Compress with Algorithm (selected via {@link CompressionMethod}) → Encode in Base64 (except {@code NONE}) → Return as | ||
* String</li> | ||
* <li><b>Decompression</b>: Input String (Base64 if compressed; plain text if {@code NONE}) → Decode Base64 (if applicable) → Decompress with Algorithm → | ||
* Return Original Data</li> | ||
* </ol> | ||
* | ||
* <p> | ||
* Base64 encoding ensures that compressed data can be safely stored or transferred as text. For {@link CompressionMethod#NONE} the data is returned as-is and | ||
* is not Base64-encoded. | ||
* </p> | ||
*/ | ||
public class OptionCompressor { | ||
|
||
// possible data class for holding metadata for the compression method. Not sure if I'll need to actually include this. | ||
// see Gz.java or Bzip2.java from Accumulo. | ||
// public CompressionAlgorithmConfiguration configuration; | ||
|
||
// additionally, see https://commons.apache.org/proper/commons-compress/ | ||
// it's probably best to source all the compression from the same place if possible. | ||
|
||
public enum CompressionMethod { | ||
NONE, GZIP, BZIP2, SEVEN_ZIP | ||
} | ||
|
||
/** | ||
* Compresses the given string using the specified {@link CompressionMethod}. | ||
* | ||
* <p> | ||
* Order of operations (by method): | ||
* </p> | ||
* <ul> | ||
* <li>{@code NONE}: Return {@code data} unchanged; no Base64 applied.</li> | ||
* <li>{@code GZIP}: Convert {@code data} to bytes with {@code charset} → GZIP-compress → Base64-encode → return String.</li> | ||
* <li>{@code BZIP2}: Convert {@code data} to bytes with {@code charset} → BZIP2-compress → Base64-encode → return String.</li> | ||
* <li>{@code SEVEN_ZIP}: Convert {@code data} to bytes with {@code charset} → LZMA-compress (7z algorithm stream) → Base64-encode → return String.</li> | ||
* </ul> | ||
*/ | ||
public String compress(final String data, final CompressionMethod method, final Charset charset) throws IOException { | ||
switch (method) { | ||
case NONE: | ||
return data; | ||
case GZIP: | ||
return compressGZIP(data, charset); | ||
case BZIP2: | ||
return compressBZIP2(data, charset); | ||
case SEVEN_ZIP: | ||
return compress7ZIP(data, charset); | ||
default: | ||
throw new IllegalArgumentException("unrecognized compression option: " + method); | ||
} | ||
} | ||
|
||
/** | ||
* GZIP implementation reference: 1) String → bytes via charset 2) Write to GZIPOutputStream (must close to finalize) 3) Base64-encode compressed bytes | ||
*/ | ||
private String compressGZIP(final String data, final Charset charset) { | ||
final byte[] input = data.getBytes(charset); | ||
|
||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(baos)) { | ||
|
||
gzip.write(input); | ||
gzip.close(); // must close to flush all compressed data into baos | ||
apmoriarty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return Base64.encodeBase64String(baos.toByteArray()); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* BZIP2 implementation (mirrors GZIP flow): 1) String → bytes via charset 2) Write to BZip2CompressorOutputStream (close to finalize) 3) Base64-encode | ||
* compressed bytes | ||
*/ | ||
private String compressBZIP2(final String data, final Charset charset) { | ||
final byte[] input = data.getBytes(charset); | ||
|
||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { | ||
|
||
bzos.write(input); | ||
bzos.close(); // finalize BZIP2 stream | ||
|
||
return Base64.encodeBase64String(baos.toByteArray()); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* "7-Zip" via LZMA stream implementation: 1) String → bytes via charset 2) Write to LZMACompressorOutputStream (close to finalize) 3) Base64-encode | ||
* compressed bytes | ||
* | ||
* Note: This uses the LZMA compressor stream from Apache Commons Compress, which is the algorithm used by 7-Zip; it is not a .7z archive container. | ||
*/ | ||
private String compress7ZIP(final String data, final Charset charset) { | ||
final byte[] input = data.getBytes(charset); | ||
|
||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { | ||
|
||
lzma.write(input); | ||
lzma.close(); // finalize LZMA stream | ||
|
||
return Base64.encodeBase64String(baos.toByteArray()); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* GZIP implementation reference for decompression: 1) Base64-decode to compressed bytes 2) Read via GZIPInputStream into buffer 3) Collect into | ||
* ByteArrayOutputStream → String via charset | ||
*/ | ||
private String decompressGZIP(final String dataBase64, final Charset charset) { | ||
final byte[] compressed = Base64.decodeBase64(dataBase64); | ||
|
||
try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); | ||
GZIPInputStream gzip = new GZIPInputStream(bais); | ||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) { | ||
|
||
byte[] buf = new byte[4096]; | ||
int n; | ||
while ((n = gzip.read(buf)) != -1) { | ||
baos.write(buf, 0, n); | ||
} | ||
|
||
// NOTE: Keeping the same pattern as provided code. | ||
return baos.toString(charset); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* BZIP2 decompression (mirrors GZIP flow): 1) Base64-decode to compressed bytes 2) Read via BZip2CompressorInputStream into buffer 3) Collect into | ||
* ByteArrayOutputStream → String via charset | ||
*/ | ||
private String decompressBZIP2(final String dataBase64, final Charset charset) { | ||
final byte[] compressed = Base64.decodeBase64(dataBase64); | ||
|
||
try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); | ||
BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais); | ||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) { | ||
|
||
byte[] buf = new byte[4096]; | ||
int n; | ||
while ((n = bzis.read(buf)) != -1) { | ||
baos.write(buf, 0, n); | ||
} | ||
|
||
return baos.toString(charset); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* "7-Zip" via LZMA stream decompression (mirrors GZIP/BZIP2 flow): 1) Base64-decode to compressed bytes 2) Read via LZMACompressorInputStream into buffer | ||
* 3) Collect into ByteArrayOutputStream → String via charset | ||
* | ||
* Note: This expects LZMA stream data (not a .7z archive container). | ||
*/ | ||
private String decompress7ZIP(final String dataBase64, final Charset charset) { | ||
final byte[] compressed = Base64.decodeBase64(dataBase64); | ||
|
||
try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); | ||
LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais); | ||
ByteArrayOutputStream baos = new ByteArrayOutputStream()) { | ||
|
||
byte[] buf = new byte[4096]; | ||
int n; | ||
while ((n = lzma.read(buf)) != -1) { | ||
baos.write(buf, 0, n); | ||
} | ||
|
||
return baos.toString(charset); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* Decompresses a string using the specified {@link CompressionMethod}. | ||
* | ||
* <p> | ||
* Order of operations (by method): | ||
* </p> | ||
* <ul> | ||
* <li>{@code NONE}: Return input unchanged (input is expected to be the original plain text, not Base64).</li> | ||
* <li>{@code GZIP}: Base64-decode → GZIP-decompress → decode to String using {@code charset}.</li> | ||
* <li>{@code BZIP2}: Base64-decode → BZIP2-decompress → decode to String using {@code charset}.</li> | ||
* <li>{@code SEVEN_ZIP}: Base64-decode → LZMA-decompress → decode to String using {@code charset}.</li> | ||
* </ul> | ||
* | ||
* <p> | ||
* This is the reverse of {@link #compress(String, CompressionMethod, Charset)}. For all non-{@code NONE} methods, the input should be the Base64 text | ||
* returned by the corresponding {@code compress(...)} call. | ||
* </p> | ||
*/ | ||
public String decompress(final String input, final CompressionMethod method, final Charset charset) throws IOException { | ||
switch (method) { | ||
case NONE: | ||
return input; | ||
case GZIP: | ||
return decompressGZIP(input, charset); | ||
case BZIP2: | ||
return decompressBZIP2(input, charset); | ||
case SEVEN_ZIP: | ||
return decompress7ZIP(input, charset); | ||
default: | ||
throw new IllegalArgumentException("unrecognized decompression option: " + method); | ||
} | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.