Skip to content

Commit ffd9c81

Browse files
committed
Improve compression logic
1 parent 6a300c6 commit ffd9c81

File tree

2 files changed

+36
-33
lines changed

2 files changed

+36
-33
lines changed

src/lib.rs

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,10 @@ pub const G_BITS: u8 = 0b11;
2121
///
2222
/// A vector of bytes containing the compressed DNA sequence.
2323
pub fn compress_sequence(sequence: &str) -> Vec<u8> {
24-
let mut compressed = Vec::new();
24+
let mut compressed = Vec::with_capacity(sequence.len() / 4 + 1);
2525
let mut current_byte = 0u8;
2626
let mut bit_count = 0;
2727

28-
// Prepend the length of the original DNA sequence as a 4-byte (u32) integer
29-
let length: u32 = sequence.len() as u32;
30-
compressed.extend_from_slice(&length.to_be_bytes());
31-
3228
for base in sequence.chars() {
3329
let bits = match base {
3430
'A' => A_BITS,
@@ -53,7 +49,8 @@ pub fn compress_sequence(sequence: &str) -> Vec<u8> {
5349
}
5450

5551
if bit_count > 0 {
56-
compressed.push(current_byte << (8 - bit_count));
52+
current_byte <<= 8 - bit_count;
53+
compressed.push(current_byte);
5754
}
5855

5956
compressed
@@ -72,29 +69,24 @@ pub fn compress_sequence(sequence: &str) -> Vec<u8> {
7269
/// # Returns
7370
///
7471
/// A string containing the decompressed DNA sequence.
75-
pub fn decompress_sequence(compressed: &[u8]) -> String {
76-
// Extract the length of the original DNA sequence from the first 4 bytes
77-
let length =
78-
u32::from_be_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]) as usize;
72+
pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> String {
7973
let mut sequence = String::new();
80-
let mut bits = 0;
81-
let mut bit_count = 0;
82-
83-
for &byte in &compressed[4..] {
84-
bits = (bits << 8) | byte as usize;
85-
bit_count += 8;
8674

87-
while bit_count >= 2 && sequence.len() < length {
88-
let base_bits = (bits >> (bit_count - 2)) & 0b11;
89-
let base = match base_bits as u8 {
75+
for &byte in compressed {
76+
let mut current_byte = byte;
77+
for _ in 0..4 {
78+
if sequence.len() >= sequence_length {
79+
break;
80+
}
81+
let nucleotide = match (current_byte >> 6) & 0b11 {
9082
A_BITS => 'A',
9183
C_BITS => 'C',
9284
T_BITS => 'T',
9385
G_BITS => 'G',
94-
_ => continue,
86+
_ => unreachable!(),
9587
};
96-
sequence.push(base);
97-
bit_count -= 2;
88+
sequence.push(nucleotide);
89+
current_byte <<= 2;
9890
}
9991
}
10092

tests/compression.rs

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ mod tests {
88
fn test_compress_decompress() {
99
let dna_sequence = "ACGTACGTACGT";
1010
let compressed = compress_sequence(dna_sequence);
11-
let decompressed = decompress_sequence(&compressed);
11+
let sequence_length = dna_sequence.len();
12+
let decompressed = decompress_sequence(&compressed, sequence_length);
1213

1314
assert_eq!(dna_sequence, decompressed);
1415
}
@@ -17,71 +18,80 @@ mod tests {
1718
fn test_empty_sequence() {
1819
let dna_sequence = "";
1920
let compressed = compress_sequence(dna_sequence);
20-
let decompressed = decompress_sequence(&compressed);
21+
let sequence_length = dna_sequence.len();
22+
let decompressed = decompress_sequence(&compressed, sequence_length);
2123
assert_eq!(dna_sequence, decompressed);
2224
}
2325

2426
#[test]
2527
fn test_single_character_a() {
2628
let dna_sequence = "A";
2729
let compressed = compress_sequence(dna_sequence);
28-
let decompressed = decompress_sequence(&compressed);
30+
let sequence_length = dna_sequence.len();
31+
let decompressed = decompress_sequence(&compressed, sequence_length);
2932
assert_eq!(dna_sequence, decompressed);
3033
}
3134

3235
#[test]
3336
fn test_single_character_c() {
3437
let dna_sequence = "C";
3538
let compressed = compress_sequence(dna_sequence);
36-
let decompressed = decompress_sequence(&compressed);
39+
let sequence_length = dna_sequence.len();
40+
let decompressed = decompress_sequence(&compressed, sequence_length);
3741
assert_eq!(dna_sequence, decompressed);
3842
}
3943

4044
#[test]
4145
fn test_single_character_t() {
4246
let dna_sequence = "T";
4347
let compressed = compress_sequence(dna_sequence);
44-
let decompressed = decompress_sequence(&compressed);
48+
let sequence_length = dna_sequence.len();
49+
let decompressed = decompress_sequence(&compressed, sequence_length);
4550
assert_eq!(dna_sequence, decompressed);
4651
}
4752

4853
#[test]
4954
fn test_single_character_g() {
5055
let dna_sequence = "G";
5156
let compressed = compress_sequence(dna_sequence);
52-
let decompressed = decompress_sequence(&compressed);
57+
let sequence_length = dna_sequence.len();
58+
let decompressed = decompress_sequence(&compressed, sequence_length);
5359
assert_eq!(dna_sequence, decompressed);
5460
}
5561

5662
#[test]
5763
fn test_non_multiple_of_four_length() {
5864
let dna_sequence = "ACGTACGTA";
5965
let compressed = compress_sequence(dna_sequence);
60-
let decompressed = decompress_sequence(&compressed);
66+
let sequence_length = dna_sequence.len();
67+
let decompressed = decompress_sequence(&compressed, sequence_length);
6168
assert_eq!(dna_sequence, decompressed);
6269
}
6370

6471
#[test]
6572
fn test_non_multiple_of_four_length_2() {
6673
let dna_sequence = "ACGTACGTAC";
6774
let compressed = compress_sequence(dna_sequence);
68-
let decompressed = decompress_sequence(&compressed);
75+
let sequence_length = dna_sequence.len();
76+
let decompressed = decompress_sequence(&compressed, sequence_length);
6977
assert_eq!(dna_sequence, decompressed);
7078
}
7179

7280
#[test]
7381
fn test_non_multiple_of_four_length_3() {
7482
let dna_sequence = "ACGTACGTACG";
7583
let compressed = compress_sequence(dna_sequence);
76-
let decompressed = decompress_sequence(&compressed);
84+
let sequence_length = dna_sequence.len();
85+
let decompressed = decompress_sequence(&compressed, sequence_length);
7786
assert_eq!(dna_sequence, decompressed);
7887
}
7988

8089
#[test]
8190
fn test_lower_case() {
8291
let dna_sequence = "acgt";
8392
let compressed = compress_sequence(dna_sequence);
84-
let decompressed = decompress_sequence(&compressed);
93+
let sequence_length = dna_sequence.len();
94+
let decompressed = decompress_sequence(&compressed, sequence_length);
8595

8696
assert_eq!(dna_sequence.to_uppercase(), decompressed);
8797
}
@@ -90,7 +100,8 @@ mod tests {
90100
fn test_invalid_sequence() {
91101
let dna_sequence = "ACXGT";
92102
let compressed = compress_sequence(dna_sequence);
93-
let decompressed = decompress_sequence(&compressed);
103+
let sequence_length = dna_sequence.len();
104+
let decompressed = decompress_sequence(&compressed, sequence_length);
94105

95106
assert_eq!("ACGT", decompressed);
96107
}

0 commit comments

Comments
 (0)