Skip to content

Commit 28956f2

Browse files
committed
Add logics for compressing fasta file
1 parent 97cd73f commit 28956f2

File tree

4 files changed

+1340128
-0
lines changed

4 files changed

+1340128
-0
lines changed

src/lib.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,102 @@ pub fn decompress_sequence(compressed: &[u8], sequence_length: usize) -> io::Res
105105

106106
Ok(sequence)
107107
}
108+
109+
/// Compresses a FASTA file content into a vector of bytes.
110+
///
111+
/// The FASTA file content is expected to have a header line followed by
112+
/// the DNA sequence. The DNA sequence is compressed by representing each
113+
/// base (A, C, T, G) with 2 bits. The compressed data starts with a 4-byte
114+
/// (u32) integer representing the length of the original sequence.
115+
///
116+
/// # Arguments
117+
///
118+
/// * `content` - A string slice that holds the FASTA file content.
119+
///
120+
/// # Returns
121+
///
122+
/// A vector of bytes containing the compressed FASTA file content.
123+
pub fn compress_fasta(content: &str) -> Vec<u8> {
124+
let mut lines = content.lines();
125+
let header = lines.next().unwrap_or("").to_string();
126+
let sequence: String = lines.map(|line| line.trim()).collect();
127+
128+
let sequence_length = sequence.len() as u32;
129+
let compressed_data = compress_sequence(&sequence);
130+
131+
let mut output = Vec::new();
132+
133+
// Write header length (4 bytes)
134+
output.extend_from_slice(&(header.len() as u32).to_le_bytes());
135+
136+
// Write header
137+
output.extend_from_slice(header.as_bytes());
138+
139+
// Write sequence length (4 bytes)
140+
output.extend_from_slice(&sequence_length.to_le_bytes());
141+
142+
// Write compressed data length (4 bytes)
143+
output.extend_from_slice(&(compressed_data.len() as u32).to_le_bytes());
144+
145+
// Write compressed data
146+
output.extend_from_slice(&compressed_data);
147+
148+
output
149+
}
150+
151+
/// Decompresses a vector of bytes into a FASTA file content.
152+
///
153+
/// The compressed data starts with a 4-byte (u32) integer representing
154+
/// the length of the header, followed by the header, the sequence length,
155+
/// and the compressed sequence data. Each base (A, C, T, G) is represented
156+
/// by 2 bits.
157+
///
158+
/// # Arguments
159+
///
160+
/// * `data` - A slice of bytes containing the compressed FASTA file content.
161+
///
162+
/// # Returns
163+
///
164+
/// A string containing the decompressed FASTA file content.
165+
///
166+
/// # Errors
167+
///
168+
/// Returns an error if the file is too short or if the file is missing
169+
pub fn decompress_fasta(data: &[u8]) -> Result<String, String> {
170+
if data.len() < 12 {
171+
return Err("File is too short".to_string());
172+
}
173+
174+
let header_len = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
175+
176+
if data.len() < 12 + header_len {
177+
return Err("File is too short for header".to_string());
178+
}
179+
180+
let header = String::from_utf8(data[4..4 + header_len].to_vec()).map_err(|e| e.to_string())?;
181+
182+
let sequence_length =
183+
u32::from_le_bytes(data[4 + header_len..8 + header_len].try_into().unwrap()) as usize;
184+
185+
let compressed_len =
186+
u32::from_le_bytes(data[8 + header_len..12 + header_len].try_into().unwrap()) as usize;
187+
188+
if data.len() < 12 + header_len + compressed_len {
189+
return Err("File is too short for compressed data".to_string());
190+
}
191+
192+
let compressed_data = &data[12 + header_len..12 + header_len + compressed_len];
193+
let decompressed = decompress_sequence(compressed_data, sequence_length).unwrap_or_default();
194+
195+
let mut result =
196+
String::with_capacity(header.len() + decompressed.len() + (decompressed.len() / 60) * 2);
197+
result.push_str(&header);
198+
result.push('\n');
199+
200+
for chunk in decompressed.as_bytes().chunks(60) {
201+
result.extend(chunk.iter().map(|&b| b as char));
202+
result.push('\n');
203+
}
204+
205+
Ok(result)
206+
}

tests/compression.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ use base_sequence_compression::{compress_sequence, decompress_sequence};
22

33
#[cfg(test)]
44
mod tests {
5+
use std::path::Path;
6+
7+
use base_sequence_compression::{compress_fasta, decompress_fasta};
8+
59
use super::*;
610

711
#[test]
@@ -105,4 +109,26 @@ mod tests {
105109

106110
assert_eq!("ACGT", decompressed);
107111
}
112+
113+
#[test]
114+
fn test_fasta_file() {
115+
let input_path = Path::new("tests/input/test.fasta");
116+
117+
let content = std::fs::read_to_string(input_path).unwrap();
118+
let compressed = compress_fasta(&content);
119+
let decompressed = decompress_fasta(&compressed).unwrap();
120+
121+
assert_eq!(content, decompressed);
122+
}
123+
124+
#[test]
125+
fn test_large_fasta_file() {
126+
let input_path = Path::new("tests/input/large.fasta");
127+
128+
let content = std::fs::read_to_string(input_path).unwrap();
129+
let compressed = compress_fasta(&content);
130+
let decompressed = decompress_fasta(&compressed).unwrap();
131+
132+
assert_eq!(content, decompressed);
133+
}
108134
}

0 commit comments

Comments
 (0)