Skip to content

Commit f5e8372

Browse files
Merge pull request #19 from theseus-rs/correct-absolute-eof-logic
fix: update internal signature to require all byte sequences match
2 parents 8ebc9ca + ca38123 commit f5e8372

File tree

8 files changed

+72
-28
lines changed

8 files changed

+72
-28
lines changed

export/apache_httpd/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#![deny(clippy::unwrap_used)]
55

66
use anyhow::Result;
7-
use file_type::pronom::{DocumentIdentifier, ExternalSignature, FileFormat};
7+
use file_type::pronom::{DocumentIdentifier, ExternalSignature, FileFormat, SignatureType};
88
use file_type::FileType;
99
use jiff::civil::Date;
1010
use jiff::tz::TimeZone;
@@ -128,7 +128,7 @@ fn process_mime_types(mime_types: HashMap<String, Vec<String>>) -> Result<Vec<Fi
128128
.iter()
129129
.enumerate()
130130
.map(|(index, extension)| {
131-
ExternalSignature::new(index, extension.as_str(), "File extension")
131+
ExternalSignature::new(index, extension.as_str(), SignatureType::FileExtension)
132132
})
133133
.collect::<Vec<ExternalSignature>>();
134134

export/linguist/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#![deny(clippy::unwrap_used)]
55

66
use anyhow::Result;
7-
use file_type::pronom::{DocumentIdentifier, ExternalSignature, FileFormat};
7+
use file_type::pronom::{DocumentIdentifier, ExternalSignature, FileFormat, SignatureType};
88
use file_type::FileType;
99
use jiff::civil::Date;
1010
use jiff::tz::TimeZone;
@@ -149,7 +149,7 @@ fn process_languages(languages: Vec<Language>) -> Vec<FileFormat> {
149149
.iter()
150150
.enumerate()
151151
.map(|(index, extension)| {
152-
ExternalSignature::new(index, extension.as_str(), "File extension")
152+
ExternalSignature::new(index, extension.as_str(), SignatureType::FileExtension)
153153
})
154154
.collect::<Vec<ExternalSignature>>();
155155

file_type/src/pronom/byte_sequence.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,10 @@ impl ByteSequence {
149149
}
150150
PositionType::AbsoluteFromEOF => {
151151
let offset = self.offset.unwrap_or_default();
152-
let offset = usize::checked_sub(bytes.len(), offset).unwrap_or(0);
152+
let value = self.regex.to_string();
153+
let regex_len = value.len() / 2;
154+
let offset = usize::checked_add(regex_len, offset).unwrap_or(regex_len);
155+
let offset = usize::checked_sub(bytes.len(), offset).unwrap_or(bytes.len());
153156
self.regex.is_match_at(bytes, offset)
154157
}
155158
PositionType::Variable => {

file_type/src/pronom/external_signature.rs

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
use serde::{Deserialize, Serialize};
22

3+
/// The type of signature.
4+
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
5+
pub enum SignatureType {
6+
#[default]
7+
#[serde(rename = "File extension")]
8+
FileExtension,
9+
}
10+
311
/// An external signature.
412
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
513
#[serde(default, rename_all = "PascalCase")]
@@ -8,16 +16,16 @@ pub struct ExternalSignature {
816
id: usize,
917
signature: String,
1018
#[serde(rename = "SignatureType")]
11-
r#type: String,
19+
r#type: SignatureType,
1220
}
1321

1422
impl ExternalSignature {
1523
/// Create a new external signature.
16-
pub fn new<S: AsRef<str>>(id: usize, signature: S, r#type: S) -> Self {
24+
pub fn new<S: AsRef<str>>(id: usize, signature: S, r#type: SignatureType) -> Self {
1725
Self {
1826
id,
1927
signature: signature.as_ref().to_string(),
20-
r#type: r#type.as_ref().to_string(),
28+
r#type,
2129
}
2230
}
2331

@@ -35,7 +43,7 @@ impl ExternalSignature {
3543

3644
/// Get the type of the signature.
3745
#[must_use]
38-
pub fn r#type(&self) -> &str {
46+
pub fn r#type(&self) -> &SignatureType {
3947
&self.r#type
4048
}
4149
}
@@ -65,16 +73,22 @@ mod test {
6573
let external_signature: ExternalSignature = from_str(xml.as_str())?;
6674

6775
assert_eq!(external_signature.id(), 2421);
76+
assert!(matches!(
77+
external_signature.r#type(),
78+
SignatureType::FileExtension
79+
));
6880
assert_eq!(external_signature.signature(), "json");
69-
assert_eq!(external_signature.r#type(), "File extension");
7081
Ok(())
7182
}
7283

7384
#[test]
7485
fn test_new() {
75-
let external_signature = ExternalSignature::new(2421, "json", "File extension");
86+
let external_signature = ExternalSignature::new(2421, "json", SignatureType::FileExtension);
7687
assert_eq!(external_signature.id(), 2421);
88+
assert!(matches!(
89+
external_signature.r#type(),
90+
SignatureType::FileExtension
91+
));
7792
assert_eq!(external_signature.signature(), "json");
78-
assert_eq!(external_signature.r#type(), "File extension");
7993
}
8094
}

file_type/src/pronom/file_format.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ impl FileFormat {
360360
#[cfg(test)]
361361
mod tests {
362362
use super::*;
363+
use crate::pronom::external_signature::SignatureType;
363364
use crate::pronom::{Author, ByteSequence, Endianness, PositionType};
364365
use indoc::indoc;
365366
use quick_xml::de::from_str;
@@ -538,7 +539,7 @@ mod tests {
538539
],
539540
vec![],
540541
vec![
541-
ExternalSignature::new(761, "png", "File extension"),
542+
ExternalSignature::new(761, "png", SignatureType::FileExtension),
542543
],
543544
vec![],
544545
vec![],

file_type/src/pronom/internal_signature.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ impl InternalSignature {
5959
/// Check if this internal signature is a match for the given bytes
6060
#[must_use]
6161
pub fn is_match(&self, bytes: &[u8]) -> bool {
62+
// All byte sequences must match in order for the internal signature to match
6263
self.byte_sequences
6364
.iter()
64-
.any(|byte_sequence| byte_sequence.is_match(bytes))
65+
.all(|byte_sequence| byte_sequence.is_match(bytes))
6566
}
6667
}
6768

file_type/src/pronom/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ pub use byte_sequence::{ByteSequence, Endianness, PositionType};
1616
pub use compression_type::{CompressionType, Lossiness};
1717
pub use document::Document;
1818
pub use document_identifier::DocumentIdentifier;
19-
pub use external_signature::ExternalSignature;
19+
pub use external_signature::{ExternalSignature, SignatureType};
2020
pub use file_format::FileFormat;
2121
pub use internal_signature::InternalSignature;
2222
pub use publisher::Publisher;

file_type/tests/classify_pronom_files.rs

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,32 @@ fn data_dir() -> PathBuf {
1212
.join("pronom")
1313
}
1414

15+
async fn test_file(file_name: &str) -> Result<(String, &FileType)> {
16+
let data_dir = data_dir();
17+
let path = data_dir.join(file_name);
18+
let file_name = path
19+
.file_name()
20+
.expect("file name")
21+
.to_string_lossy()
22+
.to_string();
23+
let file_name = file_name.split('.').next().expect("split").to_string();
24+
let id = if file_name.starts_with("x-fmt-") {
25+
let parts: Vec<&str> = file_name.split('-').collect();
26+
format!("{}-{}/{}", parts[0], parts[1], parts[2])
27+
} else {
28+
let parts: Vec<&str> = file_name.split('-').collect();
29+
format!("{}/{}", parts[0], parts[1])
30+
};
31+
32+
let file_type = FileType::try_from_file(path).await?;
33+
Ok((id, file_type))
34+
}
35+
1536
#[tokio::test]
1637
async fn test_file_classification() -> Result<()> {
1738
let data_dir = data_dir();
39+
let mut passed_tests = 0;
40+
let mut failed_tests = 0;
1841

1942
for entry in WalkDir::new(data_dir) {
2043
let entry = entry?;
@@ -23,32 +46,34 @@ async fn test_file_classification() -> Result<()> {
2346
continue;
2447
}
2548

26-
let full_filename = path
49+
let file_name = path
2750
.file_name()
2851
.expect("file name")
2952
.to_string_lossy()
3053
.to_string();
31-
let filename = full_filename.split('.').next().expect("split").to_string();
32-
let id = if filename.starts_with("x-fmt-") {
33-
let parts: Vec<&str> = filename.split('-').collect();
34-
format!("{}-{}/{}", parts[0], parts[1], parts[2])
35-
} else {
36-
let parts: Vec<&str> = filename.split('-').collect();
37-
format!("{}/{}", parts[0], parts[1])
38-
};
54+
let (id, file_type) = test_file(&file_name).await?;
3955

40-
let file_type = FileType::try_from_file(path).await?;
4156
if file_type.id() == id {
42-
println!("file_type.id()={}, id={}", file_type.id(), id);
57+
assert_eq!(file_type.id(), id);
58+
passed_tests += 1;
4359
} else {
4460
eprintln!(
45-
"[ERROR] file_type.id()={}, id={}: {full_filename}",
61+
"[ERROR] file_type.id()={}, id={}: {file_name}",
4662
file_type.id(),
4763
id
4864
);
65+
failed_tests += 1;
4966
}
50-
// assert_eq!(file_type.id(), id);
5167
}
5268

69+
println!("Passed: {passed_tests}");
70+
println!("Failed: {failed_tests}");
5371
Ok(())
5472
}
73+
74+
// #[tokio::test]
75+
// async fn test_single_file_classification() -> Result<()> {
76+
// let (id, file_type) = test_file("fmt-708-signature-id-831.wav").await?;
77+
// assert_eq!(file_type.id(), id);
78+
// Ok(())
79+
// }

0 commit comments

Comments
 (0)