Skip to content

Commit 03d037a

Browse files
committed
add initial fuzz test for partial_ratio
1 parent c6a3ac8 commit 03d037a

File tree

5 files changed

+163
-0
lines changed

5 files changed

+163
-0
lines changed

fuzzing/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ create_fuzzer(osa_distance)
2020
create_fuzzer(damerau_levenshtein_distance)
2121

2222
create_fuzzer(jaro_similarity)
23+
24+
create_fuzzer(partial_ratio)

fuzzing/fuzz_partial_ratio.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* SPDX-License-Identifier: MIT */
2+
/* Copyright © 2021 Max Bachmann */
3+
4+
#include "../rapidfuzz_reference/fuzz.hpp"
5+
#include "fuzzing.hpp"
6+
#include <cstdint>
7+
#include <rapidfuzz/fuzz.hpp>
8+
#include <stdexcept>
9+
#include <string>
10+
11+
bool is_close(double a, double b, double epsilon)
12+
{
13+
return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
14+
}
15+
16+
void validate_distance(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
17+
{
18+
auto sim = rapidfuzz::fuzz::partial_ratio(s1, s2);
19+
auto reference_sim = rapidfuzz_reference::partial_ratio(s1, s2);
20+
if (!is_close(sim, reference_sim, 0.0001)) {
21+
print_seq("s1: ", s1);
22+
print_seq("s2: ", s2);
23+
throw std::logic_error(std::string("partial_ratio failed (reference_score = ") +
24+
std::to_string(reference_sim) + std::string(", score = ") +
25+
std::to_string(sim) + ")");
26+
}
27+
}
28+
29+
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
30+
{
31+
std::vector<uint8_t> s1, s2;
32+
if (!extract_strings(data, size, s1, s2)) return 0;
33+
34+
validate_distance(s1, s2);
35+
validate_distance(s2, s1);
36+
37+
/* test long sequences */
38+
for (unsigned int i = 2; i < 9; ++i) {
39+
std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
40+
std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));
41+
42+
if (s1_.size() > 10000 || s2_.size() > 10000) break;
43+
44+
validate_distance(s1_, s2_);
45+
validate_distance(s2_, s1_);
46+
validate_distance(s1, s2_);
47+
validate_distance(s2_, s1);
48+
validate_distance(s1_, s2);
49+
validate_distance(s2, s1_);
50+
}
51+
52+
return 0;
53+
}

rapidfuzz_reference/Indel.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,17 @@ size_t indel_distance(const Sentence1& s1, const Sentence2& s2,
2222
return levenshtein_distance(s1, s2, {1, 1, 2}, score_cutoff);
2323
}
2424

25+
template <typename InputIt1, typename InputIt2>
26+
double indel_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
27+
double score_cutoff = 0.0)
28+
{
29+
return levenshtein_similarity(first1, last1, first2, last2, {1, 1, 2}, score_cutoff);
30+
}
31+
32+
template <typename Sentence1, typename Sentence2>
33+
double indel_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
34+
{
35+
return levenshtein_similarity(s1, s2, {1, 1, 2}, score_cutoff);
36+
}
37+
2538
} // namespace rapidfuzz_reference

rapidfuzz_reference/Levenshtein.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,18 @@ struct LevenshteinWeightTable {
1616
size_t replace_cost;
1717
};
1818

19+
static inline size_t levenshtein_maximum(size_t len1, size_t len2, LevenshteinWeightTable weights)
20+
{
21+
size_t max_dist = len1 * weights.delete_cost + len2 * weights.insert_cost;
22+
23+
if (len1 >= len2)
24+
max_dist = std::min(max_dist, len2 * weights.replace_cost + (len1 - len2) * weights.delete_cost);
25+
else
26+
max_dist = std::min(max_dist, len1 * weights.replace_cost + (len2 - len1) * weights.insert_cost);
27+
28+
return max_dist;
29+
}
30+
1931
template <typename InputIt1, typename InputIt2>
2032
Matrix<size_t> levenshtein_matrix(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
2133
LevenshteinWeightTable weights = {1, 1, 1})
@@ -69,4 +81,24 @@ size_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
6981
score_cutoff);
7082
}
7183

84+
template <typename InputIt1, typename InputIt2>
85+
double levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
86+
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0)
87+
{
88+
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
89+
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
90+
size_t dist = levenshtein_distance(first1, last1, first2, last2, weights);
91+
size_t max = levenshtein_maximum(len1, len2, weights);
92+
double sim = 1.0 - (double)dist / max;
93+
return (sim >= score_cutoff) ? sim : 0.0;
94+
}
95+
96+
template <typename Sentence1, typename Sentence2>
97+
double levenshtein_similarity(const Sentence1& s1, const Sentence2& s2,
98+
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 0.0)
99+
{
100+
return levenshtein_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), weights,
101+
score_cutoff);
102+
}
103+
72104
} // namespace rapidfuzz_reference

rapidfuzz_reference/fuzz.hpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/* SPDX-License-Identifier: MIT */
2+
/* Copyright © 2022-present Max Bachmann */
3+
4+
#pragma once
5+
6+
#include "Indel.hpp"
7+
8+
namespace rapidfuzz_reference {
9+
10+
template <typename InputIt1, typename InputIt2>
11+
double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0)
12+
{
13+
return indel_similarity(first1, last1, first2, last2, score_cutoff / 100.0) * 100;
14+
}
15+
16+
template <typename Sentence1, typename Sentence2>
17+
double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
18+
{
19+
return indel_similarity(s1, s2, score_cutoff / 100.0) * 100;
20+
}
21+
22+
template <typename InputIt1, typename InputIt2>
23+
double partial_ratio_impl(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
24+
double score_cutoff = 0.0)
25+
{
26+
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
27+
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
28+
if (len1 == 0 && len2 == 0) return 100.0;
29+
30+
if (len1 == 0 || len2 == 0) return 0.0;
31+
32+
if (len1 > len2) return partial_ratio_impl(first2, last2, first1, last1, score_cutoff);
33+
34+
double res = 0.0;
35+
for (ptrdiff_t i = -1 * (ptrdiff_t)len1; i < (ptrdiff_t)len2; i++) {
36+
ptrdiff_t start = std::max(ptrdiff_t(0), i);
37+
ptrdiff_t end = std::min(ptrdiff_t(len2), i + ptrdiff_t(len1));
38+
InputIt2 first2_ = first2 + start;
39+
InputIt2 last2_ = first2 + end;
40+
res = std::max(res, ratio(first1, last1, first2_, last2_, score_cutoff));
41+
}
42+
return res;
43+
}
44+
45+
template <typename InputIt1, typename InputIt2>
46+
double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
47+
double score_cutoff = 0.0)
48+
{
49+
size_t len1 = static_cast<size_t>(std::distance(first1, last1));
50+
size_t len2 = static_cast<size_t>(std::distance(first2, last2));
51+
if (len1 != len2) return partial_ratio_impl(first1, last1, first2, last2, score_cutoff);
52+
53+
return std::max(partial_ratio_impl(first1, last1, first2, last2, score_cutoff),
54+
partial_ratio_impl(first2, last2, first1, last1, score_cutoff));
55+
}
56+
57+
template <typename Sentence1, typename Sentence2>
58+
double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
59+
{
60+
return partial_ratio(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
61+
}
62+
63+
} // namespace rapidfuzz_reference

0 commit comments

Comments
 (0)