Skip to content

Commit b9bd09b

Browse files
committed
Small workaround to prevent breaking jsons when anonymizing text
1 parent 6d43d4e commit b9bd09b

File tree

2 files changed

+33
-13
lines changed

2 files changed

+33
-13
lines changed

evaluators/presidio/langevals_presidio/pii_detection.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,15 @@ def preload(cls):
109109

110110
def evaluate(self, entry: PresidioPIIDetectionEntry) -> SingleEvaluationResult:
111111
content = "\n\n".join([entry.input or "", entry.output or ""]).strip()
112+
try:
113+
json.loads(content)
114+
content = (
115+
content.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
116+
)
117+
is_valid_json = True
118+
except:
119+
is_valid_json = False
120+
112121
if not content:
113122
return EvaluationResultSkipped(details="Input and output are both empty")
114123

@@ -137,7 +146,14 @@ def evaluate(self, entry: PresidioPIIDetectionEntry) -> SingleEvaluationResult:
137146
anonymized_text = anonymizer.anonymize(
138147
text=content,
139148
analyzer_results=results, # type: ignore
140-
)
149+
).text
150+
151+
if is_valid_json:
152+
anonymized_text = (
153+
anonymized_text.replace("\n", "\\n")
154+
.replace("\t", "\\t")
155+
.replace("\r", "\\r")
156+
)
141157

142158
serialized_results = [result.to_dict() for result in results]
143159

@@ -149,6 +165,6 @@ def evaluate(self, entry: PresidioPIIDetectionEntry) -> SingleEvaluationResult:
149165
),
150166
raw_response={
151167
"results": serialized_results,
152-
"anonymized": anonymized_text.text,
168+
"anonymized": anonymized_text,
153169
},
154170
)

evaluators/presidio/tests/test_pii_detection.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,25 +12,29 @@
1212

1313
def test_pii_detection():
1414
entry = PresidioPIIDetectionEntry(input="hey there, my email is foo@bar.com")
15-
evaluator = PresidioPIIDetectionEvaluator(
16-
settings=PresidioPIIDetectionSettings()
17-
)
15+
evaluator = PresidioPIIDetectionEvaluator(settings=PresidioPIIDetectionSettings())
1816
result = evaluator.evaluate(entry)
1917

2018
assert result.status == "processed"
21-
assert result.score == 2
19+
assert result.score == 1
2220
assert result.passed is False
23-
assert (
24-
result.details
25-
== "PII detected: EMAIL_ADDRESS (likelihood: 1.0), URL (likelihood: 0.5)"
26-
)
21+
assert result.details == "PII detected: EMAIL_ADDRESS (likelihood: 1.0)"
2722

2823

2924
def test_pii_detection_long_context():
3025
entry = PresidioPIIDetectionEntry(input="lorem ipsum dolor " * 100000)
31-
evaluator = PresidioPIIDetectionEvaluator(
32-
settings=PresidioPIIDetectionSettings()
33-
)
26+
evaluator = PresidioPIIDetectionEvaluator(settings=PresidioPIIDetectionSettings())
3427

3528
with pytest.raises(Exception):
3629
evaluator.evaluate(entry)
30+
31+
32+
def test_keep_jsons_valid():
33+
entry = PresidioPIIDetectionEntry(input='{"foo": "bar\\nfoo@bar.com"}')
34+
evaluator = PresidioPIIDetectionEvaluator(settings=PresidioPIIDetectionSettings())
35+
result = evaluator.evaluate(entry)
36+
37+
assert result.status == "processed"
38+
assert result.passed is False
39+
assert result.details == "PII detected: EMAIL_ADDRESS (likelihood: 1.0)"
40+
assert result.raw_response["anonymized"] == '{"foo": "bar\\n<EMAIL_ADDRESS>"}' # type: ignore

0 commit comments

Comments
 (0)