Skip to content

Commit e4c8899

Browse files
committed
Add presidio PII detection
1 parent ec2e6cc commit e4c8899

File tree

11 files changed

+5681
-45
lines changed

11 files changed

+5681
-45
lines changed
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import json
2+
from typing import Any, Literal, Optional
3+
from langevals_core.base_evaluator import (
4+
BaseEvaluator,
5+
EvaluatorEntry,
6+
EvaluatorSettings,
7+
SingleEvaluationResult,
8+
EvaluationResult,
9+
EvaluationResultSkipped,
10+
)
11+
from pydantic import BaseModel, Field
12+
import spacy
13+
import spacy.cli
14+
from presidio_analyzer import AnalyzerEngine
15+
from presidio_anonymizer import AnonymizerEngine
16+
from presidio_analyzer.nlp_engine import SpacyNlpEngine
17+
18+
19+
class PresidioPIIDetectionEntry(EvaluatorEntry):
20+
input: Optional[str] = None
21+
output: Optional[str] = None
22+
23+
24+
class PresidioEntities(BaseModel):
25+
credit_card: bool = True
26+
crypto: bool = True
27+
date_time: bool = True
28+
email_address: bool = True
29+
iban_code: bool = True
30+
ip_address: bool = True
31+
nrp: bool = True
32+
location: bool = True
33+
person: bool = True
34+
phone_number: bool = True
35+
medical_license: bool = True
36+
url: bool = True
37+
us_bank_number: bool = False
38+
us_driver_license: bool = False
39+
us_itin: bool = False
40+
us_passport: bool = False
41+
us_ssn: bool = False
42+
uk_nhs: bool = False
43+
es_nif: bool = False
44+
es_nie: bool = False
45+
it_fiscal_code: bool = False
46+
it_driver_license: bool = False
47+
it_vat_code: bool = False
48+
it_passport: bool = False
49+
it_identity_card: bool = False
50+
pl_pesel: bool = False
51+
sg_nric_fin: bool = False
52+
sg_uen: bool = False
53+
au_abn: bool = False
54+
au_acn: bool = False
55+
au_tfn: bool = False
56+
au_medicare: bool = False
57+
in_pan: bool = False
58+
in_aadhaar: bool = False
59+
in_vehicle_registration: bool = False
60+
in_voter: bool = False
61+
in_passport: bool = False
62+
fi_personal_identity_code: bool = False
63+
64+
65+
class PresidioPIIDetectionSettings(EvaluatorSettings):
66+
entities: PresidioEntities = Field(
67+
default=PresidioEntities(),
68+
description="The types of PII to check for in the input.",
69+
)
70+
min_threshold: int = Field(
71+
default=0.5,
72+
description="The minimum confidence required for failing the evaluation on a PII match.",
73+
)
74+
75+
76+
class PresidioPIIDetectionResult(EvaluationResult):
77+
score: float = Field(description="Amount of PII detected, 0 means no PII detected")
78+
passed: Optional[bool] = Field(
79+
description="If true then no PII was detected, if false then at least one PII was detected",
80+
default=None,
81+
)
82+
raw_response: dict[str, Any]
83+
84+
85+
class PresidioPIIDetectionEvaluator(
86+
BaseEvaluator[
87+
PresidioPIIDetectionEntry,
88+
PresidioPIIDetectionSettings,
89+
PresidioPIIDetectionResult,
90+
]
91+
):
92+
"""
93+
Detects personally identifiable information in text, including phone numbers, email addresses, and
94+
social security numbers. It allows customization of the detection threshold and the specific types of PII to check.
95+
"""
96+
97+
name = "Presidio PII Detection"
98+
category = "safety"
99+
env_vars = []
100+
default_settings = PresidioPIIDetectionSettings()
101+
docs_url = "https://microsoft.github.io/presidio"
102+
is_guardrail = True
103+
104+
@classmethod
105+
def preload(cls):
106+
try:
107+
spacy.load("en_core_web_lg")
108+
except Exception:
109+
spacy.cli.download("en_core_web_lg") # type: ignore
110+
spacy.load("en_core_web_lg")
111+
cls.analyzer = AnalyzerEngine(
112+
nlp_engine=SpacyNlpEngine(
113+
models=[{"lang_code": "en", "model_name": "en_core_web_lg"}]
114+
)
115+
)
116+
117+
super().preload()
118+
119+
def evaluate(self, entry: PresidioPIIDetectionEntry) -> SingleEvaluationResult:
120+
content = "\n\n".join([entry.input or "", entry.output or ""]).strip()
121+
if not content:
122+
return EvaluationResultSkipped(details="Input and output are both empty")
123+
124+
settings_entities = self.settings.entities.model_dump()
125+
entities = [
126+
info_type.upper()
127+
for info_type in settings_entities.keys()
128+
if settings_entities[info_type]
129+
]
130+
131+
if len(content) > 524288:
132+
raise ValueError(
133+
"Content exceeds the maximum length of 524288 bytes allowed by PII Detection"
134+
)
135+
136+
results = self.analyzer.analyze(text=content, entities=entities, language="en")
137+
results = [
138+
result for result in results if result.score >= self.settings.min_threshold
139+
]
140+
141+
findings = [
142+
f"{result.entity_type} (likelihood: {result.score})" for result in results
143+
]
144+
145+
anonymizer = AnonymizerEngine()
146+
anonymized_text = anonymizer.anonymize(
147+
text=content,
148+
analyzer_results=results, # type: ignore
149+
)
150+
151+
return PresidioPIIDetectionResult(
152+
score=len(results),
153+
passed=len(results) == 0,
154+
details=(
155+
None if len(results) == 0 else f"PII detected: {', '.join(findings)}"
156+
),
157+
raw_response={
158+
"results": results,
159+
"anonymized": anonymized_text.text,
160+
},
161+
)

0 commit comments

Comments
 (0)