Skip to content

Commit f884b80

Browse files
committed
Download spacy model to a specific folder to find docker starting not finding on the lambda and trying to donwload all over again
1 parent 895bc2e commit f884b80

File tree

3 files changed

+11
-7
lines changed

3 files changed

+11
-7
lines changed

.dockerignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ langevals.code-workspace
55
.env
66
__pycache__
77
dist
8-
openapi.json
8+
openapi.json
9+
*/**/.cache

evaluators/presidio/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.cache

evaluators/presidio/langevals_presidio/pii_detection.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import os
23
from typing import Any, Literal, Optional
34
from langevals_core.base_evaluator import (
45
BaseEvaluator,
@@ -100,11 +101,14 @@ class PresidioPIIDetectionEvaluator(
100101

101102
@classmethod
102103
def preload(cls):
104+
cache_dir = os.path.join(os.path.dirname(__file__), ".cache")
105+
os.makedirs(cache_dir, exist_ok=True)
103106
try:
104-
spacy.load("en_core_web_lg")
107+
spacy.load(os.path.join(cache_dir, "en_core_web_lg"))
105108
except Exception:
106-
spacy.cli.download("en_core_web_lg") # type: ignore
107-
spacy.load("en_core_web_lg")
109+
spacy.cli.download("en_core_web_lg", False, False, "--no-cache-dir") # type: ignore
110+
nlp = spacy.load("en_core_web_lg")
111+
nlp.to_disk(os.path.join(cache_dir, "en_core_web_lg"))
108112
cls.analyzer = AnalyzerEngine(
109113
nlp_engine=SpacyNlpEngine(
110114
models=[{"lang_code": "en", "model_name": "en_core_web_lg"}]
@@ -130,9 +134,7 @@ def evaluate(self, entry: PresidioPIIDetectionEntry) -> SingleEvaluationResult:
130134
"Content exceeds the maximum length of 524288 bytes allowed by PII Detection"
131135
)
132136

133-
results = self.analyzer.analyze(
134-
text=content, entities=entities, language="en"
135-
)
137+
results = self.analyzer.analyze(text=content, entities=entities, language="en")
136138
results = [
137139
result for result in results if result.score >= self.settings.min_threshold
138140
]

0 commit comments

Comments
 (0)