Skip to content

Commit e38daf1

Browse files
committed
Disable nested progress bar also for haystack and everyone else
1 parent 8fa1acf commit e38daf1

File tree

11 files changed

+84
-89
lines changed

11 files changed

+84
-89
lines changed

evaluators/haystack/langevals_haystack/faithfulness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def evaluate(self, entry: HaystackFaithfulnessEntry) -> SingleEvaluationResult:
5555
questions = [entry.input]
5656
contexts = [entry.contexts]
5757
predicted_answers = [entry.output]
58-
evaluator = FaithfulnessEvaluator()
58+
evaluator = FaithfulnessEvaluator(progress_bar=False)
5959

6060
total_tokens = calculate_total_tokens(self.settings.model, entry)
6161
max_tokens = min(self.settings.max_tokens, 16384)

evaluators/ragas/langevals_ragas/answer_correctness.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
SingleEvaluationResult,
55
EvaluationResultSkipped,
66
)
7-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
88

99

1010
class RagasAnswerCorrectnessEntry(EvaluatorEntry):
@@ -14,7 +14,7 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry):
1414

1515

1616
class RagasAnswerCorrectnessEvaluator(
17-
RagasEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
17+
BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
1818
):
1919
"""
2020
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
@@ -24,7 +24,9 @@ class RagasAnswerCorrectnessEvaluator(
2424
category = "rag"
2525
env_vars = env_vars
2626
default_settings = RagasSettings()
27-
docs_url = "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html"
27+
docs_url = (
28+
"https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html"
29+
)
2830
is_guardrail = False
2931

3032
def evaluate(self, entry: RagasAnswerCorrectnessEntry) -> SingleEvaluationResult:

evaluators/ragas/langevals_ragas/answer_relevancy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
EvaluatorEntry,
44
SingleEvaluationResult,
55
)
6-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
6+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
77

88

99
class RagasAnswerRelevancyEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):
1212

1313

1414
class RagasAnswerRelevancyEvaluator(
15-
RagasEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
15+
BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
1616
):
1717
"""
1818
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.

evaluators/ragas/langevals_ragas/context_precision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
EvaluatorEntry,
44
SingleEvaluationResult,
55
)
6-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
6+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
77

88

99
class RagasContextPrecisionEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasContextPrecisionEntry(EvaluatorEntry):
1313

1414

1515
class RagasContextPrecisionEvaluator(
16-
RagasEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
16+
BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
1717
):
1818
"""
1919
This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.

evaluators/ragas/langevals_ragas/context_recall.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
EvaluatorEntry,
44
SingleEvaluationResult,
55
)
6-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
6+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
77

88

99
class RagasContextRecallEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasContextRecallEntry(EvaluatorEntry):
1212

1313

1414
class RagasContextRecallEvaluator(
15-
RagasEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
15+
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
1616
):
1717
"""
1818
This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.

evaluators/ragas/langevals_ragas/context_relevancy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
EvaluatorEntry,
44
SingleEvaluationResult,
55
)
6-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
6+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
77

88

99
class RagasContextRelevancyEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasContextRelevancyEntry(EvaluatorEntry):
1212

1313

1414
class RagasContextRelevancyEvaluator(
15-
RagasEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
15+
BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
1616
):
1717
"""
1818
This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.

evaluators/ragas/langevals_ragas/context_utilization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
EvaluatorEntry,
44
SingleEvaluationResult,
55
)
6-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
6+
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
77

88

99
class RagasContextUtilizationEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasContextUtilizationEntry(EvaluatorEntry):
1313

1414

1515
class RagasContextUtilizationEvaluator(
16-
RagasEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
16+
BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
1717
):
1818
"""
1919
This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.

evaluators/ragas/langevals_ragas/faithfulness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
EvaluatorEntry,
55
SingleEvaluationResult,
66
)
7-
from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import BaseEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
88

99

1010
class RagasFaithfulnessEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasFaithfulnessEntry(EvaluatorEntry):
1313

1414

1515
class RagasFaithfulnessEvaluator(
16-
RagasEvaluator[RagasFaithfulnessEntry, RagasSettings, RagasResult]
16+
BaseEvaluator[RagasFaithfulnessEntry, RagasSettings, RagasResult]
1717
):
1818
"""
1919
This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.

evaluators/ragas/langevals_ragas/lib/common.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
1-
from contextlib import contextmanager
21
import math
32
import os
4-
import time
53
from typing import List, Literal, Optional
6-
import warnings
74
from langevals_core.base_evaluator import (
85
BaseEvaluator,
96
EvaluationResult,
107
EvaluatorSettings,
118
Money,
129
EvaluationResultSkipped,
1310
EvaluatorEntry,
14-
TEntry,
15-
TResult,
16-
TSettings,
1711
)
1812
from pydantic import Field
1913
from ragas import evaluate
@@ -30,18 +24,12 @@
3024
)
3125
from langchain_community.callbacks import get_openai_callback
3226
from datasets import Dataset
33-
from tqdm import tqdm
3427

3528
from langevals_ragas.lib.model_to_langchain import (
3629
embeddings_model_to_langchain,
3730
model_to_langchain,
3831
)
3932

40-
with warnings.catch_warnings():
41-
warnings.simplefilter("ignore")
42-
from tqdm.notebook import tqdm as tqdm_notebook
43-
from functools import partialmethod
44-
4533
from typing import List, Optional
4634
from datasets import Dataset
4735
from ragas import evaluate
@@ -89,18 +77,6 @@ class _GenericEvaluatorEntry(EvaluatorEntry):
8977
contexts: Optional[List[str]]
9078

9179

92-
class RagasEvaluator(BaseEvaluator[TEntry, TSettings, TResult]):
93-
def _evaluate_entry(self, *args, **kwargs):
94-
disable_tqdm()
95-
return super()._evaluate_entry(*args, **kwargs)
96-
97-
def evaluate_batch(self, *args, **kwargs):
98-
restore_tqdm()
99-
results = super().evaluate_batch(*args, **kwargs)
100-
restore_tqdm()
101-
return results
102-
103-
10480
def evaluate_ragas(
10581
evaluator: BaseEvaluator,
10682
metric: str,
@@ -182,26 +158,3 @@ def evaluate_ragas(
182158
score=score,
183159
cost=Money(amount=cb.total_cost, currency="USD"),
184160
)
185-
186-
187-
_original_tqdm_init = tqdm.__init__
188-
_original_tqdm_notebook_init = tqdm_notebook.__init__
189-
_tqdm_disabled_once = False
190-
191-
192-
# Hack to disable tqdm output from Ragas and use the one from langevals instead
193-
def disable_tqdm():
194-
global _tqdm_disabled_once
195-
if not _tqdm_disabled_once:
196-
time.sleep(0.1)
197-
_tqdm_disabled_once = True
198-
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore
199-
tqdm_notebook.__init__ = partialmethod(tqdm_notebook.__init__, disable=True) # type: ignore
200-
201-
202-
def restore_tqdm():
203-
global _tqdm_disabled_once
204-
_tqdm_disabled_once = False
205-
206-
tqdm.__init__ = _original_tqdm_init
207-
tqdm_notebook.__init__ = _original_tqdm_notebook_init

langevals/evaluation.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,26 +47,29 @@ def to_pandas(self):
4747

4848
for i, evaluator in enumerate(self.evaluators):
4949
evaluator_definitions = get_evaluator_definitions(evaluator)
50-
for result in self.results[i]:
50+
present_keys = set()
51+
for j, result in enumerate(self.results[i]):
5152
result_dict = result.model_dump()
52-
passed = result_dict.get("passed", None)
53-
score = result_dict.get("score", None)
5453
status = result_dict.get("status", None)
55-
details = result_dict.get("details", result_dict.get("message", None))
56-
57-
if evaluator_definitions.evaluator_name not in records:
58-
records[evaluator_definitions.evaluator_name] = []
59-
records[evaluator_definitions.evaluator_name].append(
60-
status
61-
if status != "processed"
62-
else passed if passed is not None else score
63-
)
54+
for key in ["passed", "score", "label"]:
55+
if result_dict.get(key, None) is not None:
56+
present_keys.add(key)
57+
58+
for key in present_keys:
59+
value = result_dict.get(key, None)
60+
key_column = f"{evaluator_definitions.evaluator_name}_{key}"
61+
if key_column not in records:
62+
records[key_column] = []
63+
records[key_column].append(
64+
status
65+
if status != "processed"
66+
else value
67+
)
6468

69+
details = result_dict.get("details", result_dict.get("message", None))
6570
details_column = f"{evaluator_definitions.evaluator_name}_details"
6671
if details is not None and details_column not in records:
67-
records[details_column] = [None] * (
68-
len(records[evaluator_definitions.evaluator_name]) - 1
69-
)
72+
records[details_column] = [None] * j
7073
if details_column in records:
7174
records[details_column].append(details)
7275

@@ -106,7 +109,9 @@ def set_child_executor(executor: ThreadPoolExecutor):
106109
not_done = list(future_to_index.keys())
107110
try:
108111
while not_done:
109-
done, not_done = wait(not_done, timeout=0.1, return_when=FIRST_COMPLETED)
112+
done, not_done = wait(
113+
not_done, timeout=0.1, return_when=FIRST_COMPLETED
114+
)
110115
for future in done:
111116
idx = future_to_index[future]
112117
result_set[idx] = future.result()

0 commit comments

Comments
 (0)