Disable nested progress bar also for haystack and everyone else

rogeriochaves · rogeriochaves · commit e38daf198983 · 2024-09-05T16:04:04.000+02:00
diff --git a/evaluators/haystack/langevals_haystack/faithfulness.py b/evaluators/haystack/langevals_haystack/faithfulness.py
@@ -55,7 +55,7 @@ def evaluate(self, entry: HaystackFaithfulnessEntry) -> SingleEvaluationResult:
         questions = [entry.input]
         contexts = [entry.contexts]
         predicted_answers = [entry.output]
-        evaluator = FaithfulnessEvaluator()
+        evaluator = FaithfulnessEvaluator(progress_bar=False)
 
         total_tokens = calculate_total_tokens(self.settings.model, entry)
         max_tokens = min(self.settings.max_tokens, 16384)
diff --git a/evaluators/ragas/langevals_ragas/answer_correctness.py b/evaluators/ragas/langevals_ragas/answer_correctness.py
@@ -4,7 +4,7 @@
     SingleEvaluationResult,
     EvaluationResultSkipped,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasAnswerCorrectnessEntry(EvaluatorEntry):
@@ -14,7 +14,7 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry):
 
 
 class RagasAnswerCorrectnessEvaluator(
-    RagasEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
 ):
     """
     This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
@@ -24,7 +24,9 @@ class RagasAnswerCorrectnessEvaluator(
     category = "rag"
     env_vars = env_vars
     default_settings = RagasSettings()
-    docs_url = "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html"
+    docs_url = (
+        "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html"
+    )
     is_guardrail = False
 
     def evaluate(self, entry: RagasAnswerCorrectnessEntry) -> SingleEvaluationResult:
diff --git a/evaluators/ragas/langevals_ragas/answer_relevancy.py b/evaluators/ragas/langevals_ragas/answer_relevancy.py
@@ -3,7 +3,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasAnswerRelevancyEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):
 
 
 class RagasAnswerRelevancyEvaluator(
-    RagasEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
 ):
     """
     This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
diff --git a/evaluators/ragas/langevals_ragas/context_precision.py b/evaluators/ragas/langevals_ragas/context_precision.py
@@ -3,7 +3,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasContextPrecisionEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasContextPrecisionEntry(EvaluatorEntry):
 
 
 class RagasContextPrecisionEvaluator(
-    RagasEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
 ):
     """
     This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.
diff --git a/evaluators/ragas/langevals_ragas/context_recall.py b/evaluators/ragas/langevals_ragas/context_recall.py
@@ -3,7 +3,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasContextRecallEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasContextRecallEntry(EvaluatorEntry):
 
 
 class RagasContextRecallEvaluator(
-    RagasEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
 ):
     """
     This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.
diff --git a/evaluators/ragas/langevals_ragas/context_relevancy.py b/evaluators/ragas/langevals_ragas/context_relevancy.py
@@ -3,7 +3,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasContextRelevancyEntry(EvaluatorEntry):
@@ -12,7 +12,7 @@ class RagasContextRelevancyEntry(EvaluatorEntry):
 
 
 class RagasContextRelevancyEvaluator(
-    RagasEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
 ):
     """
     This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
diff --git a/evaluators/ragas/langevals_ragas/context_utilization.py b/evaluators/ragas/langevals_ragas/context_utilization.py
@@ -3,7 +3,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasContextUtilizationEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasContextUtilizationEntry(EvaluatorEntry):
 
 
 class RagasContextUtilizationEvaluator(
-    RagasEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
 ):
     """
     This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.
diff --git a/evaluators/ragas/langevals_ragas/faithfulness.py b/evaluators/ragas/langevals_ragas/faithfulness.py
@@ -4,7 +4,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import RagasEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import BaseEvaluator, env_vars, evaluate_ragas, RagasSettings, RagasResult
 
 
 class RagasFaithfulnessEntry(EvaluatorEntry):
@@ -13,7 +13,7 @@ class RagasFaithfulnessEntry(EvaluatorEntry):
 
 
 class RagasFaithfulnessEvaluator(
-    RagasEvaluator[RagasFaithfulnessEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasFaithfulnessEntry, RagasSettings, RagasResult]
 ):
     """
     This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.
diff --git a/evaluators/ragas/langevals_ragas/lib/common.py b/evaluators/ragas/langevals_ragas/lib/common.py
@@ -1,19 +1,13 @@
-from contextlib import contextmanager
 import math
 import os
-import time
 from typing import List, Literal, Optional
-import warnings
 from langevals_core.base_evaluator import (
     BaseEvaluator,
     EvaluationResult,
     EvaluatorSettings,
     Money,
     EvaluationResultSkipped,
     EvaluatorEntry,
-    TEntry,
-    TResult,
-    TSettings,
 )
 from pydantic import Field
 from ragas import evaluate
@@ -30,18 +24,12 @@
 )
 from langchain_community.callbacks import get_openai_callback
 from datasets import Dataset
-from tqdm import tqdm
 
 from langevals_ragas.lib.model_to_langchain import (
     embeddings_model_to_langchain,
     model_to_langchain,
 )
 
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    from tqdm.notebook import tqdm as tqdm_notebook
-from functools import partialmethod
-
 from typing import List, Optional
 from datasets import Dataset
 from ragas import evaluate
@@ -89,18 +77,6 @@ class _GenericEvaluatorEntry(EvaluatorEntry):
     contexts: Optional[List[str]]
 
 
-class RagasEvaluator(BaseEvaluator[TEntry, TSettings, TResult]):
-    def _evaluate_entry(self, *args, **kwargs):
-        disable_tqdm()
-        return super()._evaluate_entry(*args, **kwargs)
-
-    def evaluate_batch(self, *args, **kwargs):
-        restore_tqdm()
-        results = super().evaluate_batch(*args, **kwargs)
-        restore_tqdm()
-        return results
-
-
 def evaluate_ragas(
     evaluator: BaseEvaluator,
     metric: str,
@@ -182,26 +158,3 @@ def evaluate_ragas(
         score=score,
         cost=Money(amount=cb.total_cost, currency="USD"),
     )
-
-
-_original_tqdm_init = tqdm.__init__
-_original_tqdm_notebook_init = tqdm_notebook.__init__
-_tqdm_disabled_once = False
-
-
-# Hack to disable tqdm output from Ragas and use the one from langevals instead
-def disable_tqdm():
-    global _tqdm_disabled_once
-    if not _tqdm_disabled_once:
-        time.sleep(0.1)
-        _tqdm_disabled_once = True
-    tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
-    tqdm_notebook.__init__ = partialmethod(tqdm_notebook.__init__, disable=True)  # type: ignore
-
-
-def restore_tqdm():
-    global _tqdm_disabled_once
-    _tqdm_disabled_once = False
-
-    tqdm.__init__ = _original_tqdm_init
-    tqdm_notebook.__init__ = _original_tqdm_notebook_init
diff --git a/langevals/evaluation.py b/langevals/evaluation.py
@@ -47,26 +47,29 @@ def to_pandas(self):
 
         for i, evaluator in enumerate(self.evaluators):
             evaluator_definitions = get_evaluator_definitions(evaluator)
-            for result in self.results[i]:
+            present_keys = set()
+            for j, result in enumerate(self.results[i]):
                 result_dict = result.model_dump()
-                passed = result_dict.get("passed", None)
-                score = result_dict.get("score", None)
                 status = result_dict.get("status", None)
-                details = result_dict.get("details", result_dict.get("message", None))
-
-                if evaluator_definitions.evaluator_name not in records:
-                    records[evaluator_definitions.evaluator_name] = []
-                records[evaluator_definitions.evaluator_name].append(
-                    status
-                    if status != "processed"
-                    else passed if passed is not None else score
-                )
+                for key in ["passed", "score", "label"]:
+                    if result_dict.get(key, None) is not None:
+                        present_keys.add(key)
+
+                for key in present_keys:
+                    value = result_dict.get(key, None)
+                    key_column = f"{evaluator_definitions.evaluator_name}_{key}"
+                    if key_column not in records:
+                        records[key_column] = []
+                    records[key_column].append(
+                        status
+                        if status != "processed"
+                        else value
+                    )
 
+                details = result_dict.get("details", result_dict.get("message", None))
                 details_column = f"{evaluator_definitions.evaluator_name}_details"
                 if details is not None and details_column not in records:
-                    records[details_column] = [None] * (
-                        len(records[evaluator_definitions.evaluator_name]) - 1
-                    )
+                    records[details_column] = [None] * j
                 if details_column in records:
                     records[details_column].append(details)
 
@@ -106,7 +109,9 @@ def set_child_executor(executor: ThreadPoolExecutor):
         not_done = list(future_to_index.keys())
         try:
             while not_done:
-                done, not_done = wait(not_done, timeout=0.1, return_when=FIRST_COMPLETED)
+                done, not_done = wait(
+                    not_done, timeout=0.1, return_when=FIRST_COMPLETED
+                )
                 for future in done:
                     idx = future_to_index[future]
                     result_set[idx] = future.result()
diff --git a/langevals_core/langevals_core/base_evaluator.py b/langevals_core/langevals_core/base_evaluator.py
@@ -9,21 +9,25 @@
     List,
     Literal,
     Optional,
-    Type,
     TypeVar,
     Union,
-    get_args,
     get_type_hints,
 )
-from litellm.utils import get_max_tokens
 
 from pydantic import BaseModel, ConfigDict, Field
-import pandas as pd
-from tenacity import Retrying, retry, stop_after_attempt, wait_exponential
-from tqdm.auto import tqdm
-from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, as_completed, wait
+from tenacity import Retrying, stop_after_attempt
+from tqdm.auto import tqdm as tqdm_auto
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from langevals_core.azure_patch import patch_litellm
 
+import time
+import warnings
+from tqdm import tqdm
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    from tqdm.notebook import tqdm as tqdm_notebook
+from functools import partialmethod
+
 patch_litellm()
 
 EvalCategories = Literal[
@@ -268,7 +272,8 @@ def set_model_envs(self):
     def evaluate(self, entry: TEntry) -> SingleEvaluationResult:
         raise NotImplementedError("This method should be implemented by subclasses.")
 
-    def _evaluate_entry(self, entry, retries=0):
+    def _evaluate_entry(self, entry, retries=0, restore_tqdm=True):
+        _disable_tqdm()
         try:
             retryer = Retrying(stop=stop_after_attempt(retries), reraise=True)
             return retryer(self.evaluate, entry)
@@ -280,6 +285,9 @@ def _evaluate_entry(self, entry, retries=0):
                     traceback.TracebackException.from_exception(exception).format()
                 ),
             )
+        finally:
+            if restore_tqdm:
+                _restore_tqdm()
 
     def evaluate_batch(
         self,
@@ -289,20 +297,23 @@ def evaluate_batch(
         retries=3,
         _executor_ref: Optional[Callable[[ThreadPoolExecutor], None]] = None,
     ) -> BatchEvaluationResult:
+        _restore_tqdm()
         results: list[SingleEvaluationResult] = [
             EvaluationResultSkipped(details="not processed")
         ] * len(data)
         with ThreadPoolExecutor(max_workers=max_evaluations_in_parallel) as executor:
             future_to_index = {
-                executor.submit(self._evaluate_entry, entry, retries): idx
+                executor.submit(
+                    self._evaluate_entry, entry, retries, restore_tqdm=False
+                ): idx
                 for idx, entry in enumerate(data)
             }
 
             if _executor_ref is not None:
                 _executor_ref(executor)
 
             not_done = list(future_to_index.keys())
-            with tqdm(total=len(future_to_index), position=index) as progress:
+            with tqdm_auto(total=len(future_to_index), position=index) as progress:
                 try:
                     while not_done:
                         if hasattr(
@@ -320,4 +331,28 @@ def evaluate_batch(
                     executor.shutdown(wait=False, cancel_futures=True)
                     raise
 
+        _restore_tqdm()
         return results
+
+
+_original_tqdm_init = tqdm.__init__
+_original_tqdm_notebook_init = tqdm_notebook.__init__
+_tqdm_disabled_once = False
+
+
+# Hack to disable tqdm output from Ragas and other libraries and use the one from langevals instead
+def _disable_tqdm():
+    global _tqdm_disabled_once
+    if not _tqdm_disabled_once:
+        time.sleep(0.1)
+        _tqdm_disabled_once = True
+    tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
+    tqdm_notebook.__init__ = partialmethod(tqdm_notebook.__init__, disable=True)  # type: ignore
+
+
+def _restore_tqdm():
+    global _tqdm_disabled_once
+    _tqdm_disabled_once = False
+
+    tqdm.__init__ = _original_tqdm_init
+    tqdm_notebook.__init__ = _original_tqdm_notebook_init