contextplus opt-in, mockOpenAI improvements, improved retry logic

MeluXina user · MeluXina user · commit 9530cac0b2a2 · 2026-01-22T17:09:54.000+01:00
diff --git a/mallm/models/Chat.py b/mallm/models/Chat.py
@@ -12,7 +12,7 @@
 from langchain_core.language_models.llms import LLM
 from langchain_core.outputs import LLMResult
 from langchain_core.prompt_values import PromptValue
-from openai import APIError, RateLimitError, OpenAI
+from openai import APIError, APIConnectionError, RateLimitError, OpenAI
 
 class Chat(LLM):    # type: ignore
     """A custom chat model that queries the chat API of HuggingFace Text Generation Inference
@@ -132,7 +132,7 @@ def _call(  # type: ignore
                         collected_messages.append(message_str)
                 log_prob_sum = log_prob_sum / len(collected_messages)
                 break
-            except APIError as e:
+            except (APIError, APIConnectionError, RateLimitError) as e:
                 # Handle API error here, e.g. retry or log
                 retries += 1
                 if retries < 5:
diff --git a/mallm/models/MockOpenAI.py b/mallm/models/MockOpenAI.py
@@ -64,13 +64,30 @@ def _extract_final_solution_from_messages(messages: list[dict[str, str]]) -> str
 
     # Extraction prompts
     if "extract the final solution" in last.lower():
-        # Try to extract from "Your previous response:" if present
-        m = re.search(r"previous response:\s*(.*)$", last, re.IGNORECASE | re.DOTALL)
-        if m:
-            # Return the previous response as-is (test-friendly)
-            return m.group(1).strip()
-        # Fallback
-        return "Final Solution"
+        # Prefer extracting from the message that actually contains "Your previous response:"
+        src = ""
+        for msg in reversed(messages):
+            content = msg.get("content", "") or ""
+            if re.search(r"\bprevious response:\b", content, re.IGNORECASE):
+                src = content
+                break
+        if not src:
+            src = last
+
+        m = re.search(r"previous response:\s*(.*)$", src, re.IGNORECASE | re.DOTALL)
+        extracted = (m.group(1) if m else src).strip()
+
+        # If the extraction instruction was merged into the same user message, strip it off.
+        cut = re.search(r"\n\s*extract the final solution\b", extracted, re.IGNORECASE)
+        if cut:
+            extracted = extracted[: cut.start()].strip()
+
+        # If the previous response contains a "Final Solution:" marker, return only the solution part.
+        m2 = re.search(r"final solution:\s*(.*)$", extracted, re.IGNORECASE | re.DOTALL)
+        if m2:
+            return m2.group(1).strip()
+
+        return extracted or "Final Solution"
 
     # Task-specific simple heuristics
     if "capital of france" in joined.lower():
diff --git a/mallm/scheduler.py b/mallm/scheduler.py
@@ -22,8 +22,6 @@
 import langchain
 import langchain_core
 import openai
-
-from contextplus import context
 from mallm.models.MockOpenAI import MockOpenAI
 
 try:
@@ -373,8 +371,9 @@ def manage_discussions(self, client: httpx.Client) -> None:
         all_model = None
         if not str(self.config.endpoint_url).startswith("mock://") and SentenceTransformer is not None:
             try:
-                paraphrase_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
-                all_model = SentenceTransformer("all-MiniLM-L6-v2")
+                # Force CPU to avoid contention with the main LLM GPU server
+                paraphrase_model = SentenceTransformer("paraphrase-MiniLM-L6-v2", device="cpu")
+                all_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
             except Exception:
                 paraphrase_model = None
                 all_model = None
@@ -390,11 +389,19 @@ def worker_paraphrase_function(
             return [[1.0, 0.0] for _ in input_data]
 
         def worker_context_function(input_data: str) -> str:
-            # Acquire the lock before using the model
-            text: str
+            # Optionally disable heavy context retrieval to avoid GPU memory conflicts.
+            # Default: disabled, unless explicitly enabled via MALLM_ENABLE_CONTEXT=1.
+            if os.environ.get("MALLM_DISABLE_CONTEXT", "0") == "1":
+                return ""
+            if os.environ.get("MALLM_ENABLE_CONTEXT", "0") != "1":
+                return ""
+            # Acquire the lock before using the model; import lazily to avoid loading on module import.
             with context_lock:
-                text = context(input_data)
-            return text
+                try:
+                    from contextplus import context as cp_context  # type: ignore
+                    return cp_context(input_data)
+                except Exception:
+                    return ""
 
         def worker_persona_diversity_function(
             input_data: list[str],