UtkarshTheDev
diff --git a/‎client/python_client/locallab_client/client.py
Lines changed: 31 additions & 8 deletions b/‎client/python_client/locallab_client/client.py
Lines changed: 31 additions & 8 deletions
diff --git a/‎client/python_client/locallab_client/sync_client.py
Lines changed: 95 additions & 25 deletions b/‎client/python_client/locallab_client/sync_client.py
Lines changed: 95 additions & 25 deletions
@@ -143,7 +143,7 @@ def __init__(self, base_url: str, timeout: float = 30.0, headers: Dict[str, str]
 
 class LocalLabClient:
     """Asynchronous client for the LocalLab API with improved error handling."""
-    
+
     # Class-level attribute for activity tracking
     _last_activity_times = {}
 
@@ -152,7 +152,7 @@ def __init__(self, config: Union[str, LocalLabConfig, Dict[str, Any]]):
             config = LocalLabConfig(base_url=config)
         elif isinstance(config, dict):
             config = LocalLabConfig(**config)
-        
+
         self.config = config
         self._session: Optional[aiohttp.ClientSession] = None
         self.ws: Optional[websockets.WebSocketClientProtocol] = None
@@ -167,7 +167,7 @@ async def connect(self):
         """Initialize HTTP session with improved error handling."""
         if self._closed:
             raise RuntimeError("Client is closed")
-            
+
         if not self._session:
             try:
                 headers = {
@@ -176,7 +176,7 @@ async def connect(self):
                 }
                 if self.config.api_key:
                     headers["Authorization"] = f"Bearer {self.config.api_key}"
-                
+
                 self._session = aiohttp.ClientSession(
                     headers=headers,
                     timeout=aiohttp.ClientTimeout(total=self.config.timeout),
@@ -291,20 +291,27 @@ async def stream_generate(
         max_length: Optional[int] = None,
         temperature: float = 0.7,
         top_p: float = 0.9,
-        timeout: float = 120.0,  # Increased timeout for low-resource CPUs
-        retry_count: int = 2     # Add retry count for reliability
+        timeout: float = 300.0,  # Increased timeout for more complete responses (5 minutes)
+        retry_count: int = 3,    # Increased retry count for better reliability
+        repetition_penalty: float = 1.15  # Increased repetition penalty for better quality
     ) -> AsyncGenerator[str, None]:
         """Stream text generation with token-level streaming and robust error handling"""
         # Update activity timestamp
         self._update_activity()
 
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         payload = {
             "prompt": prompt,
             "model_id": model_id,
             "stream": True,
             "max_length": max_length,
             "temperature": temperature,
-            "top_p": top_p
+            "top_p": top_p,
+            # Add repetition_penalty for better quality
+            "repetition_penalty": 1.1
         }
 
         # Create a timeout for this specific request
@@ -313,6 +320,7 @@ async def stream_generate(
         # Track retries
         retries = 0
         last_error = None
+        accumulated_text = ""  # Track accumulated text for error recovery
 
         while retries <= retry_count:
             try:
@@ -329,12 +337,15 @@ async def stream_generate(
                     # Track if we've seen any data to detect early disconnections
                     received_data = False
                     # Buffer for accumulating partial responses if needed
+                    token_buffer = ""
+                    last_token_time = time.time()
 
                     try:
                         # Process the streaming response
                         async for line in response.content:
                             if line:
                                 received_data = True
+                                current_time = time.time()
                                 text = line.decode("utf-8").strip()
 
                                 # Skip empty lines
@@ -347,18 +358,30 @@ async def stream_generate(
 
                                 # Check for end of stream marker
                                 if text == "[DONE]":
+                                    # If we have any buffered text, yield it before ending
+                                    if token_buffer:
+                                        yield token_buffer
                                     break
 
                                 # Check for error messages
                                 if text.startswith("\nError:") or text.startswith("Error:"):
                                     error_msg = text.replace("\nError: ", "").replace("Error: ", "")
                                     raise Exception(error_msg)
 
+                                # Add to accumulated text for error recovery
+                                accumulated_text += text
+
+                                # Reset the last token time
+                                last_token_time = current_time
+
+                                # Yield the token directly for immediate feedback
                                 yield text
 
                         # If we didn't receive any data, the stream might have ended unexpectedly
                         if not received_data:
-                            yield "\nError: Stream ended unexpectedly without returning any data"
+                            # If we have accumulated text from previous attempts, don't report an error
+                            if not accumulated_text:
+                                yield "\nError: Stream ended unexpectedly without returning any data"
 
                         # Successful completion, break the retry loop
                         break
 
@@ -73,7 +73,7 @@ def _ensure_connection(self):
     def _run_coroutine(self, coro, timeout: Optional[float] = None):
         """Run a coroutine in the event loop thread with timeout and error handling."""
         self._ensure_connection()
-        
+
         try:
             future = asyncio.run_coroutine_threadsafe(coro, self._loop)
             return future.result(timeout=timeout)
@@ -123,10 +123,10 @@ def close(self):
                     # Clean up
                     self._loop = None
                     self._thread = None
-                    
+
                     # Shutdown executor
                     self._executor.shutdown(wait=False)
-                    
+
                 except Exception as e:
                     logger.error(f"Error during client cleanup: {str(e)}")
                 finally:
@@ -139,25 +139,40 @@ def generate(
         stream: bool = False,
         max_length: Optional[int] = None,
         temperature: float = 0.7,
-        top_p: float = 0.9
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
+        top_k: int = 80  # Added top_k parameter for better quality
     ) -> Union[str, Generator[str, None, None]]:
         """
-        Generate text using the model.
+        Generate text using the model with improved quality settings.
 
         Args:
             prompt: The prompt to generate text from
             model_id: Optional model ID to use
             stream: Whether to stream the response
-            max_length: Maximum length of the generated text
+            max_length: Maximum length of the generated text (defaults to 1024 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
+            repetition_penalty: Penalty for repetition (higher values = less repetition)
 
         Returns:
             If stream=False, returns the generated text as a string.
             If stream=True, returns a generator that yields chunks of text.
         """
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         if stream:
-            return self.stream_generate(prompt, model_id, max_length, temperature, top_p)
+            return self.stream_generate(
+                prompt=prompt,
+                model_id=model_id,
+                max_length=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k
+            )
 
         return self._run_coroutine(
             self._async_client.generate(
@@ -166,7 +181,10 @@ def generate(
                 stream=False,
                 max_length=max_length,
                 temperature=temperature,
-                top_p=top_p
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                timeout=180.0  # Increased timeout for more complete responses (3 minutes)
             )
         )
 
@@ -177,22 +195,29 @@ def stream_generate(
         max_length: Optional[int] = None,
         temperature: float = 0.7,
         top_p: float = 0.9,
-        timeout: float = 60.0
+        timeout: float = 300.0,  # Increased timeout for more complete responses (5 minutes)
+        repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
+        top_k: int = 80  # Added top_k parameter for better quality
     ) -> Generator[str, None, None]:
         """
-        Stream text generation.
+        Stream text generation with improved quality and reliability.
 
         Args:
             prompt: The prompt to generate text from
             model_id: Optional model ID to use
-            max_length: Maximum length of the generated text
+            max_length: Maximum length of the generated text (defaults to 1024 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
             timeout: Request timeout in seconds
+            repetition_penalty: Penalty for repetition (higher values = less repetition)
 
         Returns:
             A generator that yields chunks of text as they are generated.
         """
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         # Create a queue to pass data between the async and sync worlds
         queue = asyncio.Queue()
         stop_event = threading.Event()
@@ -206,7 +231,10 @@ async def producer():
                     max_length=max_length,
                     temperature=temperature,
                     top_p=top_p,
-                    timeout=timeout
+                    timeout=timeout,
+                    retry_count=3,  # Increased retry count for better reliability
+                    repetition_penalty=repetition_penalty,  # Pass the repetition penalty parameter
+                    top_k=top_k  # Pass the top_k parameter
                 ):
                     await queue.put(chunk)
 
@@ -250,25 +278,41 @@ def chat(
         stream: bool = False,
         max_length: Optional[int] = None,
         temperature: float = 0.7,
-        top_p: float = 0.9
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
+        top_k: int = 80  # Added top_k parameter for better quality
     ) -> Union[Dict[str, Any], Generator[Dict[str, Any], None, None]]:
         """
-        Chat completion.
+        Chat completion with improved quality settings.
 
         Args:
             messages: List of message dictionaries with 'role' and 'content' keys
             model_id: Optional model ID to use
             stream: Whether to stream the response
-            max_length: Maximum length of the generated text
+            max_length: Maximum length of the generated text (defaults to 1024 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
+            repetition_penalty: Penalty for repetition (higher values = less repetition)
 
         Returns:
             If stream=False, returns the chat completion response.
             If stream=True, returns a generator that yields chunks of the response.
         """
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         if stream:
-            return self.stream_chat(messages, model_id, max_length, temperature, top_p)
+            return self.stream_chat(
+                messages=messages,
+                model_id=model_id,
+                max_length=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                timeout=300.0,  # Increased timeout for more complete responses (5 minutes)
+                repetition_penalty=repetition_penalty,
+                top_k=top_k
+            )
 
         return self._run_coroutine(
             self._async_client.chat(
@@ -277,7 +321,10 @@ def chat(
                 stream=False,
                 max_length=max_length,
                 temperature=temperature,
-                top_p=top_p
+                top_p=top_p,
+                timeout=180.0,  # Increased timeout for more complete responses (3 minutes)
+                repetition_penalty=repetition_penalty,
+                top_k=top_k
             )
         )
 
@@ -287,21 +334,29 @@ def stream_chat(
         model_id: Optional[str] = None,
         max_length: Optional[int] = None,
         temperature: float = 0.7,
-        top_p: float = 0.9
+        top_p: float = 0.9,
+        timeout: float = 300.0,  # Increased timeout for more complete responses (5 minutes)
+        repetition_penalty: float = 1.15,  # Added repetition penalty for better quality
+        top_k: int = 80  # Added top_k parameter for better quality
     ) -> Generator[Dict[str, Any], None, None]:
         """
-        Stream chat completion.
+        Stream chat completion with improved quality and reliability.
 
         Args:
             messages: List of message dictionaries with 'role' and 'content' keys
             model_id: Optional model ID to use
-            max_length: Maximum length of the generated text
+            max_length: Maximum length of the generated text (defaults to 1024 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
+            timeout: Request timeout in seconds
 
         Returns:
             A generator that yields chunks of the chat completion response.
         """
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         # Create a queue to pass data between the async and sync worlds
         queue = asyncio.Queue()
         stop_event = threading.Event()
@@ -314,7 +369,11 @@ async def producer():
                     model_id=model_id,
                     max_length=max_length,
                     temperature=temperature,
-                    top_p=top_p
+                    top_p=top_p,
+                    timeout=timeout,
+                    retry_count=3,  # Increased retry count for better reliability
+                    repetition_penalty=repetition_penalty,
+                    top_k=top_k
                 ):
                     await queue.put(chunk)
 
@@ -357,28 +416,39 @@ def batch_generate(
         model_id: Optional[str] = None,
         max_length: Optional[int] = None,
         temperature: float = 0.7,
-        top_p: float = 0.9
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
+        top_k: int = 80,  # Added top_k parameter for better quality
+        timeout: float = 300.0  # Added timeout parameter (5 minutes)
     ) -> Dict[str, List[str]]:
         """
-        Generate text for multiple prompts in parallel.
+        Generate text for multiple prompts in parallel with improved quality settings.
 
         Args:
             prompts: List of prompts to generate text from
             model_id: Optional model ID to use
-            max_length: Maximum length of the generated text
+            max_length: Maximum length of the generated text (defaults to 1024 if None)
             temperature: Temperature for sampling
             top_p: Top-p for nucleus sampling
+            repetition_penalty: Penalty for repetition (higher values = less repetition)
 
         Returns:
             Dictionary with the generated responses.
         """
+        # Use a higher max_length by default to ensure complete responses
+        if max_length is None:
+            max_length = 4096  # Default to 4096 tokens for more complete responses
+
         return self._run_coroutine(
             self._async_client.batch_generate(
                 prompts=prompts,
                 model_id=model_id,
                 max_length=max_length,
                 temperature=temperature,
-                top_p=top_p
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                timeout=timeout  # Use the provided timeout parameter
             )
         )