Skip to content

Commit 06ca79d

Browse files
committed
Added CLI for Response Quality
1 parent 2f754f9 commit 06ca79d

File tree

8 files changed

+178
-61
lines changed

8 files changed

+178
-61
lines changed

CHANGELOG.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,39 @@
22

33
All notable changes to LocalLab will be documented in this file.
44

5+
## [0.5.8] - 2024-05-02
6+
7+
### Added
8+
9+
- Added optional Response Quality Settings section to the CLI configuration
10+
- Added detailed parameter descriptions for all response quality settings
11+
- Increased default max_length from 4096 to 8192 tokens for more complete responses
12+
- Increased default top_k from 50 to 80 for better quality responses
13+
- Added max_time parameter (default: 120 seconds) to control generation time
14+
- Improved token-level streaming with larger token batches (4 tokens at a time)
15+
- Enhanced stop sequence detection to only check for definitive end markers
16+
- Improved repetition detection to only stop for extreme repetition
17+
- Added better error recovery for out-of-memory situations
18+
19+
### Changed
20+
21+
- Made Response Quality Settings section optional in CLI (default: skip)
22+
- Updated client timeouts from 180 to 300 seconds (5 minutes) for more complete responses
23+
- Increased client default max_length from 1024 to 8192 tokens to match server's default
24+
- Increased repetition_penalty from 1.1 to 1.15 for better quality
25+
- Updated all API routes to include top_k and repetition_penalty parameters
26+
- Enhanced memory management to prevent OOM errors
27+
- Improved error handling in streaming responses
28+
29+
### Client Package Changes (v1.0.7)
30+
31+
- Increased default timeouts for all operations
32+
- Added repetition_penalty parameter to all generation methods
33+
- Improved error handling and recovery in streaming
34+
- Added better buffering for token-level streaming
35+
- Increased retry counts for better reliability
36+
- Added top_k parameter to all generation methods
37+
538
## [0.5.7] - 2024-05-01
639

740
### Improved

client/python_client/locallab_client/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
)
2121
from .sync_client import SyncLocalLabClient
2222

23-
__version__ = "1.0.6"
23+
__version__ = "1.0.7"
2424
__author__ = "Utkarsh"
2525
__email__ = "utkarshweb2023@gmail.com"
2626

client/python_client/locallab_client/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ async def stream_generate(
301301

302302
# Use a higher max_length by default to ensure complete responses
303303
if max_length is None:
304-
max_length = 4096 # Default to 4096 tokens for more complete responses
304+
max_length = 8192 # Default to 8192 tokens to match server's default
305305

306306
payload = {
307307
"prompt": prompt,
@@ -311,7 +311,7 @@ async def stream_generate(
311311
"temperature": temperature,
312312
"top_p": top_p,
313313
# Add repetition_penalty for better quality
314-
"repetition_penalty": 1.1
314+
"repetition_penalty": 1.15
315315
}
316316

317317
# Create a timeout for this specific request

client/python_client/locallab_client/sync_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def batch_generate(
427427
Args:
428428
prompts: List of prompts to generate text from
429429
model_id: Optional model ID to use
430-
max_length: Maximum length of the generated text (defaults to 1024 if None)
430+
max_length: Maximum length of the generated text (defaults to 8192 if None)
431431
temperature: Temperature for sampling
432432
top_p: Top-p for nucleus sampling
433433
repetition_penalty: Penalty for repetition (higher values = less repetition)
@@ -437,7 +437,7 @@ def batch_generate(
437437
"""
438438
# Use a higher max_length by default to ensure complete responses
439439
if max_length is None:
440-
max_length = 4096 # Default to 4096 tokens for more complete responses
440+
max_length = 8192 # Default to 8192 tokens to match server's default
441441

442442
return self._run_coroutine(
443443
self._async_client.batch_generate(

client/python_client/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="locallab-client",
5-
version="1.0.6",
5+
version="1.0.7",
66
author="Utkarsh",
77
author_email="utkarshweb2023@gmail.com",
88
description="Python client for LocalLab - A local LLM server",

locallab/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
LocalLab - A lightweight AI inference server for running LLMs locally
33
"""
44

5-
__version__ = "0.5.5" # Updated to match setup.py
5+
__version__ = "0.5.8" # Updated to match setup.py
66

77
# Only import what's necessary initially, lazy-load the rest
88
from .logger import get_logger

locallab/cli/interactive.py

Lines changed: 137 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -107,37 +107,72 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
107107
click.echo("\n⚡ Model Optimization Settings")
108108
click.echo("─────────────────────────────")
109109

110-
config["enable_quantization"] = click.confirm(
111-
"Enable model quantization?",
112-
default=config.get("enable_quantization", ENABLE_QUANTIZATION)
110+
# Show current values for reference
111+
click.echo("\nCurrent optimization settings:")
112+
click.echo(f" Quantization: {'Enabled' if config.get('enable_quantization', ENABLE_QUANTIZATION) else 'Disabled'}")
113+
if config.get('enable_quantization', ENABLE_QUANTIZATION):
114+
click.echo(f" Quantization Type: {config.get('quantization_type', QUANTIZATION_TYPE)}")
115+
click.echo(f" CPU Offloading: {'Enabled' if config.get('enable_cpu_offloading', ENABLE_CPU_OFFLOADING) else 'Disabled'}")
116+
click.echo(f" Attention Slicing: {'Enabled' if config.get('enable_attention_slicing', ENABLE_ATTENTION_SLICING) else 'Disabled'}")
117+
click.echo(f" Flash Attention: {'Enabled' if config.get('enable_flash_attention', ENABLE_FLASH_ATTENTION) else 'Disabled'}")
118+
click.echo(f" Better Transformer: {'Enabled' if config.get('enable_bettertransformer', ENABLE_BETTERTRANSFORMER) else 'Disabled'}")
119+
120+
# Ask if user wants to configure optimization settings
121+
configure_optimization = click.confirm(
122+
"\nWould you like to configure model optimization settings?",
123+
default=True # Default to Yes for optimization settings
113124
)
114125

115-
if config["enable_quantization"]:
116-
config["quantization_type"] = click.prompt(
117-
"Quantization type (fp16/int8/int4)",
118-
default=config.get("quantization_type", QUANTIZATION_TYPE),
119-
type=click.Choice(["fp16", "int8", "int4"])
126+
if configure_optimization:
127+
config["enable_quantization"] = click.confirm(
128+
"Enable model quantization?",
129+
default=config.get("enable_quantization", ENABLE_QUANTIZATION)
120130
)
121131

122-
config["enable_cpu_offloading"] = click.confirm(
123-
"Enable CPU offloading?",
124-
default=config.get("enable_cpu_offloading", ENABLE_CPU_OFFLOADING)
125-
)
132+
if config["enable_quantization"]:
133+
config["quantization_type"] = click.prompt(
134+
"Quantization type (fp16/int8/int4)",
135+
default=config.get("quantization_type", QUANTIZATION_TYPE),
136+
type=click.Choice(["fp16", "int8", "int4"])
137+
)
126138

127-
config["enable_attention_slicing"] = click.confirm(
128-
"Enable attention slicing?",
129-
default=config.get("enable_attention_slicing", ENABLE_ATTENTION_SLICING)
130-
)
139+
config["enable_cpu_offloading"] = click.confirm(
140+
"Enable CPU offloading?",
141+
default=config.get("enable_cpu_offloading", ENABLE_CPU_OFFLOADING)
142+
)
131143

132-
config["enable_flash_attention"] = click.confirm(
133-
"Enable flash attention?",
134-
default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
135-
)
144+
config["enable_attention_slicing"] = click.confirm(
145+
"Enable attention slicing?",
146+
default=config.get("enable_attention_slicing", ENABLE_ATTENTION_SLICING)
147+
)
136148

137-
config["enable_better_transformer"] = click.confirm(
138-
"Enable better transformer?",
139-
default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
140-
)
149+
config["enable_flash_attention"] = click.confirm(
150+
"Enable flash attention?",
151+
default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
152+
)
153+
154+
config["enable_better_transformer"] = click.confirm(
155+
"Enable better transformer?",
156+
default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
157+
)
158+
159+
click.echo("\n✅ Optimization settings updated!")
160+
else:
161+
# If user doesn't want to configure, use the current values or defaults
162+
if 'enable_quantization' not in config:
163+
config["enable_quantization"] = ENABLE_QUANTIZATION
164+
if config["enable_quantization"] and 'quantization_type' not in config:
165+
config["quantization_type"] = QUANTIZATION_TYPE
166+
if 'enable_cpu_offloading' not in config:
167+
config["enable_cpu_offloading"] = ENABLE_CPU_OFFLOADING
168+
if 'enable_attention_slicing' not in config:
169+
config["enable_attention_slicing"] = ENABLE_ATTENTION_SLICING
170+
if 'enable_flash_attention' not in config:
171+
config["enable_flash_attention"] = ENABLE_FLASH_ATTENTION
172+
if 'enable_bettertransformer' not in config:
173+
config["enable_bettertransformer"] = ENABLE_BETTERTRANSFORMER
174+
175+
click.echo("\nUsing current optimization settings.")
141176

142177
# Advanced Settings
143178
# ----------------
@@ -150,40 +185,89 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
150185
type=int
151186
)
152187

153-
# Generation Parameters
154-
# -------------------
155-
click.echo("\n🔄 Generation Parameters")
156-
click.echo("─────────────────────")
157-
158-
config["max_length"] = click.prompt(
159-
"Maximum generation length (tokens)",
160-
default=config.get("max_length", 8192),
161-
type=int
188+
# Response Quality Settings
189+
# -----------------------
190+
click.echo("\n🎯 Response Quality Settings")
191+
click.echo("───────────────────────────")
192+
193+
# Show current values for reference with descriptions
194+
click.echo("\nCurrent response quality settings:")
195+
click.echo(f" Max Length: {config.get('max_length', 8192)} tokens - Maximum number of tokens in the generated response")
196+
click.echo(f" Temperature: {config.get('temperature', 0.7)} - Controls randomness (higher = more creative, lower = more focused)")
197+
click.echo(f" Top-p: {config.get('top_p', 0.9)} - Nucleus sampling parameter (higher = more diverse responses)")
198+
click.echo(f" Top-k: {config.get('top_k', 80)} - Limits vocabulary to top K tokens (higher = more diverse vocabulary)")
199+
click.echo(f" Repetition Penalty: {config.get('repetition_penalty', 1.15)} - Penalizes repetition (higher = less repetition)")
200+
click.echo(f" Max Time: {config.get('max_time', 120.0)} seconds - Maximum time allowed for generation")
201+
202+
# Ask if user wants to configure response quality settings
203+
configure_response_quality = click.confirm(
204+
"\nWould you like to configure response quality settings?",
205+
default=False # Default to No
162206
)
163207

164-
config["temperature"] = click.prompt(
165-
"Temperature (0.1-1.0)",
166-
default=config.get("temperature", 0.7),
167-
type=float
168-
)
208+
if configure_response_quality:
209+
# If user wants to configure, show the prompts with descriptions
210+
config["max_length"] = click.prompt(
211+
"Maximum generation length in tokens (higher = longer responses, but slower)",
212+
default=config.get("max_length", 8192),
213+
type=int
214+
)
169215

170-
config["top_p"] = click.prompt(
171-
"Top-p (0.1-1.0)",
172-
default=config.get("top_p", 0.9),
173-
type=float
174-
)
216+
config["temperature"] = click.prompt(
217+
"Temperature (0.1-1.0, higher = more creative, lower = more focused)",
218+
default=config.get("temperature", 0.7),
219+
type=float
220+
)
175221

176-
config["top_k"] = click.prompt(
177-
"Top-k (1-100)",
178-
default=config.get("top_k", 80),
179-
type=int
180-
)
222+
config["top_p"] = click.prompt(
223+
"Top-p (0.1-1.0, higher = more diverse responses)",
224+
default=config.get("top_p", 0.9),
225+
type=float
226+
)
181227

182-
config["repetition_penalty"] = click.prompt(
183-
"Repetition penalty (1.0-2.0)",
184-
default=config.get("repetition_penalty", 1.15),
185-
type=float
186-
)
228+
config["top_k"] = click.prompt(
229+
"Top-k (1-100, higher = more diverse vocabulary)",
230+
default=config.get("top_k", 80),
231+
type=int
232+
)
233+
234+
config["repetition_penalty"] = click.prompt(
235+
"Repetition penalty (1.0-2.0, higher = less repetition)",
236+
default=config.get("repetition_penalty", 1.15),
237+
type=float
238+
)
239+
240+
config["max_time"] = click.prompt(
241+
"Maximum generation time in seconds (higher = more complete responses, but slower)",
242+
default=config.get("max_time", 120.0),
243+
type=float
244+
)
245+
246+
click.echo("\n✅ Response quality settings updated!")
247+
else:
248+
# If user doesn't want to configure, use the current values or defaults
249+
if 'max_length' not in config:
250+
config["max_length"] = 8192
251+
if 'temperature' not in config:
252+
config["temperature"] = 0.7
253+
if 'top_p' not in config:
254+
config["top_p"] = 0.9
255+
if 'top_k' not in config:
256+
config["top_k"] = 80
257+
if 'repetition_penalty' not in config:
258+
config["repetition_penalty"] = 1.15
259+
if 'max_time' not in config:
260+
config["max_time"] = 120.0
261+
262+
click.echo("\nUsing default response quality settings.")
263+
264+
# Set environment variables for these settings
265+
os.environ["DEFAULT_MAX_LENGTH"] = str(config["max_length"])
266+
os.environ["DEFAULT_TEMPERATURE"] = str(config["temperature"])
267+
os.environ["DEFAULT_TOP_P"] = str(config["top_p"])
268+
os.environ["DEFAULT_TOP_K"] = str(config["top_k"])
269+
os.environ["DEFAULT_REPETITION_PENALTY"] = str(config["repetition_penalty"])
270+
os.environ["DEFAULT_MAX_TIME"] = str(config["max_time"])
187271

188272
# Cache Settings
189273
# -------------

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747

4848
setup(
4949
name="locallab",
50-
version="0.5.5",
50+
version="0.5.8",
5151
packages=find_packages(include=["locallab", "locallab.*"]),
5252
install_requires=install_requires,
5353
extras_require={

0 commit comments

Comments
 (0)