This guide provides comprehensive examples for integrating Fluid Server with various programming languages and frameworks. Fluid Server provides OpenAI-compatible APIs, making it a drop-in replacement for OpenAI services in your applications.
- Base URL:
http://localhost:8080/v1 - Health Check:
http://localhost:8080/health - API Documentation:
http://localhost:8080/docs
POST /v1/chat/completions- Chat completions with streaming supportPOST /v1/audio/transcriptions- Audio transcriptionPOST /v1/embeddings- Text embeddings generationGET /v1/models- List available models
curl http://localhost:8080/healthcurl -X POST http://localhost:8080/v1/chat/completions `
-H "Content-Type: application/json" `
-d '{
"model": "qwen3-8b-int8-ov",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 100
}'curl -X POST http://localhost:8080/v1/chat/completions `
-H "Content-Type: application/json" `
-d '{
"model": "qwen3-8b-int8-ov",
"messages": [{"role": "user", "content": "Tell me a story"}],
"stream": true,
"max_tokens": 200
}'# Using QNN model (Snapdragon)
curl -X POST http://localhost:8080/v1/audio/transcriptions `
-F "file=@audio.wav" `
-F "model=whisper-large-v3-turbo-qnn" `
-F "response_format=json"
# Using OpenVINO model (Intel)
curl -X POST http://localhost:8080/v1/audio/transcriptions `
-F "file=@audio.wav" `
-F "model=whisper-large-v3-turbo-ov-npu" `
-F "response_format=verbose_json"curl -X POST http://localhost:8080/v1/embeddings `
-H "Content-Type: application/json" `
-d '{
"input": ["Hello world", "Vector database"],
"model": "sentence-transformers/all-MiniLM-L6-v2"
}'curl http://localhost:8080/v1/modelsInstall the OpenAI Python SDK:
pip install openaifrom openai import OpenAI
# Point to local Fluid Server
client = OpenAI(
base_url="http://localhost:8080/v1",
api_key="local" # Can be anything for local server
)# Non-streaming completion
response = client.chat.completions.create(
model="qwen3-8b-int8-ov",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing in simple terms."}
],
max_tokens=150,
temperature=0.7
)
print(response.choices[0].message.content)# Streaming completion
response = client.chat.completions.create(
model="qwen3-8b-int8-ov",
messages=[{"role": "user", "content": "Write a short poem about AI"}],
stream=True,
max_tokens=100
)
print("AI Response:")
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print() # New line at end# Transcribe audio file
with open("audio.wav", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-large-v3-turbo-qnn", # or whisper-large-v3-turbo-ov-npu
file=audio_file,
response_format="verbose_json"
)
print(f"Transcribed text: {transcript.text}")
print(f"Language: {transcript.language}")
print(f"Duration: {transcript.duration}s")# Generate embeddings
embeddings = client.embeddings.create(
model="sentence-transformers/all-MiniLM-L6-v2",
input=["Text to embed", "Another piece of text", "Vector search query"]
)
for i, embedding in enumerate(embeddings.data):
print(f"Text {i+1} embedding dimensions: {len(embedding.embedding)}")
print(f"First 5 values: {embedding.embedding[:5]}")from openai import OpenAI, APIError, APIConnectionError
client = OpenAI(base_url="http://localhost:8080/v1", api_key="local")
try:
response = client.chat.completions.create(
model="qwen3-8b-int8-ov",
messages=[{"role": "user", "content": "Hello!"}]
)
print(response.choices[0].message.content)
except APIConnectionError:
print("Failed to connect to Fluid Server. Is it running?")
except APIError as e:
print(f"API Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")Install the Azure OpenAI SDK for .NET:
<PackageReference Include="Azure.AI.OpenAI" Version="1.0.0-beta.17" />using Azure.AI.OpenAI;
using Azure;
var client = new OpenAIClient(
new Uri("http://localhost:8080/v1"),
new AzureKeyCredential("local") // Can be anything for local server
);var chatOptions = new ChatCompletionsOptions()
{
DeploymentName = "qwen3-8b-int8-ov",
Messages = {
new ChatRequestSystemMessage("You are a helpful assistant."),
new ChatRequestUserMessage("Explain machine learning briefly.")
},
MaxTokens = 150,
Temperature = 0.7f
};
var response = await client.GetChatCompletionsAsync(chatOptions);
Console.WriteLine(response.Value.Choices[0].Message.Content);var chatOptions = new ChatCompletionsOptions()
{
DeploymentName = "qwen3-8b-int8-ov",
Messages = { new ChatRequestUserMessage("Write a haiku about programming") },
MaxTokens = 100
};
await foreach (var choice in client.GetChatCompletionsStreaming(chatOptions))
{
if (choice.ContentUpdate != null)
{
Console.Write(choice.ContentUpdate);
}
}
Console.WriteLine();using var audioStream = File.OpenRead("audio.wav");
var transcriptionOptions = new AudioTranscriptionOptions()
{
DeploymentName = "whisper-large-v3-turbo-qnn",
AudioData = BinaryData.FromStream(audioStream),
ResponseFormat = AudioTranscriptionFormat.VerboseJson
};
var transcription = await client.GetAudioTranscriptionAsync(transcriptionOptions);
Console.WriteLine($"Transcribed: {transcription.Value.Text}");
Console.WriteLine($"Language: {transcription.Value.Language}");Install the OpenAI Node.js SDK:
npm install openaiimport OpenAI from 'openai';
const openai = new OpenAI({
baseURL: 'http://localhost:8080/v1',
apiKey: 'local', // Can be anything for local server
});async function chatCompletion() {
try {
const completion = await openai.chat.completions.create({
model: 'qwen3-8b-int8-ov',
messages: [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: 'Explain async/await in JavaScript' }
],
max_tokens: 200,
temperature: 0.7
});
console.log(completion.choices[0].message.content);
} catch (error) {
console.error('Chat completion error:', error);
}
}async function streamingChat() {
try {
const stream = await openai.chat.completions.create({
model: 'qwen3-8b-int8-ov',
messages: [{ role: 'user', content: 'Tell me about Node.js' }],
stream: true,
max_tokens: 150
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) {
process.stdout.write(content);
}
}
console.log(); // New line
} catch (error) {
console.error('Streaming error:', error);
}
}import fs from 'fs';
async function transcribeAudio() {
try {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream('audio.wav'),
model: 'whisper-large-v3-turbo-qnn',
response_format: 'verbose_json'
});
console.log('Transcription:', transcription.text);
console.log('Language:', transcription.language);
console.log('Duration:', transcription.duration);
} catch (error) {
console.error('Transcription error:', error);
}
}- Use connection pooling for high-throughput applications
- Implement proper retry logic with exponential backoff
- Monitor connection health and implement graceful degradation
- Use streaming for long responses to improve perceived performance
- Batch multiple requests when possible
- Consider model warm-up time for the first request
- Implement comprehensive error handling for network issues
- Handle model loading delays during server startup
- Provide fallback mechanisms for service unavailability
- Run Fluid Server on localhost for development
- Use proper network security for production deployments
- Validate all inputs before sending to the API
- Choose appropriate models based on your hardware capabilities
- Consider the trade-off between model size and performance
- Test different models to find the best fit for your use case