-
Notifications
You must be signed in to change notification settings - Fork 107
Update inference API specification to include new Llama Service #5020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
8b89902
659c6ca
a68e740
23dd73f
6b1c6d4
73fc8af
c527e7d
16e0155
207638f
0694fd2
4a49a4a
c16b9d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
{ | ||
"inference.put_llama": { | ||
"documentation": { | ||
"url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-llama.html", | ||
"description": "Configure a Llama inference endpoint" | ||
}, | ||
"stability": "stable", | ||
"visibility": "public", | ||
"headers": { | ||
"accept": ["application/json"], | ||
"content_type": ["application/json"] | ||
}, | ||
"url": { | ||
"paths": [ | ||
{ | ||
"path": "/_inference/{task_type}/{llama_inference_id}", | ||
"methods": ["PUT"], | ||
"parts": { | ||
"task_type": { | ||
"type": "string", | ||
"description": "The task type" | ||
}, | ||
"llama_inference_id": { | ||
"type": "string", | ||
"description": "The inference ID" | ||
} | ||
} | ||
} | ||
] | ||
}, | ||
"body": { | ||
"description": "The inference endpoint's task and service settings" | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1377,6 +1377,68 @@ export enum JinaAITextEmbeddingTask { | |
search | ||
} | ||
|
||
export class LlamaServiceSettings { | ||
/** | ||
* The URL endpoint of the Llama stack endpoint. | ||
* URL must contain: | ||
* * For `text_embedding` task - `/v1/inference/embeddings`. | ||
* * For `completion` and `chat_completion` tasks - `/v1/openai/v1/chat/completions`. | ||
*/ | ||
url: string | ||
/** | ||
* The name of the model to use for the inference task. | ||
* Refer to the Llama downloading models documentation for different ways of getting list of available models and downloading them. | ||
* Service has been tested and confirmed to be working with the following models: | ||
* * For `text_embedding` task - `all-MiniLM-L6-v2`. | ||
* * For `completion` and `chat_completion` tasks - `llama3.2:3b`. | ||
* @ext_doc_id llama-api-models | ||
*/ | ||
model_id: string | ||
/** | ||
* A valid API key for accessing Llama stack endpoint that is going to be sent as part of Bearer authentication header. | ||
* This field is optional because Llama stack doesn't provide authentication by default. | ||
* | ||
* IMPORTANT: You need to provide the API key only once, during the inference model creation. | ||
* The get inference endpoint API does not retrieve your API key. | ||
* After creating the inference model, you cannot change the associated API key. | ||
* If you want to use a different API key, delete the inference model and recreate it with the same name and the updated API key. | ||
*/ | ||
api_key?: string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should have mentioned this on the elasticsearch PR. When did you need to supply an API key? In my testing of running the stack locally I didn't need to supply one 🤔 I was running it like this:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is good point to discuss. Glad you're bringing that up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did we test that the api key functionality works with our llama integration? Like did we configure llama so that it requires an api key and test to see if it returns failures if the api key isn't included in the request? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, unfortunately we did not test that. Llama doesn't provide out of the box auth support. We'd have to configure it and host it, that would take more time. |
||
/** | ||
* For a `text_embedding` task, the maximum number of tokens per input before chunking occurs. | ||
*/ | ||
max_input_tokens?: integer | ||
/** | ||
* For a `text_embedding` task, the number of dimensions the resulting output embeddings should have. | ||
*/ | ||
dimensions?: integer | ||
/** | ||
* For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm. | ||
*/ | ||
similarity?: LlamaSimilarityType | ||
/** | ||
* This setting helps to minimize the number of rate limit errors returned from the Llama API. | ||
* By default, the `llama` service sets the number of requests allowed per minute to 3000. | ||
*/ | ||
rate_limit?: RateLimitSetting | ||
} | ||
|
||
export enum LlamaTaskType { | ||
text_embedding, | ||
completion, | ||
chat_completion | ||
} | ||
|
||
export enum LlamaServiceType { | ||
llama | ||
} | ||
|
||
export enum LlamaSimilarityType { | ||
cosine, | ||
dot_product, | ||
l2_norm | ||
} | ||
|
||
export class MistralServiceSettings { | ||
/** | ||
* A valid API key of your Mistral account. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
* Licensed to Elasticsearch B.V. under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch B.V. licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
import { RequestBase } from '@_types/Base' | ||
import { Id } from '@_types/common' | ||
import { Duration } from '@_types/Time' | ||
import { | ||
LlamaServiceSettings, | ||
LlamaServiceType, | ||
LlamaTaskType | ||
} from '@inference/_types/CommonTypes' | ||
import { InferenceChunkingSettings } from '@inference/_types/Services' | ||
|
||
/** | ||
* Create a Llama inference endpoint. | ||
* | ||
* Create an inference endpoint to perform an inference task with the `llama` service. | ||
* @rest_spec_name inference.put_llama | ||
* @availability stack since=9.2.0 stability=stable visibility=public | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jonathan-buttner could you please check if this 9.2.0 version is correctly set here. I assume it is, but want to be sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep this is correct 👍 |
||
* @availability serverless stability=stable visibility=public | ||
* @cluster_privileges manage_inference | ||
* @doc_id inference-api-put-llama | ||
*/ | ||
export interface Request extends RequestBase { | ||
urls: [ | ||
{ | ||
path: '/_inference/{task_type}/{llama_inference_id}' | ||
methods: ['PUT'] | ||
} | ||
] | ||
path_parts: { | ||
/** | ||
* The type of the inference task that the model will perform. | ||
*/ | ||
task_type: LlamaTaskType | ||
/** | ||
* The unique identifier of the inference endpoint. | ||
*/ | ||
llama_inference_id: Id | ||
} | ||
query_parameters: { | ||
/** | ||
* Specifies the amount of time to wait for the inference endpoint to be created. | ||
* @server_default 30s | ||
*/ | ||
timeout?: Duration | ||
} | ||
body: { | ||
/** | ||
* The chunking configuration object. | ||
* @ext_doc_id inference-chunking | ||
*/ | ||
chunking_settings?: InferenceChunkingSettings | ||
/** | ||
* The type of service supported for the specified task type. In this case, `llama`. | ||
*/ | ||
service: LlamaServiceType | ||
/** | ||
* Settings used to install the inference model. These settings are specific to the `llama` service. | ||
*/ | ||
service_settings: LlamaServiceSettings | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/* | ||
* Licensed to Elasticsearch B.V. under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch B.V. licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
import { InferenceEndpointInfoLlama } from '@inference/_types/Services' | ||
|
||
export class Response { | ||
/** @codegen_name endpoint_info */ | ||
body: InferenceEndpointInfoLlama | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# summary: | ||
description: Run `PUT _inference/text_embedding/llama-text-embedding` to create a Llama inference endpoint that performs a `text_embedding` task. | ||
method_request: 'PUT _inference/text_embedding/llama-text-embedding' | ||
# type: "request" | ||
value: |- | ||
{ | ||
"service": "llama", | ||
"service_settings": { | ||
"url": "http://localhost:8321/v1/inference/embeddings" | ||
"api_key": "llama-api-key", | ||
"model_id": "all-MiniLM-L6-v2" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# summary: | ||
description: Run `PUT _inference/completion/llama-completion` to create a Llama inference endpoint that performs a `completion` task. | ||
method_request: 'PUT _inference/completion/llama-completion' | ||
# type: "request" | ||
value: |- | ||
{ | ||
"service": "llama", | ||
"service_settings": { | ||
"url": "http://localhost:8321/v1/openai/v1/chat/completions" | ||
"api_key": "llama-api-key", | ||
"model_id": "llama3.2:3b" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# summary: | ||
description: Run `PUT _inference/chat-completion/llama-chat-completion` to create a Llama inference endpoint that performs a `chat_completion` task. | ||
method_request: 'PUT _inference/chat-completion/llama-chat-completion' | ||
# type: "request" | ||
value: |- | ||
{ | ||
"service": "llama", | ||
"service_settings": { | ||
"url": "http://localhost:8321/v1/openai/v1/chat/completions" | ||
"api_key": "llama-api-key", | ||
"model_id": "llama3.2:3b" | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.