Skip to content

Commit 7a86f19

Browse files
authored
[Vertex AI] Add responseModalities to GenerationConfig (#14658)
1 parent 5113cfd commit 7a86f19

File tree

6 files changed

+134
-2
lines changed

6 files changed

+134
-2
lines changed

FirebaseVertexAI/CHANGELOG.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
# Unreleased
2+
- [added] **Public Preview**: Added support for specifying response modalities
3+
in `GenerationConfig`. This includes **public experimental** support for image
4+
generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)
5+
<br /><br />
6+
Note: This feature is in Public Preview and relies on experimental models,
7+
which means that it is not subject to any SLA or deprecation policy and could
8+
change in backwards-incompatible ways.
9+
110
# 11.11.0
211
- [added] Emits a warning when attempting to use an incompatible model with
312
`GenerativeModel` or `ImagenModel`. (#14610)

FirebaseVertexAI/Sources/GenerationConfig.swift

+15-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable {
4848
/// Output schema of the generated candidate text.
4949
let responseSchema: Schema?
5050

51+
/// Supported modalities of the response.
52+
let responseModalities: [ResponseModality]?
53+
5154
/// Creates a new `GenerationConfig` value.
5255
///
5356
/// See the
@@ -140,11 +143,20 @@ public struct GenerationConfig: Sendable {
140143
/// [Generate structured
141144
/// output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
142145
/// for more details.
146+
/// - responseModalities: The data types (modalities) that may be returned in model responses.
147+
///
148+
/// See the [multimodal
149+
/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
150+
/// documentation for more details.
151+
///
152+
/// > Warning: Specifying response modalities is a **Public Preview** feature, which means
153+
/// > that it is not subject to any SLA or deprecation policy and could change in
154+
/// > backwards-incompatible ways.
143155
public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
144156
candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
145157
presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
146158
stopSequences: [String]? = nil, responseMIMEType: String? = nil,
147-
responseSchema: Schema? = nil) {
159+
responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) {
148160
// Explicit init because otherwise if we re-arrange the above variables it changes the API
149161
// surface.
150162
self.temperature = temperature
@@ -157,6 +169,7 @@ public struct GenerationConfig: Sendable {
157169
self.stopSequences = stopSequences
158170
self.responseMIMEType = responseMIMEType
159171
self.responseSchema = responseSchema
172+
self.responseModalities = responseModalities
160173
}
161174
}
162175

@@ -175,5 +188,6 @@ extension GenerationConfig: Encodable {
175188
case stopSequences
176189
case responseMIMEType = "responseMimeType"
177190
case responseSchema
191+
case responseModalities
178192
}
179193
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import Foundation
16+
17+
/// Represents the different types, or modalities, of data that a model can produce as output.
18+
///
19+
/// To configure the desired output modalities for model requests, set the `responseModalities`
20+
/// parameter when initializing a ``GenerationConfig``. See the [multimodal
21+
/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
22+
/// documentation for more details.
23+
///
24+
/// > Important: Support for each response modality, or combination of modalities, depends on the
25+
/// > model.
26+
@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
27+
public struct ResponseModality: EncodableProtoEnum, Sendable {
28+
enum Kind: String {
29+
case text = "TEXT"
30+
case image = "IMAGE"
31+
}
32+
33+
/// Specifies that the model should generate textual content.
34+
///
35+
/// Use this modality when you need the model to produce written language, such as answers to
36+
/// questions, summaries, creative writing, code snippets, or structured data formats like JSON.
37+
public static let text = ResponseModality(kind: .text)
38+
39+
/// **Public Experimental**: Specifies that the model should generate image data.
40+
///
41+
/// Use this modality when you want the model to create visual content based on the provided input
42+
/// or prompts. The response might contain one or more generated images. See the [image
43+
/// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation)
44+
/// documentation for more details.
45+
///
46+
/// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which
47+
/// > means that it is not subject to any SLA or deprecation policy and could change in
48+
/// > backwards-incompatible ways.
49+
public static let image = ResponseModality(kind: .image)
50+
51+
let rawValue: String
52+
}

FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift

+1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ public enum FirebaseAppNames {
2323
public enum ModelNames {
2424
public static let gemini2Flash = "gemini-2.0-flash-001"
2525
public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"
26+
public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"
2627
}

FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift

+51
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ import FirebaseVertexAI
1919
import Testing
2020
import VertexAITestApp
2121

22+
#if canImport(UIKit)
23+
import UIKit
24+
#endif // canImport(UIKit)
25+
26+
@testable import struct FirebaseVertexAI.BackendError
27+
2228
@Suite(.serialized)
2329
struct GenerateContentIntegrationTests {
2430
// Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -119,6 +125,51 @@ struct GenerateContentIntegrationTests {
119125
#expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
120126
}
121127

128+
@Test(arguments: [
129+
InstanceConfig.vertexV1Beta,
130+
InstanceConfig.developerV1Beta,
131+
])
132+
func generateImage(_ config: InstanceConfig) async throws {
133+
let generationConfig = GenerationConfig(
134+
temperature: 0.0,
135+
topP: 0.0,
136+
topK: 1,
137+
responseModalities: [.text, .image]
138+
)
139+
let model = VertexAI.componentInstance(config).generativeModel(
140+
modelName: ModelNames.gemini2FlashExperimental,
141+
generationConfig: generationConfig,
142+
safetySettings: safetySettings
143+
)
144+
let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn."
145+
146+
var response: GenerateContentResponse?
147+
try await withKnownIssue(
148+
"Backend may fail with a 503 - Service Unavailable error when overloaded",
149+
isIntermittent: true
150+
) {
151+
response = try await model.generateContent(prompt)
152+
} matching: { issue in
153+
(issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false
154+
}
155+
156+
guard let response else { return }
157+
let candidate = try #require(response.candidates.first)
158+
let inlineDataPart = try #require(candidate.content.parts
159+
.first { $0 is InlineDataPart } as? InlineDataPart)
160+
#expect(inlineDataPart.mimeType == "image/png")
161+
#expect(inlineDataPart.data.count > 0)
162+
#if canImport(UIKit)
163+
let uiImage = try #require(UIImage(data: inlineDataPart.data))
164+
// Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but
165+
// dimensions may vary depending on the aspect ratio.
166+
#expect(uiImage.size.width <= 1024)
167+
#expect(uiImage.size.width >= 500)
168+
#expect(uiImage.size.height <= 1024)
169+
#expect(uiImage.size.height >= 500)
170+
#endif // canImport(UIKit)
171+
}
172+
122173
// MARK: Streaming Tests
123174

124175
@Test(arguments: InstanceConfig.allConfigs)

FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift

+6-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase {
6161
frequencyPenalty: frequencyPenalty,
6262
stopSequences: stopSequences,
6363
responseMIMEType: responseMIMEType,
64-
responseSchema: .array(items: .string())
64+
responseSchema: .array(items: .string()),
65+
responseModalities: [.text, .image]
6566
)
6667

6768
let jsonData = try encoder.encode(generationConfig)
@@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase {
7475
"maxOutputTokens" : \(maxOutputTokens),
7576
"presencePenalty" : \(presencePenalty),
7677
"responseMimeType" : "\(responseMIMEType)",
78+
"responseModalities" : [
79+
"TEXT",
80+
"IMAGE"
81+
],
7782
"responseSchema" : {
7883
"items" : {
7984
"nullable" : false,

0 commit comments

Comments
 (0)