[Vertex AI] Add responseModalities to GenerationConfig (#14658)

andrewheard · web-flow · commit 7a86f19a62f2 · 2025-04-08T20:40:52.000-04:00
diff --git a/FirebaseVertexAI/CHANGELOG.md b/FirebaseVertexAI/CHANGELOG.md
@@ -1,3 +1,12 @@
+# Unreleased
+- [added] **Public Preview**: Added support for specifying response modalities
+  in `GenerationConfig`. This includes **public experimental** support for image
+  generation using Gemini 2.0 Flash (`gemini-2.0-flash-exp`). (#14658)
+  <br /><br />
+  Note: This feature is in Public Preview and relies on experimental models,
+  which means that it is not subject to any SLA or deprecation policy and could
+  change in backwards-incompatible ways.
+
 # 11.11.0
 - [added] Emits a warning when attempting to use an incompatible model with
   `GenerativeModel` or `ImagenModel`. (#14610)
diff --git a/FirebaseVertexAI/Sources/GenerationConfig.swift b/FirebaseVertexAI/Sources/GenerationConfig.swift
@@ -48,6 +48,9 @@ public struct GenerationConfig: Sendable {
   /// Output schema of the generated candidate text.
   let responseSchema: Schema?
 
+  /// Supported modalities of the response.
+  let responseModalities: [ResponseModality]?
+
   /// Creates a new `GenerationConfig` value.
   ///
   /// See the
@@ -140,11 +143,20 @@ public struct GenerationConfig: Sendable {
   ///     [Generate structured
   ///     output](https://firebase.google.com/docs/vertex-ai/structured-output?platform=ios) guide
   ///     for more details.
+  ///   - responseModalities: The data types (modalities) that may be returned in model responses.
+  ///
+  ///     See the [multimodal
+  ///     responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+  ///     documentation for more details.
+  ///
+  ///     > Warning: Specifying response modalities is a **Public Preview** feature, which means
+  ///     > that it is not subject to any SLA or deprecation policy and could change in
+  ///     > backwards-incompatible ways.
   public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil,
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
               stopSequences: [String]? = nil, responseMIMEType: String? = nil,
-              responseSchema: Schema? = nil) {
+              responseSchema: Schema? = nil, responseModalities: [ResponseModality]? = nil) {
     // Explicit init because otherwise if we re-arrange the above variables it changes the API
     // surface.
     self.temperature = temperature
@@ -157,6 +169,7 @@ public struct GenerationConfig: Sendable {
     self.stopSequences = stopSequences
     self.responseMIMEType = responseMIMEType
     self.responseSchema = responseSchema
+    self.responseModalities = responseModalities
   }
 }
 
@@ -175,5 +188,6 @@ extension GenerationConfig: Encodable {
     case stopSequences
     case responseMIMEType = "responseMimeType"
     case responseSchema
+    case responseModalities
   }
 }
diff --git a/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift b/FirebaseVertexAI/Sources/Types/Public/ResponseModality.swift
@@ -0,0 +1,52 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Represents the different types, or modalities, of data that a model can produce as output.
+///
+/// To configure the desired output modalities for model requests, set the `responseModalities`
+/// parameter when initializing a ``GenerationConfig``. See the [multimodal
+/// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation)
+/// documentation for more details.
+///
+/// > Important: Support for each response modality, or combination of modalities, depends on the
+/// > model.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct ResponseModality: EncodableProtoEnum, Sendable {
+  enum Kind: String {
+    case text = "TEXT"
+    case image = "IMAGE"
+  }
+
+  /// Specifies that the model should generate textual content.
+  ///
+  /// Use this modality when you need the model to produce written language, such as answers to
+  /// questions, summaries, creative writing, code snippets, or structured data formats like JSON.
+  public static let text = ResponseModality(kind: .text)
+
+  /// **Public Experimental**: Specifies that the model should generate image data.
+  ///
+  /// Use this modality when you want the model to create visual content based on the provided input
+  /// or prompts. The response might contain one or more generated images. See the [image
+  /// generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation#image-generation)
+  /// documentation for more details.
+  ///
+  /// > Warning: Image generation using Gemini 2.0 Flash is a **Public Experimental** feature, which
+  /// > means that it is not subject to any SLA or deprecation policy and could change in
+  /// > backwards-incompatible ways.
+  public static let image = ResponseModality(kind: .image)
+
+  let rawValue: String
+}
diff --git a/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift b/FirebaseVertexAI/Tests/TestApp/Sources/Constants.swift
@@ -23,4 +23,5 @@ public enum FirebaseAppNames {
 public enum ModelNames {
   public static let gemini2Flash = "gemini-2.0-flash-001"
   public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"
+  public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"
 }
diff --git a/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift b/FirebaseVertexAI/Tests/TestApp/Tests/Integration/GenerateContentIntegrationTests.swift
@@ -19,6 +19,12 @@ import FirebaseVertexAI
 import Testing
 import VertexAITestApp
 
+#if canImport(UIKit)
+  import UIKit
+#endif // canImport(UIKit)
+
+@testable import struct FirebaseVertexAI.BackendError
+
 @Suite(.serialized)
 struct GenerateContentIntegrationTests {
   // Set temperature, topP and topK to lowest allowed values to make responses more deterministic.
@@ -119,6 +125,51 @@ struct GenerateContentIntegrationTests {
     #expect(candidatesTokensDetails.tokenCount == usageMetadata.candidatesTokenCount)
   }
 
+  @Test(arguments: [
+    InstanceConfig.vertexV1Beta,
+    InstanceConfig.developerV1Beta,
+  ])
+  func generateImage(_ config: InstanceConfig) async throws {
+    let generationConfig = GenerationConfig(
+      temperature: 0.0,
+      topP: 0.0,
+      topK: 1,
+      responseModalities: [.text, .image]
+    )
+    let model = VertexAI.componentInstance(config).generativeModel(
+      modelName: ModelNames.gemini2FlashExperimental,
+      generationConfig: generationConfig,
+      safetySettings: safetySettings
+    )
+    let prompt = "Generate an image of a cute cartoon kitten playing with a ball of yarn."
+
+    var response: GenerateContentResponse?
+    try await withKnownIssue(
+      "Backend may fail with a 503 - Service Unavailable error when overloaded",
+      isIntermittent: true
+    ) {
+      response = try await model.generateContent(prompt)
+    } matching: { issue in
+      (issue.error as? BackendError).map { $0.httpResponseCode == 503 } ?? false
+    }
+
+    guard let response else { return }
+    let candidate = try #require(response.candidates.first)
+    let inlineDataPart = try #require(candidate.content.parts
+      .first { $0 is InlineDataPart } as? InlineDataPart)
+    #expect(inlineDataPart.mimeType == "image/png")
+    #expect(inlineDataPart.data.count > 0)
+    #if canImport(UIKit)
+      let uiImage = try #require(UIImage(data: inlineDataPart.data))
+      // Gemini 2.0 Flash Experimental returns images sized to fit within a 1024x1024 pixel box but
+      // dimensions may vary depending on the aspect ratio.
+      #expect(uiImage.size.width <= 1024)
+      #expect(uiImage.size.width >= 500)
+      #expect(uiImage.size.height <= 1024)
+      #expect(uiImage.size.height >= 500)
+    #endif // canImport(UIKit)
+  }
+
   // MARK: Streaming Tests
 
   @Test(arguments: InstanceConfig.allConfigs)
diff --git a/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift b/FirebaseVertexAI/Tests/Unit/GenerationConfigTests.swift
@@ -61,7 +61,8 @@ final class GenerationConfigTests: XCTestCase {
       frequencyPenalty: frequencyPenalty,
       stopSequences: stopSequences,
       responseMIMEType: responseMIMEType,
-      responseSchema: .array(items: .string())
+      responseSchema: .array(items: .string()),
+      responseModalities: [.text, .image]
     )
 
     let jsonData = try encoder.encode(generationConfig)
@@ -74,6 +75,10 @@ final class GenerationConfigTests: XCTestCase {
       "maxOutputTokens" : \(maxOutputTokens),
       "presencePenalty" : \(presencePenalty),
       "responseMimeType" : "\(responseMIMEType)",
+      "responseModalities" : [
+        "TEXT",
+        "IMAGE"
+      ],
       "responseSchema" : {
         "items" : {
           "nullable" : false,

Original file line number	Diff line number	Diff line change
`@@ -23,4 +23,5 @@ public enum FirebaseAppNames {`
`23`	`23`	`public enum ModelNames {`
`24`	`24`	`public static let gemini2Flash = "gemini-2.0-flash-001"`
`25`	`25`	`public static let gemini2FlashLite = "gemini-2.0-flash-lite-001"`
	`26`	`+ public static let gemini2FlashExperimental = "gemini-2.0-flash-exp"`
`26`	`27`	`}`