Skip to content

Commit 02aef9d

Browse files
Add ability to Hide Title in Built-in UI + llama 4 cartesia tweaks (#299)
* merge title * Fix
1 parent 745701c commit 02aef9d

File tree

6 files changed

+131
-97
lines changed

6 files changed

+131
-97
lines changed

backend/fastrtc/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
from .reply_on_stopwords import ReplyOnStopWords
1818
from .speech_to_text import MoonshineSTT, get_stt_model
1919
from .stream import Stream, UIArgs
20-
from .text_to_speech import KokoroTTSOptions, get_tts_model
20+
from .text_to_speech import (
21+
CartesiaTTSOptions,
22+
KokoroTTSOptions,
23+
get_tts_model,
24+
)
2125
from .tracks import (
2226
AsyncAudioVideoStreamHandler,
2327
AsyncStreamHandler,
@@ -87,4 +91,5 @@
8791
"VideoStreamHandler",
8892
"CloseStream",
8993
"get_current_context",
94+
"CartesiaTTSOptions",
9095
]

backend/fastrtc/stream.py

Lines changed: 86 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class UIArgs(TypedDict):
5959
If "submit", the input will be sent when the submit event is triggered by the user.
6060
If "change", the input will be sent whenever the user changes the input value.
6161
"""
62+
hide_title: NotRequired[bool]
63+
"""If True, the title and subtitle will not be displayed."""
6264

6365

6466
class Stream(WebRTCConnectionMixin):
@@ -339,21 +341,22 @@ def _generate_default_ui(
339341
same_components.append(component)
340342
if self.modality == "video" and self.mode == "receive":
341343
with gr.Blocks() as demo:
342-
gr.HTML(
343-
f"""
344-
<h1 style='text-align: center'>
345-
{ui_args.get("title", "Video Streaming (Powered by FastRTC ⚡️)")}
346-
</h1>
347-
"""
348-
)
349-
if ui_args.get("subtitle"):
350-
gr.Markdown(
344+
if not ui_args.get("hide_title"):
345+
gr.HTML(
351346
f"""
352-
<div style='text-align: center'>
353-
{ui_args.get("subtitle")}
354-
</div>
355-
"""
347+
<h1 style='text-align: center'>
348+
{ui_args.get("title", "Video Streaming (Powered by FastRTC ⚡️)")}
349+
</h1>
350+
"""
356351
)
352+
if ui_args.get("subtitle"):
353+
gr.Markdown(
354+
f"""
355+
<div style='text-align: center'>
356+
{ui_args.get("subtitle")}
357+
</div>
358+
"""
359+
)
357360
with gr.Row():
358361
with gr.Column():
359362
if additional_input_components:
@@ -391,21 +394,22 @@ def _generate_default_ui(
391394
)
392395
elif self.modality == "video" and self.mode == "send":
393396
with gr.Blocks() as demo:
394-
gr.HTML(
395-
f"""
396-
<h1 style='text-align: center'>
397-
{ui_args.get("title", "Video Streaming (Powered by FastRTC ⚡️)")}
398-
</h1>
399-
"""
400-
)
401-
if ui_args.get("subtitle"):
402-
gr.Markdown(
397+
if not ui_args.get("hide_title"):
398+
gr.HTML(
403399
f"""
404-
<div style='text-align: center'>
405-
{ui_args.get("subtitle")}
406-
</div>
407-
"""
400+
<h1 style='text-align: center'>
401+
{ui_args.get("title", "Video Streaming (Powered by FastRTC ⚡️)")}
402+
</h1>
403+
"""
408404
)
405+
if ui_args.get("subtitle"):
406+
gr.Markdown(
407+
f"""
408+
<div style='text-align: center'>
409+
{ui_args.get("subtitle")}
410+
</div>
411+
"""
412+
)
409413
with gr.Row():
410414
if additional_input_components:
411415
with gr.Column():
@@ -494,21 +498,22 @@ def _generate_default_ui(
494498
)
495499
elif self.modality == "audio" and self.mode == "receive":
496500
with gr.Blocks() as demo:
497-
gr.HTML(
498-
f"""
499-
<h1 style='text-align: center'>
500-
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
501-
</h1>
502-
"""
503-
)
504-
if ui_args.get("subtitle"):
505-
gr.Markdown(
501+
if not ui_args.get("hide_title"):
502+
gr.HTML(
506503
f"""
507-
<div style='text-align: center'>
508-
{ui_args.get("subtitle")}
509-
</div>
510-
"""
504+
<h1 style='text-align: center'>
505+
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
506+
</h1>
507+
"""
511508
)
509+
if ui_args.get("subtitle"):
510+
gr.Markdown(
511+
f"""
512+
<div style='text-align: center'>
513+
{ui_args.get("subtitle")}
514+
</div>
515+
"""
516+
)
512517
with gr.Row():
513518
with gr.Column():
514519
for component in additional_input_components:
@@ -549,21 +554,22 @@ def _generate_default_ui(
549554
)
550555
elif self.modality == "audio" and self.mode == "send":
551556
with gr.Blocks() as demo:
552-
gr.HTML(
553-
f"""
554-
<h1 style='text-align: center'>
555-
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
556-
</h1>
557-
"""
558-
)
559-
if ui_args.get("subtitle"):
560-
gr.Markdown(
557+
if not ui_args.get("hide_title"):
558+
gr.HTML(
561559
f"""
562-
<div style='text-align: center'>
563-
{ui_args.get("subtitle")}
564-
</div>
565-
"""
560+
<h1 style='text-align: center'>
561+
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
562+
</h1>
563+
"""
566564
)
565+
if ui_args.get("subtitle"):
566+
gr.Markdown(
567+
f"""
568+
<div style='text-align: center'>
569+
{ui_args.get("subtitle")}
570+
</div>
571+
"""
572+
)
567573
with gr.Row():
568574
with gr.Column():
569575
with gr.Group():
@@ -604,21 +610,22 @@ def _generate_default_ui(
604610
)
605611
elif self.modality == "audio" and self.mode == "send-receive":
606612
with gr.Blocks() as demo:
607-
gr.HTML(
608-
f"""
609-
<h1 style='text-align: center'>
610-
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
611-
</h1>
612-
"""
613-
)
614-
if ui_args.get("subtitle"):
615-
gr.Markdown(
613+
if not ui_args.get("hide_title"):
614+
gr.HTML(
616615
f"""
617-
<div style='text-align: center'>
618-
{ui_args.get("subtitle")}
619-
</div>
620-
"""
616+
<h1 style='text-align: center'>
617+
{ui_args.get("title", "Audio Streaming (Powered by FastRTC ⚡️)")}
618+
</h1>
619+
"""
621620
)
621+
if ui_args.get("subtitle"):
622+
gr.Markdown(
623+
f"""
624+
<div style='text-align: center'>
625+
{ui_args.get("subtitle")}
626+
</div>
627+
"""
628+
)
622629
with gr.Row():
623630
with gr.Column():
624631
with gr.Group():
@@ -662,21 +669,22 @@ def _generate_default_ui(
662669
css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
663670
.my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
664671
with gr.Blocks(css=css) as demo:
665-
gr.HTML(
666-
f"""
667-
<h1 style='text-align: center'>
668-
{ui_args.get("title", "Audio Video Streaming (Powered by FastRTC ⚡️)")}
669-
</h1>
670-
"""
671-
)
672-
if ui_args.get("subtitle"):
673-
gr.Markdown(
672+
if not ui_args.get("hide_title"):
673+
gr.HTML(
674674
f"""
675-
<div style='text-align: center'>
676-
{ui_args.get("subtitle")}
677-
</div>
678-
"""
675+
<h1 style='text-align: center'>
676+
{ui_args.get("title", "Audio Video Streaming (Powered by FastRTC ⚡️)")}
677+
</h1>
678+
"""
679679
)
680+
if ui_args.get("subtitle"):
681+
gr.Markdown(
682+
f"""
683+
<div style='text-align: center'>
684+
{ui_args.get("subtitle")}
685+
</div>
686+
"""
687+
)
680688
with gr.Row():
681689
with gr.Column(elem_classes=["my-column"]):
682690
with gr.Group(elem_classes=["my-group"]):
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1-
from .tts import KokoroTTSOptions, get_tts_model
1+
from .tts import (
2+
CartesiaTTSOptions,
3+
KokoroTTSOptions,
4+
get_tts_model,
5+
)
26

3-
__all__ = ["get_tts_model", "KokoroTTSOptions"]
7+
__all__ = ["get_tts_model", "KokoroTTSOptions", "CartesiaTTSOptions"]

backend/fastrtc/text_to_speech/tts.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import importlib.util
33
import re
44
from collections.abc import AsyncGenerator, Generator
5-
from dataclasses import dataclass
5+
from dataclasses import dataclass, field
66
from functools import lru_cache
77
from typing import Literal, Protocol, TypeVar
88

@@ -153,10 +153,11 @@ def stream_tts_sync(
153153
break
154154

155155

156+
@dataclass
156157
class CartesiaTTSOptions(TTSOptions):
157158
voice: str = "71a7ad14-091c-4e8e-a314-022ece01c121"
158159
language: str = "en"
159-
emotion: list[str] = []
160+
emotion: list[str] = field(default_factory=list)
160161
cartesia_version: str = "2024-06-10"
161162
model: str = "sonic-2"
162163
sample_rate: int = 22_050

demo/talk_to_llama4/AV_Huggy.png

45.8 KB
Loading

demo/talk_to_llama4/app.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
from fastapi.responses import HTMLResponse, StreamingResponse
1010
from fastrtc import (
1111
AdditionalOutputs,
12+
CartesiaTTSOptions,
1213
ReplyOnPause,
1314
Stream,
14-
audio_to_bytes,
1515
get_cloudflare_turn_credentials_async,
1616
get_current_context,
17+
get_stt_model,
1718
get_tts_model,
1819
)
1920
from groq import Groq
@@ -22,9 +23,11 @@
2223
curr_dir = Path(__file__).parent
2324
load_dotenv()
2425

25-
tts_model = get_tts_model()
26+
tts_model = get_tts_model(
27+
model="cartesia", cartesia_api_key=os.getenv("CARTESIA_API_KEY")
28+
)
2629
groq = Groq(api_key=os.getenv("GROQ_API_KEY"))
27-
30+
stt_model = get_stt_model()
2831

2932
conversations: dict[str, list[dict[str, str]]] = {}
3033

@@ -43,14 +46,8 @@ def response(user_audio: tuple[int, NDArray[np.int16]]):
4346
]
4447
messages = conversations[context.webrtc_id]
4548

46-
transcription = groq.audio.transcriptions.create(
47-
file=("audio.wav", audio_to_bytes(user_audio)),
48-
model="distil-whisper-large-v3-en",
49-
response_format="verbose_json",
50-
)
51-
print(transcription.text)
52-
53-
messages.append({"role": "user", "content": transcription.text})
49+
transcription = stt_model.stt(user_audio)
50+
messages.append({"role": "user", "content": transcription})
5451

5552
completion = groq.chat.completions.create( # type: ignore
5653
model="meta-llama/llama-4-scout-17b-16e-instruct",
@@ -68,7 +65,9 @@ def response(user_audio: tuple[int, NDArray[np.int16]]):
6865
long_response = response["long"]
6966
messages.append({"role": "assistant", "content": long_response})
7067
conversations[context.webrtc_id] = messages
71-
yield from tts_model.stream_tts_sync(short_response)
68+
yield from tts_model.stream_tts_sync(
69+
short_response, options=CartesiaTTSOptions(sample_rate=24_000)
70+
)
7271
yield AdditionalOutputs(messages)
7372

7473

@@ -78,9 +77,22 @@ def response(user_audio: tuple[int, NDArray[np.int16]]):
7877
mode="send-receive",
7978
additional_outputs=[gr.Chatbot(type="messages")],
8079
additional_outputs_handler=lambda old, new: new,
81-
rtc_configuration=get_cloudflare_turn_credentials_async,
80+
rtc_configuration=None,
81+
ui_args={"hide_title": True},
8282
)
8383

84+
with gr.Blocks() as demo:
85+
gr.HTML(
86+
f"""
87+
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
88+
<img src="/gradio_api/file={str((Path(__file__).parent / "AV_Huggy.png").resolve())}" alt="AV Huggy" style="height: 100px; margin-right: 10px"> FastRTC + Cartesia TTS = Blazing Fast LLM Audio
89+
</h1>
90+
"""
91+
)
92+
stream.ui.render()
93+
94+
stream.ui = demo
95+
8496
app = FastAPI()
8597
stream.mount(app)
8698

@@ -109,9 +121,13 @@ async def output_stream():
109121

110122
if __name__ == "__main__":
111123
import os
124+
from pathlib import Path
112125

113126
if (mode := os.getenv("MODE")) == "UI":
114-
stream.ui.launch(server_port=7860)
127+
stream.ui.launch(
128+
server_port=7860,
129+
allowed_paths=[str((Path(__file__).parent / "AV_Huggy.png").resolve())],
130+
)
115131
elif mode == "PHONE":
116132
raise ValueError("Phone mode not supported")
117133
else:

0 commit comments

Comments
 (0)