Skip to content

Commit 67f91bb

Browse files
authored
frontend: fix transcript glitch (#22)
1 parent eb58434 commit 67f91bb

File tree

4 files changed

+186
-23
lines changed

4 files changed

+186
-23
lines changed

frontend_vapi/lib/oto-websocket.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { TranscriptBeautifyData, TranscriptSegment } from "./transcript";
2+
3+
export interface WebSocketMessage {
4+
type: "transcribe" | "transcript-beautify" | "detect-action" | "error";
5+
data?: any;
6+
message?: string;
7+
}
8+
9+
export const handleOtoWsTranscribe = (message: WebSocketMessage) => {
10+
const segment: TranscriptSegment = {
11+
audioStart: message.data.audioStart || 0,
12+
audioEnd: message.data.audioEnd || 0,
13+
transcript: message.data.transcript,
14+
finalized: message.data.finalized,
15+
beautified: false,
16+
id: message.data.finalized
17+
? `${message.data.audioStart || 0}-${message.data.audioEnd || 0}`
18+
: "partial-current", // Use simple ID for partial transcripts
19+
};
20+
21+
return segment;
22+
};
23+
24+
export const handleOtoWsTranscriptBeautify = (message: WebSocketMessage) => {
25+
const beautifyData: TranscriptBeautifyData = {
26+
audioStart: message.data.audioStart,
27+
audioEnd: message.data.audioEnd,
28+
transcript: message.data.transcript,
29+
segments: message.data.segments,
30+
};
31+
32+
return beautifyData;
33+
};

frontend_vapi/lib/time.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
export const formatTimestamp = (milliseconds: number): string => {
2+
const totalSeconds = Math.floor(milliseconds / 1000);
3+
const hours = Math.floor(totalSeconds / 3600);
4+
const minutes = Math.floor((totalSeconds % 3600) / 60);
5+
const seconds = totalSeconds % 60;
6+
7+
return `${hours.toString().padStart(2, "0")}:${minutes
8+
.toString()
9+
.padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
10+
};

frontend_vapi/lib/transcript.ts

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import { formatTimestamp } from "./time";
2+
3+
export interface TranscriptSegment {
4+
audioStart: number;
5+
audioEnd: number;
6+
transcript: string;
7+
finalized: boolean;
8+
beautified: boolean;
9+
id?: string; // Add unique identifier for tracking partial updates
10+
}
11+
12+
export interface TranscriptBeautifyData {
13+
audioStart: number;
14+
audioEnd: number;
15+
transcript: string;
16+
segments: TranscriptSegment[];
17+
}
18+
19+
export const handleTranscriptSegment = (
20+
prev: TranscriptSegment[],
21+
segment: TranscriptSegment
22+
): TranscriptSegment[] => {
23+
if (segment.finalized) {
24+
// For finalized segments, remove any existing partial transcript and add the finalized one
25+
const filteredSegments = prev.filter((s) => s.finalized); // Remove partial transcripts
26+
return [
27+
...filteredSegments,
28+
{ ...segment, id: `${segment.audioStart}-${segment.audioEnd}` },
29+
];
30+
} else {
31+
// For partial segments, replace any existing partial transcript
32+
const finalizedSegments = prev.filter((s) => s.finalized); // Keep only finalized segments
33+
return [
34+
...finalizedSegments,
35+
{
36+
...segment,
37+
id: "partial-current",
38+
audioStart: 999999999,
39+
audioEnd: 999999999,
40+
},
41+
]; // Add current partial
42+
}
43+
};
44+
45+
export const handleTranscriptBeautify = (
46+
prev: TranscriptSegment[],
47+
beautifyData: TranscriptBeautifyData
48+
): TranscriptSegment[] => {
49+
// Filter out segments that fall within the beautified range
50+
const filteredSegments = prev.filter(
51+
(segment) =>
52+
segment.audioEnd <= beautifyData.audioStart ||
53+
segment.audioStart >= beautifyData.audioEnd
54+
);
55+
56+
const beautifiedSegments = beautifyData.segments.map((segment) => ({
57+
audioStart: segment.audioStart,
58+
audioEnd: segment.audioEnd,
59+
transcript: segment.transcript,
60+
finalized: true,
61+
beautified: true,
62+
}));
63+
64+
// Insert in chronological order
65+
const newSegments = [...filteredSegments, ...beautifiedSegments];
66+
return newSegments.sort((a, b) => a.audioStart - b.audioStart);
67+
};
68+
69+
export const handleFormatTranscript = (transcriptSegments: TranscriptSegment[]): string => {
70+
return transcriptSegments.sort((a, b) => a.audioStart - b.audioStart)
71+
.map(segment => {
72+
let ret = '';
73+
74+
// For partial transcripts, don't show timing (might not be meaningful)
75+
if (segment.finalized) {
76+
const startTime = formatTimestamp(segment.audioStart);
77+
const endTime = formatTimestamp(segment.audioEnd);
78+
ret = `[${startTime}-${endTime}]`;
79+
} else {
80+
ret = '[Live]'; // Show "Live" for partial transcripts
81+
}
82+
83+
// Add status indicators
84+
if (segment.beautified) {
85+
ret += ' ✓'; // Beautified
86+
} else if (segment.finalized) {
87+
ret += ' *'; // Finalized but not beautified
88+
} else {
89+
ret += ' ~'; // Partial/interim transcript
90+
}
91+
92+
ret += `\n${segment.transcript}`;
93+
94+
return ret;
95+
})
96+
.join('\n\n');
97+
};

frontend_vapi/pages/record.tsx

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import {
1313
sendRealtimeAudioData,
1414
validateUUID,
1515
} from "../lib/oto-api";
16+
import { handleFormatTranscript, handleTranscriptBeautify, handleTranscriptSegment, TranscriptSegment } from "../lib/transcript";
17+
import { handleOtoWsTranscribe, handleOtoWsTranscriptBeautify } from "../lib/oto-websocket";
1618

1719
/**
1820
* 日常会話録音画面 - リアルタイム音声ストリーミング版
@@ -30,6 +32,7 @@ export default function RecordPage() {
3032
const [isStreaming, setIsStreaming] = useState(false);
3133
const [hasPermission, setHasPermission] = useState(false);
3234
const [volume, setVolume] = useState(0);
35+
const [lastVolumeSetDateTime, setLastVolumeSetDateTime] = useState<Date | null>(null);
3336

3437
// Audio streaming references
3538
const streamRef = useRef<MediaStream | null>(null);
@@ -38,6 +41,15 @@ export default function RecordPage() {
3841
const audioContextRef = useRef<AudioContext | null>(null);
3942
const animationFrameRef = useRef<number | null>(null);
4043

44+
// transcript segments
45+
const [transcriptSegments, setTranscriptSegments] = useState<TranscriptSegment[]>([]);
46+
const transcriptContentRef = useRef<HTMLDivElement>(null);
47+
useEffect(() => {
48+
if (transcriptContentRef.current && transcriptSegments.length > 0) {
49+
transcriptContentRef.current.scrollTop = transcriptContentRef.current.scrollHeight;
50+
}
51+
}, [transcriptSegments]);
52+
4153
// Audio streaming statistics
4254
const [audioStats, setAudioStats] = useState({
4355
totalChunks: 0,
@@ -91,7 +103,9 @@ export default function RecordPage() {
91103
}
92104
const rms = Math.sqrt(sum / bufferLength);
93105
const volumeLevel = Math.round((rms / 255) * 100);
106+
94107
setVolume(volumeLevel);
108+
setLastVolumeSetDateTime(new Date());
95109

96110
// Continue monitoring
97111
animationFrameRef.current = requestAnimationFrame(monitorVolume);
@@ -261,6 +275,7 @@ export default function RecordPage() {
261275
}, 10000);
262276

263277
let authTimeout: NodeJS.Timeout | null = null;
278+
let transcriptSegments: TranscriptSegment[] = [];
264279

265280
ws.onopen = () => {
266281
clearTimeout(connectionTimeout);
@@ -422,14 +437,18 @@ export default function RecordPage() {
422437
break;
423438
case "transcribe":
424439
console.log("📝 Transcription:", message.data?.transcript);
425-
if (message.data?.transcript) {
426-
setTranscript((prev) => prev + message.data.transcript);
440+
if (message.data) {
441+
const segment = handleOtoWsTranscribe(message);
442+
transcriptSegments = handleTranscriptSegment(transcriptSegments, segment);
443+
setTranscriptSegments(transcriptSegments);
427444
}
428445
break;
429446
case "transcript-beautify":
430447
console.log("✨ Beautified transcript:", message.data?.transcript);
431-
if (message.data?.transcript) {
432-
setTranscript(message.data.transcript);
448+
if (message.data) {
449+
const beautifyData = handleOtoWsTranscriptBeautify(message);
450+
transcriptSegments = handleTranscriptBeautify(transcriptSegments, beautifyData);
451+
setTranscriptSegments(transcriptSegments);
433452
}
434453
break;
435454
case "detect-action":
@@ -706,7 +725,7 @@ export default function RecordPage() {
706725
const wsState = websocketRef.current?.readyState;
707726
if (wsState === WebSocket.OPEN) {
708727
try {
709-
console.log(`🎤 Sending audio chunk (${event.data.size} bytes) - WebSocket state: ${wsState}`);
728+
//console.log(`🎤 Sending audio chunk (${event.data.size} bytes) - WebSocket state: ${wsState}`);
710729
// Send audio data in JSON format (not binary) for server compatibility
711730
sendRealtimeAudioData(websocketRef.current, event.data, false);
712731
} catch (error) {
@@ -942,11 +961,26 @@ export default function RecordPage() {
942961
{/* Recording Controls */}
943962
<div className="text-center mb-8">
944963
{/* Large Microphone Button - Click to Start/Stop */}
945-
<div className="relative">
964+
<div className="relative mb-4">
965+
{/* Volume Ring Indicator */}
966+
{isStreaming && volume > 0 && (
967+
<div className="absolute w-full h-full flex items-center justify-center top-0 left-0">
968+
<div
969+
className="rounded-full border-4 border-red-200 bg-red-200"
970+
style={{
971+
opacity: Math.min(volume / 50, 1),
972+
//animationDuration: `${Math.max(0.5, 2 - volume / 50)}s`,
973+
width: `auto`,
974+
height: `calc(100% + ${volume*0.7}px)`,
975+
aspectRatio: "1/1",
976+
}}
977+
/>
978+
</div>
979+
)}
946980
<button
947981
onClick={isStreaming ? stopRecording : startRecording}
948982
disabled={connectionStatus === "connecting"}
949-
className={`inline-flex items-center justify-center w-32 h-32 rounded-full mb-4 transition-all duration-300 transform hover:scale-105 active:scale-95 focus:outline-none focus:ring-4 focus:ring-opacity-50 ${
983+
className={`inline-flex items-center justify-center w-32 h-32 rounded-full transition-all duration-300 transform hover:scale-105 active:scale-95 focus:outline-none focus:ring-4 focus:ring-opacity-50 ${
950984
isStreaming
951985
? "bg-red-500 hover:bg-red-600 animate-pulse focus:ring-red-300"
952986
: connectionStatus === "authenticated"
@@ -963,17 +997,6 @@ export default function RecordPage() {
963997
<Mic size={40} className="text-white" />
964998
)}
965999
</button>
966-
967-
{/* Volume Ring Indicator */}
968-
{isStreaming && volume > 0 && (
969-
<div
970-
className="absolute inset-0 rounded-full border-4 border-green-400 animate-ping"
971-
style={{
972-
opacity: Math.min(volume / 50, 1),
973-
animationDuration: `${Math.max(0.5, 2 - volume / 50)}s`,
974-
}}
975-
/>
976-
)}
9771000
</div>
9781001

9791002
{/* Status Text */}
@@ -1002,7 +1025,7 @@ export default function RecordPage() {
10021025

10031026
<div className="text-sm text-gray-500">
10041027
{isStreaming
1005-
? "Click the microphone to stop streaming"
1028+
? "Click the button to stop streaming"
10061029
: connectionStatus === "authenticated"
10071030
? "Click the microphone to start streaming"
10081031
: connectionStatus === "connecting"
@@ -1015,7 +1038,7 @@ export default function RecordPage() {
10151038
{!isStreaming ? (
10161039
<Button
10171040
onClick={startRecording}
1018-
className="px-6 py-2 text-sm"
1041+
className="px-6 py-2 text-sm hidden"
10191042
size="sm"
10201043
disabled={connectionStatus === "connecting"}
10211044
variant="outline"
@@ -1026,7 +1049,7 @@ export default function RecordPage() {
10261049
<Button
10271050
onClick={stopRecording}
10281051
variant="destructive"
1029-
className="px-6 py-2 text-sm"
1052+
className="px-6 py-2 text-sm hidden"
10301053
size="sm"
10311054
>
10321055
<Square size={16} className="mr-1" />
@@ -1043,8 +1066,8 @@ export default function RecordPage() {
10431066
<h3 className="text-lg font-semibold text-gray-900 mb-3">
10441067
Real-time Transcription
10451068
</h3>
1046-
<div className="text-gray-700 leading-relaxed whitespace-pre-wrap">
1047-
{transcript}
1069+
<div className="text-gray-700 leading-relaxed whitespace-pre-wrap overflow-y-auto max-h-[320px] scroll-smooth" ref={transcriptContentRef}>
1070+
{handleFormatTranscript(transcriptSegments)}
10481071
</div>
10491072
</div>
10501073
)}

0 commit comments

Comments
 (0)