Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions frontend_vapi/lib/oto-websocket.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { TranscriptBeautifyData, TranscriptSegment } from "./transcript";

export interface WebSocketMessage {
type: "transcribe" | "transcript-beautify" | "detect-action" | "error";
data?: any;
message?: string;
}

export const handleOtoWsTranscribe = (message: WebSocketMessage) => {
const segment: TranscriptSegment = {
audioStart: message.data.audioStart || 0,
audioEnd: message.data.audioEnd || 0,
transcript: message.data.transcript,
finalized: message.data.finalized,
beautified: false,
id: message.data.finalized
? `${message.data.audioStart || 0}-${message.data.audioEnd || 0}`
: "partial-current", // Use simple ID for partial transcripts
};

return segment;
};

export const handleOtoWsTranscriptBeautify = (message: WebSocketMessage) => {
const beautifyData: TranscriptBeautifyData = {
audioStart: message.data.audioStart,
audioEnd: message.data.audioEnd,
transcript: message.data.transcript,
segments: message.data.segments,
};

return beautifyData;
};
10 changes: 10 additions & 0 deletions frontend_vapi/lib/time.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export const formatTimestamp = (milliseconds: number): string => {
const totalSeconds = Math.floor(milliseconds / 1000);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor((totalSeconds % 3600) / 60);
const seconds = totalSeconds % 60;

return `${hours.toString().padStart(2, "0")}:${minutes
.toString()
.padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
};
97 changes: 97 additions & 0 deletions frontend_vapi/lib/transcript.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import { formatTimestamp } from "./time";

export interface TranscriptSegment {
audioStart: number;
audioEnd: number;
transcript: string;
finalized: boolean;
beautified: boolean;
id?: string; // Add unique identifier for tracking partial updates
}

export interface TranscriptBeautifyData {
audioStart: number;
audioEnd: number;
transcript: string;
segments: TranscriptSegment[];
}

export const handleTranscriptSegment = (
prev: TranscriptSegment[],
segment: TranscriptSegment
): TranscriptSegment[] => {
if (segment.finalized) {
// For finalized segments, remove any existing partial transcript and add the finalized one
const filteredSegments = prev.filter((s) => s.finalized); // Remove partial transcripts
return [
...filteredSegments,
{ ...segment, id: `${segment.audioStart}-${segment.audioEnd}` },
];
} else {
// For partial segments, replace any existing partial transcript
const finalizedSegments = prev.filter((s) => s.finalized); // Keep only finalized segments
return [
...finalizedSegments,
{
...segment,
id: "partial-current",
audioStart: 999999999,
audioEnd: 999999999,
},
]; // Add current partial
}
};

export const handleTranscriptBeautify = (
prev: TranscriptSegment[],
beautifyData: TranscriptBeautifyData
): TranscriptSegment[] => {
// Filter out segments that fall within the beautified range
const filteredSegments = prev.filter(
(segment) =>
segment.audioEnd <= beautifyData.audioStart ||
segment.audioStart >= beautifyData.audioEnd
);

const beautifiedSegments = beautifyData.segments.map((segment) => ({
audioStart: segment.audioStart,
audioEnd: segment.audioEnd,
transcript: segment.transcript,
finalized: true,
beautified: true,
}));

// Insert in chronological order
const newSegments = [...filteredSegments, ...beautifiedSegments];
return newSegments.sort((a, b) => a.audioStart - b.audioStart);
};

export const handleFormatTranscript = (transcriptSegments: TranscriptSegment[]): string => {
return transcriptSegments.sort((a, b) => a.audioStart - b.audioStart)
.map(segment => {
let ret = '';

// For partial transcripts, don't show timing (might not be meaningful)
if (segment.finalized) {
const startTime = formatTimestamp(segment.audioStart);
const endTime = formatTimestamp(segment.audioEnd);
ret = `[${startTime}-${endTime}]`;
} else {
ret = '[Live]'; // Show "Live" for partial transcripts
}

// Add status indicators
if (segment.beautified) {
ret += ' ✓'; // Beautified
} else if (segment.finalized) {
ret += ' *'; // Finalized but not beautified
} else {
ret += ' ~'; // Partial/interim transcript
}

ret += `\n${segment.transcript}`;

return ret;
})
.join('\n\n');
};
69 changes: 46 additions & 23 deletions frontend_vapi/pages/record.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import {
sendRealtimeAudioData,
validateUUID,
} from "../lib/oto-api";
import { handleFormatTranscript, handleTranscriptBeautify, handleTranscriptSegment, TranscriptSegment } from "../lib/transcript";
import { handleOtoWsTranscribe, handleOtoWsTranscriptBeautify } from "../lib/oto-websocket";

/**
* 日常会話録音画面 - リアルタイム音声ストリーミング版
Expand All @@ -30,6 +32,7 @@ export default function RecordPage() {
const [isStreaming, setIsStreaming] = useState(false);
const [hasPermission, setHasPermission] = useState(false);
const [volume, setVolume] = useState(0);
const [lastVolumeSetDateTime, setLastVolumeSetDateTime] = useState<Date | null>(null);

// Audio streaming references
const streamRef = useRef<MediaStream | null>(null);
Expand All @@ -38,6 +41,15 @@ export default function RecordPage() {
const audioContextRef = useRef<AudioContext | null>(null);
const animationFrameRef = useRef<number | null>(null);

// transcript segments
const [transcriptSegments, setTranscriptSegments] = useState<TranscriptSegment[]>([]);
const transcriptContentRef = useRef<HTMLDivElement>(null);
useEffect(() => {
if (transcriptContentRef.current && transcriptSegments.length > 0) {
transcriptContentRef.current.scrollTop = transcriptContentRef.current.scrollHeight;
}
}, [transcriptSegments]);

// Audio streaming statistics
const [audioStats, setAudioStats] = useState({
totalChunks: 0,
Expand Down Expand Up @@ -91,7 +103,9 @@ export default function RecordPage() {
}
const rms = Math.sqrt(sum / bufferLength);
const volumeLevel = Math.round((rms / 255) * 100);

setVolume(volumeLevel);
setLastVolumeSetDateTime(new Date());

// Continue monitoring
animationFrameRef.current = requestAnimationFrame(monitorVolume);
Expand Down Expand Up @@ -261,6 +275,7 @@ export default function RecordPage() {
}, 10000);

let authTimeout: NodeJS.Timeout | null = null;
let transcriptSegments: TranscriptSegment[] = [];

ws.onopen = () => {
clearTimeout(connectionTimeout);
Expand Down Expand Up @@ -422,14 +437,18 @@ export default function RecordPage() {
break;
case "transcribe":
console.log("📝 Transcription:", message.data?.transcript);
if (message.data?.transcript) {
setTranscript((prev) => prev + message.data.transcript);
if (message.data) {
const segment = handleOtoWsTranscribe(message);
transcriptSegments = handleTranscriptSegment(transcriptSegments, segment);
setTranscriptSegments(transcriptSegments);
}
break;
case "transcript-beautify":
console.log("✨ Beautified transcript:", message.data?.transcript);
if (message.data?.transcript) {
setTranscript(message.data.transcript);
if (message.data) {
const beautifyData = handleOtoWsTranscriptBeautify(message);
transcriptSegments = handleTranscriptBeautify(transcriptSegments, beautifyData);
setTranscriptSegments(transcriptSegments);
}
break;
case "detect-action":
Expand Down Expand Up @@ -706,7 +725,7 @@ export default function RecordPage() {
const wsState = websocketRef.current?.readyState;
if (wsState === WebSocket.OPEN) {
try {
console.log(`🎤 Sending audio chunk (${event.data.size} bytes) - WebSocket state: ${wsState}`);
//console.log(`🎤 Sending audio chunk (${event.data.size} bytes) - WebSocket state: ${wsState}`);
// Send audio data in JSON format (not binary) for server compatibility
sendRealtimeAudioData(websocketRef.current, event.data, false);
} catch (error) {
Expand Down Expand Up @@ -942,11 +961,26 @@ export default function RecordPage() {
{/* Recording Controls */}
<div className="text-center mb-8">
{/* Large Microphone Button - Click to Start/Stop */}
<div className="relative">
<div className="relative mb-4">
{/* Volume Ring Indicator */}
{isStreaming && volume > 0 && (
<div className="absolute w-full h-full flex items-center justify-center top-0 left-0">
<div
className="rounded-full border-4 border-red-200 bg-red-200"
style={{
opacity: Math.min(volume / 50, 1),
//animationDuration: `${Math.max(0.5, 2 - volume / 50)}s`,
width: `auto`,
height: `calc(100% + ${volume*0.7}px)`,
aspectRatio: "1/1",
}}
/>
</div>
)}
<button
onClick={isStreaming ? stopRecording : startRecording}
disabled={connectionStatus === "connecting"}
className={`inline-flex items-center justify-center w-32 h-32 rounded-full mb-4 transition-all duration-300 transform hover:scale-105 active:scale-95 focus:outline-none focus:ring-4 focus:ring-opacity-50 ${
className={`inline-flex items-center justify-center w-32 h-32 rounded-full transition-all duration-300 transform hover:scale-105 active:scale-95 focus:outline-none focus:ring-4 focus:ring-opacity-50 ${
isStreaming
? "bg-red-500 hover:bg-red-600 animate-pulse focus:ring-red-300"
: connectionStatus === "authenticated"
Expand All @@ -963,17 +997,6 @@ export default function RecordPage() {
<Mic size={40} className="text-white" />
)}
</button>

{/* Volume Ring Indicator */}
{isStreaming && volume > 0 && (
<div
className="absolute inset-0 rounded-full border-4 border-green-400 animate-ping"
style={{
opacity: Math.min(volume / 50, 1),
animationDuration: `${Math.max(0.5, 2 - volume / 50)}s`,
}}
/>
)}
</div>

{/* Status Text */}
Expand Down Expand Up @@ -1002,7 +1025,7 @@ export default function RecordPage() {

<div className="text-sm text-gray-500">
{isStreaming
? "Click the microphone to stop streaming"
? "Click the button to stop streaming"
: connectionStatus === "authenticated"
? "Click the microphone to start streaming"
: connectionStatus === "connecting"
Expand All @@ -1015,7 +1038,7 @@ export default function RecordPage() {
{!isStreaming ? (
<Button
onClick={startRecording}
className="px-6 py-2 text-sm"
className="px-6 py-2 text-sm hidden"
size="sm"
disabled={connectionStatus === "connecting"}
variant="outline"
Expand All @@ -1026,7 +1049,7 @@ export default function RecordPage() {
<Button
onClick={stopRecording}
variant="destructive"
className="px-6 py-2 text-sm"
className="px-6 py-2 text-sm hidden"
size="sm"
>
<Square size={16} className="mr-1" />
Expand All @@ -1043,8 +1066,8 @@ export default function RecordPage() {
<h3 className="text-lg font-semibold text-gray-900 mb-3">
Real-time Transcription
</h3>
<div className="text-gray-700 leading-relaxed whitespace-pre-wrap">
{transcript}
<div className="text-gray-700 leading-relaxed whitespace-pre-wrap overflow-y-auto max-h-[320px] scroll-smooth" ref={transcriptContentRef}>
{handleFormatTranscript(transcriptSegments)}
</div>
</div>
)}
Expand Down