Skip to content

Commit bb67e70

Browse files
committed
feat(audio-chat): add support for optional remote_audio_id which we can use in the history instead of the transcripts
Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
1 parent 403400b commit bb67e70

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

lib/Controller/ChattyLLMController.php

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -500,15 +500,6 @@ public function generateForSession(int $sessionId, int $agencyConfirm = 0): JSON
500500
do {
501501
$lastUserMessage = array_pop($history);
502502
} while ($lastUserMessage->getRole() !== 'human');
503-
// history is a list of JSON strings
504-
// we ignore audio attachments here because they are supposed to have been transcribed, the content is the transcription
505-
// this makes the history smaller
506-
$history = array_map(static function (Message $message) {
507-
return json_encode([
508-
'role' => $message->getRole(),
509-
'content' => $message->getContent(),
510-
]);
511-
}, $history);
512503

513504
$lastAttachments = $lastUserMessage->jsonSerialize()['attachments'];
514505
$audioAttachment = $lastAttachments[0] ?? null;
@@ -522,13 +513,22 @@ public function generateForSession(int $sessionId, int $agencyConfirm = 0): JSON
522513
&& class_exists('OCP\\TaskProcessing\\TaskTypes\\AudioToAudioChat')
523514
&& isset($this->taskProcessingManager->getAvailableTaskTypes()[\OCP\TaskProcessing\TaskTypes\AudioToAudioChat::ID])
524515
) {
516+
// for an audio chat task, let's try to get the remote audio IDs for all the previous audio messages
517+
$history = $this->getAudioHistory($history);
525518
$fileId = $audioAttachment['file_id'];
526519
try {
527520
$taskId = $this->scheduleAudioChatTask($fileId, $systemPrompt, $history, $sessionId, $lastUserMessage->getId());
528521
} catch (\Exception $e) {
529522
return new JSONResponse(['error' => $e->getMessage()], Http::STATUS_BAD_REQUEST);
530523
}
531524
} else {
525+
// for a text chat task, let's only use text in the history
526+
$history = array_map(static function (Message $message) {
527+
return json_encode([
528+
'role' => $message->getRole(),
529+
'content' => $message->getContent(),
530+
]);
531+
}, $history);
532532
try {
533533
$taskId = $this->scheduleLLMChatTask($lastUserMessage->getContent(), $systemPrompt, $history, $sessionId);
534534
} catch (\Exception $e) {
@@ -540,6 +540,27 @@ public function generateForSession(int $sessionId, int $agencyConfirm = 0): JSON
540540
return new JSONResponse(['taskId' => $taskId]);
541541
}
542542

543+
private function getAudioHistory(array $history): array {
544+
// history is a list of JSON strings
545+
// the content is the remote audio ID (or the transcription as fallback)
546+
return array_map(static function (Message $message) {
547+
$entry = [
548+
'role' => $message->getRole(),
549+
];
550+
$attachments = $message->jsonSerialize()['attachments'];
551+
if ($message->getRole() === 'assistant'
552+
&& count($attachments) > 0
553+
&& $attachments[0]['type'] === 'Audio'
554+
&& isset($attachments[0]['remote_audio_id'])
555+
) {
556+
$entry['audio'] = ['id' => $attachments[0]['remote_audio_id']];
557+
} else {
558+
$entry['content'] = $message->getContent();
559+
}
560+
return json_encode($entry);
561+
}, $history);
562+
}
563+
543564
/**
544565
* Regenerate response for a message
545566
*

lib/Listener/ChattyLLMTaskListener.php

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,11 @@ public function handle(Event $event): void {
7979
$message->setContent($outputTranscript);
8080
// agency might not return any output but just ask for confirmation
8181
if ($outputTranscript !== '') {
82-
$message->setAttachments('[{"type":"Audio","file_id":' . $task->getOutput()['output'] . '}]');
82+
$attachment = ['type' => 'Audio', 'file_id' => $task->getOutput()['output']];
83+
if (isset($task->getOutput()['audio_id'])) {
84+
$attachment['remote_audio_id'] = $task->getOutput()['audio_id'];
85+
}
86+
$message->setAttachments(json_encode([$attachment]));
8387
}
8488
// now we have the transcription of the user audio input
8589
if (preg_match('/^chatty-llm:\d+:(\d+)$/', $customId, $matches)) {
@@ -152,6 +156,11 @@ private function runTtsTask(Message $message, ?string $userId): void {
152156
$speechFileId = $ttsTaskOutput['speech'];
153157
// we need to set "ocp_task_id" here because the file is not an output of the task that produced the message
154158
// and we need the task ID + the file ID to load the audio file in the frontend
155-
$message->setAttachments('[{"type":"Audio","file_id":' . $speechFileId . ',"ocp_task_id":' . $task->getId() . '}]');
159+
$attachment = [
160+
'type' => 'Audio',
161+
'file_id' => $speechFileId,
162+
'ocp_task_id' => $task->getId(),
163+
];
164+
$message->setAttachments(json_encode([$attachment]));
156165
}
157166
}

0 commit comments

Comments
 (0)