Skip to content

Commit 11191b8

Browse files
Implement basic Whisper transcription
1 parent fafa981 commit 11191b8

File tree

3 files changed

+219
-26
lines changed

3 files changed

+219
-26
lines changed
Lines changed: 177 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,43 @@
11
package org.myrobotlab.service;
22

3-
import org.myrobotlab.framework.Service;
4-
import org.myrobotlab.service.config.ServiceConfig;
3+
import io.github.givimad.whisperjni.WhisperContext;
4+
import io.github.givimad.whisperjni.WhisperFullParams;
5+
import io.github.givimad.whisperjni.WhisperJNI;
6+
import org.myrobotlab.framework.Platform;
7+
import org.myrobotlab.service.abstracts.AbstractSpeechRecognizer;
8+
import org.myrobotlab.service.config.LlamaConfig;
9+
import org.myrobotlab.service.config.WhisperConfig;
10+
import org.myrobotlab.service.data.Locale;
11+
12+
import javax.sound.sampled.AudioFormat;
13+
import javax.sound.sampled.AudioSystem;
14+
import javax.sound.sampled.Line;
15+
import javax.sound.sampled.LineUnavailableException;
16+
import javax.sound.sampled.Mixer;
17+
import javax.sound.sampled.TargetDataLine;
18+
import java.io.File;
19+
import java.io.FileOutputStream;
20+
import java.io.IOException;
21+
import java.net.URL;
22+
import java.nio.ByteBuffer;
23+
import java.nio.ByteOrder;
24+
import java.nio.ShortBuffer;
25+
import java.nio.channels.Channels;
26+
import java.nio.channels.FileChannel;
27+
import java.nio.channels.ReadableByteChannel;
28+
import java.nio.file.Path;
29+
import java.util.Map;
30+
31+
public class Whisper extends AbstractSpeechRecognizer<WhisperConfig> {
32+
private transient WhisperJNI whisper;
33+
34+
private transient WhisperContext ctx;
35+
36+
private transient WhisperFullParams params;
37+
38+
private transient Thread listeningThread = new Thread();
39+
540

6-
public class Whisper extends Service<ServiceConfig> {
741
/**
842
* Constructor of service, reservedkey typically is a services name and inId
943
* will be its process id
@@ -14,4 +48,144 @@ public class Whisper extends Service<ServiceConfig> {
1448
public Whisper(String reservedKey, String inId) {
1549
super(reservedKey, inId);
1650
}
51+
52+
public void loadModel(String modelPath) {
53+
try {
54+
whisper = new WhisperJNI();
55+
WhisperJNI.loadLibrary();
56+
ctx = whisper.init(Path.of(modelPath));
57+
} catch (IOException e) {
58+
throw new RuntimeException(e);
59+
}
60+
61+
params = new WhisperFullParams();
62+
params.nThreads = Platform.getLocalInstance().getNumPhysicalProcessors();
63+
params.printRealtime = true;
64+
params.printProgress = true;
65+
66+
}
67+
68+
public String findModelPath(String modelName) {
69+
// First, we loop over all user-defined
70+
// model directories
71+
for (String dir : config.modelPaths) {
72+
File path = new File(dir + fs + modelName);
73+
if (path.exists()) {
74+
return path.getAbsolutePath();
75+
}
76+
}
77+
78+
// Now, we check our data directory for any downloaded models
79+
File path = new File(getDataDir() + fs + modelName);
80+
if (path.exists()) {
81+
return path.getAbsolutePath();
82+
} else if (config.modelUrls.containsKey(modelName)) {
83+
// Model was not in data but we do have a URL for it
84+
try (FileOutputStream fileOutputStream = new FileOutputStream(path)) {
85+
ReadableByteChannel readableByteChannel = Channels.newChannel(new URL(config.modelUrls.get(modelName)).openStream());
86+
FileChannel fileChannel = fileOutputStream.getChannel();
87+
info("Downloading model %s to path %s from URL %s", modelName, path, config.modelUrls.get(modelName));
88+
fileChannel.transferFrom(readableByteChannel, 0, Long.MAX_VALUE);
89+
} catch (IOException e) {
90+
throw new RuntimeException(e);
91+
}
92+
return path.getAbsolutePath();
93+
}
94+
// Cannot find the model anywhere
95+
error("Could not locate model {}, add its URL to download it or add a directory where it is located", modelName);
96+
return null;
97+
}
98+
99+
@Override
100+
public void startListening() {
101+
102+
listeningThread = new Thread(() -> {
103+
AudioFormat format = new AudioFormat(16000.0f, 16, 1, true, false);
104+
TargetDataLine microphone = null;
105+
106+
Mixer.Info[] mixerInfos = AudioSystem.getMixerInfo();
107+
for (Mixer.Info info: mixerInfos){
108+
Mixer m = AudioSystem.getMixer(info);
109+
Line.Info[] lineInfos = m.getTargetLineInfo();
110+
for (Line.Info lineInfo:lineInfos){
111+
System.out.println (info.getName()+"---"+lineInfo);
112+
// Hard-code for my mic right now
113+
if (info.getName().contains("U0x46d0x825")) {
114+
try {
115+
microphone = (TargetDataLine) m.getLine(lineInfo);
116+
microphone.open(format);
117+
System.out.println("Sample rate: " + format.getSampleRate());
118+
} catch (LineUnavailableException e) {
119+
throw new RuntimeException(e);
120+
}
121+
}
122+
123+
}
124+
125+
}
126+
127+
int numBytesRead;
128+
129+
microphone.start();
130+
while(config.listening) {
131+
int CHUNK_SIZE = (int)((format.getFrameSize() * format.getFrameRate())) * 5;
132+
ByteBuffer captureBuffer = ByteBuffer.allocate(CHUNK_SIZE);
133+
captureBuffer.order(ByteOrder.LITTLE_ENDIAN);
134+
numBytesRead = microphone.read(captureBuffer.array(), 0, CHUNK_SIZE);
135+
System.out.println("Num bytes read=" + numBytesRead);
136+
ShortBuffer shortBuffer = captureBuffer.asShortBuffer();
137+
// transform the samples to f32 samples
138+
float[] samples = new float[captureBuffer.capacity() / 2];
139+
int index = 0;
140+
shortBuffer.position(0);
141+
while (shortBuffer.hasRemaining()) {
142+
samples[index++] = Float.max(-1f, Float.min(((float) shortBuffer.get()) / (float) Short.MAX_VALUE, 1f));
143+
}
144+
int result = whisper.full(ctx, params, samples, samples.length);
145+
if(result != 0) {
146+
throw new RuntimeException("Transcription failed with code " + result);
147+
}
148+
int numSegments = whisper.fullNSegments(ctx);
149+
System.out.println("Inference done, numSegments=" + numSegments);
150+
for (int i = 0; i < numSegments; i++) {
151+
System.out.println(whisper.fullGetSegmentText(ctx, i));
152+
invoke("publishRecognized", whisper.fullGetSegmentText(ctx, i));
153+
}
154+
155+
}
156+
microphone.close();
157+
});
158+
super.startListening();
159+
160+
listeningThread.start();
161+
}
162+
163+
@Override
164+
public WhisperConfig apply(WhisperConfig c) {
165+
super.apply(c);
166+
167+
if (config.selectedModel != null && !config.selectedModel.isEmpty()) {
168+
String modelPath = findModelPath(config.selectedModel);
169+
if (modelPath != null) {
170+
loadModel(modelPath);
171+
} else {
172+
error("Could not find selected model {}", config.selectedModel);
173+
}
174+
}
175+
176+
return config;
177+
}
178+
179+
/**
180+
* locales this service supports - implementation can simply get
181+
* runtime.getLocales() if acceptable or create their own locales
182+
*
183+
* @return map of string to locale
184+
*/
185+
@Override
186+
public Map<String, Locale> getLocales() {
187+
return null;
188+
}
189+
190+
17191
}

src/main/java/org/myrobotlab/service/abstracts/AbstractSpeechRecognizer.java

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ public void clearLock() {
168168
*/
169169
@Override
170170
public String getWakeWord() {
171-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
171+
SpeechRecognizerConfig c = config;
172172
return c.wakeWord;
173173
}
174174

@@ -177,17 +177,16 @@ public String getWakeWord() {
177177
*/
178178
@Override
179179
public boolean isListening() {
180-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
180+
SpeechRecognizerConfig c = config;
181181
return c.listening;
182182
}
183183

184184
@Override
185185
@Deprecated /* use publishListening(boolean event) */
186186
public void listeningEvent(Boolean event) {
187-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
187+
SpeechRecognizerConfig c = config;
188188
c.listening = event;
189189
broadcastState();
190-
return;
191190
}
192191

193192
@Override
@@ -213,12 +212,12 @@ public void onEndSpeaking(String utterance) {
213212
// affect "recognizing"
214213
// FIXME - add a deta time after ...
215214

216-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
215+
SpeechRecognizerConfig c = config;
217216

218217
if (c.afterSpeakingPauseMs > 0) {
219218
// remove previous one shot - because we are "sliding" the window of
220219
// stopping the publishing of recognized words
221-
addTaskOneShot(c.afterSpeakingPauseMs, "setSpeaking", new Object[] { false });
220+
addTaskOneShot(c.afterSpeakingPauseMs, "setSpeaking", false);
222221
log.warn("isSpeaking = false will occur in {} ms", c.afterSpeakingPauseMs);
223222
} else {
224223
setSpeaking(false, null);
@@ -233,17 +232,16 @@ public void onAudioStart(AudioData data) {
233232
purgeTask("setSpeaking");
234233
// isSpeaking = true;
235234
setSpeaking(true, data.getFileName());
236-
return;
237235
}
238236

239237
@Override
240238
public void onAudioEnd(AudioData data) {
241239
log.info("sound stopped {}", data);
242-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
240+
SpeechRecognizerConfig c = config;
243241
if (c.afterSpeakingPauseMs > 0) {
244242
// remove previous one shot - because we are "sliding" the window of
245243
// stopping the publishing of recognized words
246-
addTaskOneShot(c.afterSpeakingPauseMs, "setSpeaking", new Object[] { false });
244+
addTaskOneShot(c.afterSpeakingPauseMs, "setSpeaking", false);
247245
log.warn("isSpeaking = false will occur in {} ms", c.afterSpeakingPauseMs);
248246
} else {
249247
setSpeaking(false, null);
@@ -264,7 +262,7 @@ public boolean setSpeaking(boolean b, String utterance) {
264262

265263
ListeningEvent event = new ListeningEvent();
266264

267-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
265+
SpeechRecognizerConfig c = config;
268266
event.isRecording = c.recording;
269267
event.isListening = c.listening;
270268
event.isAwake = isAwake;
@@ -289,7 +287,6 @@ public void onStartSpeaking(String utterance) {
289287
purgeTask("setSpeaking");
290288
// isSpeaking = true;
291289
setSpeaking(true, utterance);
292-
return;
293290
}
294291

295292
@Override
@@ -304,11 +301,10 @@ public void pauseListening() {
304301
public ListeningEvent[] processResults(ListeningEvent[] results) {
305302
// at the moment its simply invoking other methods, but if a new speech
306303
// recognizer is created - it might need more processing
307-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
304+
SpeechRecognizerConfig c = config;
308305

309306

310-
for (int i = 0; i < results.length; ++i) {
311-
ListeningEvent event = results[i];
307+
for (ListeningEvent event : results) {
312308
event.isRecording = c.recording;
313309
event.isListening = c.listening;
314310
event.isAwake = isAwake;
@@ -366,7 +362,7 @@ public void setAwake(boolean b) {
366362
}
367363

368364
public void setAwake(boolean b, String text) {
369-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
365+
SpeechRecognizerConfig c = config;
370366

371367
if (!b && isSpeaking) {
372368
log.info("bot is speaking - bot doesn't get tired when talking about self sliding idle timeout");
@@ -463,7 +459,7 @@ public void setLowerCase(boolean b) {
463459
*/
464460
@Override
465461
public void setWakeWord(String word) {
466-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
462+
SpeechRecognizerConfig c = config;
467463

468464
if (word == null || word.trim().length() == 0) {
469465
word = null;
@@ -487,7 +483,7 @@ public void setWakeWord(String word) {
487483
*
488484
*/
489485
public void setWakeWordTimeout(Integer wakeWordTimeoutSeconds) {
490-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
486+
SpeechRecognizerConfig c = config;
491487
c.wakeWordIdleTimeoutSeconds = wakeWordTimeoutSeconds;
492488
broadcastState();
493489
}
@@ -496,7 +492,7 @@ public void setWakeWordTimeout(Integer wakeWordTimeoutSeconds) {
496492
@Override
497493
public void startListening() {
498494
log.debug("Start listening event seen.");
499-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
495+
SpeechRecognizerConfig c = config;
500496
c.listening = true;
501497
c.recording = true;
502498
broadcastState();
@@ -518,7 +514,7 @@ public void setAutoListen(Boolean value) {
518514
*/
519515
@Override
520516
public void startRecording() {
521-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
517+
SpeechRecognizerConfig c = config;
522518
c.recording = true;
523519
broadcastState();
524520
}
@@ -531,7 +527,7 @@ public void startRecording() {
531527
@Override
532528
public void stopListening() {
533529
log.debug("stopListening()");
534-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
530+
SpeechRecognizerConfig c = config;
535531
c.listening = false;
536532
broadcastState();
537533
}
@@ -542,7 +538,7 @@ public void stopListening() {
542538

543539
@Override
544540
public void stopRecording() {
545-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
541+
SpeechRecognizerConfig c = config;
546542
c.recording = false;
547543
broadcastState();
548544
}
@@ -555,13 +551,13 @@ public void stopService() {
555551
}
556552

557553
public long setAfterSpeakingPause(long ms) {
558-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
554+
SpeechRecognizerConfig c = config;
559555
c.afterSpeakingPauseMs = ms;
560556
return c.afterSpeakingPauseMs;
561557
}
562558

563559
public long getAfterSpeakingPause() {
564-
SpeechRecognizerConfig c = (SpeechRecognizerConfig)config;
560+
SpeechRecognizerConfig c = config;
565561
return c.afterSpeakingPauseMs;
566562
}
567563

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package org.myrobotlab.service.config;
2+
3+
import java.util.ArrayList;
4+
import java.util.HashMap;
5+
import java.util.List;
6+
import java.util.Map;
7+
8+
public class WhisperConfig extends SpeechRecognizerConfig {
9+
public String selectedModel = "ggml-tiny.en.bin";
10+
11+
public List<String> modelPaths = new ArrayList<>(List.of(
12+
13+
));
14+
15+
public Map<String, String> modelUrls = new HashMap<>(Map.of(
16+
"ggml-tiny.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin",
17+
"ggml-small.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
18+
"ggml-tiny.en.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin",
19+
"ggml-small.en.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin",
20+
"ggml-medium-q5_0.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin",
21+
"ggml-medium.en-q5_0.bin", "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin"
22+
));
23+
}

0 commit comments

Comments
 (0)