Skip to content

Commit 635bff5

Browse files
authored
Support longform voxtral processing (#1375)
1 parent beb1d17 commit 635bff5

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

src/models/voxtral/processing_voxtral.js

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]";
77
const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]";
88
const NUM_AUDIO_TOKENS = 375;
99

10+
/**
11+
* Helper function to split audio into non-overlapping chunks of n_samples
12+
* @param {Float32Array} audio
13+
* @param {number} n_samples
14+
* @returns {Float32Array[]}
15+
*/
16+
function chunk(audio, n_samples) {
17+
const chunks = [];
18+
for (let i = 0; i < audio.length; i += n_samples) {
19+
chunks.push(audio.subarray(i, Math.min(i + n_samples, audio.length)));
20+
}
21+
return chunks;
22+
}
23+
1024
/**
1125
* Represents a VoxtralProcessor that extracts features from an audio input.
1226
*/
@@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor {
3246
if (!Array.isArray(audio)) {
3347
audio = [audio];
3448
}
35-
const num_audio_tokens = text.split(AUDIO_TOKEN).length - 1;
49+
const text_parts = text.split(AUDIO_TOKEN);
50+
const num_audio_tokens = text_parts.length - 1;
3651
if (num_audio_tokens !== audio.length) {
3752
throw new Error(`The number of audio inputs (${audio.length}) does not match the number of audio tokens in the text (${num_audio_tokens}).`);
3853
}
54+
55+
const n_samples = this.feature_extractor.config.n_samples;
56+
57+
// Split each audio input into chunks and keep track of chunk counts
58+
const audio_chunks = audio.map(a => chunk(a, n_samples));
59+
const chunk_counts = audio_chunks.map(chunks => chunks.length);
60+
61+
// Flatten all chunks for feature extraction
62+
const all_chunks = audio_chunks.flat();
3963
const features = (await Promise.all(
40-
audio.map((audio_input) => this.feature_extractor(audio_input, kwargs))
64+
all_chunks.map((audio_input) => this.feature_extractor(audio_input, kwargs))
4165
)).map(x => x.input_features);
66+
4267
audio_inputs["audio_values"] = features.length > 1 ? cat(features, 0) : features[0];
4368

44-
text = text.replaceAll(AUDIO_TOKEN, BEGIN_AUDIO_TOKEN + AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS));
69+
// Replace text tokens for each audio input, expanding for chunk count
70+
let new_text = text_parts[0];
71+
for (let i = 0; i < chunk_counts.length; ++i) {
72+
new_text += BEGIN_AUDIO_TOKEN;
73+
for (let j = 0; j < chunk_counts[i]; ++j) {
74+
new_text += AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS);
75+
}
76+
new_text += text_parts[i + 1];
77+
}
78+
text = new_text;
4579
}
4680

4781
const text_inputs = this.tokenizer(text, {

0 commit comments

Comments
 (0)