@@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]";
7
7
const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]" ;
8
8
const NUM_AUDIO_TOKENS = 375 ;
9
9
10
+ /**
11
+ * Helper function to split audio into non-overlapping chunks of n_samples
12
+ * @param {Float32Array } audio
13
+ * @param {number } n_samples
14
+ * @returns {Float32Array[] }
15
+ */
16
+ function chunk ( audio , n_samples ) {
17
+ const chunks = [ ] ;
18
+ for ( let i = 0 ; i < audio . length ; i += n_samples ) {
19
+ chunks . push ( audio . subarray ( i , Math . min ( i + n_samples , audio . length ) ) ) ;
20
+ }
21
+ return chunks ;
22
+ }
23
+
10
24
/**
11
25
* Represents a VoxtralProcessor that extracts features from an audio input.
12
26
*/
@@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor {
32
46
if ( ! Array . isArray ( audio ) ) {
33
47
audio = [ audio ] ;
34
48
}
35
- const num_audio_tokens = text . split ( AUDIO_TOKEN ) . length - 1 ;
49
+ const text_parts = text . split ( AUDIO_TOKEN ) ;
50
+ const num_audio_tokens = text_parts . length - 1 ;
36
51
if ( num_audio_tokens !== audio . length ) {
37
52
throw new Error ( `The number of audio inputs (${ audio . length } ) does not match the number of audio tokens in the text (${ num_audio_tokens } ).` ) ;
38
53
}
54
+
55
+ const n_samples = this . feature_extractor . config . n_samples ;
56
+
57
+ // Split each audio input into chunks and keep track of chunk counts
58
+ const audio_chunks = audio . map ( a => chunk ( a , n_samples ) ) ;
59
+ const chunk_counts = audio_chunks . map ( chunks => chunks . length ) ;
60
+
61
+ // Flatten all chunks for feature extraction
62
+ const all_chunks = audio_chunks . flat ( ) ;
39
63
const features = ( await Promise . all (
40
- audio . map ( ( audio_input ) => this . feature_extractor ( audio_input , kwargs ) )
64
+ all_chunks . map ( ( audio_input ) => this . feature_extractor ( audio_input , kwargs ) )
41
65
) ) . map ( x => x . input_features ) ;
66
+
42
67
audio_inputs [ "audio_values" ] = features . length > 1 ? cat ( features , 0 ) : features [ 0 ] ;
43
68
44
- text = text . replaceAll ( AUDIO_TOKEN , BEGIN_AUDIO_TOKEN + AUDIO_TOKEN . repeat ( NUM_AUDIO_TOKENS ) ) ;
69
+ // Replace text tokens for each audio input, expanding for chunk count
70
+ let new_text = text_parts [ 0 ] ;
71
+ for ( let i = 0 ; i < chunk_counts . length ; ++ i ) {
72
+ new_text += BEGIN_AUDIO_TOKEN ;
73
+ for ( let j = 0 ; j < chunk_counts [ i ] ; ++ j ) {
74
+ new_text += AUDIO_TOKEN . repeat ( NUM_AUDIO_TOKENS ) ;
75
+ }
76
+ new_text += text_parts [ i + 1 ] ;
77
+ }
78
+ text = new_text ;
45
79
}
46
80
47
81
const text_inputs = this . tokenizer ( text , {
0 commit comments