Skip to content

Commit 489e31a

Browse files
authored
Merge pull request #57 from diffusionstudio/konstantin/fix/silence-detection
Konstantin/fix/silence detection
2 parents 4294a9f + d6d2c18 commit 489e31a

File tree

12 files changed

+170
-291
lines changed

12 files changed

+170
-291
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "@diffusionstudio/core",
33
"private": false,
4-
"version": "1.4.3",
4+
"version": "1.5.1",
55
"type": "module",
66
"description": "Build bleeding edge video processing applications",
77
"files": [

playground/main.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,7 @@ const audioSource = await core.AudioSource.from('/harvard.MP3');
4747
await audioTrack.add(
4848
await new core.AudioClip(audioSource)
4949
);
50-
await audioTrack.removeSilences({
51-
minDuration: 300,
52-
windowSize: 1,
53-
});
50+
await audioTrack.removeSilences();
5451

5552
image.animate()
5653
.rotation(-16).to(14, 5).to(-7, 10).to(24, 7).to(-3, 9).to(19, 7).to(-14, 12).to(5, 9).to(-30, 13)

src/clips/media/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@
77

88
export * from './media';
99
export * from './media.interfaces';
10+
export * from './media.types';

src/clips/media/media.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { Clip } from '../clip';
1616
import type { CaptionPresetStrategy, CaptionTrack } from '../../tracks';
1717
import type { float, frame } from '../../types';
1818
import type { MediaClipProps } from './media.interfaces';
19+
import type { SilenceRemoveOptions } from './media.types';
1920

2021
export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Clip<MediaClipProps> {
2122
public source = new AudioSource();
@@ -313,4 +314,69 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
313314
) {
314315
return this.addCaptions(strategy);
315316
}
317+
318+
/**
319+
* Remove silences from the clip
320+
*
321+
* @param options - Options for silence detection
322+
*/
323+
public async removeSilences(options: SilenceRemoveOptions = {}): Promise<MediaClip<Props>[]> {
324+
if (!['READY', 'ATTACHED'].includes(this.state)) {
325+
await this.init();
326+
}
327+
328+
const silences = (await this.source.silences(options))
329+
.filter((silence) => inRange(silence, this.range))
330+
.sort((a, b) => a.start.millis - b.start.millis);
331+
332+
if (silences.length == 0) {
333+
return [this];
334+
}
335+
336+
// default padding between clips
337+
const padding = options.padding ?? 500;
338+
const result: MediaClip<Props>[] = [this];
339+
340+
for (const silence of silences) {
341+
const item = result.at(-1);
342+
343+
if (!item) break;
344+
if (!inRange(silence, item.range)) continue;
345+
346+
// start with padding
347+
const start = new Timestamp(
348+
Math.min(silence.start.millis + padding, silence.stop.millis)
349+
);
350+
351+
if (silence.start.millis > item.range[0].millis && silence.stop.millis < item.range[1].millis) {
352+
const copy = item.copy();
353+
354+
item.range[1] = start;
355+
copy.range[0] = silence.stop;
356+
357+
result.push(copy);
358+
} else if (silence.start.millis <= item.range[0].millis) {
359+
item.range[0] = silence.stop;
360+
} else if (silence.stop.millis >= item.range[1].millis) {
361+
item.range[1] = start;
362+
}
363+
}
364+
365+
return result;
366+
}
367+
}
368+
369+
function inRange(
370+
silence: {
371+
start: Timestamp;
372+
stop: Timestamp;
373+
},
374+
range: [Timestamp, Timestamp],
375+
): boolean {
376+
return (
377+
(silence.start.millis >= range[0].millis &&
378+
silence.start.millis <= range[1].millis) ||
379+
(silence.stop.millis <= range[1].millis &&
380+
silence.stop.millis >= range[0].millis)
381+
)
316382
}

src/clips/media/media.types.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import { SilenceDetectionOptions } from "../../sources";
2+
3+
export type SilenceRemoveOptions = {
4+
/**
5+
* Adds padding in milliseconds after each detected non-silent segment.
6+
* This helps prevent cutting off audio too abruptly.
7+
* @default 500
8+
*/
9+
padding?: number;
10+
} & SilenceDetectionOptions;

src/sources/audio.fixtures.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export const MIN_SAMPLE_RATE = 3000;

src/sources/audio.spec.ts

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66
*/
77

88
import { describe, it, vi, beforeEach, expect } from 'vitest';
9-
import { AudioSource } from './audio'; // Import the AudioSource class
10-
import { findSilences } from './audio.utils';
11-
import { Timestamp } from '../models';
9+
import { AudioSource } from './audio';
1210

1311
// Mocking the OfflineAudioContext class
1412
class MockOfflineAudioContext {
@@ -27,30 +25,6 @@ class MockOfflineAudioContext {
2725

2826
vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
2927

30-
describe('AudioUtils', () => {
31-
it('all silent', () => {
32-
const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
33-
expect(silences).toEqual([{
34-
start: new Timestamp(0),
35-
stop: new Timestamp(100),
36-
}]);
37-
});
38-
39-
it('no silences', () => {
40-
const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
41-
expect(silences).toEqual([]);
42-
});
43-
44-
it('find silences correctly', () => {
45-
const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
46-
const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
47-
expect(silences).toEqual([{
48-
start: new Timestamp(3010),
49-
stop: new Timestamp(4000),
50-
}]);
51-
});
52-
});
53-
5428
describe('AudioSource', () => {
5529
let audioSource: AudioSource;
5630

@@ -59,42 +33,6 @@ describe('AudioSource', () => {
5933
audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
6034
});
6135

62-
it('find silences correctly', async () => {
63-
const audioBuffer = {
64-
duration: 16,
65-
sampleRate: 1000,
66-
length: 16000,
67-
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
68-
} as any as AudioBuffer;
69-
audioSource.audioBuffer = audioBuffer;
70-
const silences = await audioSource.silences({});
71-
expect(silences).toEqual([{
72-
start: new Timestamp(0),
73-
stop: new Timestamp(16000),
74-
}]);
75-
});
76-
77-
it('find silences correctly with too high minDuration', async () => {
78-
const audioBuffer = {
79-
duration: 16,
80-
sampleRate: 1000,
81-
length: 16000,
82-
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
83-
} as any as AudioBuffer;
84-
audioSource.audioBuffer = audioBuffer;
85-
const silences = await audioSource.silences({minDuration: 1e10});
86-
expect(silences).toEqual([{
87-
start: new Timestamp(0),
88-
stop: new Timestamp(16000),
89-
}]);
90-
});
91-
92-
it('find silences correctly after caching', async () => {
93-
const silences = await audioSource.silences({});
94-
const cachedSilences = await audioSource.silences({threshold: 0, minDuration: 1e10, windowSize: 1e10});
95-
expect(silences).toEqual(cachedSilences);
96-
});
97-
9836
it('should decode an audio buffer correctly', async () => {
9937
const buffer = await audioSource.decode(2, 44100, true);
10038
expect(buffer.duration).toBe(5); // Mock duration

src/sources/audio.ts

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@
66
*/
77

88
import { Source } from './source';
9+
import { detectSilences } from './audio.utils';
10+
import { MIN_SAMPLE_RATE } from './audio.fixtures';
911

1012
import type { ClipType } from '../clips';
1113
import type { ArgumentTypes } from '../types';
12-
import type { FastSamplerOptions, SilenceOptions } from './audio.types';
14+
import type { AudioSlice, FastSamplerOptions, SilenceDetectionOptions } from './audio.types';
1315
import type { Timestamp, Transcript } from '../models';
14-
import { findSilences } from './audio.utils';
15-
16-
const DEFAULT_SAMPLE_RATE = 3000;
1716

1817
export class AudioSource<T extends Object = {}> extends Source<T> {
1918
public readonly type: ClipType = 'audio';
@@ -89,12 +88,12 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
8988
if (typeof start === 'object') start = start.millis;
9089
if (typeof stop === 'object') stop = stop.millis;
9190

92-
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
91+
const audioBuffer = this.audioBuffer ?? (await this.decode(1, MIN_SAMPLE_RATE, true));
9392
const channelData = audioBuffer.getChannelData(0);
9493

95-
const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
94+
const firstSample = Math.floor(Math.max((start * MIN_SAMPLE_RATE) / 1000, 0));
9695
const lastSample = stop
97-
? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
96+
? Math.floor(Math.min((stop * MIN_SAMPLE_RATE) / 1000, audioBuffer.length))
9897
: audioBuffer.length;
9998

10099
const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -137,20 +136,18 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
137136
* @param options - Silences options.
138137
* @returns An array of the silences (in ms) in the clip.
139138
*/
140-
public async silences({
141-
threshold = -50,
142-
minDuration = 100,
143-
windowSize = 50,
144-
}: SilenceOptions = {}): Promise<{ start: Timestamp; stop: Timestamp }[]> {
139+
public async silences(options: SilenceDetectionOptions = {}): Promise<AudioSlice[]> {
145140
if (this._silences) return this._silences;
146141

147-
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
148-
const length = Math.floor(audioBuffer.length / windowSize);
149-
const samples = await this.fastsampler({ length, logarithmic: false });
142+
const buffer = await this.arrayBuffer();
143+
144+
const ctx = new AudioContext();
145+
146+
const audioBuffer = await ctx.decodeAudioData(buffer);
147+
this._silences = detectSilences(audioBuffer, options);
150148

151-
const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
152-
this._silences = silences;
149+
ctx.close();
153150

154-
return silences;
151+
return this._silences;
155152
}
156153
}

src/sources/audio.types.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,25 @@ export type FastSamplerOptions = {
2222
logarithmic?: boolean;
2323
};
2424

25-
export type SilenceOptions = {
25+
export type SilenceDetectionOptions = {
2626
/**
27-
* The threshold to use for the silence detection in db.
28-
*/
27+
* If the RMS is below the threshold, the frame is considered silent.
28+
* @default 0.02
29+
*/
2930
threshold?: number;
3031
/**
31-
* The minimum duration of a silence to be considered a silence in milliseconds.
32+
* This parameter affects how accurately the algorithm captures short silences.
33+
* @default 1024
3234
*/
33-
minDuration?: number;
35+
hopSize?: number;
3436
/**
35-
* The window size to use for the silence detection.
37+
* Setting a minimum duration in **milliseconds** for a silence period helps avoid detecting brief gaps between sounds as silences.
38+
* @default 500
3639
*/
37-
windowSize?: number;
40+
minDuration?: number;
41+
};
42+
43+
export type AudioSlice = {
44+
start: Timestamp;
45+
stop: Timestamp;
3846
};

0 commit comments

Comments
 (0)