Merge pull request #57 from diffusionstudio/konstantin/fix/silence-detection

k9p5 · web-flow · commit 489e31afc5b5 · 2024-12-03T15:11:48.000-08:00
Konstantin/fix/silence detection
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@diffusionstudio/core",
   "private": false,
-  "version": "1.4.3",
+  "version": "1.5.1",
   "type": "module",
   "description": "Build bleeding edge video processing applications",
   "files": [
diff --git a/playground/main.ts b/playground/main.ts
@@ -47,10 +47,7 @@ const audioSource = await core.AudioSource.from('/harvard.MP3');
 await audioTrack.add(
   await new core.AudioClip(audioSource)
 );
-await audioTrack.removeSilences({
-  minDuration: 300,
-  windowSize: 1,
-});
+await audioTrack.removeSilences();
 
 image.animate()
   .rotation(-16).to(14, 5).to(-7, 10).to(24, 7).to(-3, 9).to(19, 7).to(-14, 12).to(5, 9).to(-30, 13)
diff --git a/src/clips/media/index.ts b/src/clips/media/index.ts
@@ -7,3 +7,4 @@
 
 export * from './media';
 export * from './media.interfaces';
+export * from './media.types';
diff --git a/src/clips/media/media.ts b/src/clips/media/media.ts
@@ -16,6 +16,7 @@ import { Clip } from '../clip';
 import type { CaptionPresetStrategy, CaptionTrack } from '../../tracks';
 import type { float, frame } from '../../types';
 import type { MediaClipProps } from './media.interfaces';
+import type { SilenceRemoveOptions } from './media.types';
 
 export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Clip<MediaClipProps> {
 	public source = new AudioSource();
@@ -313,4 +314,69 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
 	) {
 		return this.addCaptions(strategy);
 	}
+
+	/**
+	 * Remove silences from the clip
+	 *
+	 * @param options - Options for silence detection
+	 */
+	public async removeSilences(options: SilenceRemoveOptions = {}): Promise<MediaClip<Props>[]> {
+		if (!['READY', 'ATTACHED'].includes(this.state)) {
+			await this.init();
+		}
+
+		const silences = (await this.source.silences(options))
+			.filter((silence) => inRange(silence, this.range))
+			.sort((a, b) => a.start.millis - b.start.millis);
+
+		if (silences.length == 0) {
+			return [this];
+		}
+
+		// default padding between clips
+		const padding = options.padding ?? 500;
+		const result: MediaClip<Props>[] = [this];
+
+		for (const silence of silences) {
+			const item = result.at(-1);
+
+			if (!item) break;
+			if (!inRange(silence, item.range)) continue;
+
+			// start with padding
+			const start = new Timestamp(
+				Math.min(silence.start.millis + padding, silence.stop.millis)
+			);
+
+			if (silence.start.millis > item.range[0].millis && silence.stop.millis < item.range[1].millis) {
+				const copy = item.copy();
+
+				item.range[1] = start;
+				copy.range[0] = silence.stop;
+
+				result.push(copy);
+			} else if (silence.start.millis <= item.range[0].millis) {
+				item.range[0] = silence.stop;
+			} else if (silence.stop.millis >= item.range[1].millis) {
+				item.range[1] = start;
+			}
+		}
+
+		return result;
+	}
+}
+
+function inRange(
+	silence: {
+		start: Timestamp;
+		stop: Timestamp;
+	},
+	range: [Timestamp, Timestamp],
+): boolean {
+	return (
+		(silence.start.millis >= range[0].millis &&
+			silence.start.millis <= range[1].millis) ||
+		(silence.stop.millis <= range[1].millis &&
+			silence.stop.millis >= range[0].millis)
+	)
 }
diff --git a/src/clips/media/media.types.ts b/src/clips/media/media.types.ts
@@ -0,0 +1,10 @@
+import { SilenceDetectionOptions } from "../../sources";
+
+export type SilenceRemoveOptions = {
+  /**
+   * Adds padding in milliseconds after each detected non-silent segment.
+   * This helps prevent cutting off audio too abruptly.
+   * @default 500
+   */
+	padding?: number;
+} & SilenceDetectionOptions;
diff --git a/src/sources/audio.fixtures.ts b/src/sources/audio.fixtures.ts
@@ -0,0 +1 @@
+export const MIN_SAMPLE_RATE = 3000;
diff --git a/src/sources/audio.spec.ts b/src/sources/audio.spec.ts
@@ -6,9 +6,7 @@
  */
 
 import { describe, it, vi, beforeEach, expect } from 'vitest';
-import { AudioSource } from './audio'; // Import the AudioSource class
-import { findSilences } from './audio.utils';
-import { Timestamp } from '../models';
+import { AudioSource } from './audio';
 
 // Mocking the OfflineAudioContext class
 class MockOfflineAudioContext {
@@ -27,30 +25,6 @@ class MockOfflineAudioContext {
 
 vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
 
-describe('AudioUtils', () => {
-	it('all silent', () => {
-		const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(100),
-		}]);
-	});
-
-	it('no silences', () => {
-		const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
-		expect(silences).toEqual([]);
-	});
-
-	it('find silences correctly', () => {
-		const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
-		const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
-		expect(silences).toEqual([{
-			start: new Timestamp(3010),
-			stop: new Timestamp(4000),
-		}]);
-	});
-});
-
 describe('AudioSource', () => {
 	let audioSource: AudioSource;
 
@@ -59,42 +33,6 @@ describe('AudioSource', () => {
 		audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
 	});
 
-	it('find silences correctly', async () => {
-		const audioBuffer = {
-			duration: 16,
-			sampleRate: 1000,
-			length: 16000,
-			getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
-		} as any as AudioBuffer;
-		audioSource.audioBuffer = audioBuffer;
-		const silences = await audioSource.silences({});
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(16000),
-		}]);
-	});
-
-	it('find silences correctly with too high minDuration', async () => {
-		const audioBuffer = {
-			duration: 16,
-			sampleRate: 1000,
-			length: 16000,
-			getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
-		} as any as AudioBuffer;
-		audioSource.audioBuffer = audioBuffer;
-		const silences = await audioSource.silences({minDuration: 1e10});
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(16000),
-		}]);
-	});
-
-	it('find silences correctly after caching', async () => {
-		const silences = await audioSource.silences({});
-		const cachedSilences = await audioSource.silences({threshold: 0, minDuration: 1e10, windowSize: 1e10});
-		expect(silences).toEqual(cachedSilences);
-	});
-
 	it('should decode an audio buffer correctly', async () => {
 		const buffer = await audioSource.decode(2, 44100, true);
 		expect(buffer.duration).toBe(5); // Mock duration
diff --git a/src/sources/audio.ts b/src/sources/audio.ts
@@ -6,14 +6,13 @@
  */
 
 import { Source } from './source';
+import { detectSilences } from './audio.utils';
+import { MIN_SAMPLE_RATE } from './audio.fixtures';
 
 import type { ClipType } from '../clips';
 import type { ArgumentTypes } from '../types';
-import type { FastSamplerOptions, SilenceOptions } from './audio.types';
+import type { AudioSlice, FastSamplerOptions, SilenceDetectionOptions } from './audio.types';
 import type { Timestamp, Transcript } from '../models';
-import { findSilences } from './audio.utils';
-
-const DEFAULT_SAMPLE_RATE = 3000;
 
 export class AudioSource<T extends Object = {}> extends Source<T> {
 	public readonly type: ClipType = 'audio';
@@ -89,12 +88,12 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 		if (typeof start === 'object') start = start.millis;
 		if (typeof stop === 'object') stop = stop.millis;
 
-		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
+		const audioBuffer = this.audioBuffer ?? (await this.decode(1, MIN_SAMPLE_RATE, true));
 		const channelData = audioBuffer.getChannelData(0);
 
-		const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
+		const firstSample = Math.floor(Math.max((start * MIN_SAMPLE_RATE) / 1000, 0));
 		const lastSample = stop
-			? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
+			? Math.floor(Math.min((stop * MIN_SAMPLE_RATE) / 1000, audioBuffer.length))
 			: audioBuffer.length;
 
 		const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -137,20 +136,18 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 	 * @param options - Silences options.
 	 * @returns An array of the silences (in ms) in the clip.
 	 */
-	public async silences({
-		threshold = -50,
-		minDuration = 100,
-		windowSize = 50,
-	}: SilenceOptions = {}): Promise<{ start: Timestamp; stop: Timestamp }[]> {
+	public async silences(options: SilenceDetectionOptions = {}): Promise<AudioSlice[]> {
 		if (this._silences) return this._silences;
 
-		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
-		const length = Math.floor(audioBuffer.length / windowSize);
-		const samples = await this.fastsampler({ length, logarithmic: false });
+		const buffer = await this.arrayBuffer();
+
+		const ctx = new AudioContext();
+
+		const audioBuffer = await ctx.decodeAudioData(buffer);
+		this._silences = detectSilences(audioBuffer, options);
 
-		const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
-		this._silences = silences;
+		ctx.close();
 
-		return silences;
+		return this._silences;
 	}
 }
diff --git a/src/sources/audio.types.ts b/src/sources/audio.types.ts
@@ -22,17 +22,25 @@ export type FastSamplerOptions = {
 	logarithmic?: boolean;
 };
 
-export type SilenceOptions = {
+export type SilenceDetectionOptions = {
 	/**
-	 * The threshold to use for the silence detection in db.
-	 */	
+	 * If the RMS is below the threshold, the frame is considered silent. 
+	 * @default 0.02
+	 */
 	threshold?: number;
 	/**
-	 * The minimum duration of a silence to be considered a silence in milliseconds.
+	 * This parameter affects how accurately the algorithm captures short silences. 
+	 * @default 1024
 	 */
-	minDuration?: number;
+	hopSize?: number;
 	/**
-	 * The window size to use for the silence detection.
+	 * Setting a minimum duration in **milliseconds** for a silence period helps avoid detecting brief gaps between sounds as silences. 
+	 * @default 500
 	 */
-	windowSize?: number;
+	minDuration?: number;
+};
+
+export type AudioSlice = {
+	start: Timestamp;
+	stop: Timestamp;
 };
diff --git a/src/sources/audio.utils.ts b/src/sources/audio.utils.ts
diff --git a/src/tracks/media/media.spec.ts b/src/tracks/media/media.spec.ts
diff --git a/src/tracks/media/media.ts b/src/tracks/media/media.ts

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@diffusionstudio/core",`
`3`	`3`	`"private": false,`
`4`		`- "version": "1.4.3",`
	`4`	`+ "version": "1.5.1",`
`5`	`5`	`"type": "module",`
`6`	`6`	`"description": "Build bleeding edge video processing applications",`
`7`	`7`	`"files": [`
Original file line number	Diff line number	Diff line change
`@@ -7,3 +7,4 @@`
`7`	`7`
`8`	`8`	`export * from './media';`
`9`	`9`	`export * from './media.interfaces';`
	`10`	`+export * from './media.types';`