Skip to content

Commit 7b7dc3a

Browse files
ronwcopybara-github
authored andcommitted
Add TED-LIUM (release 1) speech recognition dataset.
PiperOrigin-RevId: 303146158
1 parent a27e334 commit 7b7dc3a

File tree

12 files changed

+207
-0
lines changed

12 files changed

+207
-0
lines changed

tensorflow_datasets/audio/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@
2525
from tensorflow_datasets.audio.nsynth import Nsynth
2626
from tensorflow_datasets.audio.savee import Savee
2727
from tensorflow_datasets.audio.speech_commands import SpeechCommands
28+
from tensorflow_datasets.audio.tedlium import Tedlium

tensorflow_datasets/audio/tedlium.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""TED-LIUM speech recognition dataset."""
18+
19+
from __future__ import absolute_import
20+
from __future__ import division
21+
from __future__ import print_function
22+
23+
import os
24+
import re
25+
import numpy as np
26+
27+
import tensorflow.compat.v2 as tf
28+
29+
import tensorflow_datasets.public_api as tfds
30+
31+
_DESCRIPTION = """\
32+
The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled
33+
at 16kHz. It contains about 118 hours of speech.
34+
35+
This is the TED-LIUM corpus release 1,
36+
licensed under Creative Commons BY-NC-ND 3.0
37+
(http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en).
38+
"""
39+
40+
_CITATION = """\
41+
@inproceedings{rousseau2012tedlium,
42+
title={TED-LIUM: an Automatic Speech Recognition dedicated corpus.},
43+
author={Rousseau, Anthony and Del{\\'e}glise, Paul and Est{\\`e}ve, Yannick},
44+
booktitle={Conference on Language Resources and Evaluation (LREC)},
45+
pages={125--129},
46+
year={2012}
47+
}
48+
"""
49+
50+
_URL = "https://www.openslr.org/7/"
51+
_DL_URL = "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz"
52+
53+
54+
class Tedlium(tfds.core.BeamBasedBuilder):
55+
"""TED-LIUM dataset release 1."""
56+
57+
VERSION = tfds.core.Version("1.0.0")
58+
59+
def _info(self):
60+
return tfds.core.DatasetInfo(
61+
builder=self,
62+
description=_DESCRIPTION,
63+
features=tfds.features.FeaturesDict({
64+
"speech":
65+
tfds.features.Audio(),
66+
"text":
67+
tfds.features.Text(),
68+
"speaker_id":
69+
tf.string,
70+
"gender":
71+
tfds.features.ClassLabel(names=["unknown", "female", "male"]),
72+
"id":
73+
tf.string,
74+
}),
75+
supervised_keys=("speech", "text"),
76+
homepage="https://www.openslr.org/7/",
77+
citation=_CITATION,
78+
metadata=tfds.core.MetadataDict(sample_rate=16000,),
79+
)
80+
81+
def _split_generators(self, dl_manager):
82+
extracted_dir = dl_manager.download_and_extract(_DL_URL)
83+
base_dir = os.path.join(extracted_dir, "TEDLIUM_release1")
84+
splits = []
85+
for split, dir_name in [(tfds.Split.TRAIN, "train"),
86+
(tfds.Split.VALIDATION, "dev"),
87+
(tfds.Split.TEST, "test")]:
88+
kwargs = {"directory": os.path.join(base_dir, dir_name)}
89+
splits.append(tfds.core.SplitGenerator(name=split, gen_kwargs=kwargs))
90+
return splits
91+
92+
def _build_pcollection(self, pipeline, directory):
93+
beam = tfds.core.lazy_imports.apache_beam
94+
stm_files = tf.io.gfile.glob(os.path.join(directory, "stm", "*stm"))
95+
return (pipeline
96+
| beam.Create(stm_files)
97+
| beam.FlatMap(_generate_examples_from_stm_file))
98+
99+
100+
def _generate_examples_from_stm_file(stm_path):
101+
"""Generate examples from a TED-LIUM stm file."""
102+
stm_dir = os.path.dirname(stm_path)
103+
sph_dir = os.path.join(os.path.dirname(stm_dir), "sph")
104+
with tf.io.gfile.GFile(stm_path) as f:
105+
for line in f:
106+
line = line.strip()
107+
fn, channel, speaker, start, end, label, transcript = line.split(" ", 6)
108+
transcript = _maybe_trim_suffix(transcript)
109+
110+
audio_file = "%s.sph" % fn
111+
samples = _extract_audio_segment(
112+
os.path.join(sph_dir, audio_file), int(channel), float(start),
113+
float(end))
114+
115+
key = "-".join([speaker, start, end, label])
116+
example = {
117+
"speech": samples,
118+
"text": transcript,
119+
"speaker_id": speaker,
120+
"gender": _parse_gender(label),
121+
"id": key,
122+
}
123+
yield key, example
124+
125+
126+
def _maybe_trim_suffix(transcript):
127+
# stm files for the train split contain a key (enclosed in parens) at the end.
128+
splits = transcript.rsplit(" ", 1)
129+
transcript = splits[0]
130+
if len(splits) > 1:
131+
suffix = splits[-1]
132+
if not suffix.startswith("("):
133+
transcript += " " + suffix
134+
return transcript
135+
136+
137+
def _parse_gender(label_str):
138+
gender = re.split(",|_", label_str)[-1][:-1]
139+
# Fix inconsistencies in the data.
140+
if not gender:
141+
gender = -1 # Missing label.
142+
elif gender == "F":
143+
gender = "female"
144+
elif gender == "M":
145+
gender = "male"
146+
return gender
147+
148+
149+
def _extract_audio_segment(sph_path, channel, start_sec, end_sec):
150+
"""Extracts segment of audio samples (as an ndarray) from the given path."""
151+
with tf.io.gfile.GFile(sph_path, "rb") as f:
152+
segment = tfds.core.lazy_imports.pydub.AudioSegment.from_file(
153+
f, format="nistsphere")
154+
# The dataset only contains mono audio.
155+
assert segment.channels == 1
156+
assert channel == 1
157+
start_ms = int(start_sec * 1000)
158+
end_ms = int(end_sec * 1000)
159+
segment = segment[start_ms:end_ms]
160+
samples = np.array(segment.get_array_of_samples())
161+
return samples
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""Tests for tedlium dataset module."""
18+
19+
from __future__ import absolute_import
20+
from __future__ import division
21+
from __future__ import print_function
22+
23+
from tensorflow_datasets import testing
24+
from tensorflow_datasets.audio import tedlium
25+
import tensorflow_datasets.public_api as tfds
26+
27+
28+
class TedliumTest(testing.DatasetBuilderTestCase):
29+
DATASET_CLASS = tedlium.Tedlium
30+
SPLITS = {
31+
tfds.Split.TRAIN: 4,
32+
tfds.Split.TEST: 1,
33+
tfds.Split.VALIDATION: 1,
34+
}
35+
36+
37+
if __name__ == "__main__":
38+
testing.test_main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hulk_2000 1 Hulk_2000 0.10 0.7 <o,f0,male> go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Groot_2001 1 Groot_2001 0.00 2.30 <o,f0,unknown> he began a confused complaint against the wizard who had vanished behind the curtain on the left
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Denny_2003 1 Denny_2003 0.01 1.00 <o,f0,male> go do you hear (suffix)
2+
Denny_2003 1 Denny_2003 1.05 1.75 <o,f0,male> {noise} maybe (suffix2)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Sean_2004 1 Sean_2004 0.1 3.34 <F0_M> forgotten too the name of Gillian the lovely captive (suffix)
2+
Sean_2004 1 Sean_2004 3.40 4.01 <F0_M> some transcript text (another_suffix)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz 21285615015 30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27

0 commit comments

Comments
 (0)