Add TED-LIUM (release 1) speech recognition dataset.

ronw · copybara-github · commit 7b7dc3af474d · 2020-03-26T10:50:10.000-07:00
PiperOrigin-RevId: 303146158
diff --git a/tensorflow_datasets/audio/__init__.py b/tensorflow_datasets/audio/__init__.py
@@ -25,3 +25,4 @@
 from tensorflow_datasets.audio.nsynth import Nsynth
 from tensorflow_datasets.audio.savee import Savee
 from tensorflow_datasets.audio.speech_commands import SpeechCommands
+from tensorflow_datasets.audio.tedlium import Tedlium
diff --git a/tensorflow_datasets/audio/tedlium.py b/tensorflow_datasets/audio/tedlium.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""TED-LIUM speech recognition dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import numpy as np
+
+import tensorflow.compat.v2 as tf
+
+import tensorflow_datasets.public_api as tfds
+
+_DESCRIPTION = """\
+The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled
+at 16kHz. It contains about 118 hours of speech.
+
+This is the TED-LIUM corpus release 1,
+licensed under Creative Commons BY-NC-ND 3.0
+(http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en).
+"""
+
+_CITATION = """\
+@inproceedings{rousseau2012tedlium,
+  title={TED-LIUM: an Automatic Speech Recognition dedicated corpus.},
+  author={Rousseau, Anthony and Del{\\'e}glise, Paul and Est{\\`e}ve, Yannick},
+  booktitle={Conference on Language Resources and Evaluation (LREC)},
+  pages={125--129},
+  year={2012}
+}
+"""
+
+_URL = "https://www.openslr.org/7/"
+_DL_URL = "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz"
+
+
+class Tedlium(tfds.core.BeamBasedBuilder):
+  """TED-LIUM dataset release 1."""
+
+  VERSION = tfds.core.Version("1.0.0")
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "speech":
+                tfds.features.Audio(),
+            "text":
+                tfds.features.Text(),
+            "speaker_id":
+                tf.string,
+            "gender":
+                tfds.features.ClassLabel(names=["unknown", "female", "male"]),
+            "id":
+                tf.string,
+        }),
+        supervised_keys=("speech", "text"),
+        homepage="https://www.openslr.org/7/",
+        citation=_CITATION,
+        metadata=tfds.core.MetadataDict(sample_rate=16000,),
+    )
+
+  def _split_generators(self, dl_manager):
+    extracted_dir = dl_manager.download_and_extract(_DL_URL)
+    base_dir = os.path.join(extracted_dir, "TEDLIUM_release1")
+    splits = []
+    for split, dir_name in [(tfds.Split.TRAIN, "train"),
+                            (tfds.Split.VALIDATION, "dev"),
+                            (tfds.Split.TEST, "test")]:
+      kwargs = {"directory": os.path.join(base_dir, dir_name)}
+      splits.append(tfds.core.SplitGenerator(name=split, gen_kwargs=kwargs))
+    return splits
+
+  def _build_pcollection(self, pipeline, directory):
+    beam = tfds.core.lazy_imports.apache_beam
+    stm_files = tf.io.gfile.glob(os.path.join(directory, "stm", "*stm"))
+    return (pipeline
+            | beam.Create(stm_files)
+            | beam.FlatMap(_generate_examples_from_stm_file))
+
+
+def _generate_examples_from_stm_file(stm_path):
+  """Generate examples from a TED-LIUM stm file."""
+  stm_dir = os.path.dirname(stm_path)
+  sph_dir = os.path.join(os.path.dirname(stm_dir), "sph")
+  with tf.io.gfile.GFile(stm_path) as f:
+    for line in f:
+      line = line.strip()
+      fn, channel, speaker, start, end, label, transcript = line.split(" ", 6)
+      transcript = _maybe_trim_suffix(transcript)
+
+      audio_file = "%s.sph" % fn
+      samples = _extract_audio_segment(
+          os.path.join(sph_dir, audio_file), int(channel), float(start),
+          float(end))
+
+      key = "-".join([speaker, start, end, label])
+      example = {
+          "speech": samples,
+          "text": transcript,
+          "speaker_id": speaker,
+          "gender": _parse_gender(label),
+          "id": key,
+      }
+      yield key, example
+
+
+def _maybe_trim_suffix(transcript):
+  # stm files for the train split contain a key (enclosed in parens) at the end.
+  splits = transcript.rsplit(" ", 1)
+  transcript = splits[0]
+  if len(splits) > 1:
+    suffix = splits[-1]
+    if not suffix.startswith("("):
+      transcript += " " + suffix
+  return transcript
+
+
+def _parse_gender(label_str):
+  gender = re.split(",|_", label_str)[-1][:-1]
+  # Fix inconsistencies in the data.
+  if not gender:
+    gender = -1  # Missing label.
+  elif gender == "F":
+    gender = "female"
+  elif gender == "M":
+    gender = "male"
+  return gender
+
+
+def _extract_audio_segment(sph_path, channel, start_sec, end_sec):
+  """Extracts segment of audio samples (as an ndarray) from the given path."""
+  with tf.io.gfile.GFile(sph_path, "rb") as f:
+    segment = tfds.core.lazy_imports.pydub.AudioSegment.from_file(
+        f, format="nistsphere")
+  # The dataset only contains mono audio.
+  assert segment.channels == 1
+  assert channel == 1
+  start_ms = int(start_sec * 1000)
+  end_ms = int(end_sec * 1000)
+  segment = segment[start_ms:end_ms]
+  samples = np.array(segment.get_array_of_samples())
+  return samples
diff --git a/tensorflow_datasets/audio/tedlium_test.py b/tensorflow_datasets/audio/tedlium_test.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for tedlium dataset module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.audio import tedlium
+import tensorflow_datasets.public_api as tfds
+
+
+class TedliumTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = tedlium.Tedlium
+  SPLITS = {
+      tfds.Split.TRAIN: 4,
+      tfds.Split.TEST: 1,
+      tfds.Split.VALIDATION: 1,
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/dev/sph/Hulk_2000.sph b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/dev/sph/Hulk_2000.sph
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/dev/stm/Hulk_2000.stm b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/dev/stm/Hulk_2000.stm
@@ -0,0 +1 @@
+Hulk_2000 1 Hulk_2000 0.10 0.7 <o,f0,male> go do you hear
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/test/sph/Groot_2001.sph b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/test/sph/Groot_2001.sph
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/test/stm/Groot_2001.stm b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/test/stm/Groot_2001.stm
@@ -0,0 +1 @@
+Groot_2001 1 Groot_2001 0.00 2.30 <o,f0,unknown> he began a confused complaint against the wizard who had vanished behind the curtain on the left
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/sph/Denny_2003.sph b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/sph/Denny_2003.sph
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/sph/Sean_2004.sph b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/sph/Sean_2004.sph
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/stm/Denny_2003.stm b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/stm/Denny_2003.stm
@@ -0,0 +1,2 @@
+Denny_2003 1 Denny_2003 0.01 1.00 <o,f0,male> go do you hear (suffix)
+Denny_2003 1 Denny_2003 1.05 1.75 <o,f0,male> {noise} maybe (suffix2)
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/stm/Sean_2004.stm b/tensorflow_datasets/testing/test_data/fake_examples/tedlium/TEDLIUM_release1/train/stm/Sean_2004.stm
@@ -0,0 +1,2 @@
+Sean_2004 1 Sean_2004 0.1 3.34 <F0_M> forgotten too the name of Gillian the lovely captive (suffix)
+Sean_2004 1 Sean_2004 3.40 4.01 <F0_M> some transcript text (another_suffix)
diff --git a/tensorflow_datasets/url_checksums/tedlium.txt b/tensorflow_datasets/url_checksums/tedlium.txt
@@ -0,0 +1 @@
+http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz 21285615015 30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hulk_2000 1 Hulk_2000 0.10 0.7 <o,f0,male> go do you hear`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Groot_2001 1 Groot_2001 0.00 2.30 <o,f0,unknown> he began a confused complaint against the wizard who had vanished behind the curtain on the left`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Denny_2003 1 Denny_2003 0.01 1.00 <o,f0,male> go do you hear (suffix)`
	`2`	`+Denny_2003 1 Denny_2003 1.05 1.75 <o,f0,male> {noise} maybe (suffix2)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Sean_2004 1 Sean_2004 0.1 3.34 <F0_M> forgotten too the name of Gillian the lovely captive (suffix)`
	`2`	`+Sean_2004 1 Sean_2004 3.40 4.01 <F0_M> some transcript text (another_suffix)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz 21285615015 30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27`