Adding a minimum viable DNA data encoder.

alexyku · Ryan Sepassi · commit 554973f1d4d8 · 2017-08-04T16:14:20.000-07:00
PiperOrigin-RevId: 164201984
diff --git a/tensor2tensor/data_generators/dna_encoder.py b/tensor2tensor/data_generators/dna_encoder.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Encoders for DNA data.
+
+* DNAEncoder: ACTG strings to ints and back
+* DelimitedDNAEncoder: for delimited subsequences
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from tensor2tensor.data_generators import text_encoder
+
+
+class DNAEncoder(text_encoder.TextEncoder):
+  """ACTG strings to ints and back. Optionally chunks bases into single ids.
+
+  To use a different character set, subclass and set BASES to the char set. UNK
+  and PAD must not appear in the char set, but can also be reset.
+
+  Uses 'N' as an unknown base.
+  """
+  BASES = list("ACTG")
+  UNK = "N"
+  PAD = "0"
+
+  def __init__(self,
+               chunk_size=1,
+               num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS):
+    super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+    # Build a vocabulary of chunks of size chunk_size
+    self._chunk_size = chunk_size
+    tokens = self._tokens()
+    tokens.sort()
+    ids = range(self._num_reserved_ids, len(tokens) + self._num_reserved_ids)
+    self._ids_to_tokens = dict(zip(ids, tokens))
+    self._tokens_to_ids = dict(zip(tokens, ids))
+
+  def _tokens(self):
+    chunks = []
+    for size in range(1, self._chunk_size + 1):
+      c = itertools.product(self.BASES + [self.UNK], repeat=size)
+      num_pad = self._chunk_size - size
+      padding = (self.PAD,) * num_pad
+      c = [el + padding for el in c]
+      chunks.extend(c)
+    return chunks
+
+  @property
+  def vocab_size(self):
+    return len(self._ids_to_tokens) + self._num_reserved_ids
+
+  def encode(self, s):
+    bases = list(s)
+    extra = len(bases) % self._chunk_size
+    if extra > 0:
+      pad = [self.PAD] * (self._chunk_size - extra)
+      bases.extend(pad)
+    assert (len(bases) % self._chunk_size) == 0
+    num_chunks = len(bases) // self._chunk_size
+    ids = []
+    for chunk_idx in xrange(num_chunks):
+      start_idx = chunk_idx * self._chunk_size
+      end_idx = start_idx + self._chunk_size
+      chunk = tuple(bases[start_idx:end_idx])
+      if chunk not in self._tokens_to_ids:
+        raise ValueError("Unrecognized token %s" % chunk)
+      ids.append(self._tokens_to_ids[chunk])
+    return ids
+
+  def decode(self, ids):
+    bases = []
+    for idx in ids:
+      if idx >= self._num_reserved_ids:
+        chunk = self._ids_to_tokens[idx]
+        if self.PAD in chunk:
+          chunk = chunk[:chunk.index(self.PAD)]
+      else:
+        chunk = [text_encoder.RESERVED_TOKENS[idx]]
+      bases.extend(chunk)
+    return "".join(bases)
+
+
+class DelimitedDNAEncoder(DNAEncoder):
+  """DNAEncoder for delimiter separated subsequences.
+
+  Uses ',' as default delimiter.
+  """
+
+  def __init__(self, delimiter=",", **kwargs):
+    self._delimiter = delimiter
+    super(DelimitedDNAEncoder, self).__init__(**kwargs)
+
+  @property
+  def delimiter(self):
+    return self._delimiter
+
+  def _tokens(self):
+    return super(DelimitedDNAEncoder, self)._tokens() + [self.delimiter]
+
+  def encode(self, delimited_string):
+    ids = []
+    for s in delimited_string.split(self.delimiter):
+      ids.extend(super(DelimitedDNAEncoder, self).encode(s))
+      ids.append(self._tokens_to_ids[self.delimiter])
+    return ids[:-1]
diff --git a/tensor2tensor/data_generators/dna_encoder_test.py b/tensor2tensor/data_generators/dna_encoder_test.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensor2tensor.data_generators.dna_encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import dna_encoder
+import tensorflow as tf
+
+
+class DnaEncoderTest(tf.test.TestCase):
+
+  def test_encode_decode(self):
+    original = 'TTCGCGGNNNAACCCAACGCCATCTATGTANNTTGAGTTGTTGAGTTAAA'
+
+    # Encoding should be reversible for any reasonable chunk size.
+    for chunk_size in [1, 2, 4, 6, 8]:
+      encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size)
+      encoded = encoder.encode(original)
+      decoded = encoder.decode(encoded)
+      self.assertEqual(original, decoded)
+
+  def test_delimited_dna_encoder(self):
+    original = 'TTCGCGGNNN,AACCCAACGC,CATCTATGTA,NNTTGAGTTG,TTGAGTTAAA'
+
+    # Encoding should be reversible for any reasonable chunk size.
+    for chunk_size in [1, 2, 4, 6, 8]:
+      encoder = dna_encoder.DelimitedDNAEncoder(chunk_size=chunk_size)
+      encoded = encoder.encode(original)
+      decoded = encoder.decode(encoded)
+      self.assertEqual(original, decoded)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
@@ -35,7 +35,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import math
 import multiprocessing as mp
 import os
@@ -47,6 +46,7 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensor2tensor.data_generators import dna_encoder
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
@@ -56,7 +56,6 @@
 import tensorflow as tf
 
 MAX_CONCURRENT_PROCESSES = 10
-_bases = list("ACTG")
 
 
 class GeneExpressionProblem(problem.Problem):
@@ -82,7 +81,7 @@ def chunk_size(self):
   def feature_encoders(self, data_dir):
     del data_dir
     return {
-        "inputs": DNAEncoder(chunk_size=self.chunk_size),
+        "inputs": dna_encoder.DNAEncoder(chunk_size=self.chunk_size),
         # TODO(rsepassi): RealEncoder?
         "targets": text_encoder.TextEncoder()
     }
@@ -244,7 +243,7 @@ def dataset_generator(filepath,
                       chunk_size=1,
                       start_idx=None,
                       end_idx=None):
-  encoder = DNAEncoder(chunk_size=chunk_size)
+  encoder = dna_encoder.DNAEncoder(chunk_size=chunk_size)
   with h5py.File(filepath, "r") as h5_file:
     # Get input keys from h5_file
     src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
@@ -278,7 +277,7 @@ def to_example_dict(encoder, inputs, mask, outputs):
     while idx != last_idx + 1:
       bases.append(encoder.UNK)
       last_idx += 1
-    bases.append(_bases[base_id])
+    bases.append(encoder.BASES[base_id])
     last_idx = idx
   assert len(inputs) == len(bases)
 
@@ -297,62 +296,3 @@ def to_example_dict(encoder, inputs, mask, outputs):
   ex_dict = dict(
       zip(example_keys, [input_ids, targets_mask, targets, targets_shape]))
   return ex_dict
-
-
-class DNAEncoder(text_encoder.TextEncoder):
-  """ACTG strings to ints and back. Optionally chunks bases into single ids.
-
-  Uses 'X' as an unknown base.
-  """
-  UNK = "X"
-  PAD = "0"
-
-  def __init__(self,
-               chunk_size=1,
-               num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS):
-    super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
-    # Build a vocabulary of chunks of size chunk_size
-    self._chunk_size = chunk_size
-    chunks = []
-    for size in range(1, chunk_size + 1):
-      c = itertools.product(_bases + [DNAEncoder.UNK], repeat=size)
-      num_pad = chunk_size - size
-      padding = (DNAEncoder.PAD,) * num_pad
-      c = [el + padding for el in c]
-      chunks.extend(c)
-    chunks.sort()
-    ids = range(self._num_reserved_ids, len(chunks) + self._num_reserved_ids)
-    self._ids_to_chunk = dict(zip(ids, chunks))
-    self._chunks_to_ids = dict(zip(chunks, ids))
-
-  @property
-  def vocab_size(self):
-    return len(self._ids_to_chunk) + self._num_reserved_ids
-
-  def encode(self, s):
-    bases = list(s)
-    pad = [DNAEncoder.PAD] * (len(bases) % self._chunk_size)
-    bases.extend(pad)
-    assert (len(bases) % self._chunk_size) == 0
-    num_chunks = len(bases) // self._chunk_size
-    ids = []
-    for chunk_idx in xrange(num_chunks):
-      start_idx = chunk_idx * self._chunk_size
-      end_idx = start_idx + self._chunk_size
-      chunk = tuple(bases[start_idx:end_idx])
-      if chunk not in self._chunks_to_ids:
-        raise ValueError("Unrecognized chunk %s" % chunk)
-      ids.append(self._chunks_to_ids[chunk])
-    return ids
-
-  def decode(self, ids):
-    bases = []
-    for idx in ids:
-      if idx >= self._num_reserved_ids:
-        chunk = self._ids_to_chunk[idx]
-        if DNAEncoder.PAD in chunk:
-          chunk = chunk[:chunk.index(DNAEncoder.PAD)]
-      else:
-        chunk = [text_encoder.RESERVED_TOKENS[idx]]
-      bases.extend(chunk)
-    return "".join(bases)
diff --git a/tensor2tensor/data_generators/gene_expression_test.py b/tensor2tensor/data_generators/gene_expression_test.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from tensor2tensor.data_generators import dna_encoder
 from tensor2tensor.data_generators import gene_expression
 
 import tensorflow as tf
@@ -40,8 +41,8 @@ def _oneHotBases(self, bases):
     return np.array(one_hots)
 
   def testRecordToExample(self):
-    encoder = gene_expression.DNAEncoder(chunk_size=2)
-    raw_inputs = ["A", "C", "G", "X", "C", "T"]
+    encoder = dna_encoder.DNAEncoder(chunk_size=2)
+    raw_inputs = ["A", "C", "G", "N", "C", "T"]
 
     # Put in numpy arrays in the same format as in the h5 file
     inputs = self._oneHotBases(raw_inputs)