mlpack · mrityunjay-tripathi · Aug 18, 2020 · Aug 18, 2020 · Aug 18, 2020 · Aug 18, 2020
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ data/*
 *.bin
 .travis/configs.hpp
 Testing/*
+.vscode/*
diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
@@ -1,7 +1,15 @@
 cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
 project(models)
 
-add_subdirectory(darknet)
+# Recurse into each model mlpack provides.
+set(DIRS
+  darknet
+  bert
+)
+
+foreach(dir ${DIRS})
+    add_subdirectory(${dir})
+endforeach()
 
 # Add directory name to sources.
 set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)

diff --git a/models/bert/CMakeLists.txt b/models/bert/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
+project(bert)
+
+set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+set(SOURCES
+  bert.hpp
+  bert_impl.hpp
+  bert_tokenizer.hpp
+  bert_tokenizer_impl.hpp
+)
+
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+
+set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE)
diff --git a/models/bert/bert.hpp b/models/bert/bert.hpp
@@ -0,0 +1,111 @@
+/**
+ * @file models/bert/bert.hpp
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the BERT (Bidirectional Encoder Representation for Transformers).
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_BERT_BERT_HPP
+#define MODELS_BERT_BERT_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/ffn.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/init_rules/glorot_init.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * @tparam OutputLayerType Type of the last layer to be added to BERT model.
+ * @tparam InitializationRuleType Initilization Rule to be used to initialize
+ *         parameters.
+ */
+template <
+  typename OutputLayerType = NegativeLogLikelihood<>,
+  typename InitializationRuleType = XavierInitialization
+>
+class BERT
+{
+ public:
+  BERT();
+
+  /**
+   * Create the BERT object using the specified parameters.
+   *
+   * @param srcVocabSize The size of the vocabulary.
+   * @param srcSeqLen The source sequence length.
+   * @param numEncoderLayers The number of Transformer Encoder layers.
+   * @param dModel The dimensionality of the model.
+   * @param numHeads The number of attention heads.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask used to black-out future sequences.
+   * @param keyPaddingMask Blacks out specific tokens.
+   */
+  BERT(const size_t srcVocabSize,
+       const size_t srcSeqLen,
+       const size_t numEncoderLayers = 12,
+       const size_t dModel = 512,
+       const size_t numHeads = 8,
+       const double dropout = 0.1,
+       const arma::mat& attentionMask = arma::mat(),
+       const arma::mat& keyPaddingMask = arma::mat());
+
+  /**
+   * Load the network from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the network locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+ private:
+  //! Locally-stored size of the vocabulary.
+  size_t srcVocabSize;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of Transformer Encoder blocks.
+  size_t numEncoderLayers;
+
+  //! Locally-stored dimensionality of the model.
+  size_t dModel;
+
+  //! Locally-stored number of attention heads.
+  size_t numHeads;
+
+  //! Locally-stored number of hidden units in FFN.
+  size_t dimFFN;
+
+  //! Locally-stored dropout rate.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  arma::mat attentionMask;
+
+  //! Locally-stored key padding mask.
+  arma::mat keyPaddingMask;
+
+  //! Locally-stored complete decoder network.
+  FFN<OutputLayerType, InitializationRuleType> bert;
+}; // class BERT
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "bert_impl.hpp"
+
+#endif
diff --git a/models/bert/bert_impl.hpp b/models/bert/bert_impl.hpp
@@ -0,0 +1,99 @@
+/**
+ * @file models/bert/bert_impl.hpp
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the BERT (Bidirectional Encoder Representation for
+ * Transformers).
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_BERT_BERT_IMPL_HPP
+#define MODELS_BERT_BERT_IMPL_HPP
+
+#include "bert.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template<typename OutputLayerType, typename InitializationRuleType>
+BERT<OutputLayerType, InitializationRuleType>::BERT() :
+    srcVocabSize(0),
+    srcSeqLen(0),
+    numEncoderLayers(0),
+    dModel(0),
+    numHeads(0),
+    dimFFN(4 * dModel),
+    dropout(0.0)
+{
+  // Nothing to do here.
+}
+
+template<typename OutputLayerType, typename InitializationRuleType>
+BERT<OutputLayerType, InitializationRuleType>::BERT(
+    const size_t srcVocabSize,
+    const size_t srcSeqLen,
+    const size_t numEncoderLayers,
+    const size_t dModel,
+    const size_t numHeads,
+    const double dropout,
+    const arma::mat& attentionMask,
+    const arma::mat& keyPaddingMask) :
+    srcVocabSize(srcVocabSize),
+    srcSeqLen(srcSeqLen),
+    numEncoderLayers(numEncoderLayers),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(4 * dModel),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  AddMerge<>* embedding = new AddMerge<>();
+  embedding->Add<Lookup<>>(vocabSize, dModel);
+  embedding->Add<Lookup<>>(3, dModel);
+
+  bert.Add(embedding);
+  bert.Add<PositionalEncoding<>>(dModel, srcSeqLen);
+  bert.Add<Dropout<>>(dropout);
+
+  for (size_t i = 0; i < numLayers; ++i)
+  {
+    mlpack::ann::TransformerEncoder<> encoder(
+      numEncoderLayers,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask);
+
+    bert.Add(encoder.Model());
+  }
+}
+
+template<typename OutputLayerType, typename InitializationRuleType>
+void BERT<OutputLayerType, InitializationRuleType>::LoadModel(
+    const std::string& filepath)
+{
+  data::Load(filepath, "BERT", bert);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template<typename OutputLayerType, typename InitializationRuleType>
+void BERT<OutputLayerType, InitializationRuleType>::SaveModel(
+    const std::string& filepath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filepath, "BERT", bert);
+  std::cout << "Model saved in " << filepath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/models/bert/bert_tokenizer.hpp b/models/bert/bert_tokenizer.hpp
@@ -0,0 +1,135 @@
+/**
+ * @file models/bert/bert_tokenizer.hpp
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the BERT Tokenizer.
+ *
+ * @code
+ * @article{Wolf2019HuggingFacesTS,
+ * title = {HuggingFace's Transformers: State-of-the-art Natural Language
+ *          Processing},
+ * author = {Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond
+ *           and Clement Delangue and Anthony Moi and Pierric Cistac and
+ *           Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
+ * journal = {ArXiv},
+ * year = {2019},
+ * volume = {abs/1910.03771}
+ * }
+ * @endcode
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_BERT_BERT_TOKENIZER_HPP
+#define MODELS_BERT_BERT_TOKENIZER_HPP
+
+#include <mlpack/prereqs.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class BertTokenizer
+{
+ public:
+  /**
+   * Create a BertTokenizer object.
+   */
+  BertTokenizer();
+
+  /**
+   * Create the TransformerDecoder object using the specified parameters.
+   *
+   * @param vocabFile Location of file containing the vocabulary.
+   * @param lowerCase Whether to turn each token to lower case.
+   * @param basicTokenize Whether to do basic tokenization before WordPiece.
+   * @param neverSplit Tokens which will never be split during tokenization.
+   *        Only has an effect when basicTokenize = true.
+   * @param unkSplit The unknown token. A token that is not in the vocabulary
+   *        cannot be converted to an ID and is set to be this token instead.
+   * @param sepToken The separator token. It is used when building a sequence
+   *        from multiple sequences, e.g. two sequences for sequence
+   *        classification or for a text and a question for question answering.
+   *        It is also used as the last token of a sequence built with special
+   *        tokens.
+   * @param padToken The token used for padding, for example when batching
+   *        sequences of different lengths.
+   * @param clsToken The classifier token which is used when doing sequence
+   *        classification (classification of the whole sequence instead of
+   *        per-token classification). It is the first token of the sequence
+   *        when built with special tokens.
+   * @param maskToken The token used for masking values. This is the token used
+   *        when training this model with masked language modeling. This is the
+   *        token which the model will try to predict.
+   */
+  BertTokenizer(const std::string vocabFile,
+       const bool lowerCase = true,
+       const bool basicTokenize = true,
+       const std::vector<std::string> neverSplit = std::vector<std::string>(),
+       const std::string unkToken = "[UNK]",
+       const std::string sepToken = "[SEP]",
+       const std::string padToken = "[PAD]",
+       const std::string clsToken = "[CLS]",
+       const std::string maskToken = "[MASK]");
+
+ private:
+  //! Location of vocabulary.
+  std::string vocabFile;
+
+  //! Locally-stored vocabulary.
+  std::vector<std::string> vocabulary;
+
+  //! Whether to turn each token to lower case.
+  bool lowerCase;
+
+  //! Whether to do basic tokenization before WordPiece.
+  bool basicTokenize;
+
+  //! Tokens which will never be split during tokenization. Only has an effect
+  //! when basicTokenize = true.
+  std::vector<std::string> neverSplit;
+
+  //! The unknown token. A token that is not in the vocabulary cannot be
+  //! converted to an ID and is set to be this token instead.
+  std::string unkToken;
+
+  //! The separator token. It is used when building a sequence from multiple
+  //! sequences, e.g. two sequences for sequence classification or for a text
+  //! and a question for question answering. It is also used as the last token
+  //! of a sequence built with special tokens.
+  std::string sepToken;
+
+  //! The token used for padding, for example when batching sequences of
+  //! different lengths.
+  std::string padToken;
+
+  //! The classifier token which is used when doing sequence classification
+  //! (classification of the whole sequence instead of per-token
+  //! classification). It is the first token of the sequence when built with
+  //! special tokens.
+  std::string clsToken;
+
+  //! The token used for masking values. This is the token used when training
+  //! this model with masked language modeling. This is the token which the
+  //! model will try to predict.
+  std::string maskToken;
+}; // class BertTokenizer
+
+// Include implementation.
+#include "bert_tokenizer_impl.hpp"
+} // namespace ann
+} // namespace mlpack
+
+#endif