Skip to content
Draft
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ data/*
*.bin
.travis/configs.hpp
Testing/*
.vscode/*
10 changes: 9 additions & 1 deletion models/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
project(models)

add_subdirectory(darknet)
# Recurse into each model mlpack provides.
set(DIRS
darknet
bert
)

foreach(dir ${DIRS})
add_subdirectory(${dir})
endforeach()

# Add directory name to sources.
set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
Expand Down
18 changes: 18 additions & 0 deletions models/bert/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
project(bert)

set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../")

set(SOURCES
bert.hpp
bert_impl.hpp
bert_tokenizer.hpp
bert_tokenizer_impl.hpp
)

foreach(file ${SOURCES})
set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
endforeach()

set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE)
111 changes: 111 additions & 0 deletions models/bert/bert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
* @file models/bert/bert.hpp
* @author Mrityunjay Tripathi
*
* Definition of the BERT (Bidirectional Encoder Representation for Transformers).
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_HPP
#define MODELS_BERT_BERT_HPP

#include <mlpack/prereqs.hpp>
#include <mlpack/methods/ann/ffn.hpp>
#include <mlpack/methods/ann/layer/layer_types.hpp>
#include <mlpack/methods/ann/init_rules/glorot_init.hpp>

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

/**
* @tparam OutputLayerType Type of the last layer to be added to BERT model.
* @tparam InitializationRuleType Initilization Rule to be used to initialize
* parameters.
*/
template <
typename OutputLayerType = NegativeLogLikelihood<>,
typename InitializationRuleType = XavierInitialization
>
class BERT
{
public:
BERT();

/**
* Create the BERT object using the specified parameters.
*
* @param srcVocabSize The size of the vocabulary.
* @param srcSeqLen The source sequence length.
* @param numEncoderLayers The number of Transformer Encoder layers.
* @param dModel The dimensionality of the model.
* @param numHeads The number of attention heads.
* @param dropout The dropout rate.
* @param attentionMask The attention mask used to black-out future sequences.
* @param keyPaddingMask Blacks out specific tokens.
*/
BERT(const size_t srcVocabSize,
const size_t srcSeqLen,
const size_t numEncoderLayers = 12,
const size_t dModel = 512,
const size_t numHeads = 8,
const double dropout = 0.1,
const arma::mat& attentionMask = arma::mat(),
const arma::mat& keyPaddingMask = arma::mat());

/**
* Load the network from a local directory.
*
* @param filepath The location of the stored model.
*/
void LoadModel(const std::string& filepath);

/**
* Save the network locally.
*
* @param filepath The location where the model is to be saved.
*/
void SaveModel(const std::string& filepath);

private:
//! Locally-stored size of the vocabulary.
size_t srcVocabSize;

//! Locally-stored source sequence length.
size_t srcSeqLen;

//! Locally-stored number of Transformer Encoder blocks.
size_t numEncoderLayers;

//! Locally-stored dimensionality of the model.
size_t dModel;

//! Locally-stored number of attention heads.
size_t numHeads;

//! Locally-stored number of hidden units in FFN.
size_t dimFFN;

//! Locally-stored dropout rate.
double dropout;

//! Locally-stored attention mask.
arma::mat attentionMask;

//! Locally-stored key padding mask.
arma::mat keyPaddingMask;

//! Locally-stored complete decoder network.
FFN<OutputLayerType, InitializationRuleType> bert;
}; // class BERT

} // namespace ann
} // namespace mlpack

// Include implementation.
#include "bert_impl.hpp"

#endif
99 changes: 99 additions & 0 deletions models/bert/bert_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/**
* @file models/bert/bert_impl.hpp
* @author Mrityunjay Tripathi
*
* Implementation of the BERT (Bidirectional Encoder Representation for
* Transformers).
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_IMPL_HPP
#define MODELS_BERT_BERT_IMPL_HPP

#include "bert.hpp"

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

template<typename OutputLayerType, typename InitializationRuleType>
BERT<OutputLayerType, InitializationRuleType>::BERT() :
srcVocabSize(0),
srcSeqLen(0),
numEncoderLayers(0),
dModel(0),
numHeads(0),
dimFFN(4 * dModel),
dropout(0.0)
{
// Nothing to do here.
}

template<typename OutputLayerType, typename InitializationRuleType>
BERT<OutputLayerType, InitializationRuleType>::BERT(
const size_t srcVocabSize,
const size_t srcSeqLen,
const size_t numEncoderLayers,
const size_t dModel,
const size_t numHeads,
const double dropout,
const arma::mat& attentionMask,
const arma::mat& keyPaddingMask) :
srcVocabSize(srcVocabSize),
srcSeqLen(srcSeqLen),
numEncoderLayers(numEncoderLayers),
dModel(dModel),
numHeads(numHeads),
dimFFN(4 * dModel),
dropout(dropout),
attentionMask(attentionMask),
keyPaddingMask(keyPaddingMask)
{
AddMerge<>* embedding = new AddMerge<>();
embedding->Add<Lookup<>>(vocabSize, dModel);
embedding->Add<Lookup<>>(3, dModel);

bert.Add(embedding);
bert.Add<PositionalEncoding<>>(dModel, srcSeqLen);
bert.Add<Dropout<>>(dropout);

for (size_t i = 0; i < numLayers; ++i)
{
mlpack::ann::TransformerEncoder<> encoder(
numEncoderLayers,
srcSeqLen,
dModel,
numHeads,
dimFFN,
dropout,
attentionMask,
keyPaddingMask);

bert.Add(encoder.Model());
}
}

template<typename OutputLayerType, typename InitializationRuleType>
void BERT<OutputLayerType, InitializationRuleType>::LoadModel(
const std::string& filepath)
{
data::Load(filepath, "BERT", bert);
std::cout << "Loaded model" << std::endl;
}

template<typename OutputLayerType, typename InitializationRuleType>
void BERT<OutputLayerType, InitializationRuleType>::SaveModel(
const std::string& filepath)
{
std::cout << "Saving model" << std::endl;
data::Save(filepath, "BERT", bert);
std::cout << "Model saved in " << filepath << std::endl;
}

} // namespace ann
} // namespace mlpack

#endif
135 changes: 135 additions & 0 deletions models/bert/bert_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/**
* @file models/bert/bert_tokenizer.hpp
* @author Mrityunjay Tripathi
*
* Definition of the BERT Tokenizer.
*
* @code
* @article{Wolf2019HuggingFacesTS,
* title = {HuggingFace's Transformers: State-of-the-art Natural Language
* Processing},
* author = {Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond
* and Clement Delangue and Anthony Moi and Pierric Cistac and
* Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
* journal = {ArXiv},
* year = {2019},
* volume = {abs/1910.03771}
* }
* @endcode
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_TOKENIZER_HPP
#define MODELS_BERT_BERT_TOKENIZER_HPP

#include <mlpack/prereqs.hpp>

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

/**
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat
>
class BertTokenizer
{
public:
/**
* Create a BertTokenizer object.
*/
BertTokenizer();

/**
* Create the TransformerDecoder object using the specified parameters.
*
* @param vocabFile Location of file containing the vocabulary.
* @param lowerCase Whether to turn each token to lower case.
* @param basicTokenize Whether to do basic tokenization before WordPiece.
* @param neverSplit Tokens which will never be split during tokenization.
* Only has an effect when basicTokenize = true.
* @param unkSplit The unknown token. A token that is not in the vocabulary
* cannot be converted to an ID and is set to be this token instead.
* @param sepToken The separator token. It is used when building a sequence
* from multiple sequences, e.g. two sequences for sequence
* classification or for a text and a question for question answering.
* It is also used as the last token of a sequence built with special
* tokens.
* @param padToken The token used for padding, for example when batching
* sequences of different lengths.
* @param clsToken The classifier token which is used when doing sequence
* classification (classification of the whole sequence instead of
* per-token classification). It is the first token of the sequence
* when built with special tokens.
* @param maskToken The token used for masking values. This is the token used
* when training this model with masked language modeling. This is the
* token which the model will try to predict.
*/
BertTokenizer(const std::string vocabFile,
const bool lowerCase = true,
const bool basicTokenize = true,
const std::vector<std::string> neverSplit = std::vector<std::string>(),
const std::string unkToken = "[UNK]",
const std::string sepToken = "[SEP]",
const std::string padToken = "[PAD]",
const std::string clsToken = "[CLS]",
const std::string maskToken = "[MASK]");

private:
//! Location of vocabulary.
std::string vocabFile;

//! Locally-stored vocabulary.
std::vector<std::string> vocabulary;

//! Whether to turn each token to lower case.
bool lowerCase;

//! Whether to do basic tokenization before WordPiece.
bool basicTokenize;

//! Tokens which will never be split during tokenization. Only has an effect
//! when basicTokenize = true.
std::vector<std::string> neverSplit;

//! The unknown token. A token that is not in the vocabulary cannot be
//! converted to an ID and is set to be this token instead.
std::string unkToken;

//! The separator token. It is used when building a sequence from multiple
//! sequences, e.g. two sequences for sequence classification or for a text
//! and a question for question answering. It is also used as the last token
//! of a sequence built with special tokens.
std::string sepToken;

//! The token used for padding, for example when batching sequences of
//! different lengths.
std::string padToken;

//! The classifier token which is used when doing sequence classification
//! (classification of the whole sequence instead of per-token
//! classification). It is the first token of the sequence when built with
//! special tokens.
std::string clsToken;

//! The token used for masking values. This is the token used when training
//! this model with masked language modeling. This is the token which the
//! model will try to predict.
std::string maskToken;
}; // class BertTokenizer

// Include implementation.
#include "bert_tokenizer_impl.hpp"
} // namespace ann
} // namespace mlpack

#endif
Loading