graykode · graykode · May 19, 2019 · May 20, 2019 · May 21, 2019
diff --git a/GPT2/config.py b/GPT2/config.py
@@ -3,7 +3,20 @@
     Original Paper and repository here : https://github.com/openai/gpt-2
     GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
 '''
-class GPT2Config(object):
+
+
+def get_config(model_name='117M'):
+    config = None
+
+    if model_name == '117M':
+        config = GPT2_117M_Config()
+    elif model_name == '345M':
+        config = GPT2_345M_Config()
+
+    return config
+
+
+class GPT2_117M_Config(object):
     def __init__(
             self,
             vocab_size_or_config_json_file=50257,
@@ -22,4 +35,26 @@ def __init__(
         self.n_layer = n_layer
         self.n_head = n_head
         self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
+        self.initializer_range = initializer_range
+
+
+class GPT2_345M_Config(object):
+    def __init__(
+            self,
+            vocab_size_or_config_json_file=50257,
+            n_positions=1024,
+            n_ctx=1024,
+            n_embd=1024,
+            n_layer=24,
+            n_head=16,
+            layer_norm_epsilon=1e-5,
+            initializer_range=0.02,
+    ):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
diff --git a/GPT2/data.py b/GPT2/data.py
@@ -0,0 +1,99 @@
+"""
+Module to deal with loading text data and sampling from it.
+"""
+import glob
+import numpy as np
+import os
+import tqdm
+
+
+def load_dataset(enc, path, combine):
+    paths = []
+
+    # Simple file
+    if os.path.isfile(path):
+        paths.append(path)
+
+    # Directory
+    elif os.path.isdir(path):
+        for (dirpath, _, fnames) in os.walk(path):
+            for fname in fnames:
+                paths.append(os.path.join(dirpath, fname))
+
+    # Assume glob
+    else:
+        paths = glob.glob(path)
+
+    # filter paths
+    paths = [p for p in paths if '.DS_Store' not in p]
+
+    token_chunks = []
+    raw_text = ''
+    for path in tqdm.tqdm(paths):
+
+        if path.endswith('.npz'):
+
+            # Pre-encoded
+            with np.load(path) as npz:
+                for item in npz.files:
+                    token_chunks.append(npz[item])
+        else:
+
+            # Plain text
+            with open(path, mode='r', encoding='utf-8') as fp:
+                raw_text += fp.read()
+
+            if len(raw_text) >= combine:
+                tokens = np.stack(enc.encode(raw_text))
+                token_chunks.append(tokens)
+                raw_text = ''
+            else:
+                raw_text += '<|endoftext|>'
+
+    if raw_text:
+        tokens = np.stack(enc.encode(raw_text))
+        token_chunks.append(tokens)
+
+    return token_chunks
+
+
+def binary_search(f, lo, hi):
+    if f(lo) or not f(hi):
+        return None
+    while hi > lo + 1:
+        mid = (lo + hi) // 2
+        if f(mid):
+            hi = mid
+        else:
+            lo = mid
+    return hi
+
+
+class Sampler(object):
+    """Fairly samples a slice from a set of variable sized chunks.
+
+    'Fairly' means that the distribution is the same as sampling from one concatenated chunk,
+    but without crossing chunk boundaries."""
+
+    def __init__(self, chunks, seed=None):
+        self.chunks = chunks
+        self.total_size = sum(chunk.shape[0] for chunk in chunks)
+        self.boundaries = [0]
+        for i in range(len(chunks)):
+            self.boundaries.append(self.boundaries[-1] + chunks[i].shape[0])
+        self.rs = np.random.RandomState(seed=seed)
+
+    def sample(self, length):
+        assert length < self.total_size // len(self.chunks), "Dataset files are too small to sample {} tokens at a time".format(length)
+
+        while True:
+            index = self.rs.randint(0, self.total_size - length - 1)
+
+            # i = boundary that index is in
+            i = binary_search(lambda j: self.boundaries[j] > index, 0, len(self.boundaries) - 1) - 1
+
+            # sample length fits within the chunk at the starting index in that chunk
+            if self.boundaries[i + 1] > index + length:
+                # finding start of boundary from index
+                within_chunk = index - self.boundaries[i]
+                return self.chunks[i][within_chunk: within_chunk + length]
diff --git a/GPT2/encoder.py b/GPT2/encoder.py
@@ -1,10 +1,9 @@
 """Byte pair encoding utilities"""
-
-import os
 import json
 import regex as re
 from functools import lru_cache
 
+
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -27,6 +26,7 @@ def bytes_to_unicode():
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
     Word is represented as tuple of symbols (symbols being variable-length strings).
@@ -38,13 +38,14 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 class Encoder:
     def __init__(self, encoder, bpe_merges, errors='replace'):
         self.encoder = encoder
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
 
@@ -61,7 +62,7 @@ def bpe(self, token):
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -104,13 +105,11 @@ def decode(self, tokens):
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
+
 def get_encoder():
     with open('./GPT2/encoder.json', 'r') as f:
         encoder = json.load(f)
     with open('./GPT2/vocab.bpe', 'r', encoding="utf-8") as f:
         bpe_data = f.read()
     bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
+    return Encoder(encoder=encoder, bpe_merges=bpe_merges)
diff --git a/GPT2/model.py b/GPT2/model.py
@@ -9,9 +9,11 @@
 import torch.nn as nn
 from torch.nn.parameter import Parameter
 
+
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
+
 class LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-12):
         """Construct a layernorm module in the TF style (epsilon inside the square root).
@@ -27,6 +29,7 @@ def forward(self, x):
         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
         return self.weight * x + self.bias
 
+
 class Conv1D(nn.Module):
     def __init__(self, nf, nx):
         super(Conv1D, self).__init__()
@@ -42,6 +45,7 @@ def forward(self, x):
         x = x.view(*size_out)
         return x
 
+
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
@@ -94,6 +98,7 @@ def forward(self, x, layer_past=None):
         a = self.c_proj(a)
         return a, present
 
+
 class MLP(nn.Module):
     def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         super(MLP, self).__init__()
@@ -107,6 +112,7 @@ def forward(self, x):
         h2 = self.c_proj(h)
         return h2
 
+
 class Block(nn.Module):
     def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
@@ -123,6 +129,7 @@ def forward(self, x, layer_past=None):
         x = x + m
         return x, present
 
+
 class GPT2Model(nn.Module):
     def __init__(self, config):
         super(GPT2Model, self).__init__()
@@ -172,6 +179,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         output_shape = input_shape + (hidden_states.size(-1),)
         return hidden_states.view(*output_shape), presents
 
+
 class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
@@ -189,6 +197,7 @@ def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
         return lm_logits
 
+
 class GPT2LMHeadModel(nn.Module):
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__()
@@ -205,6 +214,6 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.contiguous().view(-1))
             return loss
-        return lm_logits, presents
+        return lm_logits, presents
diff --git a/GPT2/sample.py b/GPT2/sample.py
@@ -7,32 +7,42 @@
 import torch.nn.functional as F
 from tqdm import trange
 
+
 def top_k_logits(logits, k):
+
     if k == 0:
         return logits
+
     values, _ = torch.topk(logits, k)
     min_values = values[:, -1]
     return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
 
-def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
+
+def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0,
+                    device='cuda', sample=True):
+
     if start_token is None:
         assert context is not None, 'Specify exactly one of start_token and context!'
         context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
     else:
         assert context is None, 'Specify exactly one of start_token and context!'
         context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
+
     prev = context
     output = context
     past = None
+
     with torch.no_grad():
         for i in trange(length):
             logits, past = model(prev, past=past)
             logits = logits[:, -1, :] / temperature
             logits = top_k_logits(logits, k=top_k)
             log_probs = F.softmax(logits, dim=-1)
+
             if sample:
                 prev = torch.multinomial(log_probs, num_samples=1)
             else:
                 _, prev = torch.topk(log_probs, k=1, dim=-1)
+
             output = torch.cat((output, prev), dim=1)
-    return output
+    return output
diff --git a/GPT2/utils.py b/GPT2/utils.py
@@ -7,6 +7,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def load_weight(model, state_dict):
     old_keys = []
     new_keys = []
@@ -49,4 +50,4 @@ def load(module, prefix=""):
 
     # Make sure we are still sharing the output and input embeddings after loading weights
     model.set_tied()
-    return model
+    return model
diff --git a/README.md b/README.md
@@ -24,7 +24,8 @@ This repository is simple implementation GPT-2 about **text-generator** in **Pyt
 ```shell
 $ git clone https://github.com/graykode/gpt-2-Pytorch && cd gpt-2-Pytorch
 # download huggingface's pytorch model 
-$ curl --output gpt2-pytorch_model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
+$ mkdir -p pretrained_models/117M
+$ curl --output pretrained_models/117M/model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
 # setup requirements
 $ pip install -r requirements.txt
 ```
@@ -54,12 +55,19 @@ $ python main.py --text "It was a bright cold day in April, and the clocks were
 
 See more detail option about `temperature` and `top_k` in [here](https://github.com/openai/gpt-2#gpt-2-samples)
 
+## Training
 
+You can also fine-tune a pre-trained gpt-2 model on your own custom dataset:
+
+```
+$ python train.py --dataset data/webtext_train_10 --model_name 117M --sample_every 100 --save_every 1000
+```
 
 ## Dependencies
 
-- Pytorch 0.41+
+- Pytorch 1.0.0
 - regex 2017.4.5
+- tqdm 4.31.1