Skip to content

Train #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions GPT2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,20 @@
Original Paper and repository here : https://github.com/openai/gpt-2
GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
'''
class GPT2Config(object):


def get_config(model_name='117M'):
config = None

if model_name == '117M':
config = GPT2_117M_Config()
elif model_name == '345M':
config = GPT2_345M_Config()

return config


class GPT2_117M_Config(object):
def __init__(
self,
vocab_size_or_config_json_file=50257,
Expand All @@ -22,4 +35,26 @@ def __init__(
self.n_layer = n_layer
self.n_head = n_head
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.initializer_range = initializer_range


class GPT2_345M_Config(object):
def __init__(
self,
vocab_size_or_config_json_file=50257,
n_positions=1024,
n_ctx=1024,
n_embd=1024,
n_layer=24,
n_head=16,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
):
self.vocab_size = vocab_size_or_config_json_file
self.n_ctx = n_ctx
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
99 changes: 99 additions & 0 deletions GPT2/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Module to deal with loading text data and sampling from it.
"""
import glob
import numpy as np
import os
import tqdm


def load_dataset(enc, path, combine):
paths = []

# Simple file
if os.path.isfile(path):
paths.append(path)

# Directory
elif os.path.isdir(path):
for (dirpath, _, fnames) in os.walk(path):
for fname in fnames:
paths.append(os.path.join(dirpath, fname))

# Assume glob
else:
paths = glob.glob(path)

# filter paths
paths = [p for p in paths if '.DS_Store' not in p]

token_chunks = []
raw_text = ''
for path in tqdm.tqdm(paths):

if path.endswith('.npz'):

# Pre-encoded
with np.load(path) as npz:
for item in npz.files:
token_chunks.append(npz[item])
else:

# Plain text
with open(path, mode='r', encoding='utf-8') as fp:
raw_text += fp.read()

if len(raw_text) >= combine:
tokens = np.stack(enc.encode(raw_text))
token_chunks.append(tokens)
raw_text = ''
else:
raw_text += '<|endoftext|>'

if raw_text:
tokens = np.stack(enc.encode(raw_text))
token_chunks.append(tokens)

return token_chunks


def binary_search(f, lo, hi):
if f(lo) or not f(hi):
return None
while hi > lo + 1:
mid = (lo + hi) // 2
if f(mid):
hi = mid
else:
lo = mid
return hi


class Sampler(object):
"""Fairly samples a slice from a set of variable sized chunks.

'Fairly' means that the distribution is the same as sampling from one concatenated chunk,
but without crossing chunk boundaries."""

def __init__(self, chunks, seed=None):
self.chunks = chunks
self.total_size = sum(chunk.shape[0] for chunk in chunks)
self.boundaries = [0]
for i in range(len(chunks)):
self.boundaries.append(self.boundaries[-1] + chunks[i].shape[0])
self.rs = np.random.RandomState(seed=seed)

def sample(self, length):
assert length < self.total_size // len(self.chunks), "Dataset files are too small to sample {} tokens at a time".format(length)

while True:
index = self.rs.randint(0, self.total_size - length - 1)

# i = boundary that index is in
i = binary_search(lambda j: self.boundaries[j] > index, 0, len(self.boundaries) - 1) - 1

# sample length fits within the chunk at the starting index in that chunk
if self.boundaries[i + 1] > index + length:
# finding start of boundary from index
within_chunk = index - self.boundaries[i]
return self.chunks[i][within_chunk: within_chunk + length]
19 changes: 9 additions & 10 deletions GPT2/encoder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""Byte pair encoding utilities"""

import os
import json
import regex as re
from functools import lru_cache


@lru_cache()
def bytes_to_unicode():
"""
Expand All @@ -27,6 +26,7 @@ def bytes_to_unicode():
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))


def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
Expand All @@ -38,13 +38,14 @@ def get_pairs(word):
prev_char = char
return pairs


class Encoder:
def __init__(self, encoder, bpe_merges, errors='replace'):
self.encoder = encoder
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}

Expand All @@ -61,7 +62,7 @@ def bpe(self, token):
return token

while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
Expand Down Expand Up @@ -104,13 +105,11 @@ def decode(self, tokens):
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text


def get_encoder():
with open('./GPT2/encoder.json', 'r') as f:
encoder = json.load(f)
with open('./GPT2/vocab.bpe', 'r', encoding="utf-8") as f:
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
)
return Encoder(encoder=encoder, bpe_merges=bpe_merges)
13 changes: 11 additions & 2 deletions GPT2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
import torch.nn as nn
from torch.nn.parameter import Parameter


def gelu(x):
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


class LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
Expand All @@ -27,6 +29,7 @@ def forward(self, x):
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias


class Conv1D(nn.Module):
def __init__(self, nf, nx):
super(Conv1D, self).__init__()
Expand All @@ -42,6 +45,7 @@ def forward(self, x):
x = x.view(*size_out)
return x


class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False):
super(Attention, self).__init__()
Expand Down Expand Up @@ -94,6 +98,7 @@ def forward(self, x, layer_past=None):
a = self.c_proj(a)
return a, present


class MLP(nn.Module):
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
super(MLP, self).__init__()
Expand All @@ -107,6 +112,7 @@ def forward(self, x):
h2 = self.c_proj(h)
return h2


class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__()
Expand All @@ -123,6 +129,7 @@ def forward(self, x, layer_past=None):
x = x + m
return x, present


class GPT2Model(nn.Module):
def __init__(self, config):
super(GPT2Model, self).__init__()
Expand Down Expand Up @@ -172,6 +179,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
output_shape = input_shape + (hidden_states.size(-1),)
return hidden_states.view(*output_shape), presents


class GPT2LMHead(nn.Module):
def __init__(self, model_embeddings_weights, config):
super(GPT2LMHead, self).__init__()
Expand All @@ -189,6 +197,7 @@ def forward(self, hidden_state):
lm_logits = self.decoder(hidden_state)
return lm_logits


class GPT2LMHeadModel(nn.Module):
def __init__(self, config):
super(GPT2LMHeadModel, self).__init__()
Expand All @@ -205,6 +214,6 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
lm_logits = self.lm_head(hidden_states)
if lm_labels is not None:
loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.contiguous().view(-1))
return loss
return lm_logits, presents
return lm_logits, presents
14 changes: 12 additions & 2 deletions GPT2/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,42 @@
import torch.nn.functional as F
from tqdm import trange


def top_k_logits(logits, k):

if k == 0:
return logits

values, _ = torch.topk(logits, k)
min_values = values[:, -1]
return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)

def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):

def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0,
device='cuda', sample=True):

if start_token is None:
assert context is not None, 'Specify exactly one of start_token and context!'
context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
else:
assert context is None, 'Specify exactly one of start_token and context!'
context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)

prev = context
output = context
past = None

with torch.no_grad():
for i in trange(length):
logits, past = model(prev, past=past)
logits = logits[:, -1, :] / temperature
logits = top_k_logits(logits, k=top_k)
log_probs = F.softmax(logits, dim=-1)

if sample:
prev = torch.multinomial(log_probs, num_samples=1)
else:
_, prev = torch.topk(log_probs, k=1, dim=-1)

output = torch.cat((output, prev), dim=1)
return output
return output
3 changes: 2 additions & 1 deletion GPT2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

logger = logging.getLogger(__name__)


def load_weight(model, state_dict):
old_keys = []
new_keys = []
Expand Down Expand Up @@ -49,4 +50,4 @@ def load(module, prefix=""):

# Make sure we are still sharing the output and input embeddings after loading weights
model.set_tied()
return model
return model
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ This repository is simple implementation GPT-2 about **text-generator** in **Pyt
```shell
$ git clone https://github.com/graykode/gpt-2-Pytorch && cd gpt-2-Pytorch
# download huggingface's pytorch model
$ curl --output gpt2-pytorch_model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
$ mkdir -p pretrained_models/117M
$ curl --output pretrained_models/117M/model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
# setup requirements
$ pip install -r requirements.txt
```
Expand Down Expand Up @@ -54,12 +55,19 @@ $ python main.py --text "It was a bright cold day in April, and the clocks were

See more detail option about `temperature` and `top_k` in [here](https://github.com/openai/gpt-2#gpt-2-samples)

## Training

You can also fine-tune a pre-trained gpt-2 model on your own custom dataset:

```
$ python train.py --dataset data/webtext_train_10 --model_name 117M --sample_every 100 --save_every 1000
```

## Dependencies

- Pytorch 0.41+
- Pytorch 1.0.0
- regex 2017.4.5
- tqdm 4.31.1



Expand Down
Loading