From 4f1e29944802b4491e70f9a87694167ce98bc058 Mon Sep 17 00:00:00 2001
From: jbrophy <jonathanbrophy47@gmail.com>
Date: Sat, 18 May 2019 18:06:23 -0700
Subject: [PATCH 1/2] added finetuning

---
 GPT2/config.py                   |  39 ++++-
 GPT2/data.py                     |  99 ++++++++++++
 GPT2/encoder.py                  |  19 ++-
 GPT2/model.py                    |  13 +-
 GPT2/sample.py                   |  14 +-
 GPT2/utils.py                    |   3 +-
 README.md                        |  12 +-
 data/webtext_train_10/0.txt      |  23 +++
 data/webtext_train_10/1.txt      |   5 +
 data/webtext_train_10/2.txt      |   7 +
 data/webtext_train_10/3.txt      |  43 +++++
 data/webtext_train_10/4.txt      |  27 ++++
 data/webtext_train_10/5.txt      |   7 +
 data/webtext_train_10/6.txt      |  25 +++
 data/webtext_train_10/7.txt      |  19 +++
 data/webtext_train_10/8.txt      |  17 ++
 data/webtext_train_10/9.txt      |  45 ++++++
 main.py                          |  18 ++-
 pretrained_models/117M/README.md |   2 +
 requirements.txt                 |   4 +-
 train.py                         | 264 +++++++++++++++++++++++++++++++
 21 files changed, 678 insertions(+), 27 deletions(-)
 create mode 100644 GPT2/data.py
 create mode 100644 data/webtext_train_10/0.txt
 create mode 100644 data/webtext_train_10/1.txt
 create mode 100644 data/webtext_train_10/2.txt
 create mode 100644 data/webtext_train_10/3.txt
 create mode 100644 data/webtext_train_10/4.txt
 create mode 100644 data/webtext_train_10/5.txt
 create mode 100644 data/webtext_train_10/6.txt
 create mode 100644 data/webtext_train_10/7.txt
 create mode 100644 data/webtext_train_10/8.txt
 create mode 100644 data/webtext_train_10/9.txt
 create mode 100644 pretrained_models/117M/README.md
 create mode 100644 train.py

diff --git a/GPT2/config.py b/GPT2/config.py
index b5efe35..c7fb51f 100644
--- a/GPT2/config.py
+++ b/GPT2/config.py
@@ -3,7 +3,20 @@
     Original Paper and repository here : https://github.com/openai/gpt-2
     GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
 '''
-class GPT2Config(object):
+
+
+def get_config(model_name='117M'):
+    config = None
+
+    if model_name == '117M':
+        config = GPT2_117M_Config()
+    elif model_name == '345M':
+        config = GPT2_345M_Config()
+
+    return config
+
+
+class GPT2_117M_Config(object):
     def __init__(
             self,
             vocab_size_or_config_json_file=50257,
@@ -22,4 +35,26 @@ def __init__(
         self.n_layer = n_layer
         self.n_head = n_head
         self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
\ No newline at end of file
+        self.initializer_range = initializer_range
+
+
+class GPT2_345M_Config(object):
+    def __init__(
+            self,
+            vocab_size_or_config_json_file=50257,
+            n_positions=1024,
+            n_ctx=1024,
+            n_embd=1024,
+            n_layer=24,
+            n_head=16,
+            layer_norm_epsilon=1e-5,
+            initializer_range=0.02,
+    ):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
diff --git a/GPT2/data.py b/GPT2/data.py
new file mode 100644
index 0000000..b02cbc2
--- /dev/null
+++ b/GPT2/data.py
@@ -0,0 +1,99 @@
+"""
+Module to deal with loading text data and sampling from it.
+"""
+import glob
+import numpy as np
+import os
+import tqdm
+
+
+def load_dataset(enc, path, combine):
+    paths = []
+
+    # Simple file
+    if os.path.isfile(path):
+        paths.append(path)
+
+    # Directory
+    elif os.path.isdir(path):
+        for (dirpath, _, fnames) in os.walk(path):
+            for fname in fnames:
+                paths.append(os.path.join(dirpath, fname))
+
+    # Assume glob
+    else:
+        paths = glob.glob(path)
+
+    # filter paths
+    paths = [p for p in paths if '.DS_Store' not in p]
+
+    token_chunks = []
+    raw_text = ''
+    for path in tqdm.tqdm(paths):
+
+        if path.endswith('.npz'):
+
+            # Pre-encoded
+            with np.load(path) as npz:
+                for item in npz.files:
+                    token_chunks.append(npz[item])
+        else:
+
+            # Plain text
+            with open(path, 'r') as fp:
+                raw_text += fp.read()
+
+            if len(raw_text) >= combine:
+                tokens = np.stack(enc.encode(raw_text))
+                token_chunks.append(tokens)
+                raw_text = ''
+            else:
+                raw_text += '<|endoftext|>'
+
+    if raw_text:
+        tokens = np.stack(enc.encode(raw_text))
+        token_chunks.append(tokens)
+
+    return token_chunks
+
+
+def binary_search(f, lo, hi):
+    if f(lo) or not f(hi):
+        return None
+    while hi > lo + 1:
+        mid = (lo + hi) // 2
+        if f(mid):
+            hi = mid
+        else:
+            lo = mid
+    return hi
+
+
+class Sampler(object):
+    """Fairly samples a slice from a set of variable sized chunks.
+
+    'Fairly' means that the distribution is the same as sampling from one concatenated chunk,
+    but without crossing chunk boundaries."""
+
+    def __init__(self, chunks, seed=None):
+        self.chunks = chunks
+        self.total_size = sum(chunk.shape[0] for chunk in chunks)
+        self.boundaries = [0]
+        for i in range(len(chunks)):
+            self.boundaries.append(self.boundaries[-1] + chunks[i].shape[0])
+        self.rs = np.random.RandomState(seed=seed)
+
+    def sample(self, length):
+        assert length < self.total_size // len(self.chunks), "Dataset files are too small to sample {} tokens at a time".format(length)
+
+        while True:
+            index = self.rs.randint(0, self.total_size - length - 1)
+
+            # i = boundary that index is in
+            i = binary_search(lambda j: self.boundaries[j] > index, 0, len(self.boundaries) - 1) - 1
+
+            # sample length fits within the chunk at the starting index in that chunk
+            if self.boundaries[i + 1] > index + length:
+                # finding start of boundary from index
+                within_chunk = index - self.boundaries[i]
+                return self.chunks[i][within_chunk: within_chunk + length]
diff --git a/GPT2/encoder.py b/GPT2/encoder.py
index f6508e8..2ca5326 100644
--- a/GPT2/encoder.py
+++ b/GPT2/encoder.py
@@ -1,10 +1,9 @@
 """Byte pair encoding utilities"""
-
-import os
 import json
 import regex as re
 from functools import lru_cache
 
+
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -27,6 +26,7 @@ def bytes_to_unicode():
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
     Word is represented as tuple of symbols (symbols being variable-length strings).
@@ -38,13 +38,14 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
+
 class Encoder:
     def __init__(self, encoder, bpe_merges, errors='replace'):
         self.encoder = encoder
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
 
@@ -61,7 +62,7 @@ def bpe(self, token):
             return token
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -104,13 +105,11 @@ def decode(self, tokens):
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
+
 def get_encoder():
     with open('./GPT2/encoder.json', 'r') as f:
         encoder = json.load(f)
     with open('./GPT2/vocab.bpe', 'r', encoding="utf-8") as f:
         bpe_data = f.read()
     bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
\ No newline at end of file
+    return Encoder(encoder=encoder, bpe_merges=bpe_merges)
diff --git a/GPT2/model.py b/GPT2/model.py
index f18113e..98e1fac 100644
--- a/GPT2/model.py
+++ b/GPT2/model.py
@@ -9,9 +9,11 @@
 import torch.nn as nn
 from torch.nn.parameter import Parameter
 
+
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
+
 class LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-12):
         """Construct a layernorm module in the TF style (epsilon inside the square root).
@@ -27,6 +29,7 @@ def forward(self, x):
         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
         return self.weight * x + self.bias
 
+
 class Conv1D(nn.Module):
     def __init__(self, nf, nx):
         super(Conv1D, self).__init__()
@@ -42,6 +45,7 @@ def forward(self, x):
         x = x.view(*size_out)
         return x
 
+
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
@@ -94,6 +98,7 @@ def forward(self, x, layer_past=None):
         a = self.c_proj(a)
         return a, present
 
+
 class MLP(nn.Module):
     def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         super(MLP, self).__init__()
@@ -107,6 +112,7 @@ def forward(self, x):
         h2 = self.c_proj(h)
         return h2
 
+
 class Block(nn.Module):
     def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
@@ -123,6 +129,7 @@ def forward(self, x, layer_past=None):
         x = x + m
         return x, present
 
+
 class GPT2Model(nn.Module):
     def __init__(self, config):
         super(GPT2Model, self).__init__()
@@ -172,6 +179,7 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
         output_shape = input_shape + (hidden_states.size(-1),)
         return hidden_states.view(*output_shape), presents
 
+
 class GPT2LMHead(nn.Module):
     def __init__(self, model_embeddings_weights, config):
         super(GPT2LMHead, self).__init__()
@@ -189,6 +197,7 @@ def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
         return lm_logits
 
+
 class GPT2LMHeadModel(nn.Module):
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__()
@@ -205,6 +214,6 @@ def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=N
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.contiguous().view(-1))
             return loss
-        return lm_logits, presents
\ No newline at end of file
+        return lm_logits, presents
diff --git a/GPT2/sample.py b/GPT2/sample.py
index 1065cce..252d612 100644
--- a/GPT2/sample.py
+++ b/GPT2/sample.py
@@ -7,32 +7,42 @@
 import torch.nn.functional as F
 from tqdm import trange
 
+
 def top_k_logits(logits, k):
+
     if k == 0:
         return logits
+
     values, _ = torch.topk(logits, k)
     min_values = values[:, -1]
     return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
 
-def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
+
+def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0,
+                    device='cuda', sample=True):
+
     if start_token is None:
         assert context is not None, 'Specify exactly one of start_token and context!'
         context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
     else:
         assert context is None, 'Specify exactly one of start_token and context!'
         context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
+
     prev = context
     output = context
     past = None
+
     with torch.no_grad():
         for i in trange(length):
             logits, past = model(prev, past=past)
             logits = logits[:, -1, :] / temperature
             logits = top_k_logits(logits, k=top_k)
             log_probs = F.softmax(logits, dim=-1)
+
             if sample:
                 prev = torch.multinomial(log_probs, num_samples=1)
             else:
                 _, prev = torch.topk(log_probs, k=1, dim=-1)
+
             output = torch.cat((output, prev), dim=1)
-    return output
\ No newline at end of file
+    return output
diff --git a/GPT2/utils.py b/GPT2/utils.py
index 115a397..741ddc1 100644
--- a/GPT2/utils.py
+++ b/GPT2/utils.py
@@ -7,6 +7,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def load_weight(model, state_dict):
     old_keys = []
     new_keys = []
@@ -49,4 +50,4 @@ def load(module, prefix=""):
 
     # Make sure we are still sharing the output and input embeddings after loading weights
     model.set_tied()
-    return model
\ No newline at end of file
+    return model
diff --git a/README.md b/README.md
index 0db8581..df1f924 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,8 @@ This repository is simple implementation GPT-2 about **text-generator** in **Pyt
 ```shell
 $ git clone https://github.com/graykode/gpt-2-Pytorch && cd gpt-2-Pytorch
 # download huggingface's pytorch model 
-$ curl --output gpt2-pytorch_model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
+$ mkdir -p pretrained_models/117M
+$ curl --output pretrained_models/117M/model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin
 # setup requirements
 $ pip install -r requirements.txt
 ```
@@ -54,12 +55,19 @@ $ python main.py --text "It was a bright cold day in April, and the clocks were
 
 See more detail option about `temperature` and `top_k` in [here](https://github.com/openai/gpt-2#gpt-2-samples)
 
+## Training
 
+You can also fine-tune a pre-trained gpt-2 model on your own custom dataset:
+
+```
+$ python train.py --dataset data/webtext_train_10 --model_name 117M --sample_every 100 --save_every 1000
+```
 
 ## Dependencies
 
-- Pytorch 0.41+
+- Pytorch 1.0.0
 - regex 2017.4.5
+- tqdm 4.31.1
 
 
 
diff --git a/data/webtext_train_10/0.txt b/data/webtext_train_10/0.txt
new file mode 100644
index 0000000..17a55d6
--- /dev/null
+++ b/data/webtext_train_10/0.txt
@@ -0,0 +1,23 @@
+These girlfriends deserves a special mention for going that extra mile, hopefully doesn't set too many guys off on the path towards outrageous demands.
+
+1. She knows the severity of man-flu
+
+2. All fun and games is all good
+
+3. A voucher that says 'I love you'
+
+4. When arguments don't drag on forever.
+
+5. Providing everything he needs.
+
+6. Very understanding
+
+7. As awesome a gesture as this is, we are worried about this man's cooking skills.
+
+8. Nice cake
+
+8. Fair bargaining
+
+9. Excellent gift choice
+
+10. Very thoughtful
\ No newline at end of file
diff --git a/data/webtext_train_10/1.txt b/data/webtext_train_10/1.txt
new file mode 100644
index 0000000..404bc15
--- /dev/null
+++ b/data/webtext_train_10/1.txt
@@ -0,0 +1,5 @@
+LeSean McCoy going through warmups with first team offense. To my eye, does not look close to 100 percent when cutting and exploding.
+
+ABOUT COOKIES
+
+To help make this website better, to improve and personalize your experience and for advertising purposes, are you happy to accept cookies and other technologies?
\ No newline at end of file
diff --git a/data/webtext_train_10/2.txt b/data/webtext_train_10/2.txt
new file mode 100644
index 0000000..78f5062
--- /dev/null
+++ b/data/webtext_train_10/2.txt
@@ -0,0 +1,7 @@
+Tom Curran has been called up to England's Ashes squad. The 22-year-old Surrey all-rounder will fly out to Australia in the next 24 hours as a replacement for Steven Finn, after the Middlesex fast bowler was ruled out of the rest of the tour with a torn left knee cartilage.
+
+Curran is yet to play a Test match for England. However, he broke into the white-ball side during the 2017 season, making his debut in Twenty20 and one-day international cricket, and impressed the England management with his attitude.
+
+Coach Trevor Bayliss is known to be a fan, and so Curran has been preferred to the likes of Liam Plunkett, Tom Helm, George Garton and Mark Wood as England seek to bolster their ailing pace-bowling reserves ahead of the toughest series of them all.
+
+Shape Created with Sketch. England Ashes squad Show all 17 left Created with Sketch. right Created with Sketch. Shape Created with Sketch. England Ashes squad 1/17 Captain: Joe Root England's Mr Dependable will lead his side into an Ashes series for the first time, and while he has the experience of the series wins in 2013 and 2015, he also has the scars of the last trip Down Under. Getty 2/17 Batsman: Alastair Cook The former captain will be crucial to England's hopes, with the Essex opener needing to find the same resilient form that he displayed in Australia in the 2010/11 series. Getty 3/17 Batsman: Mark Stoneman Cook's likely opening partner will be Mark Stoneman after selectors decided to stick with him despite a nervous series against the West Indies. Getty 4/17 Batsman/spinner: Dawid Malan Malan showed glimpses of promise this summer and can also offer an option with the ball, but he is untested on the hard pitches of Australia and could be found out. Getty 5/17 Batsman: Gary Ballance Ballance is handed yet another chance to salvage his England career as the selectors hope he will eventually come good for their unyielding faith. Getty 6/17 Batsman: James Vince Vince is the surprise inclusion in the squad, having done little of note in county cricket since being dropped in 2016. Getty 7/17 Batsman/spinner: Moeen Ali Moeen Ali could easily go on to be man of the series given his ability to deliver fireworks with bat and ball. He may disagree, but he is undoubtedly England's front line spinner. Getty 8/17 Batsman/spinner: Mason Crane Crane is yet to make his full debut, though took a wonderful catch against the West Indies as a substitute fielder and will head to Australia as a back-up leg-break spiner bowler. Getty 9/17 Wicketkeeper: Ben Foakes Foakes will head to Australia as a deputy for first-choice wicketkeeper Jonny Bairstow. Getty 10/17 Wicketkeeper: Jonny Bairstow Another man who will need to produce runs to give England a chance of victory, with his ability in the mid-order giving the tourists a bite throughout the line-up. Getty 11/17 All-rounder: Ben Stokes Stokes is named in the side despite falling under a huge cloud after his arrest on a late night out in Bristol. His future as vice-captain looks very much in doubt. Getty 12/17 All-rounder: Chris Woakes Woakes will provide rest for the front-line bowlers and will also prove handy with the bat. Getty 13/17 Bowler: Stuart Broad Broad has long set his sights on this Ashes tour as he hopes to make up for the 2013/14 humiliation, and his opening partnership with James Anderson will set the tone for how England will cope out in Australia. Getty 14/17 Bowler: James Anderson England's leading Test wicket-taker will be wrapped in cotton wool until the first Test, though he will have to deliver the goods in a country where swing can be hard to find. Getty 15/17 Bowler: Jake Ball Ball could prove to be England's joke in the pack given his extra pace and bounce. Think Chris Tremlett a la 2010/11. It's just a case of keeping him fit. Getty 16/17 Bowler: Craig Overton The third uncapped member of the squad, Overton has been rewarded for a solid season with Somerset. Getty 17/17 Bowler: Tom Curran Called up by England to replace Steven Finn, who had previously been called up due to Ben Stokes' uncertainty. Getty 1/17 Captain: Joe Root England's Mr Dependable will lead his side into an Ashes series for the first time, and while he has the experience of the series wins in 2013 and 2015, he also has the scars of the last trip Down Under. Getty 2/17 Batsman: Alastair Cook The former captain will be crucial to England's hopes, with the
\ No newline at end of file
diff --git a/data/webtext_train_10/3.txt b/data/webtext_train_10/3.txt
new file mode 100644
index 0000000..e06be98
--- /dev/null
+++ b/data/webtext_train_10/3.txt
@@ -0,0 +1,43 @@
+We'll have turkey on the table Thursday but, as yet, it looks like no turkey in the Presidential on-deck circle. And both Gov. Howard Dean and Congressman Bernie Sanders think credit should go where credit is due. That means, thank you, Ralph Nader. Thanks a whole frickin' lot!
+
+Shortly after the votes were counted and the stalemate began, Dean said he hoped the Nader voters in the Sunshine State — all 93,000 — were "happy." Said he hoped they would be comfortable with the Supreme Court justices that Dubya will be appointing if he proves to be the eventual winner. Dean supported Ralph Nader's right to run, but sarcastically expressed the hope that Nader's supporters will live at peace with themselves under the looming Bush administration.
+
+And Sanders, an old friend of the Green Party candidate, was even more upset.
+
+"It seemed to me," said Ol' Bernardo, "that in the last month or two, Nader really made a deliberate effort to defeat Al Gore. He went to those states where the races were closest and was pretty clear about his goal. I don't understand that and I think that was wrong."
+
+Hindsight's always 20-20.
+
+Thanks, Ralph.
+
+Civil-Unions Update — The 11-member commission established by Act 91 met last week at the Statehouse to check up on how Vermont's landmark legalization of love for all couples was shaking out. The news was quite good.
+
+Steve Patterson, deputy commerce secretary, reported no negative repercussions whatsoever to date. While there's plenty of anecdotal information about the positive effect the more than 1200 civil unions have had on Vermont's hospitality sector, Patterson said his agency had compiled no data to quantify that.
+
+Town Clerk Linda Spence of Manchester told the commission over the speakerphone that implementation of the civil-unions law has been remarkably smooth. Vance is also a justice of the peace and the president of the Association of Town Clerks and Treasurers. She was familiar with just one case of rude treatment given an out-of-state lesbian couple by a town official in Weston.
+
+"The sun still rises and sets in Vermont," said Spence. "I myself am a heterosexual, but I have to say my experiences both as town clerk and justice of the peace have been nothing but positive with this law in place. It has proved to me to be one of the most moving and emotional pieces of legislation I have seen, and I don't see where it does any harm to anybody."
+
+We're No.1??? — University of Vermont men's ice hockey Coach Mike Gilligan told Seven Days Tuesday morning he's hanging in there. After all, the guy's a veteran of the game. A senior statesman. An institution.
+
+When Gilligan first hit Burlington, Bernie Sanders was mayor, Madeleine Kunin just got elected governor, Phish was a typo. Nobody ever heard of Bill Clinton. The lakefront bikepath did not exist.
+
+Things change.
+
+That's why we had to check Gilligan's pulse this week. At no time in his 16 years of whistling Vermont line changes has Mike Gilligan been here before. It's uncharted territory. The numbers don't lie. As the old Green & Gold prepared for Tuesday night's game against non-league opponent UMass-Amherst, Vermont is 4-0 and flying solo in first place in the ECAC.
+
+Yeah, yeah, yeah, I know. It's much too early to suggest that Cinderella is spending the winter in Burlington, Vermont. Way too early. Dream on, right?
+
+But they say the darkest hour comes right before the dawn. And everybody remembers the black night that swallowed UVM last season. However, what these guys have been doing on the ice speaks volumes.
+
+"They learned quite a bit last season," Gilligan told Seven Days. "They learned how precious one game is. How precious a night on the ice is."
+
+It shows.
+
+And make no mistake, this is a disciplined team in more ways than one. Gilligan told us forward Graham Mink, a junior from Stowe, Vermont, sat out the first four games as punishment for breaking an undisclosed team rule. When he finally got to play in the Yale game, Big Mink played like a gorilla on ice skates.
+
+A couple of Minnesota schools are coming in this weekend for a Saturday-Sunday tournament at the Gut with Gilligan's Gorillas and UNH. Duluth and Mankato, in the giant Minnesota state university system, will hit Burlap with some rock-'em-sock-'em "western-style" hockey. Welcome to Vermont, boys!
+
+Media Notes — Is there Mardi Gras coverage in Sera Congi's future?
+
+Congi is the talented co-anchor of "Vermont's Own" Ch. 3
\ No newline at end of file
diff --git a/data/webtext_train_10/4.txt b/data/webtext_train_10/4.txt
new file mode 100644
index 0000000..eacc254
--- /dev/null
+++ b/data/webtext_train_10/4.txt
@@ -0,0 +1,27 @@
+The 1945 Sinkings of the Cap Arcona and the Thielbek
+
+Allied Attacks Killed Thousands of Concentration Camp Inmates
+
+By Mark Weber
+
+All prisoners of German wartime concentration camps who perished while in German custody are routinely regarded as "victims of Nazism" -- even if they lost their lives as direct or indirect result of Allied policy. Similarly, all Jews who died in German captivity during World War II -- no matter what the cause of death -- are counted as "victims of the Holocaust."
+
+This view is very misleading, if not deceitful. In fact, many tens of thousands of camp inmates and Jews lost their lives as direct and indirect victims of Allied action, or of the horrors of the Second World War. For example, the many thousands of Jews who perished in the notorious Bergen-Belsen camp during and after the final months of the war in Europe, including Anne Frank, were primarily victims not of German policy, but rather of the turmoil and chaos of war.
+
+Among the German concentration camp prisoners who perished at Allied hands were some 7,000 inmates who were killed during the war's final week as they were being evacuated in three large German ships that were attacked by British war planes. This little-known tragedy is one of history's greatest maritime disasters.
+
+The Cap Arcona, launched in May 1927, was a handsome passenger ship of the "Hamburg-South America" line. At 27,000 gross registered tons, it was the fourth-largest ship in the German merchant marine. For twelve years -- until the outbreak of war in 1939 -- she had sailed regularly between Hamburg and Rio de Janeiro. In the war's final months she was pressed into service by the German navy to rescue refugees fleeing from areas in the east threatened by the Red Army. This was part of a vast rescue operation organized by the German navy under the supervision of Grand Admiral Karl Dönitz. All but unknown in the United States today, this great undertaking saved countless lives. The Thielbek, a much smaller ship of 2,800 gross registered tons, was also used to transport refugees as part of the rescue operation.
+
+In April 1945, Karl Kaufmann, Gauleiter of Hamburg and Reich Commissioner for merchant shipping, transferred the Cap Arcona and the Thielbek from naval command, and ordered them to Neustadt Bay in the Baltic Sea near the north German city of Lübeck.
+
+Some 5,000 prisoners hastily evacuated from the Neuengamme concentration camp (a few miles southeast of Hamburg) were brought on board the Cap Arcona between April 18 and 26, along with some 400 SS guards, a naval gunnery detail of 500, and a crew of 76. Similarly the Thielbek took on some 2,800 Neuengamme prisoners. Under the terrible conditions that prevailed in what remained of unoccupied Germany during those final weeks, conditions for the prisoners on board the two vessels were dreadful. Many of the tightly packed inmates were ill, and both food and water were in very short supply.
+
+On the afternoon of May 3, 1945, British "Typhoon" fighter-bombers, striking in several attack waves, bombarded and fired on the Cap Arcona and then the Thielbek. The two ships, which had no military function or mission, were flying many large white flags. "The hoisting of white flags proved useless," notes the Encyclopedia of the Third Reich. The attacks were thus violations of international law, for which -- if Britain and not Germany had been the vanquished power -- British pilots and their commanders could have been punished and even executed as "war criminals."
+
+The Thielbek, struck by rockets, bombs and machine gun fire, sank in just 15-20 minutes. British planes then fired on terror-stricken survivors who were struggling in rescue boats or thrashing in the cold sea. Nearly everyone on board the Thielbek perished quickly, including nearly all the SS guards, ship's officers and crew members. Only about 50 of the prisoners survived.
+
+The burning Cap Arcona took longer to go under. Many inmates burned to death. Most of those who were able to leap overboard drowned in the cold sea, and only some 350-500 could be rescued. During the next several days hundreds of corpses washed up on nearby shores, and were buried in mass graves. Having sunk in shallow water, the wreck of the capsized Cap Arcona remained partially above water as a grim reminder of the catastrophe.
+
+A German reference work, Verheimlichte Dokumente, sums up:
+
+A particularly barbaric Allied war crime was the bombing on May 3, 1945, by British Royal Air Force planes of the passenger ships Cap Arcona and Thielbek in the Lübeck bay, packed with concentration camp inmates. Among the many 'nameless' victims were many prominent political figures, a fact that is hushed up today because the fact that concentration camp inmates, many of them
\ No newline at end of file
diff --git a/data/webtext_train_10/5.txt b/data/webtext_train_10/5.txt
new file mode 100644
index 0000000..890bc90
--- /dev/null
+++ b/data/webtext_train_10/5.txt
@@ -0,0 +1,7 @@
+Kim Kardashian is jumping on the hype wave and releasing a fidget spinner. The spinner is a gold money symbol and it says "daddy," which is apt since it's called the Daddy Money Fidget Spinner. It can be yours — with a seven-day shipping delay because these things take time — for the low price of $15, plus $4 for shipping
+
+Fidget spinners are apparently still a thing. But since I could not give less of a shit about them, I asked my colleague and noted fidget spinner enthusiast, Ashley Carman, what she thought about Kim's latest business endeavor.
+
+"I would never spend $15 on a Kim K spinner," says Carman.
+
+Well there you have it. Have a nice weekend.
\ No newline at end of file
diff --git a/data/webtext_train_10/6.txt b/data/webtext_train_10/6.txt
new file mode 100644
index 0000000..94b361f
--- /dev/null
+++ b/data/webtext_train_10/6.txt
@@ -0,0 +1,25 @@
+10 of London's greatest Victorian projects – 4. The Palace of Westminster… February 6, 2013
+
+Commonly thought to be older than it actually is due to its Gothic stylings (although, to be fair, parts of it do date from medieval times), the Palace of Westminster – or, as it's more commonly known, the Houses of Parliament – didn't actually take on much of its current appearance until the latter half of the 19th century.
+
+The need for a new building for parliament arose after 1834 when a fire, caused by the overheating of two underfloor stoves used to incinerate the Exchequer's obsolete tally sticks, tore through the former complex, leaving only some structures from the old palace intact. They included the 11th century Westminster Hall (the largest in Europe when it was built), 14th century Jewel Tower and a chapterhouse, crypt and cloisters, all of which was once attached to the now gone St Stephen's Chapel.
+
+While King William IV offered the use of Buckingham Palace for Parliament, the idea – along with a host of other options – was rejected as unsuitable. Instead, a competition was held for a new design and after almost 100 entries were considered, architect Charles Barry and his design for a new palace in the perpendicular Gothic style was chosen. Interestingly, while Barry was a classical architect, under the terms of the competition, designs were required to be in a Gothic style, thought to embody conservative values .
+
+Incorporating some of the remains of the old palace – including Westminster Hall but not the Jewel Tower which to this day stands alone – the design was based around a series of internal courtyards with the House of Commons and House of Lords located on either side of a central lobby (first known as Octagonal Hall). The design involved reclaiming some land from the Thames so the building's main river-facing facade could be completed.
+
+Towers stand at either end of the complex – the Victoria Tower over the Sovereign's Entrance at the southern end of the complex (for many years the tallest square stone tower in the world) and the narrower tower formerly known as the Clock Tower which houses the bell Big Ben, at the northern end – and there is a central Octagonal Tower which stands directly over the Central Lobby. The Clock Tower, incidentally, was renamed the Elizabeth Tower last year in honour of Queen Elizabeth II's Diamond Jubilee (for more on it and Big Ben, see our earlier entries here and here).
+
+Other towers include the Speaker's Tower (located at the northern end of the building on the waterfront, this contains a residence for the Speaker), the Chancellor's Tower (located at the southern end, it too contained a residence originally used by the Lord Chancellor) and St Stephen's Tower – located in the middle of the building's west front, it contains the public entrance to the building. Significant other rooms in the palace complex include the Robing Room – where the Queen puts on her ceremonial robes and crown before the State Opening of Parliament – and the Royal Gallery, used for state occasions.
+
+The foundation stone (the building was constructed out of sand-coloured limestone from Yorkshire) was laid in 1840 and construction of the monumental building – which features more than 1,100 rooms and two miles of passageways – wasn't completely finished until the 1870s although most of the work had been completed by 1860 (the year Barry died). The House of Lords first sat in their new chamber in 1847 and the House of Commons in 1852 (it was at this point that Barry was knighted for his work).
+
+The cost, meanwhile, originally estimated at less than £750,000, ended up coming in at more than £2 million.
+
+Much of the interior decoration owes its appearance to the Gothic revivalist Augustus Pugin who designed everything from wallpapers, to floor tiles and furnishings. Pugin also helped Barry with the external appearance but like Barry died before the project was completely finished (in 1852).
+
+The palace was bombed numerous times in World War II – in one raid, the Commons Chamber was destroyed as firefighters opted to save the much older Westminster Hall instead. It was later rebuilt under the direction of Sir Giles Gilbert Scott and completed by 1950. Other aspects of the building have also been restored.
+
+A Grade I-listed building classified as a World Heritage Site, Barry's Houses of Parliament remain one of London's most iconic structures. We'll be looking in more detail at some of the building's features in future posts.
+
+WHERE: Houses of Parliament (nearest Tube stations are Westminster, St James's Park and Embankment); WHEN: Tours (75 minutes) are run from 9.15am to 4.30pm on Saturdays (also six days a week during summer opening); COST: £15 adults/£10 concessions/£6 children five to 15 years (children under five are free). Prices go up after 1st April – check website for details and to purchase tickets (
\ No newline at end of file
diff --git a/data/webtext_train_10/7.txt b/data/webtext_train_10/7.txt
new file mode 100644
index 0000000..656a87a
--- /dev/null
+++ b/data/webtext_train_10/7.txt
@@ -0,0 +1,19 @@
+: This week the Chilean government's promise to protect roughly 10 million acres of land became official, boosting the nation's parklands by 38.5 percent, according to a statement . Read our original story about the move below:
+
+Last week, the government of Chile signed an agreement taking possession of a 1-million-acres of private park land put together by a pair of American philanthropists. It also announced it would protect an additional 9 million acres of wildlands as national parks, reports Jonathan Franklin at The Guardian.
+
+Kris McDivitt Tompkins, former CEO of the clothing company Patagonia and her husband, Doug Tompkins, co-founder of the North Face and Esprit clothing lines, began buying hundreds of thousands of acres in the wild Patagonia region of Chile in the early 1990s, The Guardian's John Vidal reported last year. Their goal, Vidal writes, was to "buy and restore as much land as they could, improve and protect it, and then return it to people as public, national parks."
+
+After over two decades of work, they acquired 2.2 million acres of land, including the gifted land, Parque Pumalín and Patagonia, which together span roughly 1 million acres and represent the largest land donation from a private entity to a country.
+
+But Chile was not always receptive to the couple. In the beginning of the project, they were accused of being CIA spies, of trying to hobble Chile's economic development and called a national security threat. At one point the government threatened to take their land.
+
+"We were opposed for four years. We were 'the couple who cut Chile in half,'" McDivitt Tompkins tells Vidal. "They said we were setting up a nuclear-waste dump or a new Jewish state."
+
+But in recent years, the Chilean government has warmed up to the conservation projects, and president Michelle Bachelet was on hand at the border of Pumalin Park to sign the documents authorizing the handover. As Elizabeth Royte at National Geographic reports, Chile hopes to include the new parks in a 1,500-mile tourism route they want to call the Ruta de los Parques, which would link together 17 national parks and offer everything from rainforest hikes and mountaineering to sea kayaking. By some estimates the new parks will bring $270 million into the area and employ 43,000 people.
+
+The new parks make Chile one of Central and South America's most eco-conscious nations. "That puts Chile right up there with Costa Rica in terms of the percentage of protected lands," Yvon Chouinard, founder of the Patagonia clothing company tells Franklin. "No other human has ever created this many acres of protected wildlands…These are tourist-ready parks with trails and cabins and infrastructure."
+
+However, Doug Tompkins, who died in 2015 in a kayaking accident, will never see the fruits of their labor. "I wish my husband Doug, whose vision inspired today's historic pledge, were here on this memorable day. Our team and I feel his absence deeply," McDivitt Tompkins says in a press release. "But I know that if Doug were here today, he would speak of national parks being one of the greatest expressions of democracy that a country can realize, preserving the masterpieces of a nation for all of its citizenry."
+
+The handover of the Tompkins property will take place incrementally over the next two years.
\ No newline at end of file
diff --git a/data/webtext_train_10/8.txt b/data/webtext_train_10/8.txt
new file mode 100644
index 0000000..1435d84
--- /dev/null
+++ b/data/webtext_train_10/8.txt
@@ -0,0 +1,17 @@
+Household fast food names could soon be a thing of the past thanks to the funding of plant-based alternatives from some of the world's biggest investors. The plant-based food scene is changing rapidly, no longer are vegans thought to eat nothing but lettuce and lentils, the plant-based burger revolution has arrived. What's more, big investors are excited about it.
+
+Recently, Beyond Meat's burger, otherwise known as 'the burger that bleeds' accomplished something huge when they struck a deal with stores owned by Kroger, that means their burger will be stocked in 605 stores across the US. Beyond have seen investments from some big names including Bill Gates, General Mills and even meat producers Tyson Foods. Tyson's CEO said earlier this year that the future of protein may lie in the meatless market.
+
+A similar burger, made by Impossible Foods, who Bill Gates has also invested in, managed to accumulate $75 million worth of funding last month. Alongside Bill Gates, Facebook co-founder Dustin Moskovitz and Asia's second richest man Li Ka-shing were amongst the contributors.
+
+Impossible are currently waiting on confirmation from the FDA about their ingredient 'heme', which they were granted a patent for this month. Heme is the key to their 'meat like' burgers and is derived from soy. As yet the FDA are undecided as to whether this product will be considered an allergen, which is currently stopping Impossible from distributing further. However, it seems that they will have a lot of support behind them when they do.
+
+Taking it one step further, from individual products to a whole chain of restaurants, By Chloe has taken the US by storm since its first store opened its doors in 2015. The fully plant-based chain which has been described as 'Shake Shack without the meat', is already venturing across the Atlantic to London, where it will open its first European store in Covent Garden later this year. Since opening, the chain has sold over 600,000 vegan burgers, which aren't the only item on their menu. These high figures may account for the whopping $13 million dollars invested in the chain recently to allow them to expand both in the US and overseas.
+
+Start-ups like these are seeing an increase in popularity, and it's not surprising. Plant-based foods are much more sustainable than their animal product alternative, something Impossible Foods boast about on their site. The increased number of options allow people who are still keen on the taste of meat, as well as those who want to celebrate vegetables, plenty of things to try without comprising their favourite foods.
+
+With backing from big investors these companies could go from just starting out to fast food giants pretty quickly, and with everything they have to brag about it could mean a sharp decline in some of the world's biggest fast food outlets.
+
+Unless, of course, McDonalds want to jump on the plant-based bandwagon?
+
+Image credit: Impossible Foods | Beyond Meat | Grub Street | by Chloe
\ No newline at end of file
diff --git a/data/webtext_train_10/9.txt b/data/webtext_train_10/9.txt
new file mode 100644
index 0000000..d922b52
--- /dev/null
+++ b/data/webtext_train_10/9.txt
@@ -0,0 +1,45 @@
+by David E. Petzal - Thursday, June 29, 2017
+
+According to a poll by the Media Insight Project, only 6 percent of Americans have faith in the news media. Among readers of American Rifleman, that figure is likely even lower. We've learned that, in all probability, anything we read or hear about guns will be either biased or factually wrong, or both. How come?
+
+Reason One:
+
+Lack of accountability. When I broke into the magazine business in 1964, I had some basic rules drummed into my crew-cut head. If you edited a piece, you were responsible for everything in it: spelling, punctuation, facts, everything—especially facts.
+
+If you were wrong, God help you. The least you would get was a serious public tongue-lashing. As a result of this process I can tell you to this day that it's the Smithsonian Institution, not Institute, and that "prairie" is not spelled "prarie." A cousin of mine, who worked at one of the big New York City daily newspapers, had exactly the same experience.
+
+This seems to have gone by the wayside. A small example: On Jan. 5, in The New York Times, there was an article on "ballistics vests." What's wrong with this? It's "ballistic vests." "Ballistics" is the science of projectile behavior and is a noun, while "ballistic" means relating to ballistics, and is an adjective. No one at The Times appears to know the difference. If I had made that mistake, or my cousin had, we would have been screamed at for 10 minutes with no regard for our "safe space."
+
+Reason Two:
+
+People in the same profession tend to think alike. Among the news media, the collective wisdom regarding guns runs as follows:
+
+1. They believe guns are inherently evil.
+
+2. They think gun owners are, at the least, disturbed and, at the worst, dangerous.
+
+3. They believe the National Rifle Association has, since its founding in 1871, done nothing worthwhile, has never been right about anything and is nothing more than a shill for the firearm industry.
+
+4. They believe the Second Amendment has no relevance in today's United States, and if only we could institute reasonable gun controls there would be no more gun violence.
+
+5. It is their opinion that guns in the home lead only to tragedy, and self-protection is a myth.
+
+Now, let's assume that you're a student in journalism school, and the subject of guns comes up, and you venture the fact that you own guns, and use them, and that your family has always had guns in the home, and that there's never been an accident, and that you don't even know of any accidents. How do you think that statement would be received?
+
+Most likely, you would shortly receive a summons from the dean of students and probably the college psychiatrist to come in and chat, not that there's anything wrong, you understand, but … .
+
+Or let's say that you work in a newsroom and the subject of guns used in self-defense comes up and you point out that, every month, you can cite half a dozen or more cases in "The Armed Citizen" where someone who had a gun prevented a crime or saved themselves or someone else and, in many instances, did it without firing a shot. Imagine the look you're going to get from the news director. It will not be the kind of look that says your career will prosper here.
+
+Reason Three:
+
+Good old-fashioned ignorance. I got my first exposure to this in the mid-1960s when the armed forces began issuing M16s. The 5.56 mm bullets, we were told by reporters, created terrible wounds because "they tumbled through the air and hit people like little buzz saws."
+
+"Golly gee," I said, (or words to that effect), "that's just not possible. Anything that tumbles through the air isn't going to go where it's aimed. Just look at a football that doesn't spiral."
+
+But there it was, and there was more to come.
+
+When the first Glocks were imported they were denounced as "all plastic," and therefore undetectable at airports. The media, knowing nothing of how guns work, and not understanding such concepts as mass and resistance, printed the nonsense verbatim.
+
+When the great "cop-killer-bullet" brouhaha erupted, the media also neglected to check the facts. If they had, they would have learned that Teflon-coated ammunition was very expensive, made in limited amounts, sold to police agencies only and had never been used to kill a cop.
+
+Journalists are ignorant about guns because being knowledgeable about guns requires at least a smattering of
\ No newline at end of file
diff --git a/main.py b/main.py
index c2b0649..60a45ed 100644
--- a/main.py
+++ b/main.py
@@ -11,11 +11,12 @@
 import numpy as np
 from GPT2.model import (GPT2LMHeadModel)
 from GPT2.utils import load_weight
-from GPT2.config import GPT2Config
+from GPT2.config import get_config
 from GPT2.sample import sample_sequence
 from GPT2.encoder import get_encoder
 
-def text_generator(state_dict):
+
+def text_generator(state_dict, config):
     parser = argparse.ArgumentParser()
     parser.add_argument("--text", type=str, required=True)
     parser.add_argument("--quiet", type=bool, default=False)
@@ -42,7 +43,6 @@ def text_generator(state_dict):
 
     # Load Model
     enc = get_encoder()
-    config = GPT2Config()
     model = GPT2LMHeadModel(config)
     model = load_weight(model, state_dict)
     model.to(device)
@@ -60,7 +60,7 @@ def text_generator(state_dict):
     for _ in range(args.nsamples // args.batch_size):
         out = sample_sequence(
             model=model, length=args.length,
-            context=context_tokens  if not  args.unconditional else None,
+            context=context_tokens if not args.unconditional else None,
             start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
             batch_size=args.batch_size,
             temperature=args.temperature, top_k=args.top_k, device=device
@@ -74,9 +74,13 @@ def text_generator(state_dict):
             print(text)
 
 if __name__ == '__main__':
-    if os.path.exists('gpt2-pytorch_model.bin'):
-        state_dict = torch.load('gpt2-pytorch_model.bin', map_location='cpu' if not torch.cuda.is_available() else None)
-        text_generator(state_dict)
+    model_name = '117M'
+    model_path = os.path.join('pretrained_models', model_name, 'model.bin')
+
+    if os.path.exists(model_path):
+        state_dict = torch.load(model_path, map_location='cpu' if not torch.cuda.is_available() else None)
+        config = get_config(model_name)
+        text_generator(state_dict, config)
     else:
         print('Please download gpt2-pytorch_model.bin')
         sys.exit()
diff --git a/pretrained_models/117M/README.md b/pretrained_models/117M/README.md
new file mode 100644
index 0000000..ae359ea
--- /dev/null
+++ b/pretrained_models/117M/README.md
@@ -0,0 +1,2 @@
+Download 117M pretrained model from huggingface/pytorch-pretrained-BERT:
+```$ curl --output model.bin https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 6ec436b..b783b6f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
-regex==2017.4.5
\ No newline at end of file
+regex==2017.4.5
+tqdm==4.31.1
+torch==1.0.0
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..dca8288
--- /dev/null
+++ b/train.py
@@ -0,0 +1,264 @@
+"""
+Finetune a pretrained gpt2 model on a custom dataset.
+    Original Paper and repository here: https://github.com/openai/gpt-2
+    Adapted from code by nshepperd: https://github.com/nshepperd/gpt-2/blob/finetuning/train.py
+"""
+import os
+import tqdm
+import time
+import argparse
+import torch
+
+from GPT2.model import GPT2LMHeadModel
+from GPT2.utils import load_weight
+from GPT2.config import get_config
+from GPT2.sample import sample_sequence
+from GPT2.encoder import get_encoder
+from GPT2.data import load_dataset, Sampler
+
+FINETUNED_DIR = 'finetuned_models'
+CHECKPOINT_DIR = os.path.join(FINETUNED_DIR, 'checkpoint')
+SAMPLE_DIR = os.path.join(FINETUNED_DIR, 'samples')
+
+
+def get_state_dict(model_name='117M'):
+    model_path = os.path.join('pretrained_models', model_name, 'model.bin')
+
+    if os.path.exists(model_path):
+        state_dict = torch.load(model_path, map_location='cpu' if not torch.cuda.is_available() else None)
+    else:
+        print('model does not exist at: {}'.format(model_path))
+        exit(0)
+    return state_dict
+
+
+def load_model(model, state_dict, device):
+    model = load_weight(model, state_dict)
+    model.to(device)
+    model.eval()
+    return model
+
+
+def get_latest_ckpt(ckpt_run_dir):
+
+    if not os.path.isdir(ckpt_run_dir):
+        return None
+
+    ckpts = [ckpt for ckpt in os.listdir(ckpt_run_dir) if ckpt.endswith('.tar')]
+
+    if len(ckpts) == 0:
+        return None
+
+    ckpts = [(ckpt, int(ckpt.split('.')[0].split('-')[1])) for ckpt in ckpts]
+    ckpt, counter = max(ckpts, key=lambda tup: tup[1])
+    ckpt_path = os.path.join(ckpt_run_dir, ckpt)
+
+    return ckpt_path
+
+
+def maketree(path):
+    os.makedirs(path, exist_ok=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Fine-tune GPT-2 on your custom dataset.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--dataset', metavar='PATH', type=str, required=True,
+                        help='Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).')
+    parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name')
+    parser.add_argument('--combine', metavar='CHARS', type=int, default=50000,
+                        help='Concatenate input files with <|endoftext|> separator into chunks of this minimum size')
+
+    parser.add_argument('--batch_size', metavar='SIZE', type=int, default=1, help='Batch size')
+    parser.add_argument('--learning_rate', metavar='LR', type=float, default=0.00002, help='Learning rate for Adam')
+    parser.add_argument('--accumulate_gradients', metavar='N', type=int, default=1,
+                        help='Accumulate gradients across N minibatches.')
+    parser.add_argument('--only_train_transformer_layers', default=False, action='store_true',
+                        help='Restrict training to the transformer blocks.')
+    parser.add_argument('--optimizer', type=str, default='adam', help='Optimizer. <adam|sgd>.')
+    parser.add_argument('--noise', type=float, default=0.0,
+                        help='Add noise to input training data to regularize against typos.')
+
+    parser.add_argument('--top_k', type=int, default=40, help='K for top-k sampling.')
+    parser.add_argument('--top_p', type=float, default=0.0, help='P for top-p sampling. Overrides top_k if set > 0.')
+
+    parser.add_argument('--restore_from', type=str, default='latest',
+                        help='Either "latest", "fresh", or a path to a checkpoint file')
+    parser.add_argument('--run_name', type=str, default='run1',
+                        help='Run id. Name of subdirectory in finetuned_models/')
+    parser.add_argument('--sample_every', metavar='N', type=int, default=100, help='Generate samples every N steps')
+    parser.add_argument('--sample_length', metavar='TOKENS', type=int, default=1023, help='Sample this many tokens')
+    parser.add_argument('--sample_num', metavar='N', type=int, default=1, help='Generate this many samples')
+    parser.add_argument('--save_every', metavar='N', type=int, default=1000, help='Write a checkpoint every N steps')
+
+    parser.add_argument('--val_dataset', metavar='PATH', type=str, default=None,
+                        help='Dataset for validation loss, defaults to --dataset.')
+    parser.add_argument('--val_batch_size', metavar='SIZE', type=int, default=2, help='Batch size for validation.')
+    parser.add_argument('--val_batch_count', metavar='N', type=int, default=40,
+                        help='Number of batches for validation.')
+    parser.add_argument('--val_every', metavar='STEPS', type=int, default=0,
+                        help='Calculate validation loss every STEPS steps.')
+
+    # settings
+    args = parser.parse_args()
+    print(args)
+
+    enc = get_encoder()
+    config = get_config(args.model_name)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = GPT2LMHeadModel(config)
+
+    # error checking
+    if args.sample_length > config.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)
+
+    if args.model_name == '345M':
+        args.memory_saving_gradients = True
+        if args.optimizer == 'adam':
+            args.only_train_transformer_layers = True
+
+    # select variables to update while training
+    all_vars = [tensor for tensor in model.parameters()]
+    transformer_vars = [tensor for name, tensor in model.named_parameters() if 'transformer.h.' in name]
+    train_vars = transformer_vars if args.only_train_transformer_layers else all_vars
+
+    # create optimizer
+    if args.optimizer == 'adam':
+        optimizer = torch.optim.Adam(train_vars, lr=args.learning_rate)
+    elif args.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(train_vars, lr=args.learning_rate)
+    else:
+        exit('Bad optimizer:', args.optimizer)
+
+    # load model
+    if args.restore_from == 'latest':
+        ckpt_path = get_latest_ckpt(os.path.join(CHECKPOINT_DIR, args.run_name))
+
+        if ckpt_path is None:
+            state_dict = get_state_dict(args.model_name)
+            model = load_model(model, state_dict, device)
+            counter = 1
+
+        else:
+            ckpt = torch.load(ckpt_path)
+            model = load_model(model, ckpt['model_state_dict'], device)
+            optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+            counter = ckpt['counter']
+
+    elif args.restore_from == 'fresh':
+        state_dict = get_state_dict(args.model_name)
+        model = load_model(model, state_dict, device)
+        counter = 1
+
+    else:  # path to a checkpoint tar file
+        ckpt = torch.load(args.restore_from)
+        model = load_model(model, ckpt['model_state_dict'], device)
+        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+        counter = ckpt['counter']
+
+    # load datasets
+    print('load training dataset...')
+    chunks = load_dataset(enc, args.dataset, args.combine)
+    data_sampler = Sampler(chunks)
+    print('dataset has {} tokens'.format(data_sampler.total_size))
+
+    if args.val_every > 0:
+        # Sample from validation set once with fixed seed to make
+        # it deterministic during training as well as across runs.
+        print('load validation dataset...')
+        val_chunks = load_dataset(enc, args.val_dataset, args.combine) if args.val_dataset else chunks
+        val_data_sampler = Sampler(val_chunks, seed=1)
+        val_batches = torch.tensor([[val_data_sampler.sample(1024) for _ in range(args.val_batch_size)]
+                                    for _ in range(args.val_batch_count)])
+
+    def save():
+        maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
+        save_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'ckpt-{}.tar'.format(counter))
+        torch.save({
+            'counter': counter,
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict()
+            }, save_path)
+
+    def generate_samples():
+        """Generate unconditional samples."""
+        print('Generating samples...')
+
+        generated = 0
+        all_text = []
+
+        for _ in range(args.sample_num):
+            out = sample_sequence(
+                model=model, length=args.sample_length, context=None,
+                start_token=enc.encoder['<|endoftext|>'], batch_size=1,
+                temperature=1.0, top_k=args.top_k, device=device
+            )
+
+            out = out[:, :].tolist()[0]
+            generated += 1
+            text = enc.decode(out)
+            print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+            print(text)
+            all_text.append(text)
+
+        maketree(os.path.join(SAMPLE_DIR, args.run_name))
+        with open(os.path.join(SAMPLE_DIR, args.run_name, 'samples-{}.txt'.format(counter)), 'w') as fp:
+            fp.write('\n'.join(all_text))
+
+    def validation():
+        print('Calculating validation loss...')
+        losses = []
+        for batch in tqdm.tqdm(val_batches):
+            loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device))
+            losses.append(loss)
+        v_val_loss = torch.mean(torch.tensor(losses))
+        print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'
+              .format(counter=counter, time=time.time() - start_time, loss=v_val_loss))
+
+    def sample_batch():
+        return torch.tensor([data_sampler.sample(1024) for _ in range(args.batch_size)])
+
+    avg_loss = (0.0, 0.0)
+    start_time = time.time()
+
+    # training
+    try:
+        while True:
+            if counter % args.save_every == 0:
+                save()
+            if counter % args.sample_every == 0:
+                generate_samples()
+            if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1):
+                validation()
+
+            if args.accumulate_gradients > 1:
+                optimizer.zero_grad()
+
+                for _ in range(args.accumulate_gradients):
+                    batch = sample_batch()
+                    loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device))
+                    loss.backward()
+                    optimizer.step()
+
+            else:
+                optimizer.zero_grad()
+                batch = sample_batch()
+                loss = model(batch[:, :-1].to(device), lm_labels=batch[:, 1:].to(device))
+                loss.backward()
+                optimizer.step()
+
+            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
+
+            print('[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
+                  .format(counter=counter, time=time.time() - start_time,
+                          loss=loss, avg=avg_loss[0] / avg_loss[1]))
+
+            counter += 1
+
+    except KeyboardInterrupt:
+        print('interrupt')
+        save()
+
+if __name__ == '__main__':
+    main()

From 64e58e7cce7896249476762c8ac484387e43f6a0 Mon Sep 17 00:00:00 2001
From: jbrophy <jonathanbrophy47@gmail.com>
Date: Mon, 20 May 2019 12:48:45 -0700
Subject: [PATCH 2/2] fixing issues

---
 GPT2/data.py | 2 +-
 train.py     | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/GPT2/data.py b/GPT2/data.py
index b02cbc2..24863b7 100644
--- a/GPT2/data.py
+++ b/GPT2/data.py
@@ -40,7 +40,7 @@ def load_dataset(enc, path, combine):
         else:
 
             # Plain text
-            with open(path, 'r') as fp:
+            with open(path, mode='r', encoding='utf-8') as fp:
                 raw_text += fp.read()
 
             if len(raw_text) >= combine:
diff --git a/train.py b/train.py
index dca8288..9244b10 100644
--- a/train.py
+++ b/train.py
@@ -35,7 +35,6 @@ def get_state_dict(model_name='117M'):
 def load_model(model, state_dict, device):
     model = load_weight(model, state_dict)
     model.to(device)
-    model.eval()
     return model