Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Virtual Environment
venv/
env/
ENV/
.venv/

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
*.manifest
*.spec

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Model files (optional - uncomment if you don't want to track model files)
# *.h5
# *.pkl
# *.pickle

# Data files (optional - uncomment if you don't want to track large data files)
# *.json
# *.csv

# IDE
.vscode/
.idea/
*.swp
*.swo
*~

# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db

# Logs
*.log

# Ignore model and history files
models/
51 changes: 41 additions & 10 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,45 @@
# Next Word Prediction Model
# Next Word Prediction with LSTM

This notebook shows how to train an English next word prediction model using LSTM (w/ Keras and NLTK).
A neural network model for predicting the next word in a sequence using LSTM, trained on the BSD (Business Scene Dialogue) corpus.

### Required files
To run the notebook (`next_word_prediction_en_bsd.ipynb`) in Google Colab, you need to download the followings and adjust the paths inside the notebook:
* training file (`train.json`) from [this repo](https://github.com/tsuruoka-lab/BSD).
## Installation

### Load a trained model
If you want, you can skip the training step and load the trained model below to make predictions.
* [trained model](https://drive.google.com/file/d/1kV8nDT2UGfcLm2klgSr_K-7kMrbupGU3/view?usp=sharing) (`next_word_model.h5`)
1. **Create and activate virtual environment:**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```

### Notes
Detailed explanations can be found inside the notebook.
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```

3. **Download NLTK data:**
```bash
python -c "import nltk; nltk.download('punkt')"
```

## Usage

### Train a new model:
```bash
python next_word_prediction.py
```

### Run test predictions:
```bash
python test_next_word_prediction.py
```

## Data Format

Training data should be a JSON file from the [BSD corpus](https://github.com/tsuruoka-lab/BSD) with business dialogue conversations. Place it in the `downloads/` directory as `train.json`.

## Notes

- Requires `downloads/train.json` file for training and vocabulary recreation
- Model and history are saved to the `models/` directory by default
- Model uses a 5-word context window (input to prediction must be exactly 5 words)
- Outputs top-N most likely next words
- You can plot training history using `plot_training_history`
258 changes: 258 additions & 0 deletions next_word_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Next Word Prediction with LSTM - BSD Corpus (English only)
A neural network model for predicting the next word in a sequence using LSTM.
"""

import numpy as np
import heapq
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from tqdm import tqdm


class NextWordPredictor:
def __init__(self, word_length=5):
self.word_length = word_length
self.tokenizer = RegexpTokenizer(r'\w+')
self.unique_words = None
self.word2id = None
self.id2word = None
self.model = None

def load_data(self, json_file_path):
"""Load and preprocess data from BSD corpus JSON file."""
print("Loading data...")
df = pd.read_json(json_file_path)
print(f"Loaded {len(df)} conversations with tags: {df['tag'].unique()}")

# Extract conversation data
df_convs = pd.concat([pd.json_normalize(df['conversation'][i])
for i in range(len(df))], ignore_index=True)

# Get English sentences
train_en = df_convs["en_sentence"].values.tolist()
print(f"Total English sentences: {len(train_en)}")

return train_en

def preprocess_text(self, sentences, max_sentences=10000):
"""Tokenize and prepare text data."""
print("Preprocessing text...")

# Limit sentences due to memory constraints
text = ' '.join(sentences[:max_sentences])
print(f'Corpus length: {len(text)} characters')

# Tokenize
words = self.tokenizer.tokenize(text)
print(f'Total words: {len(words)}')

# Create vocabulary
self.unique_words = np.unique(words)
print(f'Unique words: {len(self.unique_words)}')

# Create word mappings
self.word2id = dict((word, i) for i, word in enumerate(self.unique_words))
self.id2word = {i: word for word, i in self.word2id.items()}

return words

def create_sequences(self, words):
"""Create input sequences and target words for training."""
print("Creating training sequences...")
prev_words = []
next_words = []
for i in tqdm(range(len(words) - self.word_length), desc="Sequences"):
prev_words.append(words[i:i + self.word_length])
next_words.append(words[i + self.word_length])
print(f"Created {len(prev_words)} training sequences")
print(f"Example: {prev_words[0]} -> {next_words[0]}")
return prev_words, next_words

def vectorize_sequences(self, prev_words, next_words):
"""Convert sequences to one-hot encoded vectors."""
print("Vectorizing sequences...")
vocab_size = len(self.unique_words)
X = np.zeros((len(prev_words), self.word_length, vocab_size), dtype=bool)
Y = np.zeros((len(next_words), vocab_size), dtype=bool)
for i, words in tqdm(enumerate(prev_words), total=len(prev_words), desc="Vectorizing"):
for j, word in enumerate(words):
X[i, j, self.word2id[word]] = 1
Y[i, self.word2id[next_words[i]]] = 1
print(f"X shape: {X.shape}, Y shape: {Y.shape}")
return X, Y

def build_model(self):
"""Build the LSTM model architecture."""
print("Building model...")

vocab_size = len(self.unique_words)

self.model = Sequential()
self.model.add(LSTM(128, input_shape=(self.word_length, vocab_size)))
self.model.add(Dense(vocab_size))
self.model.add(Activation('softmax'))

# Compile model
optimizer = RMSprop(learning_rate=0.01)
self.model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])

print("Model built successfully!")
return self.model

def train_model(self, X, Y, epochs=2, batch_size=128, validation_split=0.05):
"""Train the LSTM model."""
print("Training model...")

history = self.model.fit(X, Y,
validation_split=validation_split,
batch_size=batch_size,
epochs=epochs,
shuffle=True)

print("Training completed!")
return history.history

def save_model(self, model_path='models/next_word_model.h5', history_path='models/history.p'):
"""Save the trained model and training history."""
self.model.save(model_path)
print(f"Model saved to {model_path}")

def load_trained_model(self, model_path='models/next_word_model.h5'):
"""Load a pre-trained model."""
self.model = load_model(model_path)
print(f"Model loaded from {model_path}")

def prepare_input(self, text):
"""Prepare input text for prediction."""
words = self.tokenizer.tokenize(text)
if len(words) != self.word_length:
raise ValueError(f"Input must contain exactly {self.word_length} words")

x = np.zeros((1, self.word_length, len(self.unique_words)))
for t, word in enumerate(words):
if word in self.word2id:
x[0, t, self.word2id[word]] = 1.0
else:
print(f"Warning: '{word}' not in vocabulary")

return x

def sample_predictions(self, preds, top_n=3):
"""Sample top predictions from model output."""
preds = np.asarray(preds).astype('float64')
preds = np.log(preds)
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)

return heapq.nlargest(top_n, range(len(preds)), preds.take)

def predict_next_word(self, text, n=3):
"""Predict the next word(s) given input text."""
if self.model is None:
raise ValueError("Model not trained or loaded")

x = self.prepare_input(text)
preds = self.model.predict(x, verbose=0)[0]
next_indices = self.sample_predictions(preds, n)

return [self.id2word[idx] for idx in next_indices]

def plot_training_history(self, history):
"""Plot training history."""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot accuracy
ax1.plot(history['accuracy'], label='Training Accuracy')
ax1.plot(history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()

# Plot loss
ax2.plot(history['loss'], label='Training Loss')
ax2.plot(history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()

plt.tight_layout()
plt.show()


def demo_predictions(predictor):
"""Demonstrate model predictions with examples."""
print("\n" + "="*50)
print("PREDICTION EXAMPLES")
print("="*50)

examples = [
"I look forward to working",
"Thank you very much for",
"Could you please send me",
"I would like to schedule",
"Please let me know if"
]

for example in examples:
try:
predictions = predictor.predict_next_word(example, n=5)
print(f"\nInput: '{example}'")
print(f"Predictions: {predictions}")
except Exception as e:
print(f"Error with '{example}': {e}")


def main():
"""Main execution function."""
print("Next Word Prediction with LSTM - BSD Corpus")
print("="*50)
# Initialize predictor
print("[FLAG] Initializing predictor...")
predictor = NextWordPredictor(word_length=5)
# Load and preprocess data
try:
print("[FLAG] Loading data...")
json_file = "downloads/train.json" # Updated path
sentences = predictor.load_data(json_file)
print("[FLAG] Preprocessing text...")
words = predictor.preprocess_text(sentences, max_sentences=10000)
print("[FLAG] Creating training sequences...")
prev_words, next_words = predictor.create_sequences(words)
print("[FLAG] Vectorizing sequences...")
X, Y = predictor.vectorize_sequences(prev_words, next_words)
print("[FLAG] Building model...")
predictor.build_model()
print("[FLAG] Training model...")
history = predictor.train_model(X, Y, epochs=2)
print("[FLAG] Saving model...")
predictor.save_model()
print("[FLAG] Plotting training history...")
predictor.plot_training_history(history)
print("[FLAG] Running demo predictions...")
demo_predictions(predictor)
except FileNotFoundError:
print("Training data file not found. Loading pre-trained model...")
try:
predictor.load_trained_model()
print("Note: For demo purposes, you'll need to ensure vocabulary is loaded")
except Exception as e:
print(f"Error loading model: {e}")
print("Please ensure you have either:")
print("1. The training data file (train.json)")
print("2. A pre-trained model (models/next_word_model.h5) with vocabulary")


if __name__ == "__main__":
main()
Loading