arusl · arusl · May 31, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,78 @@
+# Virtual Environment
+venv/
+env/
+ENV/
+.venv/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Model files (optional - uncomment if you don't want to track model files)
+# *.h5
+# *.pkl
+# *.pickle
+
+# Data files (optional - uncomment if you don't want to track large data files)
+# *.json
+# *.csv
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Logs
+*.log
+
+# Ignore model and history files
+models/ 
diff --git a/Readme.md b/Readme.md
@@ -1,14 +1,45 @@
-# Next Word Prediction Model
+# Next Word Prediction with LSTM
 
-This notebook shows how to train an English next word prediction model using LSTM (w/ Keras and NLTK).
+A neural network model for predicting the next word in a sequence using LSTM, trained on the BSD (Business Scene Dialogue) corpus.
 
-### Required files
-To run the notebook (`next_word_prediction_en_bsd.ipynb`) in Google Colab, you need to download the followings and adjust the paths inside the notebook:
-* training file (`train.json`) from [this repo](https://github.com/tsuruoka-lab/BSD).
+## Installation
 
-### Load a trained model
-If you want, you can skip the training step and load the trained model below to make predictions.
-* [trained model](https://drive.google.com/file/d/1kV8nDT2UGfcLm2klgSr_K-7kMrbupGU3/view?usp=sharing) (`next_word_model.h5`)
+1. **Create and activate virtual environment:**
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # On Windows: venv\Scripts\activate
+   ```
 
-### Notes
-Detailed explanations can be found inside the notebook.
+2. **Install dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. **Download NLTK data:**
+   ```bash
+   python -c "import nltk; nltk.download('punkt')"
+   ```
+
+## Usage
+
+### Train a new model:
+```bash
+python next_word_prediction.py
+```
+
+### Run test predictions:
+```bash
+python test_next_word_prediction.py
+```
+
+## Data Format
+
+Training data should be a JSON file from the [BSD corpus](https://github.com/tsuruoka-lab/BSD) with business dialogue conversations. Place it in the `downloads/` directory as `train.json`.
+
+## Notes
+
+- Requires `downloads/train.json` file for training and vocabulary recreation
+- Model and history are saved to the `models/` directory by default
+- Model uses a 5-word context window (input to prediction must be exactly 5 words)
+- Outputs top-N most likely next words
+- You can plot training history using `plot_training_history`
diff --git a/next_word_prediction.py b/next_word_prediction.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Next Word Prediction with LSTM - BSD Corpus (English only)
+A neural network model for predicting the next word in a sequence using LSTM.
+"""
+
+import numpy as np
+import heapq
+import matplotlib.pyplot as plt
+import pandas as pd
+import pickle
+from nltk.tokenize import RegexpTokenizer
+from keras.models import Sequential, load_model
+from keras.layers import Dense, Activation
+from keras.layers import LSTM
+from tensorflow.keras.optimizers import RMSprop
+from tqdm import tqdm
+
+
+class NextWordPredictor:
+    def __init__(self, word_length=5):
+        self.word_length = word_length
+        self.tokenizer = RegexpTokenizer(r'\w+')
+        self.unique_words = None
+        self.word2id = None
+        self.id2word = None
+        self.model = None
+
+    def load_data(self, json_file_path):
+        """Load and preprocess data from BSD corpus JSON file."""
+        print("Loading data...")
+        df = pd.read_json(json_file_path)
+        print(f"Loaded {len(df)} conversations with tags: {df['tag'].unique()}")
+
+        # Extract conversation data
+        df_convs = pd.concat([pd.json_normalize(df['conversation'][i]) 
+                             for i in range(len(df))], ignore_index=True)
+
+        # Get English sentences
+        train_en = df_convs["en_sentence"].values.tolist()
+        print(f"Total English sentences: {len(train_en)}")
+
+        return train_en
+
+    def preprocess_text(self, sentences, max_sentences=10000):
+        """Tokenize and prepare text data."""
+        print("Preprocessing text...")
+
+        # Limit sentences due to memory constraints
+        text = ' '.join(sentences[:max_sentences])
+        print(f'Corpus length: {len(text)} characters')
+
+        # Tokenize
+        words = self.tokenizer.tokenize(text)
+        print(f'Total words: {len(words)}')
+
+        # Create vocabulary
+        self.unique_words = np.unique(words)
+        print(f'Unique words: {len(self.unique_words)}')
+
+        # Create word mappings
+        self.word2id = dict((word, i) for i, word in enumerate(self.unique_words))
+        self.id2word = {i: word for word, i in self.word2id.items()}
+
+        return words
+
+    def create_sequences(self, words):
+        """Create input sequences and target words for training."""
+        print("Creating training sequences...")
+        prev_words = []
+        next_words = []
+        for i in tqdm(range(len(words) - self.word_length), desc="Sequences"):
+            prev_words.append(words[i:i + self.word_length])
+            next_words.append(words[i + self.word_length])
+        print(f"Created {len(prev_words)} training sequences")
+        print(f"Example: {prev_words[0]} -> {next_words[0]}")
+        return prev_words, next_words
+
+    def vectorize_sequences(self, prev_words, next_words):
+        """Convert sequences to one-hot encoded vectors."""
+        print("Vectorizing sequences...")
+        vocab_size = len(self.unique_words)
+        X = np.zeros((len(prev_words), self.word_length, vocab_size), dtype=bool)
+        Y = np.zeros((len(next_words), vocab_size), dtype=bool)
+        for i, words in tqdm(enumerate(prev_words), total=len(prev_words), desc="Vectorizing"):
+            for j, word in enumerate(words):
+                X[i, j, self.word2id[word]] = 1
+            Y[i, self.word2id[next_words[i]]] = 1
+        print(f"X shape: {X.shape}, Y shape: {Y.shape}")
+        return X, Y
+
+    def build_model(self):
+        """Build the LSTM model architecture."""
+        print("Building model...")
+
+        vocab_size = len(self.unique_words)
+
+        self.model = Sequential()
+        self.model.add(LSTM(128, input_shape=(self.word_length, vocab_size)))
+        self.model.add(Dense(vocab_size))
+        self.model.add(Activation('softmax'))
+
+        # Compile model
+        optimizer = RMSprop(learning_rate=0.01)
+        self.model.compile(loss='categorical_crossentropy', 
+                          optimizer=optimizer, 
+                          metrics=['accuracy'])
+
+        print("Model built successfully!")
+        return self.model
+
+    def train_model(self, X, Y, epochs=2, batch_size=128, validation_split=0.05):
+        """Train the LSTM model."""
+        print("Training model...")
+
+        history = self.model.fit(X, Y, 
+                                validation_split=validation_split,
+                                batch_size=batch_size,
+                                epochs=epochs,
+                                shuffle=True)
+
+        print("Training completed!")
+        return history.history
+
+    def save_model(self, model_path='models/next_word_model.h5', history_path='models/history.p'):
+        """Save the trained model and training history."""
+        self.model.save(model_path)
+        print(f"Model saved to {model_path}")
+
+    def load_trained_model(self, model_path='models/next_word_model.h5'):
+        """Load a pre-trained model."""
+        self.model = load_model(model_path)
+        print(f"Model loaded from {model_path}")
+
+    def prepare_input(self, text):
+        """Prepare input text for prediction."""
+        words = self.tokenizer.tokenize(text)
+        if len(words) != self.word_length:
+            raise ValueError(f"Input must contain exactly {self.word_length} words")
+
+        x = np.zeros((1, self.word_length, len(self.unique_words)))
+        for t, word in enumerate(words):
+            if word in self.word2id:
+                x[0, t, self.word2id[word]] = 1.0
+            else:
+                print(f"Warning: '{word}' not in vocabulary")
+
+        return x
+
+    def sample_predictions(self, preds, top_n=3):
+        """Sample top predictions from model output."""
+        preds = np.asarray(preds).astype('float64')
+        preds = np.log(preds)
+        exp_preds = np.exp(preds)
+        preds = exp_preds / np.sum(exp_preds)
+
+        return heapq.nlargest(top_n, range(len(preds)), preds.take)
+
+    def predict_next_word(self, text, n=3):
+        """Predict the next word(s) given input text."""
+        if self.model is None:
+            raise ValueError("Model not trained or loaded")
+
+        x = self.prepare_input(text)
+        preds = self.model.predict(x, verbose=0)[0]
+        next_indices = self.sample_predictions(preds, n)
+
+        return [self.id2word[idx] for idx in next_indices]
+
+    def plot_training_history(self, history):
+        """Plot training history."""
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+
+        # Plot accuracy
+        ax1.plot(history['accuracy'], label='Training Accuracy')
+        ax1.plot(history['val_accuracy'], label='Validation Accuracy')
+        ax1.set_title('Model Accuracy')
+        ax1.set_xlabel('Epoch')
+        ax1.set_ylabel('Accuracy')
+        ax1.legend()
+
+        # Plot loss
+        ax2.plot(history['loss'], label='Training Loss')
+        ax2.plot(history['val_loss'], label='Validation Loss')
+        ax2.set_title('Model Loss')
+        ax2.set_xlabel('Epoch')
+        ax2.set_ylabel('Loss')
+        ax2.legend()
+
+        plt.tight_layout()
+        plt.show()
+
+
+def demo_predictions(predictor):
+    """Demonstrate model predictions with examples."""
+    print("\n" + "="*50)
+    print("PREDICTION EXAMPLES")
+    print("="*50)
+
+    examples = [
+        "I look forward to working",
+        "Thank you very much for",
+        "Could you please send me",
+        "I would like to schedule",
+        "Please let me know if"
+    ]
+
+    for example in examples:
+        try:
+            predictions = predictor.predict_next_word(example, n=5)
+            print(f"\nInput: '{example}'")
+            print(f"Predictions: {predictions}")
+        except Exception as e:
+            print(f"Error with '{example}': {e}")
+
+
+def main():
+    """Main execution function."""
+    print("Next Word Prediction with LSTM - BSD Corpus")
+    print("="*50)
+    # Initialize predictor
+    print("[FLAG] Initializing predictor...")
+    predictor = NextWordPredictor(word_length=5)
+    # Load and preprocess data
+    try:
+        print("[FLAG] Loading data...")
+        json_file = "downloads/train.json"  # Updated path
+        sentences = predictor.load_data(json_file)
+        print("[FLAG] Preprocessing text...")
+        words = predictor.preprocess_text(sentences, max_sentences=10000)
+        print("[FLAG] Creating training sequences...")
+        prev_words, next_words = predictor.create_sequences(words)
+        print("[FLAG] Vectorizing sequences...")
+        X, Y = predictor.vectorize_sequences(prev_words, next_words)
+        print("[FLAG] Building model...")
+        predictor.build_model()
+        print("[FLAG] Training model...")
+        history = predictor.train_model(X, Y, epochs=2)
+        print("[FLAG] Saving model...")
+        predictor.save_model()
+        print("[FLAG] Plotting training history...")
+        predictor.plot_training_history(history)
+        print("[FLAG] Running demo predictions...")
+        demo_predictions(predictor)
+    except FileNotFoundError:
+        print("Training data file not found. Loading pre-trained model...")
+        try:
+            predictor.load_trained_model()
+            print("Note: For demo purposes, you'll need to ensure vocabulary is loaded")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            print("Please ensure you have either:")
+            print("1. The training data file (train.json)")
+            print("2. A pre-trained model (models/next_word_model.h5) with vocabulary")
+
+
+if __name__ == "__main__":
+    main()