Add corpus integrity testing workflow and tests

Copilot · bact · Copilot · commit fa9f3a782268 · 2026-02-06T11:16:50.000Z
Co-authored-by: bact &lt;128572+bact@users.noreply.github.com&gt;
diff --git a/.github/workflows/corpus-integrity.yml b/.github/workflows/corpus-integrity.yml
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
+name: Corpus Integrity Test
+
+on:
+  push:
+    paths:
+      - ".github/workflows/corpus-integrity.yml"
+      - "pythainlp/corpus/**"
+      - "tests/corpus_integrity/**"
+  pull_request:
+    branches:
+      - dev
+    paths:
+      - ".github/workflows/corpus-integrity.yml"
+      - "pythainlp/corpus/**"
+      - "tests/corpus_integrity/**"
+
+# Avoid duplicate runs for the same source branch and repository
+concurrency:
+  group: >-
+    ${{ github.workflow }}-${{
+      github.event.pull_request.head.repo.full_name || github.repository
+    }}-${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  corpus-integrity:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.13"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .
+
+      - name: Test built-in corpus files
+        env:
+          PYTHONIOENCODING: utf-8
+        run: |
+          python -m unittest discover -s tests/corpus_integrity -p "test_builtin_*.py" -v
+
+      - name: Test downloadable corpus files
+        env:
+          PYTHONIOENCODING: utf-8
+        run: |
+          python -m unittest discover -s tests/corpus_integrity -p "test_downloadable_*.py" -v
diff --git a/tests/corpus_integrity/__init__.py b/tests/corpus_integrity/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Corpus integrity tests.
+
+These tests verify the integrity, format, and parseability of corpus files.
+They are separate from regular unit tests to avoid slowing down development
+test cycles with large file downloads and parsing.
+"""
diff --git a/tests/corpus_integrity/test_builtin_corpus.py b/tests/corpus_integrity/test_builtin_corpus.py
@@ -0,0 +1,164 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test integrity and parseability of built-in corpus files.
+
+These tests verify that all corpus files included in the package
+can be loaded and parsed correctly.
+"""
+
+import unittest
+
+from pythainlp.corpus import (
+    countries,
+    find_synonyms,
+    get_corpus,
+    provinces,
+    thai_family_names,
+    thai_female_names,
+    thai_icu_words,
+    thai_male_names,
+    thai_negations,
+    thai_orst_words,
+    thai_stopwords,
+    thai_syllables,
+    thai_synonyms,
+    thai_volubilis_words,
+    thai_wikipedia_titles,
+    thai_words,
+    ttc,
+)
+
+
+class BuiltinCorpusIntegrityTestCase(unittest.TestCase):
+    """Test integrity of built-in corpus files."""
+
+    def test_negations(self):
+        """Test thai_negations corpus can be loaded and is not empty."""
+        result = thai_negations()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        # Verify it contains actual Thai content
+        self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
+
+    def test_stopwords(self):
+        """Test thai_stopwords corpus can be loaded and is not empty."""
+        result = thai_stopwords()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        # Verify it contains actual Thai content
+        self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
+
+    def test_syllables(self):
+        """Test thai_syllables corpus can be loaded and is not empty."""
+        result = thai_syllables()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        # Verify it contains actual Thai content
+        self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
+
+    def test_words(self):
+        """Test thai_words corpus can be loaded and is not empty."""
+        result = thai_words()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        # Verify it contains actual Thai content
+        self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
+
+    def test_synonyms(self):
+        """Test thai_synonyms corpus can be loaded and parsed correctly."""
+        result = thai_synonyms()
+        self.assertIsInstance(result, dict)
+        self.assertGreater(len(result), 0)
+        # Test that find_synonyms works with the loaded data
+        synonyms = find_synonyms("หมู")
+        self.assertIsInstance(synonyms, list)
+        self.assertGreater(len(synonyms), 0)
+
+    def test_icu_words(self):
+        """Test thai_icu_words corpus can be loaded and is not empty."""
+        result = thai_icu_words()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_orst_words(self):
+        """Test thai_orst_words corpus can be loaded and is not empty."""
+        result = thai_orst_words()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_volubilis_words(self):
+        """Test thai_volubilis_words corpus can be loaded and is not empty."""
+        result = thai_volubilis_words()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_wikipedia_titles(self):
+        """Test thai_wikipedia_titles corpus can be loaded and is not empty."""
+        result = thai_wikipedia_titles()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_countries(self):
+        """Test countries corpus can be loaded and is not empty."""
+        result = countries()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_provinces(self):
+        """Test provinces corpus can be loaded and parsed correctly."""
+        result = provinces()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        
+        # Test with details
+        result_details = provinces(details=True)
+        self.assertIsInstance(result_details, list)
+        self.assertEqual(len(result), len(result_details))
+
+    def test_family_names(self):
+        """Test thai_family_names corpus can be loaded and is not empty."""
+        result = thai_family_names()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_female_names(self):
+        """Test thai_female_names corpus can be loaded and is not empty."""
+        result = thai_female_names()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_male_names(self):
+        """Test thai_male_names corpus can be loaded and is not empty."""
+        result = thai_male_names()
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+
+    def test_ttc_freq(self):
+        """Test TTC frequency corpus can be loaded and parsed correctly."""
+        result = ttc.word_freqs()
+        self.assertIsInstance(result, list)
+        self.assertGreater(len(result), 0)
+        # Verify format: list of (word, frequency) tuples
+        for item in result[:10]:  # Check first 10 items
+            self.assertIsInstance(item, tuple)
+            self.assertEqual(len(item), 2)
+            self.assertIsInstance(item[0], str)
+            self.assertIsInstance(item[1], int)
+        
+        # Test unigram version
+        result_unigram = ttc.unigram_word_freqs()
+        self.assertIsInstance(result_unigram, dict)
+        self.assertGreater(len(result_unigram), 0)
+
+    def test_tnc_freq(self):
+        """Test TNC frequency corpus can be loaded and parsed correctly."""
+        # Test unigram from built-in file
+        result = get_corpus("tnc_freq.txt")
+        self.assertIsInstance(result, frozenset)
+        self.assertGreater(len(result), 0)
+        # Verify format: tab-separated word and frequency
+        for line in list(result)[:10]:  # Check first 10 lines
+            parts = line.split('\t')
+            self.assertGreaterEqual(len(parts), 2)
diff --git a/tests/corpus_integrity/test_downloadable_corpus.py b/tests/corpus_integrity/test_downloadable_corpus.py
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test integrity and parseability of downloadable corpus files.
+
+These tests verify that corpus files that need to be downloaded
+can be fetched and parsed correctly. These tests may take longer
+to run due to network downloads.
+"""
+
+import unittest
+
+from pythainlp.corpus import oscar, tnc
+
+
+class DownloadableCorpusIntegrityTestCase(unittest.TestCase):
+    """Test integrity of downloadable corpus files."""
+
+    def test_oscar_corpus(self):
+        """Test OSCAR corpus can be downloaded and parsed correctly."""
+        # Test word_freqs
+        result = oscar.word_freqs()
+        self.assertIsNotNone(result)
+        self.assertIsInstance(result, list)
+        self.assertGreater(len(result), 0)
+        
+        # Verify format: list of (word, frequency) tuples
+        for item in result[:10]:  # Check first 10 items
+            self.assertIsInstance(item, tuple)
+            self.assertEqual(len(item), 2)
+            self.assertIsInstance(item[0], str)
+            self.assertIsInstance(item[1], int)
+            self.assertGreater(item[1], 0)
+        
+        # Test unigram_word_freqs
+        result_unigram = oscar.unigram_word_freqs()
+        self.assertIsNotNone(result_unigram)
+        self.assertIsInstance(result_unigram, dict)
+        self.assertGreater(len(result_unigram), 0)
+        
+        # Verify dict values are integers
+        for word, freq in list(result_unigram.items())[:10]:
+            self.assertIsInstance(word, str)
+            self.assertIsInstance(freq, int)
+            self.assertGreater(freq, 0)
+
+    def test_tnc_unigram(self):
+        """Test TNC unigram corpus can be loaded and parsed correctly."""
+        result = tnc.word_freqs()
+        self.assertIsNotNone(result)
+        self.assertIsInstance(result, list)
+        self.assertGreater(len(result), 0)
+        
+        # Verify format
+        for item in result[:10]:
+            self.assertIsInstance(item, tuple)
+            self.assertEqual(len(item), 2)
+            self.assertIsInstance(item[0], str)
+            self.assertIsInstance(item[1], int)
+        
+        # Test unigram version
+        result_unigram = tnc.unigram_word_freqs()
+        self.assertIsNotNone(result_unigram)
+        self.assertIsInstance(result_unigram, dict)
+        self.assertGreater(len(result_unigram), 0)
+
+    def test_tnc_bigram(self):
+        """Test TNC bigram corpus can be downloaded and parsed correctly."""
+        result = tnc.bigram_word_freqs()
+        self.assertIsNotNone(result)
+        self.assertIsInstance(result, dict)
+        self.assertGreater(len(result), 0)
+        
+        # Verify format: dict with tuple keys (word1, word2) -> frequency
+        for key, freq in list(result.items())[:10]:
+            self.assertIsInstance(key, tuple)
+            self.assertEqual(len(key), 2)
+            self.assertIsInstance(key[0], str)
+            self.assertIsInstance(key[1], str)
+            self.assertIsInstance(freq, int)
+            self.assertGreater(freq, 0)
+
+    def test_tnc_trigram(self):
+        """Test TNC trigram corpus can be downloaded and parsed correctly."""
+        result = tnc.trigram_word_freqs()
+        self.assertIsNotNone(result)
+        self.assertIsInstance(result, dict)
+        self.assertGreater(len(result), 0)
+        
+        # Verify format: dict with tuple keys (word1, word2, word3) -> frequency
+        for key, freq in list(result.items())[:10]:
+            self.assertIsInstance(key, tuple)
+            self.assertEqual(len(key), 3)
+            self.assertIsInstance(key[0], str)
+            self.assertIsInstance(key[1], str)
+            self.assertIsInstance(key[2], str)
+            self.assertIsInstance(freq, int)
+            self.assertGreater(freq, 0)