|
| 1 | +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project |
| 2 | +# SPDX-FileType: SOURCE |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +""" |
| 5 | +Test integrity and parseability of built-in corpus files. |
| 6 | +
|
| 7 | +These tests verify that all corpus files included in the package |
| 8 | +can be loaded and parsed correctly. |
| 9 | +""" |
| 10 | + |
| 11 | +import unittest |
| 12 | + |
| 13 | +from pythainlp.corpus import ( |
| 14 | + countries, |
| 15 | + find_synonyms, |
| 16 | + get_corpus, |
| 17 | + provinces, |
| 18 | + thai_family_names, |
| 19 | + thai_female_names, |
| 20 | + thai_icu_words, |
| 21 | + thai_male_names, |
| 22 | + thai_negations, |
| 23 | + thai_orst_words, |
| 24 | + thai_stopwords, |
| 25 | + thai_syllables, |
| 26 | + thai_synonyms, |
| 27 | + thai_volubilis_words, |
| 28 | + thai_wikipedia_titles, |
| 29 | + thai_words, |
| 30 | + ttc, |
| 31 | +) |
| 32 | + |
| 33 | + |
| 34 | +class BuiltinCorpusIntegrityTestCase(unittest.TestCase): |
| 35 | + """Test integrity of built-in corpus files.""" |
| 36 | + |
| 37 | + def test_negations(self): |
| 38 | + """Test thai_negations corpus can be loaded and is not empty.""" |
| 39 | + result = thai_negations() |
| 40 | + self.assertIsInstance(result, frozenset) |
| 41 | + self.assertGreater(len(result), 0) |
| 42 | + # Verify it contains actual Thai content |
| 43 | + self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item)) |
| 44 | + |
| 45 | + def test_stopwords(self): |
| 46 | + """Test thai_stopwords corpus can be loaded and is not empty.""" |
| 47 | + result = thai_stopwords() |
| 48 | + self.assertIsInstance(result, frozenset) |
| 49 | + self.assertGreater(len(result), 0) |
| 50 | + # Verify it contains actual Thai content |
| 51 | + self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item)) |
| 52 | + |
| 53 | + def test_syllables(self): |
| 54 | + """Test thai_syllables corpus can be loaded and is not empty.""" |
| 55 | + result = thai_syllables() |
| 56 | + self.assertIsInstance(result, frozenset) |
| 57 | + self.assertGreater(len(result), 0) |
| 58 | + # Verify it contains actual Thai content |
| 59 | + self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item)) |
| 60 | + |
| 61 | + def test_words(self): |
| 62 | + """Test thai_words corpus can be loaded and is not empty.""" |
| 63 | + result = thai_words() |
| 64 | + self.assertIsInstance(result, frozenset) |
| 65 | + self.assertGreater(len(result), 0) |
| 66 | + # Verify it contains actual Thai content |
| 67 | + self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item)) |
| 68 | + |
| 69 | + def test_synonyms(self): |
| 70 | + """Test thai_synonyms corpus can be loaded and parsed correctly.""" |
| 71 | + result = thai_synonyms() |
| 72 | + self.assertIsInstance(result, dict) |
| 73 | + self.assertGreater(len(result), 0) |
| 74 | + # Test that find_synonyms works with the loaded data |
| 75 | + synonyms = find_synonyms("หมู") |
| 76 | + self.assertIsInstance(synonyms, list) |
| 77 | + self.assertGreater(len(synonyms), 0) |
| 78 | + |
| 79 | + def test_icu_words(self): |
| 80 | + """Test thai_icu_words corpus can be loaded and is not empty.""" |
| 81 | + result = thai_icu_words() |
| 82 | + self.assertIsInstance(result, frozenset) |
| 83 | + self.assertGreater(len(result), 0) |
| 84 | + |
| 85 | + def test_orst_words(self): |
| 86 | + """Test thai_orst_words corpus can be loaded and is not empty.""" |
| 87 | + result = thai_orst_words() |
| 88 | + self.assertIsInstance(result, frozenset) |
| 89 | + self.assertGreater(len(result), 0) |
| 90 | + |
| 91 | + def test_volubilis_words(self): |
| 92 | + """Test thai_volubilis_words corpus can be loaded and is not empty.""" |
| 93 | + result = thai_volubilis_words() |
| 94 | + self.assertIsInstance(result, frozenset) |
| 95 | + self.assertGreater(len(result), 0) |
| 96 | + |
| 97 | + def test_wikipedia_titles(self): |
| 98 | + """Test thai_wikipedia_titles corpus can be loaded and is not empty.""" |
| 99 | + result = thai_wikipedia_titles() |
| 100 | + self.assertIsInstance(result, frozenset) |
| 101 | + self.assertGreater(len(result), 0) |
| 102 | + |
| 103 | + def test_countries(self): |
| 104 | + """Test countries corpus can be loaded and is not empty.""" |
| 105 | + result = countries() |
| 106 | + self.assertIsInstance(result, frozenset) |
| 107 | + self.assertGreater(len(result), 0) |
| 108 | + |
| 109 | + def test_provinces(self): |
| 110 | + """Test provinces corpus can be loaded and parsed correctly.""" |
| 111 | + result = provinces() |
| 112 | + self.assertIsInstance(result, frozenset) |
| 113 | + self.assertGreater(len(result), 0) |
| 114 | + |
| 115 | + # Test with details |
| 116 | + result_details = provinces(details=True) |
| 117 | + self.assertIsInstance(result_details, list) |
| 118 | + self.assertEqual(len(result), len(result_details)) |
| 119 | + |
| 120 | + def test_family_names(self): |
| 121 | + """Test thai_family_names corpus can be loaded and is not empty.""" |
| 122 | + result = thai_family_names() |
| 123 | + self.assertIsInstance(result, frozenset) |
| 124 | + self.assertGreater(len(result), 0) |
| 125 | + |
| 126 | + def test_female_names(self): |
| 127 | + """Test thai_female_names corpus can be loaded and is not empty.""" |
| 128 | + result = thai_female_names() |
| 129 | + self.assertIsInstance(result, frozenset) |
| 130 | + self.assertGreater(len(result), 0) |
| 131 | + |
| 132 | + def test_male_names(self): |
| 133 | + """Test thai_male_names corpus can be loaded and is not empty.""" |
| 134 | + result = thai_male_names() |
| 135 | + self.assertIsInstance(result, frozenset) |
| 136 | + self.assertGreater(len(result), 0) |
| 137 | + |
| 138 | + def test_ttc_freq(self): |
| 139 | + """Test TTC frequency corpus can be loaded and parsed correctly.""" |
| 140 | + result = ttc.word_freqs() |
| 141 | + self.assertIsInstance(result, list) |
| 142 | + self.assertGreater(len(result), 0) |
| 143 | + # Verify format: list of (word, frequency) tuples |
| 144 | + for item in result[:10]: # Check first 10 items |
| 145 | + self.assertIsInstance(item, tuple) |
| 146 | + self.assertEqual(len(item), 2) |
| 147 | + self.assertIsInstance(item[0], str) |
| 148 | + self.assertIsInstance(item[1], int) |
| 149 | + |
| 150 | + # Test unigram version |
| 151 | + result_unigram = ttc.unigram_word_freqs() |
| 152 | + self.assertIsInstance(result_unigram, dict) |
| 153 | + self.assertGreater(len(result_unigram), 0) |
| 154 | + |
| 155 | + def test_tnc_freq(self): |
| 156 | + """Test TNC frequency corpus can be loaded and parsed correctly.""" |
| 157 | + # Test unigram from built-in file |
| 158 | + result = get_corpus("tnc_freq.txt") |
| 159 | + self.assertIsInstance(result, frozenset) |
| 160 | + self.assertGreater(len(result), 0) |
| 161 | + # Verify format: tab-separated word and frequency |
| 162 | + for line in list(result)[:10]: # Check first 10 lines |
| 163 | + parts = line.split('\t') |
| 164 | + self.assertGreaterEqual(len(parts), 2) |
0 commit comments