Skip to content

Commit fa9f3a7

Browse files
Copilotbact
andcommitted
Add corpus integrity testing workflow and tests
Co-authored-by: bact <128572+bact@users.noreply.github.com>
1 parent 6f00243 commit fa9f3a7

File tree

4 files changed

+330
-0
lines changed

4 files changed

+330
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
name: Corpus Integrity Test
5+
6+
on:
7+
push:
8+
paths:
9+
- ".github/workflows/corpus-integrity.yml"
10+
- "pythainlp/corpus/**"
11+
- "tests/corpus_integrity/**"
12+
pull_request:
13+
branches:
14+
- dev
15+
paths:
16+
- ".github/workflows/corpus-integrity.yml"
17+
- "pythainlp/corpus/**"
18+
- "tests/corpus_integrity/**"
19+
20+
# Avoid duplicate runs for the same source branch and repository
21+
concurrency:
22+
group: >-
23+
${{ github.workflow }}-${{
24+
github.event.pull_request.head.repo.full_name || github.repository
25+
}}-${{ github.head_ref || github.ref_name }}
26+
cancel-in-progress: true
27+
28+
jobs:
29+
corpus-integrity:
30+
runs-on: ubuntu-latest
31+
32+
steps:
33+
- name: Checkout
34+
uses: actions/checkout@v6
35+
36+
- name: Set up Python
37+
uses: actions/setup-python@v6
38+
with:
39+
python-version: "3.13"
40+
cache: "pip"
41+
42+
- name: Install dependencies
43+
run: |
44+
pip install --upgrade pip
45+
pip install .
46+
47+
- name: Test built-in corpus files
48+
env:
49+
PYTHONIOENCODING: utf-8
50+
run: |
51+
python -m unittest discover -s tests/corpus_integrity -p "test_builtin_*.py" -v
52+
53+
- name: Test downloadable corpus files
54+
env:
55+
PYTHONIOENCODING: utf-8
56+
run: |
57+
python -m unittest discover -s tests/corpus_integrity -p "test_downloadable_*.py" -v

tests/corpus_integrity/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-FileType: SOURCE
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Corpus integrity tests.
6+
7+
These tests verify the integrity, format, and parseability of corpus files.
8+
They are separate from regular unit tests to avoid slowing down development
9+
test cycles with large file downloads and parsing.
10+
"""
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-FileType: SOURCE
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Test integrity and parseability of built-in corpus files.
6+
7+
These tests verify that all corpus files included in the package
8+
can be loaded and parsed correctly.
9+
"""
10+
11+
import unittest
12+
13+
from pythainlp.corpus import (
14+
countries,
15+
find_synonyms,
16+
get_corpus,
17+
provinces,
18+
thai_family_names,
19+
thai_female_names,
20+
thai_icu_words,
21+
thai_male_names,
22+
thai_negations,
23+
thai_orst_words,
24+
thai_stopwords,
25+
thai_syllables,
26+
thai_synonyms,
27+
thai_volubilis_words,
28+
thai_wikipedia_titles,
29+
thai_words,
30+
ttc,
31+
)
32+
33+
34+
class BuiltinCorpusIntegrityTestCase(unittest.TestCase):
35+
"""Test integrity of built-in corpus files."""
36+
37+
def test_negations(self):
38+
"""Test thai_negations corpus can be loaded and is not empty."""
39+
result = thai_negations()
40+
self.assertIsInstance(result, frozenset)
41+
self.assertGreater(len(result), 0)
42+
# Verify it contains actual Thai content
43+
self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
44+
45+
def test_stopwords(self):
46+
"""Test thai_stopwords corpus can be loaded and is not empty."""
47+
result = thai_stopwords()
48+
self.assertIsInstance(result, frozenset)
49+
self.assertGreater(len(result), 0)
50+
# Verify it contains actual Thai content
51+
self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
52+
53+
def test_syllables(self):
54+
"""Test thai_syllables corpus can be loaded and is not empty."""
55+
result = thai_syllables()
56+
self.assertIsInstance(result, frozenset)
57+
self.assertGreater(len(result), 0)
58+
# Verify it contains actual Thai content
59+
self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
60+
61+
def test_words(self):
62+
"""Test thai_words corpus can be loaded and is not empty."""
63+
result = thai_words()
64+
self.assertIsInstance(result, frozenset)
65+
self.assertGreater(len(result), 0)
66+
# Verify it contains actual Thai content
67+
self.assertTrue(any('\u0e00' <= char <= '\u0e7f' for item in result for char in item))
68+
69+
def test_synonyms(self):
70+
"""Test thai_synonyms corpus can be loaded and parsed correctly."""
71+
result = thai_synonyms()
72+
self.assertIsInstance(result, dict)
73+
self.assertGreater(len(result), 0)
74+
# Test that find_synonyms works with the loaded data
75+
synonyms = find_synonyms("หมู")
76+
self.assertIsInstance(synonyms, list)
77+
self.assertGreater(len(synonyms), 0)
78+
79+
def test_icu_words(self):
80+
"""Test thai_icu_words corpus can be loaded and is not empty."""
81+
result = thai_icu_words()
82+
self.assertIsInstance(result, frozenset)
83+
self.assertGreater(len(result), 0)
84+
85+
def test_orst_words(self):
86+
"""Test thai_orst_words corpus can be loaded and is not empty."""
87+
result = thai_orst_words()
88+
self.assertIsInstance(result, frozenset)
89+
self.assertGreater(len(result), 0)
90+
91+
def test_volubilis_words(self):
92+
"""Test thai_volubilis_words corpus can be loaded and is not empty."""
93+
result = thai_volubilis_words()
94+
self.assertIsInstance(result, frozenset)
95+
self.assertGreater(len(result), 0)
96+
97+
def test_wikipedia_titles(self):
98+
"""Test thai_wikipedia_titles corpus can be loaded and is not empty."""
99+
result = thai_wikipedia_titles()
100+
self.assertIsInstance(result, frozenset)
101+
self.assertGreater(len(result), 0)
102+
103+
def test_countries(self):
104+
"""Test countries corpus can be loaded and is not empty."""
105+
result = countries()
106+
self.assertIsInstance(result, frozenset)
107+
self.assertGreater(len(result), 0)
108+
109+
def test_provinces(self):
110+
"""Test provinces corpus can be loaded and parsed correctly."""
111+
result = provinces()
112+
self.assertIsInstance(result, frozenset)
113+
self.assertGreater(len(result), 0)
114+
115+
# Test with details
116+
result_details = provinces(details=True)
117+
self.assertIsInstance(result_details, list)
118+
self.assertEqual(len(result), len(result_details))
119+
120+
def test_family_names(self):
121+
"""Test thai_family_names corpus can be loaded and is not empty."""
122+
result = thai_family_names()
123+
self.assertIsInstance(result, frozenset)
124+
self.assertGreater(len(result), 0)
125+
126+
def test_female_names(self):
127+
"""Test thai_female_names corpus can be loaded and is not empty."""
128+
result = thai_female_names()
129+
self.assertIsInstance(result, frozenset)
130+
self.assertGreater(len(result), 0)
131+
132+
def test_male_names(self):
133+
"""Test thai_male_names corpus can be loaded and is not empty."""
134+
result = thai_male_names()
135+
self.assertIsInstance(result, frozenset)
136+
self.assertGreater(len(result), 0)
137+
138+
def test_ttc_freq(self):
139+
"""Test TTC frequency corpus can be loaded and parsed correctly."""
140+
result = ttc.word_freqs()
141+
self.assertIsInstance(result, list)
142+
self.assertGreater(len(result), 0)
143+
# Verify format: list of (word, frequency) tuples
144+
for item in result[:10]: # Check first 10 items
145+
self.assertIsInstance(item, tuple)
146+
self.assertEqual(len(item), 2)
147+
self.assertIsInstance(item[0], str)
148+
self.assertIsInstance(item[1], int)
149+
150+
# Test unigram version
151+
result_unigram = ttc.unigram_word_freqs()
152+
self.assertIsInstance(result_unigram, dict)
153+
self.assertGreater(len(result_unigram), 0)
154+
155+
def test_tnc_freq(self):
156+
"""Test TNC frequency corpus can be loaded and parsed correctly."""
157+
# Test unigram from built-in file
158+
result = get_corpus("tnc_freq.txt")
159+
self.assertIsInstance(result, frozenset)
160+
self.assertGreater(len(result), 0)
161+
# Verify format: tab-separated word and frequency
162+
for line in list(result)[:10]: # Check first 10 lines
163+
parts = line.split('\t')
164+
self.assertGreaterEqual(len(parts), 2)
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2+
# SPDX-FileType: SOURCE
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Test integrity and parseability of downloadable corpus files.
6+
7+
These tests verify that corpus files that need to be downloaded
8+
can be fetched and parsed correctly. These tests may take longer
9+
to run due to network downloads.
10+
"""
11+
12+
import unittest
13+
14+
from pythainlp.corpus import oscar, tnc
15+
16+
17+
class DownloadableCorpusIntegrityTestCase(unittest.TestCase):
18+
"""Test integrity of downloadable corpus files."""
19+
20+
def test_oscar_corpus(self):
21+
"""Test OSCAR corpus can be downloaded and parsed correctly."""
22+
# Test word_freqs
23+
result = oscar.word_freqs()
24+
self.assertIsNotNone(result)
25+
self.assertIsInstance(result, list)
26+
self.assertGreater(len(result), 0)
27+
28+
# Verify format: list of (word, frequency) tuples
29+
for item in result[:10]: # Check first 10 items
30+
self.assertIsInstance(item, tuple)
31+
self.assertEqual(len(item), 2)
32+
self.assertIsInstance(item[0], str)
33+
self.assertIsInstance(item[1], int)
34+
self.assertGreater(item[1], 0)
35+
36+
# Test unigram_word_freqs
37+
result_unigram = oscar.unigram_word_freqs()
38+
self.assertIsNotNone(result_unigram)
39+
self.assertIsInstance(result_unigram, dict)
40+
self.assertGreater(len(result_unigram), 0)
41+
42+
# Verify dict values are integers
43+
for word, freq in list(result_unigram.items())[:10]:
44+
self.assertIsInstance(word, str)
45+
self.assertIsInstance(freq, int)
46+
self.assertGreater(freq, 0)
47+
48+
def test_tnc_unigram(self):
49+
"""Test TNC unigram corpus can be loaded and parsed correctly."""
50+
result = tnc.word_freqs()
51+
self.assertIsNotNone(result)
52+
self.assertIsInstance(result, list)
53+
self.assertGreater(len(result), 0)
54+
55+
# Verify format
56+
for item in result[:10]:
57+
self.assertIsInstance(item, tuple)
58+
self.assertEqual(len(item), 2)
59+
self.assertIsInstance(item[0], str)
60+
self.assertIsInstance(item[1], int)
61+
62+
# Test unigram version
63+
result_unigram = tnc.unigram_word_freqs()
64+
self.assertIsNotNone(result_unigram)
65+
self.assertIsInstance(result_unigram, dict)
66+
self.assertGreater(len(result_unigram), 0)
67+
68+
def test_tnc_bigram(self):
69+
"""Test TNC bigram corpus can be downloaded and parsed correctly."""
70+
result = tnc.bigram_word_freqs()
71+
self.assertIsNotNone(result)
72+
self.assertIsInstance(result, dict)
73+
self.assertGreater(len(result), 0)
74+
75+
# Verify format: dict with tuple keys (word1, word2) -> frequency
76+
for key, freq in list(result.items())[:10]:
77+
self.assertIsInstance(key, tuple)
78+
self.assertEqual(len(key), 2)
79+
self.assertIsInstance(key[0], str)
80+
self.assertIsInstance(key[1], str)
81+
self.assertIsInstance(freq, int)
82+
self.assertGreater(freq, 0)
83+
84+
def test_tnc_trigram(self):
85+
"""Test TNC trigram corpus can be downloaded and parsed correctly."""
86+
result = tnc.trigram_word_freqs()
87+
self.assertIsNotNone(result)
88+
self.assertIsInstance(result, dict)
89+
self.assertGreater(len(result), 0)
90+
91+
# Verify format: dict with tuple keys (word1, word2, word3) -> frequency
92+
for key, freq in list(result.items())[:10]:
93+
self.assertIsInstance(key, tuple)
94+
self.assertEqual(len(key), 3)
95+
self.assertIsInstance(key[0], str)
96+
self.assertIsInstance(key[1], str)
97+
self.assertIsInstance(key[2], str)
98+
self.assertIsInstance(freq, int)
99+
self.assertGreater(freq, 0)

0 commit comments

Comments
 (0)