Skip to content

Commit 4613e4f

Browse files
author
Michal Bida
committed
czech: Add czech language support
- Based on the Slovak language that is very close to Czech one I have created initial support for Czech language sentence splitting
1 parent 5905f13 commit 4613e4f

File tree

4 files changed

+142
-1
lines changed

4 files changed

+142
-1
lines changed

pysbd/lang/czech.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# -*- coding: utf-8 -*-
2+
import re
3+
from pysbd.abbreviation_replacer import AbbreviationReplacer
4+
from pysbd.between_punctuation import BetweenPunctuation
5+
from pysbd.lang.common import Common, Standard
6+
from pysbd.processor import Processor
7+
from pysbd.utils import Text
8+
from pysbd.punctuation_replacer import replace_punctuation
9+
from pysbd.lists_item_replacer import ListItemReplacer
10+
11+
12+
class Czech(Common, Standard):
13+
14+
iso_code = 'cz'
15+
16+
class ListItemReplacer(ListItemReplacer):
17+
18+
def add_line_break(self):
19+
# We've found alphabetical lists are causing a lot of problems with abbreviations
20+
# with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling
21+
# alphabetical list parsing seems like a reasonable tradeoff.
22+
23+
# self.format_alphabetical_lists()
24+
self.format_roman_numeral_lists()
25+
self.format_numbered_list_with_periods()
26+
self.format_numbered_list_with_parens()
27+
return self.text
28+
29+
class AbbreviationReplacer(AbbreviationReplacer):
30+
SENTENCE_STARTERS = []
31+
32+
def replace_period_of_abbr(self, txt, abbr):
33+
# This is a very simple version of the original function, which makes sure
34+
# all of the periods in the abbreviation get replaced, not only the last one.
35+
# In Czech language we use a lot of abbreviations like 'Company Name s. r. o.', so it
36+
# is important to handle this properly.
37+
38+
abbr_new = abbr.replace(".", "∯") + "∯"
39+
txt = txt.replace(abbr + ".", abbr_new)
40+
return txt
41+
42+
class Abbreviation(Standard.Abbreviation):
43+
ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'napřk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'př', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'střed', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'příp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv']
44+
PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof']
45+
NUMBER_ABBREVIATIONS = ['č', 'no', 'nr']
46+
47+
class BetweenPunctuation(BetweenPunctuation):
48+
# Rubular: https://rubular.com/r/rImWbaYFtHHtf4
49+
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“'
50+
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“'
51+
52+
def sub_punctuation_between_czech_double_quotes(self, txt):
53+
return re.sub(self.BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt)
54+
55+
def sub_punctuation_between_quotes_and_parens(self, txt):
56+
txt = self.sub_punctuation_between_single_quotes(txt)
57+
txt = self.sub_punctuation_between_single_quote_slanted(txt)
58+
txt = self.sub_punctuation_between_double_quotes(txt)
59+
txt = self.sub_punctuation_between_square_brackets(txt)
60+
txt = self.sub_punctuation_between_parens(txt)
61+
txt = self.sub_punctuation_between_quotes_arrow(txt)
62+
txt = self.sub_punctuation_between_em_dashes(txt)
63+
txt = self.sub_punctuation_between_quotes_slanted(txt)
64+
txt = self.sub_punctuation_between_czech_double_quotes(txt)
65+
return txt
66+
67+
class Processor(Processor):
68+
69+
def __init__(self, text, lang, char_span=False):
70+
super().__init__(text, lang, char_span)
71+
72+
def process(self):
73+
if not self.text:
74+
return self.text
75+
self.text = self.text.replace('\n', '\r')
76+
77+
# Here we use language specific ListItemReplacer:
78+
li = self.lang.ListItemReplacer(self.text)
79+
self.text = li.add_line_break()
80+
81+
self.replace_abbreviations()
82+
self.replace_numbers()
83+
self.replace_continuous_punctuation()
84+
self.replace_periods_before_numeric_references()
85+
self.text = Text(self.text).apply(
86+
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
87+
self.lang.GeoLocationRule, self.lang.FileFormatRule)
88+
postprocessed_sents = self.split_into_segments()
89+
return postprocessed_sents
90+
91+
def replace_numbers(self):
92+
self.text = Text(self.text).apply(*self.lang.Numbers.All)
93+
self.replace_period_in_czech_dates()
94+
self.replace_period_in_ordinal_numerals()
95+
self.replace_period_in_roman_numerals()
96+
return self.text
97+
98+
def replace_period_in_ordinal_numerals(self):
99+
# Rubular: https://rubular.com/r/0HkmvzMGTqgWs6
100+
self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text)
101+
102+
def replace_period_in_roman_numerals(self):
103+
# Rubular: https://rubular.com/r/XlzTIi7aBRThSl
104+
self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE)
105+
106+
def replace_period_in_czech_dates(self):
107+
MONTHS = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', 'červenec', 'srpen', 'září', 'říjen', 'listopad', 'prosinec',
108+
'ledna', 'února', 'března', 'dubna', 'května', 'června', 'července', 'srpna', 'října', 'listopadu', 'prosince']
109+
for month in MONTHS:
110+
# Rubular: https://rubular.com/r/dGLZqsbjcdJvCd
111+
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)

pysbd/languages.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pysbd.lang.deutsch import Deutsch
2323
from pysbd.lang.kazakh import Kazakh
2424
from pysbd.lang.slovak import Slovak
25+
from pysbd.lang.czech import Czech
2526

2627
LANGUAGE_CODES = {
2728
'en': English,
@@ -46,7 +47,8 @@
4647
'ja': Japanese,
4748
'de': Deutsch,
4849
'kk': Kazakh,
49-
'sk': Slovak
50+
'sk': Slovak,
51+
'cz': Czech
5052
}
5153

5254

tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,8 @@ def kk_default_fixture():
146146
def sk_default_fixture():
147147
sk_segmenter = pysbd.Segmenter(language="sk", clean=False, char_span=False)
148148
return sk_segmenter
149+
150+
@pytest.fixture()
151+
def cz_default_fixture():
152+
cz_segmenter = pysbd.Segmenter(language="cz", clean=False, char_span=False)
153+
return cz_segmenter

tests/lang/test_czech.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
import pytest
3+
4+
GOLDEN_CZ_RULES_TEST_CASES = [
5+
("Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s.",
6+
["Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s."]),
7+
("„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus.",
8+
["„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus."]),
9+
("Toto se mi podařilo až na 10. pokus, ale stálo to za to.",
10+
["Toto se mi podařilo až na 10. pokus, ale stálo to za to."]),
11+
("Jde o príslušníky XII. Pluku speciálního nasazení.",
12+
["Jde o príslušníky XII. Pluku speciálního nasazení."]),
13+
("Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020.",
14+
["Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020."]),
15+
]
16+
17+
18+
@pytest.mark.parametrize('text,expected_sents', GOLDEN_CZ_RULES_TEST_CASES)
19+
def test_pl_sbd(cz_default_fixture, text, expected_sents):
20+
"""Czech language SBD tests"""
21+
segments = cz_default_fixture.segment(text)
22+
segments = [s.strip() for s in segments]
23+
assert segments == expected_sents

0 commit comments

Comments
 (0)