Skip to content

Commit 441c935

Browse files
Merge branch 'master' of github.com:nipunsadvilkar/pySBD into master
* 'master' of github.com:nipunsadvilkar/pySBD: 🐛 Fix trailing period/ellipses with spaces
2 parents 9328b41 + 699910f commit 441c935

File tree

3 files changed

+12
-6
lines changed

3 files changed

+12
-6
lines changed

pysbd/about.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.3.3"
5+
__version__ = "0.3.4"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/segmenter.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,18 @@ def sentences_with_char_spans(self, sentences):
6161
# for trailing whitespaces \s* & is used as suffix
6262
# to keep non-destructive text after segments joins
6363
sent_spans = []
64-
prior_start_char_idx = 0
64+
prior_end_char_idx = 0
6565
for sent in sentences:
66-
for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
66+
for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
6767
match_str = match.group()
6868
match_start_idx, match_end_idx = match.span()
69-
if match_start_idx >= prior_start_char_idx:
69+
if match_end_idx > prior_end_char_idx:
7070
# making sure if curren sentence and its span
7171
# is either first sentence along with its char spans
7272
# or current sent spans adjacent to prior sentence spans
7373
sent_spans.append(
7474
TextSpan(match_str, match_start_idx, match_end_idx))
75-
prior_start_char_idx = match_start_idx
75+
prior_end_char_idx = match_end_idx
7676
break
7777
return sent_spans
7878

tests/regression/test_issues.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,13 @@
6666
[('This eBook is for the use of anyone anywhere at no cost\n', 0, 56),
6767
('you may copy it, give it away or re-use it under the terms of the this license\n', 56, 135)]),
6868
('#78', 'Sentence. .. Next sentence. Next next sentence.',
69-
[('Sentence. ', 0, 10), ('.. ', 10, 13), ('Next sentence. ', 13, 28), ('Next next sentence.', 28, 47)])
69+
[('Sentence. ', 0, 10), ('.. ', 10, 13), ('Next sentence. ', 13, 28), ('Next next sentence.', 28, 47)]),
70+
('#83', 'Maissen se chargea du reste .. Logiquement,',
71+
[('Maissen se chargea du reste .', 0, 29), ('. ', 29, 31), ('Logiquement,', 31, 43)]),
72+
('#83', 'Maissen se chargea du reste ... Logiquement,',
73+
[('Maissen se chargea du reste ... ', 0, 32), ('Logiquement,', 32, 44)]),
74+
pytest.param('#83', 'Maissen se chargea du reste .... Logiquement,',
75+
[('Maissen se chargea du reste .', 0, 29), ('... ', 29, 33), ('Logiquement,', 33, 45)], marks=pytest.mark.xfail)
7076
]
7177

7278
@pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA)

0 commit comments

Comments
 (0)