Merge branch 'master' of github.com:nipunsadvilkar/pySBD into master

nipunsadvilkar · nipunsadvilkar · commit 441c935d68f9 · 2021-02-11T22:07:38.000+05:30
* 'master' of github.com:nipunsadvilkar/pySBD:
  🐛 Fix trailing period/ellipses with spaces
diff --git a/pysbd/about.py b/pysbd/about.py
@@ -2,7 +2,7 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 
 __title__ = "pysbd"
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 __uri__ = "http://nipunsadvilkar.github.io/"
 __author__ = "Nipun Sadvilkar"
diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py
@@ -61,18 +61,18 @@ def sentences_with_char_spans(self, sentences):
         # for trailing whitespaces \s* & is used as suffix
         # to keep non-destructive text after segments joins
         sent_spans = []
-        prior_start_char_idx = 0
+        prior_end_char_idx = 0
         for sent in sentences:
-            for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
+            for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
                 match_str = match.group()
                 match_start_idx, match_end_idx = match.span()
-                if match_start_idx >= prior_start_char_idx:
+                if match_end_idx > prior_end_char_idx:
                     # making sure if curren sentence and its span
                     # is either first sentence along with its char spans
                     # or current sent spans adjacent to prior sentence spans
                     sent_spans.append(
                         TextSpan(match_str, match_start_idx, match_end_idx))
-                    prior_start_char_idx = match_start_idx
+                    prior_end_char_idx = match_end_idx
                     break
         return sent_spans
 
diff --git a/tests/regression/test_issues.py b/tests/regression/test_issues.py
@@ -66,7 +66,13 @@
 [('This eBook is for the use of anyone anywhere at no cost\n', 0, 56),
  ('you may copy it, give it away or re-use it under the terms of the this license\n', 56, 135)]),
 ('#78', 'Sentence. .. Next sentence. Next next sentence.',
-[('Sentence. ', 0, 10), ('.. ', 10, 13), ('Next sentence. ', 13, 28), ('Next next sentence.', 28, 47)])
+[('Sentence. ', 0, 10), ('.. ', 10, 13), ('Next sentence. ', 13, 28), ('Next next sentence.', 28, 47)]),
+('#83', 'Maissen se chargea du reste .. Logiquement,',
+[('Maissen se chargea du reste .', 0, 29), ('. ', 29, 31), ('Logiquement,', 31, 43)]),
+('#83', 'Maissen se chargea du reste ... Logiquement,',
+[('Maissen se chargea du reste ... ', 0, 32), ('Logiquement,', 32, 44)]),
+pytest.param('#83', 'Maissen se chargea du reste .... Logiquement,',
+[('Maissen se chargea du reste .', 0, 29), ('... ', 29, 33), ('Logiquement,', 33, 45)], marks=pytest.mark.xfail)
 ]
 
 @pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA)