Skip to content

Commit f0e71a5

Browse files
🐛 Fix consecutive period bug
Reference allenai/scholarphi#114
1 parent fc61aef commit f0e71a5

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

pysbd/lang/common/common.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44

55
class Common(object):
66

7-
# added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
8-
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"
7+
# added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
8+
# r"[。..!!?] at end to handle single instances of these symbol inputs
9+
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"
910

1011
# # Rubular: http://rubular.com/r/NqCqv372Ix
1112
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'

tests/regression/test_issues.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@
6464
you may copy it, give it away or re-use it under the terms of the this license
6565
""",
6666
[('This eBook is for the use of anyone anywhere at no cost\n', 0, 56),
67-
('you may copy it, give it away or re-use it under the terms of the this license\n', 56, 135)])
68-
67+
('you may copy it, give it away or re-use it under the terms of the this license\n', 56, 135)]),
68+
('#78', 'Sentence. .. Next sentence. Next next sentence.',
69+
[('Sentence. ', 0, 10), ('.. ', 10, 13), ('Next sentence. ', 13, 28), ('Next next sentence.', 28, 47)])
6970
]
7071

7172
@pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA)

0 commit comments

Comments
 (0)