Skip to content

Commit

Permalink
Fix edge cases in split_sentence
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jun 15, 2022
1 parent 584ce7e commit c9317ae
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
6 changes: 5 additions & 1 deletion hanlp/utils/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ def split_sentence(text, best=True):
yield chunk
continue
processed = replace_with_separator(chunk, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
for sentence in RE_SENTENCE.finditer(processed):
sents = list(RE_SENTENCE.finditer(processed))
if not sents:
yield chunk
continue
for sentence in sents:
sentence = replace_with_separator(sentence.group(), r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
yield sentence

Expand Down
17 changes: 17 additions & 0 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-03-22 17:17
import unittest

from hanlp.utils.rules import split_sentence


class TestRules(unittest.TestCase):
def test_eos(self):
self.assertListEqual(list(split_sentence('叶')), ['叶'])
self.assertListEqual(list(split_sentence('他说:“加油。”谢谢')), ['他说:“加油。”', '谢谢'])
self.assertListEqual(list(split_sentence('Go to hankcs.com. Yes.')), ['Go to hankcs.com.', 'Yes.'])


if __name__ == '__main__':
unittest.main()

0 comments on commit c9317ae

Please sign in to comment.