fix user-defined POS in OOV handlers bug for user dictionaries (#214)

WorksApplications · Jun 17, 2022 · 6a77652 · 6a77652
1 parent df1b04d
commit 6a77652
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 13 deletions.
diff --git a/sudachi/src/dic/lexicon_set.rs b/sudachi/src/dic/lexicon_set.rs
@@ -42,17 +42,19 @@ pub enum LexiconSetError {
 pub struct LexiconSet<'a> {
     lexicons: Vec<Lexicon<'a>>,
     pos_offsets: Vec<usize>,
+    num_system_pos: usize,
 }
 
 impl<'a> LexiconSet<'a> {
     /// Creates a LexiconSet given a lexicon
     ///
     /// It is assumed that the passed lexicon is the system dictionary
-    pub fn new(mut system_lexicon: Lexicon) -> LexiconSet {
+    pub fn new(mut system_lexicon: Lexicon, num_system_pos: usize) -> LexiconSet {
         system_lexicon.set_dic_id(0);
         LexiconSet {
             lexicons: vec![system_lexicon],
             pos_offsets: vec![0],
+            num_system_pos,
         }
     }
 
@@ -111,10 +113,10 @@ impl LexiconSet<'_> {
             .into();
 
         if subset.contains(InfoSubset::POS_ID) {
-            let pos_id = word_info.pos_id;
-            if dict_id > 0 && pos_id as usize >= self.pos_offsets[1] {
+            let pos_id = word_info.pos_id as usize;
+            if dict_id > 0 && pos_id >= self.num_system_pos {
                 // user defined part-of-speech
-                word_info.pos_id = (pos_id as usize - self.pos_offsets[1]
+                word_info.pos_id = (pos_id as usize - self.num_system_pos
                     + self.pos_offsets[dict_id as usize]) as u16;
             }
         }

diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs
@@ -64,9 +64,10 @@ impl<'a> LoadedDictionary<'a> {
             .ok_or(SudachiError::InvalidDictionaryGrammar)?;
         grammar.set_character_category(character_category);
 
+        let num_system_pos = grammar.pos_list.len();
         Ok(LoadedDictionary {
             grammar,
-            lexicon_set: LexiconSet::new(system_dict.lexicon),
+            lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
         })
     }
 
@@ -169,10 +170,13 @@ impl<'a> DictionaryLoader<'a> {
         lexicon.set_dic_id(0);
         match self.grammar {
             None => None,
-            Some(grammar) => Some(LoadedDictionary {
-                grammar,
-                lexicon_set: LexiconSet::new(lexicon),
-            }),
+            Some(grammar) => {
+                let num_system_pos = grammar.pos_list.len();
+                Some(LoadedDictionary {
+                    grammar,
+                    lexicon_set: LexiconSet::new(lexicon, num_system_pos),
+                })
+            }
         }
     }
 }
diff --git a/sudachi/tests/common/mod.rs b/sudachi/tests/common/mod.rs
@@ -199,6 +199,12 @@ pub struct TestStatefulTokenizer {
 #[allow(unused)]
 pub const LEX_CSV: &[u8] = include_bytes!("../resources/lex.csv");
 
+#[allow(unused)]
+pub const USER1_CSV: &[u8] = include_bytes!("../resources/user1.csv");
+
+#[allow(unused)]
+pub const USER2_CSV: &[u8] = include_bytes!("../resources/user2.csv");
+
 #[allow(unused)]
 impl TestStatefulTokenizer {
     pub fn new_built(mode: Mode) -> TestStatefulTokenizer {

diff --git a/sudachi/tests/regex_oov.rs b/sudachi/tests/regex_oov.rs
@@ -14,7 +14,7 @@
  *  limitations under the License.
  */
 
-use crate::common::{TestStatefulTokenizer, LEX_CSV};
+use crate::common::{TestStatefulTokenizer, LEX_CSV, USER1_CSV, USER2_CSV};
 use std::ops::Deref;
 
 mod common;
@@ -73,3 +73,20 @@ fn very_long_word_not_added() {
     assert_eq!(&data, tokens.get(0).surface().deref());
     assert_eq!("数詞", tokens.get(0).part_of_speech()[1])
 }
+
+#[test]
+fn user_dictionaries_have_correct_pos() {
+    let mut tok = TestStatefulTokenizer::builder(LEX_CSV)
+        .config(REGEX_CONFIG)
+        .user(USER1_CSV)
+        .user(USER2_CSV)
+        .build();
+    let tokens = tok.tokenize("すだちASDF12かぼす");
+    assert_eq!(3, tokens.len());
+    assert_eq!("すだち", tokens.get(0).surface().deref());
+    assert_eq!("スダチ", tokens.get(0).part_of_speech()[5]);
+    assert_eq!("ASDF12", tokens.get(1).surface().deref());
+    assert_eq!("REGEX", tokens.get(1).part_of_speech()[5]);
+    assert_eq!("かぼす", tokens.get(2).surface().deref());
+    assert_eq!("カボス", tokens.get(2).part_of_speech()[5]);
+}
diff --git a/sudachi/tests/resources/sudachi.regex.json b/sudachi/tests/resources/sudachi.regex.json
@@ -22,9 +22,9 @@
         "名詞",
         "普通名詞",
         "REGEX",
-        "*",
-        "*",
-        "*"
+        "REGEX",
+        "REGEX",
+        "REGEX"
       ],
       "leftId": 5,
       "rightId": 5,

diff --git a/sudachi/tests/resources/user1.csv b/sudachi/tests/resources/user1.csv
@@ -0,0 +1,4 @@
+ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,*
+府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,*
+東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3
+すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,*
diff --git a/sudachi/tests/resources/user2.csv b/sudachi/tests/resources/user2.csv
@@ -0,0 +1,2 @@
+ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,*
+かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,*
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,,,,ピサル,ぴさる,,A,,,,
		かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,,A,,,,*