Skip to content

Commit

Permalink
fix user-defined POS in OOV handlers bug for user dictionaries (#214)
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito authored Jun 17, 2022
1 parent df1b04d commit 6a77652
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 13 deletions.
10 changes: 6 additions & 4 deletions sudachi/src/dic/lexicon_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,19 @@ pub enum LexiconSetError {
pub struct LexiconSet<'a> {
lexicons: Vec<Lexicon<'a>>,
pos_offsets: Vec<usize>,
num_system_pos: usize,
}

impl<'a> LexiconSet<'a> {
/// Creates a LexiconSet given a lexicon
///
/// It is assumed that the passed lexicon is the system dictionary
pub fn new(mut system_lexicon: Lexicon) -> LexiconSet {
pub fn new(mut system_lexicon: Lexicon, num_system_pos: usize) -> LexiconSet {
system_lexicon.set_dic_id(0);
LexiconSet {
lexicons: vec![system_lexicon],
pos_offsets: vec![0],
num_system_pos,
}
}

Expand Down Expand Up @@ -111,10 +113,10 @@ impl LexiconSet<'_> {
.into();

if subset.contains(InfoSubset::POS_ID) {
let pos_id = word_info.pos_id;
if dict_id > 0 && pos_id as usize >= self.pos_offsets[1] {
let pos_id = word_info.pos_id as usize;
if dict_id > 0 && pos_id >= self.num_system_pos {
// user defined part-of-speech
word_info.pos_id = (pos_id as usize - self.pos_offsets[1]
word_info.pos_id = (pos_id as usize - self.num_system_pos
+ self.pos_offsets[dict_id as usize]) as u16;
}
}
Expand Down
14 changes: 9 additions & 5 deletions sudachi/src/dic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ impl<'a> LoadedDictionary<'a> {
.ok_or(SudachiError::InvalidDictionaryGrammar)?;
grammar.set_character_category(character_category);

let num_system_pos = grammar.pos_list.len();
Ok(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(system_dict.lexicon),
lexicon_set: LexiconSet::new(system_dict.lexicon, num_system_pos),
})
}

Expand Down Expand Up @@ -169,10 +170,13 @@ impl<'a> DictionaryLoader<'a> {
lexicon.set_dic_id(0);
match self.grammar {
None => None,
Some(grammar) => Some(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(lexicon),
}),
Some(grammar) => {
let num_system_pos = grammar.pos_list.len();
Some(LoadedDictionary {
grammar,
lexicon_set: LexiconSet::new(lexicon, num_system_pos),
})
}
}
}
}
6 changes: 6 additions & 0 deletions sudachi/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ pub struct TestStatefulTokenizer {
#[allow(unused)]
pub const LEX_CSV: &[u8] = include_bytes!("../resources/lex.csv");

#[allow(unused)]
pub const USER1_CSV: &[u8] = include_bytes!("../resources/user1.csv");

#[allow(unused)]
pub const USER2_CSV: &[u8] = include_bytes!("../resources/user2.csv");

#[allow(unused)]
impl TestStatefulTokenizer {
pub fn new_built(mode: Mode) -> TestStatefulTokenizer {
Expand Down
19 changes: 18 additions & 1 deletion sudachi/tests/regex_oov.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

use crate::common::{TestStatefulTokenizer, LEX_CSV};
use crate::common::{TestStatefulTokenizer, LEX_CSV, USER1_CSV, USER2_CSV};
use std::ops::Deref;

mod common;
Expand Down Expand Up @@ -73,3 +73,20 @@ fn very_long_word_not_added() {
assert_eq!(&data, tokens.get(0).surface().deref());
assert_eq!("数詞", tokens.get(0).part_of_speech()[1])
}

#[test]
fn user_dictionaries_have_correct_pos() {
let mut tok = TestStatefulTokenizer::builder(LEX_CSV)
.config(REGEX_CONFIG)
.user(USER1_CSV)
.user(USER2_CSV)
.build();
let tokens = tok.tokenize("すだちASDF12かぼす");
assert_eq!(3, tokens.len());
assert_eq!("すだち", tokens.get(0).surface().deref());
assert_eq!("スダチ", tokens.get(0).part_of_speech()[5]);
assert_eq!("ASDF12", tokens.get(1).surface().deref());
assert_eq!("REGEX", tokens.get(1).part_of_speech()[5]);
assert_eq!("かぼす", tokens.get(2).surface().deref());
assert_eq!("カボス", tokens.get(2).part_of_speech()[5]);
}
6 changes: 3 additions & 3 deletions sudachi/tests/resources/sudachi.regex.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
"名詞",
"普通名詞",
"REGEX",
"*",
"*",
"*"
"REGEX",
"REGEX",
"REGEX"
],
"leftId": 5,
"rightId": 5,
Expand Down
4 changes: 4 additions & 0 deletions sudachi/tests/resources/user1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,*
府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,*
東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3
すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,*
2 changes: 2 additions & 0 deletions sudachi/tests/resources/user2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,*
かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,*

0 comments on commit 6a77652

Please sign in to comment.