diff --git a/vaporetto/src/kytea_model.rs b/vaporetto/src/kytea_model.rs index bf7451f..f15f006 100644 --- a/vaporetto/src/kytea_model.rs +++ b/vaporetto/src/kytea_model.rs @@ -479,7 +479,7 @@ impl TryFrom for Model { } let mut type_ngrams = vec![]; - for (type_ngram, v) in type_dict.dump_items() { + 'a: for (type_ngram, v) in type_dict.dump_items() { let weight_size = config.type_w as usize * 2 - type_ngram.len() + 1; let mut ngram = type_ngram .into_iter() @@ -494,6 +494,12 @@ impl TryFrom for Model { b'T' => CharacterType::Katakana as u8, b'K' => CharacterType::Kanji as u8, b'O' => CharacterType::Other as u8, + // https://github.com/daac-tools/vaporetto/issues/110 + // Some models distributed on KyTea's web site contain the invalid character + // type `0x04`. The following supports them. + 4 => { + continue 'a; + } t => { return Err(VaporettoError::invalid_model(format!( "unsupported character type: {t}"