Skip to content

Commit

Permalink
refactor: restructure;
Browse files Browse the repository at this point in the history
  • Loading branch information
codybloemhard committed Aug 16, 2022
1 parent 1d4f8bf commit d665f3b
Show file tree
Hide file tree
Showing 5 changed files with 570 additions and 551 deletions.
299 changes: 299 additions & 0 deletions src/japanese.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
pub fn romanize(string: &str) -> String{
let mut res = String::new();
let chars: Vec<char> = string.chars().collect();
if chars.is_empty() { return res; }
let lm1 = chars.len() - 1;
let mut i = 0;
let mut prev = chars[0];
let mut tsu = false;

fn push(res: &mut String, roman: &str, tsu: &mut bool, prev: &mut char){
if *tsu{
let next = roman.chars().next();
if let Some(next) = next{
res.push_str(&next.to_string());
}
*tsu = false;
}
res.push_str(roman);
if let Some(last) = roman.chars().last(){ *prev = last; }
}

while i < lm1{
let a = chars[i];
let b = chars[i + 1];
let comb = format!("{}{}", a, b);
if let Hepburn::Roman(roman) = Hepburn::from(&comb){
push(&mut res, &roman, &mut tsu, &mut prev);
i += 2;
continue;
}
let a = a.to_string();
match Hepburn::from(&a){
Hepburn::Roman(roman) => push(&mut res, &roman, &mut tsu, &mut prev),
Hepburn::SmallTsu => tsu = true,
Hepburn::Enlongate => res.push(prev),
Hepburn::Fail => res.push_str(&a),
}
i += 1;
}
if i == lm1{
let last = chars[lm1].to_string();
if last == "っ"{
res.push('h');
} else if let Hepburn::Roman(roman) = Hepburn::from(&last){
push(&mut res, &roman, &mut tsu, &mut prev);
} else {
res.push_str(&last);
}
}
res
}

pub enum Hepburn{ Roman(String), SmallTsu, Enlongate, Fail }

impl Hepburn{
fn from(string: &str) -> Self{
let temp = match string{
"あ" => "a", "ア" => "a", "い" => "i", "イ" => "i", "う" => "u", "ウ" => "u",
"え" => "e", "エ" => "e", "お" => "o", "オ" => "o",
"か" => "ka", "カ" => "ka", "き" => "ki", "キ" => "ki", "く" => "ku", "ク" => "ku",
"け" => "ke", "ケ" => "ke", "こ" => "ko", "コ" => "ko",
"さ" => "sa", "サ" => "sa", "し" => "shi", "シ" => "shi", "す" => "su", "ス" => "su",
"せ" => "se", "セ" => "se", "そ" => "so", "ソ" => "so",
"た" => "ta", "タ" => "ta", "ち" => "chi", "チ" => "chi", "つ" => "tsu", "ツ" => "tsu",
"て" => "te", "テ" => "te", "と" => "to", "ト" => "to", "な" => "na", "ナ" => "na",
"に" => "ni", "ニ" => "ni", "ぬ" => "nu", "ヌ" => "nu", "ね" => "ne", "ネ" => "ne",
"の" => "no", "ノ" => "no",
"は" => "ha", "ハ" => "ha", "ひ" => "hi", "ヒ" => "hi", "ふ" => "fu", "フ" => "fu",
"へ" => "he", "ヘ" => "he", "ほ" => "ho", "ホ" => "ho",
"ま" => "ma", "マ" => "ma", "み" => "mi", "ミ" => "mi", "む" => "mu", "ム" => "mu",
"め" => "me", "メ" => "me", "も" => "mo", "モ" => "mo",
"や" => "ya", "ヤ" => "ya", "ゆ" => "yu", "ユ" => "yu", "よ" => "yo", "ヨ" => "yo",
"ら" => "ra", "ラ" => "ra", "り" => "ri", "リ" => "ri", "る" => "ru", "ル" => "ru",
"れ" => "re", "レ" => "re", "ろ" => "ro", "ロ" => "ro",
"わ" => "wa", "ワ" => "wa", "ゐ" => "i", "を" => "o", "ヲ" => "o", "ん" => "n", "ン" => "n",
"が" => "ga", "ガ" => "ga", "ぎ" => "gi", "ギ" => "gi", "ぐ" => "gu", "グ" => "gu",
"げ" => "ge", "ゲ" => "ge", "ご" => "go", "ゴ" => "go", "ざ" => "za", "ザ" => "za",
"じ" => "ji", "ジ" => "ji", "ず" => "zu", "ズ" => "zu",
"ぜ" => "ze", "ゼ" => "ze", "ぞ" => "zo", "ゾ" => "zo",
"だ" => "da", "ダ" => "da", "ぢ" => "ji", "ヂ" => "ji", "づ" => "zu", "ヅ" => "zu",
"で" => "de", "デ" => "de", "ど" => "do", "ド" => "do",
"ば" => "ba", "バ" => "ba", "び" => "bi", "ビ" => "bi", "ぶ" => "bu", "ブ" => "bu",
"べ" => "be", "ベ" => "be", "ぼ" => "bo", "ボ" => "bo",
"ぱ" => "pa", "パ" => "pa", "ぴ" => "pi", "ピ" => "pi", "ぷ" => "pu", "プ" => "pu",
"ぺ" => "pe", "ペ" => "pe", "ぽ" => "po", "ポ" => "po",
"きゃ" => "kya", "キャ" => "kya", "きゅ" => "kyu", "キュ" => "kyu",
"きょ" => "kyo", "キョ" => "kyo",
"しゃ" => "sha", "シャ" => "sha", "しゅ" => "shu", "シュ" => "shu",
"しょ" => "sho", "ショ" => "sho",
"ちゃ" => "cha", "チャ" => "cha", "ちゅ" => "chu", "チュ" => "chu",
"ちょ" => "cho", "チョ" => "cho",
"にゃ" => "nya", "ニャ" => "nya", "にゅ" => "nyu", "ニュ" => "nyu",
"にょ" => "nyo", "ニョ" => "nyo",
"ひゃ" => "hya", "ヒャ" => "hya", "ひゅ" => "hyu", "ヒュ" => "hyu",
"ひょ" => "hyo", "ヒョ" => "hyo",
"みゃ" => "mya", "ミャ" => "mya", "みゅ" => "myu", "ミュ" => "myu",
"みょ" => "myo", "ミョ" => "myo",
"りゃ" => "rya", "リャ" => "rya", "りゅ" => "ryu", "リュ" => "ryu",
"りょ" => "ryo", "リョ" => "ryo",
"ぎゃ" => "gya", "ギャ" => "gya", "ぎゅ" => "gyu", "ギュ" => "gyu",
"ぎょ" => "gyo", "ギョ" => "gyo",
"じゃ" => "ja", "ジャ" => "ja", "じゅ" => "ju", "ジュ" => "ju",
"じょ" => "jo", "ジョ" => "jo",
"びゃ" => "bya", "ビャ" => "bya", "びゅ" => "byu", "ビュ" => "byu",
"びょ" => "byo", "ビョ" => "byo",
"ぴゃ" => "pya", "ピャ" => "pya", "ぴゅ" => "pyu", "ピュ" => "pyu",
"ぴょ" => "pyo", "ピョ" => "pyo",
"〜" => "~", "?" => "?", "!" => "!", " " => " ", "「" => "\"", "」" => "\"",
"。" => ".", "、" => ",",
"っ" => "_", "ッ" => "_",
"ー" => "-",
_ => "",
}.to_string();
match temp.as_str(){
"" => Self::Fail,
"_" => Self::SmallTsu,
"-" => Self::Enlongate,
_ => Self::Roman(temp),
}
}
}

pub fn map_kanjis(strings: &[String], subs: &[[String; 2]]) -> Vec<String>{
let mut replaceds = strings.to_vec();
for [replacee, replacant] in subs{
for (i, string) in replaceds.iter().enumerate(){
let new = string.replacen(replacee, replacant, 1);
if &new != string {
replaceds[i] = new;
break;
}
}
}
replaceds
}

pub fn could_contain_kanji(strings: &[String]) -> bool{
for string in strings{
for c in string.chars(){
if could_be_kanji(c) { return true; }
}
}
false
}

pub fn could_be_kanji(c: char) -> bool{
!is_latin(c) && !is_hiragana(c) && !is_katakana(c) && !is_punctuation(c) && !is_whitespace(c)
}

pub fn is_latin(c: char) -> bool{
"qgmlwyfubdstnriaeohzxcvjkpQGMLWYFUBDSTNRIAEOHZXCVJKP0123456789".contains(c)
}

pub fn is_hiragana(c: char) -> bool{
"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろ
わをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽゐゃゅょっ".contains(c)
}

pub fn is_katakana(c: char) -> bool{
"アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロ
ワヲンガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポャュョッ".contains(c)
}

pub fn is_punctuation(c: char) -> bool{
"-_=+`~,./<>?\\|[]{}!@#$%^&*()〜ー!?・「」、。".contains(c)
}

pub fn is_whitespace(c: char) -> bool{
"  \t\n".contains(c)
}

pub fn to_mora(c: char) -> usize{
if "ゃゅょャュョ  〜!?・「」、。-_=+`~,./<>?\\|[]{}!@#$%^&*(\"'".contains(c) { return 0; }
if is_latin(c) { return 0; }
if is_whitespace(c) { return 0; }
1
}

#[cfg(test)]
mod tests{
use super::*;

#[test]
fn test_romanize(){
assert_eq!(
&romanize("ちょうしょく は じぶんで!! つくって ください!!"),
"choushoku ha jibunde!! tsukutte kudasai!!"
);
assert_eq!(
&romanize("いってキーまーす!!"),
"ittekiimaasu!!"
);
}

#[test]
fn test_map_kanjis(){
// normal one of each
let map = vec!
[
("今", "きょ"), ("日", "う"), ("雨", "あめ"), ("降", "ふ"),
("作", "つく"), ("下", "くだ")
]
.into_iter().map(|(a, b)| [a.to_string(), b.to_string()]).collect::<Vec<_>>();
assert_eq!(
map_kanjis(&[
"今日雨降るって".to_string(),
"作って 下さい!!".to_string()
], &map),
vec![
"きょうあめふるって".to_string(),
"つくって ください!!".to_string()
]
);
// more than kanji mapping in one map entry
let map = vec!
[
("今日", "きょう"), ("雨", "あめ"), ("降", "ふ"), ("作", "つく"), ("下", "くだ")
]
.into_iter().map(|(a, b)| [a.to_string(), b.to_string()]).collect::<Vec<_>>();
assert_eq!(
map_kanjis(&[
"今日雨降るって".to_string(),
"作って 下さい!!".to_string()
], &map),
vec![
"きょうあめふるって".to_string(),
"つくって ください!!".to_string()
]
);
// two of the same in one line
let map = vec!
[
("考", "かんが"), ("不", "ふ"), ("幸", "こう"), ("中", "ちゅ"), ("幸", "さいわ"),
]
.into_iter().map(|(a, b)| [a.to_string(), b.to_string()]).collect::<Vec<_>>();
assert_eq!(
map_kanjis(&[
"そう考えると".to_string(),
"不幸中の幸いって".to_string(),
"ヤツだね".to_string()
], &map),
vec![
"そうかんがえると".to_string(),
"ふこうちゅのさいわいって".to_string(),
"ヤツだね".to_string()
]
);
// two of the same in two lines
let map = vec!
[
("考", "かんが"), ("不", "ふ"), ("幸", "こう"), ("中", "ちゅ"), ("幸", "さいわ"),
]
.into_iter().map(|(a, b)| [a.to_string(), b.to_string()]).collect::<Vec<_>>();
assert_eq!(
map_kanjis(&[
"そう考えると".to_string(),
"不幸中の".to_string(),
"幸いって".to_string(),
"ヤツだね".to_string()
], &map),
vec![
"そうかんがえると".to_string(),
"ふこうちゅの".to_string(),
"さいわいって".to_string(),
"ヤツだね".to_string()
]
);
// all of it
let map = vec!
[
("今日", "きょう"), ("考", "かんが"), ("不", "ふ"),
("幸", "こう"), ("中", "ちゅ"), ("幸", "さいわ"), ("幸", "justatest")
]
.into_iter().map(|(a, b)| [a.to_string(), b.to_string()]).collect::<Vec<_>>();
assert_eq!(
map_kanjis(&[
"今日考".to_string(),
"不".to_string(),
"幸幸中".to_string(),
"幸".to_string()
], &map),
vec![
"きょうかんが".to_string(),
"ふ".to_string(),
"こうさいわちゅ".to_string(),
"justatest".to_string()
]
);
}

#[test]
fn to_mora_test(){
fn morae(string: &str) -> usize{
string.chars().fold(0, |a, c| a + to_mora(c))
}
assert_eq!(morae("きょう"), 2);
assert_eq!(morae(" いってキーまーす!"), 8);
}
}
Loading

0 comments on commit d665f3b

Please sign in to comment.