// SPDX-License-Identifier: MIT const FIRST_CONSONANT_SEQS: &[&str] = &[ "b d đ g gh m n nh p ph r s t tr v z", "c h k kh qu th", "ch gi l ng ngh x", "đ l", "h", ]; const VOWEL_SEQS: &[&str] = &[ "ê i ua uê uy y", "a iê oa uyê yê", "â ă e o oo ô ơ oe u ư uâ uô ươ", "oă", "uơ", "ai ao au âu ay ây eo êu ia iêu iu oai oao oay oeo oi ôi ơi ưa uây ui ưi uôi ươi ươu ưu uya uyu yêu", "ă", "i", ]; const LAST_CONSONANT_SEQS: &[&str] = &["ch nh", "c ng", "m n p t", "k", "c"]; const CV_MATRIX: &[&[usize]] = &[ &[0, 1, 2, 5], &[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 5], &[6], &[7], ]; const VC_MATRIX: &[&[usize]] = &[&[0, 2], &[0, 1, 2], &[1, 2], &[1, 2], &[], &[], &[3], &[4]]; fn strip_tone(c: char) -> char { match c { 'à' | 'á' | 'ả' | 'ã' | 'ạ' => 'a', 'ằ' | 'ắ' | 'ẳ' | 'ẵ' | 'ặ' => 'ă', 'ầ' | 'ấ' | 'ẩ' | 'ẫ' | 'ậ' => 'â', 'è' | 'é' | 'ẻ' | 'ẽ' | 'ẹ' => 'e', 'ề' | 'ế' | 'ể' | 'ễ' | 'ệ' => 'ê', 'ì' | 'í' | 'ỉ' | 'ĩ' | 'ị' => 'i', 'ò' | 'ó' | 'ỏ' | 'õ' | 'ọ' => 'o', 'ồ' | 'ố' | 'ổ' | 'ỗ' | 'ộ' => 'ô', 'ờ' | 'ớ' | 'ở' | 'ỡ' | 'ợ' => 'ơ', 'ù' | 'ú' | 'ủ' | 'ũ' | 'ụ' => 'u', 'ừ' | 'ứ' | 'ử' | 'ữ' | 'ự' => 'ư', 'ỳ' | 'ý' | 'ỷ' | 'ỹ' | 'ỵ' => 'y', _ => c, } } fn is_vowel(c: char) -> bool { matches!( c, 'a' | 'à' | 'á' | 'ả' | 'ã' | 'ạ' | 'ă' | 'ằ' | 'ắ' | 'ẳ' | 'ẵ' | 'ặ' | 'â' | 'ầ' | 'ấ' | 'ẩ' | 'ẫ' | 'ậ' | 'e' | 'è' | 'é' | 'ẻ' | 'ẽ' | 'ẹ' | 'ê' | 'ề' | 'ế' | 'ể' | 'ễ' | 'ệ' | 'i' | 'ì' | 'í' | 'ỉ' | 'ĩ' | 'ị' | 'o' | 'ò' | 'ó' | 'ỏ' | 'õ' | 'ọ' | 'ô' | 'ồ' | 'ố' | 'ổ' | 'ỗ' | 'ộ' | 'ơ' | 'ờ' | 'ớ' | 'ở' | 'ỡ' | 'ợ' | 'u' | 'ù' | 'ú' | 'ủ' | 'ũ' | 'ụ' | 'ư' | 'ừ' | 'ứ' | 'ử' | 'ữ' | 'ự' | 'y' | 'ý' | 'ỳ' | 'ỷ' | 'ỹ' | 'ỵ' ) } /// Partition a word into (first_consonant, vowel_cluster, last_consonant) pub fn partition(word: &str) -> (String, String, String) { let chars: Vec = word.chars().collect(); let n = chars.len(); if n == 0 { return (String::new(), String::new(), String::new()); } // 1. Find the first vowel index let mut first_vowel_idx = None; for i in 0..n { if is_vowel(chars[i]) { first_vowel_idx = Some(i); break; } } let first_vowel = match first_vowel_idx { Some(idx) => idx, None => { return (word.to_string(), String::new(), String::new()); } }; let mut fc_end = first_vowel; // Adjust fc_end for "qu" or "gi" acting as onset if first_vowel == 1 && chars[0] == 'q' && chars[1] == 'u' && n > 2 && is_vowel(chars[2]) { fc_end = 2; } if first_vowel == 1 && chars[0] == 'g' && chars[1] == 'i' && n > 2 && is_vowel(chars[2]) { fc_end = 2; } // 2. Find the end of the vowel cluster let mut vo_end = fc_end; while vo_end < n && is_vowel(chars[vo_end]) { vo_end += 1; } let fc: String = chars[..fc_end].iter().collect(); let vo: String = chars[fc_end..vo_end].iter().collect(); let lc: String = chars[vo_end..].iter().collect(); (fc, vo, lc) } fn lookup(seqs: &[&str], input: &str) -> Vec { let mut matching_indices = Vec::new(); if input.is_empty() { return matching_indices; } for (index, row) in seqs.iter().enumerate() { for word in row.split_whitespace() { if word == input { matching_indices.push(index); break; } } } matching_indices } /// Check if a word is a valid Vietnamese syllable according to phonology rules pub fn is_valid_vietnamese_syllable(word: &str) -> bool { let lowercase_word = word.to_lowercase(); // Quick reject if it has foreign letters 'f', 'j', 'w', 'z' if lowercase_word .chars() .any(|c| matches!(c, 'f' | 'j' | 'w' | 'z')) { return false; } // Clean tones from the word to validate spelling structure let cleaned_word: String = lowercase_word.chars().map(strip_tone).collect(); let (fc, vo, lc) = partition(&cleaned_word); // If there is no vowel, it must be a valid standalone consonant (like "d", "đ", etc.) // but typically a full syllable must have a vowel. Let's allow empty vowel only if it's // a valid first consonant of length 1 or 2 (e.g. for initials/abbreviations). if vo.is_empty() { return !fc.is_empty() && !lookup(FIRST_CONSONANT_SEQS, &fc).is_empty(); } let fc_indices = if !fc.is_empty() { let indices = lookup(FIRST_CONSONANT_SEQS, &fc); if indices.is_empty() { return false; // Invalid onset consonant } Some(indices) } else { None }; let vo_indices = lookup(VOWEL_SEQS, &vo); if vo_indices.is_empty() { return false; // Invalid vowel cluster } let lc_indices = if !lc.is_empty() { let indices = lookup(LAST_CONSONANT_SEQS, &lc); if indices.is_empty() { return false; // Invalid coda consonant } Some(indices) } else { None }; // If we have an onset, check CV compatibility if let Some(ref fcs) = fc_indices { let mut cv_valid = false; for &fc_idx in fcs { if let Some(allowed_vos) = CV_MATRIX.get(fc_idx) { for &allowed_vo in *allowed_vos { if vo_indices.contains(&allowed_vo) { cv_valid = true; break; } } } if cv_valid { break; } } if !cv_valid { return false; } } // If we have a coda, check VC compatibility if let Some(ref lcs) = lc_indices { let mut vc_valid = false; for &vo_idx in &vo_indices { if let Some(allowed_lcs) = VC_MATRIX.get(vo_idx) { for &allowed_lc in *allowed_lcs { if lcs.contains(&allowed_lc) { vc_valid = true; break; } } } if vc_valid { break; } } if !vc_valid { return false; } } else { // If there's no coda, we must verify that the vowel allows having no coda // (all vowel sequences allow no coda, except some specific ones in matrix, but let's see: // vowel groups 4, 5 have no allowed last consonants in matrix, which is correct). } true } #[cfg(test)] mod tests { use super::*; #[test] fn test_valid_vietnamese_syllables() { assert!(is_valid_vietnamese_syllable("chuyên")); assert!(is_valid_vietnamese_syllable("tiếng")); assert!(is_valid_vietnamese_syllable("việt")); assert!(is_valid_vietnamese_syllable("quang")); assert!(is_valid_vietnamese_syllable("giá")); assert!(is_valid_vietnamese_syllable("oanh")); assert!(is_valid_vietnamese_syllable("anh")); assert!(is_valid_vietnamese_syllable("thuở")); assert!(is_valid_vietnamese_syllable("gì")); } #[test] fn test_invalid_vietnamese_syllables() { assert!(!is_valid_vietnamese_syllable("fast")); assert!(!is_valid_vietnamese_syllable("box")); assert!(!is_valid_vietnamese_syllable("study")); assert!(!is_valid_vietnamese_syllable("fát")); assert!(!is_valid_vietnamese_syllable("făst")); assert!(!is_valid_vietnamese_syllable("cargo")); assert!(!is_valid_vietnamese_syllable("rust")); assert!(!is_valid_vietnamese_syllable("status")); } }