vietc/engine/src/spelling.rs

// SPDX-License-Identifier: MIT
const FIRST_CONSONANT_SEQS: &[&str] = &[
    "b d đ g gh m n nh p ph r s t tr v z",
    "c h k kh qu th",
    "ch gi l ng ngh x",
    "đ l",
    "h",
];

const VOWEL_SEQS: &[&str] = &[
    "ê i ua uê uy y",
    "a iê oa uyê yê",
    "â ă e o oo ô ơ oe u ư uâ uô ươ",
    "oă",
    "uơ",
    "ai ao au âu ay ây eo êu ia iêu iu oai oao oay oeo oi ôi ơi ưa uây ui ưi uôi ươi ươu ưu uya uyu yêu",
    "ă",
    "i",
];

const LAST_CONSONANT_SEQS: &[&str] = &["ch nh", "c ng", "m n p t", "k", "c"];

const CV_MATRIX: &[&[usize]] = &[
    &[0, 1, 2, 5],
    &[0, 1, 2, 3, 4, 5],
    &[0, 1, 2, 3, 5],
    &[6],
    &[7],
];

const VC_MATRIX: &[&[usize]] = &[&[0, 2], &[0, 1, 2], &[1, 2], &[1, 2], &[], &[], &[3], &[4]];

fn strip_tone(c: char) -> char {
    match c {
        'à' | 'á' | 'ả' | 'ã' | 'ạ' => 'a',
        'ằ' | 'ắ' | 'ẳ' | 'ẵ' | 'ặ' => 'ă',
        'ầ' | 'ấ' | 'ẩ' | 'ẫ' | 'ậ' => 'â',
        'è' | 'é' | 'ẻ' | 'ẽ' | 'ẹ' => 'e',
        'ề' | 'ế' | 'ể' | 'ễ' | 'ệ' => 'ê',
        'ì' | 'í' | 'ỉ' | 'ĩ' | 'ị' => 'i',
        'ò' | 'ó' | 'ỏ' | 'õ' | 'ọ' => 'o',
        'ồ' | 'ố' | 'ổ' | 'ỗ' | 'ộ' => 'ô',
        'ờ' | 'ớ' | 'ở' | 'ỡ' | 'ợ' => 'ơ',
        'ù' | 'ú' | 'ủ' | 'ũ' | 'ụ' => 'u',
        'ừ' | 'ứ' | 'ử' | 'ữ' | 'ự' => 'ư',
        'ỳ' | 'ý' | 'ỷ' | 'ỹ' | 'ỵ' => 'y',
        _ => c,
    }
}

fn is_vowel(c: char) -> bool {
    matches!(
        c,
        'a' | 'à'
            | 'á'
            | 'ả'
            | 'ã'
            | 'ạ'
            | 'ă'
            | 'ằ'
            | 'ắ'
            | 'ẳ'
            | 'ẵ'
            | 'ặ'
            | 'â'
            | 'ầ'
            | 'ấ'
            | 'ẩ'
            | 'ẫ'
            | 'ậ'
            | 'e'
            | 'è'
            | 'é'
            | 'ẻ'
            | 'ẽ'
            | 'ẹ'
            | 'ê'
            | 'ề'
            | 'ế'
            | 'ể'
            | 'ễ'
            | 'ệ'
            | 'i'
            | 'ì'
            | 'í'
            | 'ỉ'
            | 'ĩ'
            | 'ị'
            | 'o'
            | 'ò'
            | 'ó'
            | 'ỏ'
            | 'õ'
            | 'ọ'
            | 'ô'
            | 'ồ'
            | 'ố'
            | 'ổ'
            | 'ỗ'
            | 'ộ'
            | 'ơ'
            | 'ờ'
            | 'ớ'
            | 'ở'
            | 'ỡ'
            | 'ợ'
            | 'u'
            | 'ù'
            | 'ú'
            | 'ủ'
            | 'ũ'
            | 'ụ'
            | 'ư'
            | 'ừ'
            | 'ứ'
            | 'ử'
            | 'ữ'
            | 'ự'
            | 'y'
            | 'ý'
            | 'ỳ'
            | 'ỷ'
            | 'ỹ'
            | 'ỵ'
    )
}

/// Partition a word into (first_consonant, vowel_cluster, last_consonant)
pub fn partition(word: &str) -> (String, String, String) {
    let chars: Vec<char> = word.chars().collect();
    let n = chars.len();
    if n == 0 {
        return (String::new(), String::new(), String::new());
    }

    // 1. Find the first vowel index
    let mut first_vowel_idx = None;
    for i in 0..n {
        if is_vowel(chars[i]) {
            first_vowel_idx = Some(i);
            break;
        }
    }

    let first_vowel = match first_vowel_idx {
        Some(idx) => idx,
        None => {
            return (word.to_string(), String::new(), String::new());
        }
    };

    let mut fc_end = first_vowel;

    // Adjust fc_end for "qu" or "gi" acting as onset
    if first_vowel == 1 && chars[0] == 'q' && chars[1] == 'u' && n > 2 && is_vowel(chars[2]) {
        fc_end = 2;
    }
    if first_vowel == 1 && chars[0] == 'g' && chars[1] == 'i' && n > 2 && is_vowel(chars[2]) {
        fc_end = 2;
    }

    // 2. Find the end of the vowel cluster
    let mut vo_end = fc_end;
    while vo_end < n && is_vowel(chars[vo_end]) {
        vo_end += 1;
    }

    let fc: String = chars[..fc_end].iter().collect();
    let vo: String = chars[fc_end..vo_end].iter().collect();
    let lc: String = chars[vo_end..].iter().collect();

    (fc, vo, lc)
}

fn lookup(seqs: &[&str], input: &str) -> Vec<usize> {
    let mut matching_indices = Vec::new();
    if input.is_empty() {
        return matching_indices;
    }

    for (index, row) in seqs.iter().enumerate() {
        for word in row.split_whitespace() {
            if word == input {
                matching_indices.push(index);
                break;
            }
        }
    }
    matching_indices
}

/// Check if a word is a valid Vietnamese syllable according to phonology rules
pub fn is_valid_vietnamese_syllable(word: &str) -> bool {
    let lowercase_word = word.to_lowercase();

    // Quick reject if it has foreign letters 'f', 'j', 'w', 'z'
    if lowercase_word
        .chars()
        .any(|c| matches!(c, 'f' | 'j' | 'w' | 'z'))
    {
        return false;
    }

    // Clean tones from the word to validate spelling structure
    let cleaned_word: String = lowercase_word.chars().map(strip_tone).collect();

    let (fc, vo, lc) = partition(&cleaned_word);

    // If there is no vowel, it must be a valid standalone consonant (like "d", "đ", etc.)
    // but typically a full syllable must have a vowel. Let's allow empty vowel only if it's
    // a valid first consonant of length 1 or 2 (e.g. for initials/abbreviations).
    if vo.is_empty() {
        return !fc.is_empty() && !lookup(FIRST_CONSONANT_SEQS, &fc).is_empty();
    }

    let fc_indices = if !fc.is_empty() {
        let indices = lookup(FIRST_CONSONANT_SEQS, &fc);
        if indices.is_empty() {
            return false; // Invalid onset consonant
        }
        Some(indices)
    } else {
        None
    };

    let vo_indices = lookup(VOWEL_SEQS, &vo);
    if vo_indices.is_empty() {
        return false; // Invalid vowel cluster
    }

    let lc_indices = if !lc.is_empty() {
        let indices = lookup(LAST_CONSONANT_SEQS, &lc);
        if indices.is_empty() {
            return false; // Invalid coda consonant
        }
        Some(indices)
    } else {
        None
    };

    // If we have an onset, check CV compatibility
    if let Some(ref fcs) = fc_indices {
        let mut cv_valid = false;
        for &fc_idx in fcs {
            if let Some(allowed_vos) = CV_MATRIX.get(fc_idx) {
                for &allowed_vo in *allowed_vos {
                    if vo_indices.contains(&allowed_vo) {
                        cv_valid = true;
                        break;
                    }
                }
            }
            if cv_valid {
                break;
            }
        }
        if !cv_valid {
            return false;
        }
    }

    // If we have a coda, check VC compatibility
    if let Some(ref lcs) = lc_indices {
        let mut vc_valid = false;
        for &vo_idx in &vo_indices {
            if let Some(allowed_lcs) = VC_MATRIX.get(vo_idx) {
                for &allowed_lc in *allowed_lcs {
                    if lcs.contains(&allowed_lc) {
                        vc_valid = true;
                        break;
                    }
                }
            }
            if vc_valid {
                break;
            }
        }
        if !vc_valid {
            return false;
        }
    } else {
        // If there's no coda, we must verify that the vowel allows having no coda
        // (all vowel sequences allow no coda, except some specific ones in matrix, but let's see:
        // vowel groups 4, 5 have no allowed last consonants in matrix, which is correct).
    }

    true
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_valid_vietnamese_syllables() {
        assert!(is_valid_vietnamese_syllable("chuyên"));
        assert!(is_valid_vietnamese_syllable("tiếng"));
        assert!(is_valid_vietnamese_syllable("việt"));
        assert!(is_valid_vietnamese_syllable("quang"));
        assert!(is_valid_vietnamese_syllable("giá"));
        assert!(is_valid_vietnamese_syllable("oanh"));
        assert!(is_valid_vietnamese_syllable("anh"));
        assert!(is_valid_vietnamese_syllable("thuở"));
        assert!(is_valid_vietnamese_syllable("gì"));
    }

    #[test]
    fn test_invalid_vietnamese_syllables() {
        assert!(!is_valid_vietnamese_syllable("fast"));
        assert!(!is_valid_vietnamese_syllable("box"));
        assert!(!is_valid_vietnamese_syllable("study"));
        assert!(!is_valid_vietnamese_syllable("fát"));
        assert!(!is_valid_vietnamese_syllable("făst"));
        assert!(!is_valid_vietnamese_syllable("cargo"));
        assert!(!is_valid_vietnamese_syllable("rust"));
        assert!(!is_valid_vietnamese_syllable("status"));
    }
}