vietc/engine/src/english.rs
Devin AI 7569e7e218 feat: auto-restore English words and invalid Vietnamese syllables
When Vietnamese mode is on, the engine transformed every word including
English (test->tét, cargo->cảgo, status->státu). This wires up the
previously-dead english.rs dictionary and spelling.rs validator so that on
word commit, words that are clearly English or not phonologically valid
Vietnamese are reverted to the raw keystrokes typed. Genuine Vietnamese
(tiếng, việt, quả) is kept. Gated by the existing [auto_restore] enabled
config (default on).

Co-Authored-By: vndangkhoa <vonguyendangkhoa@gmail.com>
2026-06-26 10:31:37 +00:00

388 lines
8 KiB
Rust

use std::collections::HashSet;
pub struct EnglishDict {
/// Common English words that shouldn't be converted to Vietnamese
words: HashSet<String>,
/// Words that are definitely Vietnamese (even if they look like English)
vietnamese_overrides: HashSet<String>,
}
impl EnglishDict {
pub fn new() -> Self {
let mut words = HashSet::new();
// Common English words that users type frequently
// These would trigger false Vietnamese conversions
let common_words = [
// Programming/tech
"the",
"and",
"for",
"are",
"but",
"not",
"you",
"all",
"can",
"had",
"her",
"was",
"one",
"our",
"out",
"day",
"get",
"has",
"him",
"his",
"how",
"its",
"may",
"new",
"now",
"old",
"see",
"way",
"who",
"did",
"does",
"each",
"from",
"have",
"here",
"just",
"like",
"long",
"look",
"made",
"make",
"many",
"most",
"over",
"such",
"take",
"than",
"them",
"then",
"that",
"this",
"time",
"very",
"when",
"what",
"will",
"with",
"also",
"back",
"been",
"call",
"came",
"come",
"could",
"does",
"done",
"down",
"each",
"even",
"find",
"first",
"from",
"give",
"goes",
"going",
"good",
"great",
"hand",
"have",
"head",
"help",
"high",
"home",
"hope",
"into",
"keep",
"know",
"last",
"left",
"life",
"like",
"line",
"live",
"look",
"made",
"make",
"many",
"mean",
"more",
"most",
"much",
"must",
"name",
"need",
"next",
"only",
"open",
"part",
"place",
"point",
"right",
"same",
"said",
"second",
"should",
"show",
"small",
"some",
"something",
"still",
"such",
"sure",
"take",
"tell",
"than",
"that",
"them",
"then",
"there",
"these",
"they",
"thing",
"think",
"this",
"those",
"time",
"turn",
"upon",
"very",
"want",
"well",
"went",
"were",
"what",
"when",
"where",
"which",
"while",
"will",
"with",
"work",
"would",
"year",
"your",
// Common words that conflict with Vietnamese
"ok",
"no",
"so",
"do",
"go",
"to",
"in",
"on",
"at",
"by",
"up",
"an",
"as",
"be",
"he",
"if",
"is",
"it",
"me",
"my",
"of",
"or",
"am",
"we",
"us",
"set",
"run",
"put",
"get",
"let",
"say",
"ask",
"try",
"use",
"add",
"end",
"few",
"far",
"got",
"big",
"off",
"old",
"own",
"red",
"hot",
"top",
"far",
"low",
"six",
"ten",
"red",
// Greetings & common
"hello",
"hi",
"hey",
"bye",
"thanks",
"thank",
"please",
"sorry",
"yes",
"yeah",
"no",
"ok",
"okay",
"sure",
"well",
"too",
"also",
// More common English
"about",
"after",
"again",
"being",
"below",
"between",
"both",
"came",
"come",
"could",
"does",
"done",
"down",
"each",
"even",
"find",
"first",
"from",
"give",
"goes",
"going",
"good",
"great",
"hand",
"have",
"head",
"help",
"high",
"home",
"hope",
"into",
"keep",
"kind",
"know",
"last",
"left",
"life",
"like",
"line",
"live",
"long",
"look",
"made",
"make",
"many",
"mean",
"more",
"most",
"much",
"must",
"name",
"need",
"next",
"only",
"open",
"part",
"place",
"point",
"right",
"same",
"said",
"second",
"should",
"show",
"small",
"some",
"something",
"still",
"sure",
"take",
"tell",
"than",
"that",
"them",
"then",
"there",
"these",
"they",
"thing",
"think",
"this",
"those",
"time",
"turn",
"upon",
"very",
"want",
"well",
"went",
"were",
"what",
"when",
"where",
"which",
"while",
"will",
"with",
"work",
"would",
"year",
"your",
];
for word in common_words {
words.insert(word.to_string());
}
let mut vietnamese_overrides = HashSet::new();
// Common Vietnamese words that look like English
let overrides = ["không", "xin", "chào", "cảm", "ơn", "tôi", "bạn"];
for word in overrides {
vietnamese_overrides.insert(word.to_string());
}
Self {
words,
vietnamese_overrides,
}
}
pub fn is_english_word(&self, word: &str) -> bool {
self.words.contains(word)
}
pub fn is_vietnamese_override(&self, word: &str) -> bool {
self.vietnamese_overrides.contains(word)
}
#[allow(dead_code)]
pub fn should_restore(&self, word: &str) -> bool {
if self.vietnamese_overrides.contains(word) {
return false;
}
self.is_english_word(word)
}
#[allow(dead_code)]
pub fn add_word(&mut self, word: String) {
self.words.insert(word);
}
#[allow(dead_code)]
pub fn remove_word(&mut self, word: &str) {
self.words.remove(word);
}
}