feat: auto-restore English words and invalid Vietnamese syllables

When Vietnamese mode is on, the engine transformed every word including
English (test->tét, cargo->cảgo, status->státu). This wires up the
previously-dead english.rs dictionary and spelling.rs validator so that on
word commit, words that are clearly English or not phonologically valid
Vietnamese are reverted to the raw keystrokes typed. Genuine Vietnamese
(tiếng, việt, quả) is kept. Gated by the existing [auto_restore] enabled
config (default on).

Co-Authored-By: vndangkhoa <vonguyendangkhoa@gmail.com>
This commit is contained in:
Devin AI 2026-06-26 10:31:37 +00:00
parent 6e48d8b2fb
commit 7569e7e218
5 changed files with 174 additions and 2 deletions

View file

@ -127,6 +127,7 @@ impl Daemon {
}; };
let mut engine = Engine::new(method); let mut engine = Engine::new(method);
engine.set_enabled(config.start_enabled); engine.set_enabled(config.start_enabled);
engine.set_auto_restore(config.auto_restore.enabled);
engine_enabled.store(config.start_enabled, Ordering::SeqCst); engine_enabled.store(config.start_enabled, Ordering::SeqCst);
for (shortcut, expansion) in &config.macros { for (shortcut, expansion) in &config.macros {
@ -197,6 +198,8 @@ impl Daemon {
_ => InputMethod::Telex, _ => InputMethod::Telex,
}; };
self.engine.set_method(method); self.engine.set_method(method);
self.engine
.set_auto_restore(new_config.auto_restore.enabled);
self.engine.clear_macros(); self.engine.clear_macros();
for (shortcut, expansion) in &new_config.macros { for (shortcut, expansion) in &new_config.macros {
@ -287,7 +290,7 @@ impl Daemon {
if !self.screen_output.is_empty() { if !self.screen_output.is_empty() {
let backspaces = self.screen_output.chars().count(); let backspaces = self.screen_output.chars().count();
commands.push(OutputCommand::Backspace(backspaces)); commands.push(OutputCommand::Backspace(backspaces));
commands.push(OutputCommand::Type(self.screen_output.clone())); commands.push(OutputCommand::Type(self.word_to_commit()));
} }
// Type the flush character itself // Type the flush character itself
commands.push(OutputCommand::Type(ch.to_string())); commands.push(OutputCommand::Type(ch.to_string()));
@ -317,7 +320,7 @@ impl Daemon {
if !self.screen_output.is_empty() { if !self.screen_output.is_empty() {
let backspaces = self.screen_output.chars().count(); let backspaces = self.screen_output.chars().count();
commands.push(OutputCommand::Backspace(backspaces)); commands.push(OutputCommand::Backspace(backspaces));
commands.push(OutputCommand::Type(self.screen_output.clone())); commands.push(OutputCommand::Type(self.word_to_commit()));
} }
self.keystroke_history.clear(); self.keystroke_history.clear();
self.screen_output.clear(); self.screen_output.clear();
@ -379,6 +382,19 @@ impl Daemon {
commands commands
} }
/// Decide what to type when committing the current word: the Vietnamese
/// composition normally, or — when smart auto-restore is enabled and the
/// word is English / not valid Vietnamese — the raw keystrokes typed.
fn word_to_commit(&self) -> String {
if self.config.auto_restore.enabled {
let raw: String = self.keystroke_history.iter().collect();
if Engine::should_restore_word(&self.screen_output, &raw) {
return raw;
}
}
self.screen_output.clone()
}
/// Reset the replay state (on flush, focus loss, modifier key, etc.) /// Reset the replay state (on flush, focus loss, modifier key, etc.)
fn replay_reset(&mut self) { fn replay_reset(&mut self) {
self.keystroke_history.clear(); self.keystroke_history.clear();

View file

@ -1,6 +1,13 @@
use crate::bamboo::BambooEngine; use crate::bamboo::BambooEngine;
use crate::english::EnglishDict;
use crate::input_method::InputMethod; use crate::input_method::InputMethod;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::OnceLock;
fn english_dict() -> &'static EnglishDict {
static DICT: OnceLock<EnglishDict> = OnceLock::new();
DICT.get_or_init(EnglishDict::new)
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)] #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
pub enum EngineEvent { pub enum EngineEvent {
@ -17,6 +24,7 @@ pub struct Engine {
macros: HashMap<String, String>, macros: HashMap<String, String>,
raw_buffer: String, raw_buffer: String,
paste_mode: bool, paste_mode: bool,
auto_restore: bool,
} }
impl Engine { impl Engine {
@ -26,9 +34,41 @@ impl Engine {
macros: HashMap::new(), macros: HashMap::new(),
raw_buffer: String::new(), raw_buffer: String::new(),
paste_mode: false, paste_mode: false,
auto_restore: true,
} }
} }
pub fn set_auto_restore(&mut self, enabled: bool) {
self.auto_restore = enabled;
}
/// Decide whether a committed word should be reverted to the raw keystrokes
/// the user typed instead of the Vietnamese transformation. Returns true for
/// words that are clearly English / non-Vietnamese: a known English word, a
/// result that isn't a phonologically valid Vietnamese syllable, or one that
/// contains letters foreign to Vietnamese. `composed` is the transformed
/// output; `raw` is the literal keystrokes typed.
pub fn should_restore_word(composed: &str, raw: &str) -> bool {
// No transformation happened — English already passed through untouched.
if composed == raw {
return false;
}
let dict = english_dict();
let raw_lower = raw.to_lowercase();
let composed_lower = composed.to_lowercase();
// Genuine Vietnamese words that happen to look like English stay as-is.
if dict.is_vietnamese_override(&composed_lower) {
return false;
}
if dict.is_english_word(&raw_lower) {
return true;
}
!crate::spelling::is_valid_vietnamese_syllable(composed)
}
pub fn set_enabled(&mut self, enabled: bool) { pub fn set_enabled(&mut self, enabled: bool) {
self.bamboo.set_enabled(enabled); self.bamboo.set_enabled(enabled);
if !enabled { if !enabled {
@ -171,8 +211,17 @@ impl Engine {
}); });
} }
let raw = self.raw_buffer.clone();
self.reset(); self.reset();
if prev_len > 0 { if prev_len > 0 {
// Auto-restore: if the committed word is English / not valid
// Vietnamese, revert to the raw keystrokes the user typed.
if self.auto_restore && Engine::should_restore_word(&previous, &raw) {
return Some(EngineEvent::Replace {
backspaces: prev_len,
insert: raw,
});
}
// Don't include flush char in insert — daemon forwards it separately // Don't include flush char in insert — daemon forwards it separately
return Some(EngineEvent::Replace { return Some(EngineEvent::Replace {
backspaces: prev_len, backspaces: prev_len,

View file

@ -364,6 +364,11 @@ impl EnglishDict {
self.words.contains(word) self.words.contains(word)
} }
pub fn is_vietnamese_override(&self, word: &str) -> bool {
self.vietnamese_overrides.contains(word)
}
#[allow(dead_code)]
pub fn should_restore(&self, word: &str) -> bool { pub fn should_restore(&self, word: &str) -> bool {
if self.vietnamese_overrides.contains(word) { if self.vietnamese_overrides.contains(word) {
return false; return false;

View file

@ -1,5 +1,6 @@
mod bamboo; mod bamboo;
mod engine; mod engine;
mod english;
mod input_method; mod input_method;
pub mod spelling; pub mod spelling;

View file

@ -0,0 +1,101 @@
//! Tests for smart English auto-restore: when Vietnamese mode is on, words that
//! are clearly English / not valid Vietnamese revert to the raw keystrokes the
//! user typed, while genuine Vietnamese is kept.
use std::collections::HashMap;
use vietc_engine::{Engine, InputMethod};
fn telex(keys: &str) -> String {
Engine::replay_keystrokes(InputMethod::Telex, &HashMap::new(), &keys.chars().collect::<Vec<_>>()).0
}
/// Resolve what would actually be committed for a Telex keystroke sequence,
/// applying the auto-restore decision the daemon makes on word commit.
fn committed(keys: &str) -> String {
let composed = telex(keys);
let raw: String = keys.chars().collect();
if Engine::should_restore_word(&composed, &raw) {
raw
} else {
composed
}
}
#[test]
fn english_words_are_restored() {
// (telex keystrokes, expected committed word)
let cases = [
("fix", "fix"), // foreign letter f
("cargo", "cargo"), // invalid onset/coda
("status", "status"), // invalid cluster
("world", "world"), // invalid coda
("english", "english"),
("sweet", "sweet"), // invalid onset "sw"
];
for (keys, want) in cases {
assert_eq!(committed(keys), want, "expected {keys} to restore to {want}");
}
}
#[test]
fn vietnamese_words_are_kept() {
let cases = [
("tieengs", "tiếng"),
("vieejt", "việt"),
("quar", "quả"),
("gif", ""),
("khoong", "không"),
("tooi", "tôi"),
("banhf", "bành"),
("ddi", "đi"),
];
for (keys, want) in cases {
assert_eq!(committed(keys), want, "expected {keys} to stay {want}");
}
}
#[test]
fn untransformed_english_passes_through() {
// Words with no tone/mark letters never transform, so nothing to restore.
for keys in ["type", "code", "hello", "the", "and"] {
assert_eq!(committed(keys), keys);
assert!(!Engine::should_restore_word(&telex(keys), keys));
}
}
#[test]
fn process_key_restores_on_flush() {
// Drive the per-keystroke engine API and confirm the flush commits English.
let mut engine = Engine::new(InputMethod::Telex);
engine.set_enabled(true);
for ch in "cargo".chars() {
engine.process_key(ch);
}
// Mid-word the buffer is the Vietnamese composition.
assert_eq!(engine.buffer(), "cảgo");
// On flush the engine should emit a Replace back to the raw English word.
let event = engine.process_key(' ');
match event {
Some(vietc_engine::EngineEvent::Replace { insert, .. }) => {
assert_eq!(insert, "cargo");
}
other => panic!("expected Replace to 'cargo', got {other:?}"),
}
}
#[test]
fn auto_restore_can_be_disabled() {
let mut engine = Engine::new(InputMethod::Telex);
engine.set_enabled(true);
engine.set_auto_restore(false);
for ch in "cargo".chars() {
engine.process_key(ch);
}
let event = engine.process_key(' ');
match event {
Some(vietc_engine::EngineEvent::Replace { insert, .. }) => {
assert_eq!(insert, "cảgo", "with auto-restore off the VN form is kept");
}
other => panic!("expected Replace to 'cảgo', got {other:?}"),
}
}