diff --git a/src/apps/desktop/src/computer_use/desktop_host.rs b/src/apps/desktop/src/computer_use/desktop_host.rs index d016d28d..accbbc72 100644 --- a/src/apps/desktop/src/computer_use/desktop_host.rs +++ b/src/apps/desktop/src/computer_use/desktop_host.rs @@ -739,6 +739,12 @@ struct ComputerUseSessionMutableState { /// After `screenshot`, block `pointer_move_rel` / `ComputerUseMouseStep` until an absolute move /// from AX/OCR/globals (`mouse_move`, `move_to_text`, `click_element`, `click_label`) clears this. block_vision_pixel_nudge_after_screenshot: bool, + /// After click / key / type / scroll / drag: recommend a **`screenshot`** to confirm UI state (Cowork verify). + /// Cleared on the next successful `screenshot_display`. + pending_verify_screenshot: bool, + /// After `move_to_text` (global OCR coordinates): next guarded **`click`** may run without a prior + /// `screenshot_display` / fine-crop basis — same idea as `click_element` relaxed guard. + pointer_trusted_after_ocr_move: bool, /// Action optimizer for loop detection, history, and visual verification. optimizer: ComputerUseOptimizer, } @@ -752,6 +758,8 @@ impl ComputerUseSessionMutableState { navigation_focus: None, screenshot_cache: None, block_vision_pixel_nudge_after_screenshot: false, + pending_verify_screenshot: false, + pointer_trusted_after_ocr_move: false, optimizer: ComputerUseOptimizer::new(), } } @@ -767,17 +775,27 @@ impl ComputerUseSessionMutableState { self.last_shot_refinement = Some(refinement); self.navigation_focus = nav_focus; self.click_needs_fresh_screenshot = false; + self.pending_verify_screenshot = false; + self.pointer_trusted_after_ocr_move = false; self.block_vision_pixel_nudge_after_screenshot = true; } /// Called after pointer mutation (move, step, relative), click, scroll, key_chord, or type_text. fn transition_after_pointer_mutation(&mut self) { self.click_needs_fresh_screenshot = true; + self.pointer_trusted_after_ocr_move = false; } /// Called after click (same effect as pointer mutation for freshness). fn transition_after_click(&mut self) { self.click_needs_fresh_screenshot = true; + self.pending_verify_screenshot = true; + self.pointer_trusted_after_ocr_move = false; + } + + /// Called after key, typing, scroll, or drag — UI likely changed; next `screenshot` should confirm. + fn transition_after_committed_ui_action(&mut self) { + self.pending_verify_screenshot = true; } } @@ -1040,10 +1058,10 @@ end tell"#]) "space" => Key::Space, "backspace" => Key::Backspace, "delete" => Key::Delete, - "up" => Key::UpArrow, - "down" => Key::DownArrow, - "left" => Key::LeftArrow, - "right" => Key::RightArrow, + "up" | "arrow_up" | "arrowup" => Key::UpArrow, + "down" | "arrow_down" | "arrowdown" => Key::DownArrow, + "left" | "arrow_left" | "arrowleft" => Key::LeftArrow, + "right" | "arrow_right" | "arrowright" => Key::RightArrow, "home" => Key::Home, "end" => Key::End, "pageup" | "page_up" => Key::PageUp, @@ -1938,8 +1956,9 @@ impl ComputerUseHost for DesktopComputerUseHost { let s = self.state.lock().unwrap(); let last_ref = s.last_shot_refinement; let click_needs_fresh = s.click_needs_fresh_screenshot; + let pending_verify = s.pending_verify_screenshot; - let (click_ready, screenshot_kind, recommended_next_action) = match last_ref { + let (click_ready, screenshot_kind, mut recommended_next_action) = match last_ref { Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => ( !click_needs_fresh, Some(ComputerUseInteractionScreenshotKind::RegionCrop), @@ -1967,11 +1986,16 @@ impl ComputerUseHost for DesktopComputerUseHost { None => (false, None, Some("screenshot".to_string())), }; + if pending_verify && recommended_next_action.is_none() { + recommended_next_action = Some("screenshot".to_string()); + } + ComputerUseInteractionState { click_ready, enter_ready: !click_needs_fresh, requires_fresh_screenshot_before_click: click_needs_fresh, requires_fresh_screenshot_before_enter: click_needs_fresh, + recommend_screenshot_to_verify_last_action: pending_verify, last_screenshot_kind: screenshot_kind, last_mutation: None, recommended_next_action, @@ -2464,6 +2488,7 @@ impl ComputerUseHost for DesktopComputerUseHost { .await .map_err(|e| BitFunError::tool(e.to_string()))??; ComputerUseHost::computer_use_after_pointer_mutation(self); + ComputerUseHost::computer_use_after_committed_ui_action(self); Ok(()) } @@ -2482,7 +2507,6 @@ impl ComputerUseHost for DesktopComputerUseHost { .iter() .map(|s| Self::map_key(s)) .collect::>()?; - #[cfg(target_os = "macos")] let chord_has_modifier = keys_for_job.iter().any(|s| { matches!( s.to_lowercase().as_str(), @@ -2493,21 +2517,29 @@ impl ComputerUseHost for DesktopComputerUseHost { e.key(mapped[0], Direction::Click) .map_err(|err| BitFunError::tool(format!("key: {}", err)))?; } else { - for k in &mapped[..mapped.len() - 1] { + let mods = &mapped[..mapped.len() - 1]; + let last = *mapped.last().unwrap(); + for k in mods { e.key(*k, Direction::Press) .map_err(|err| BitFunError::tool(format!("key press: {}", err)))?; } - let last = *mapped.last().unwrap(); + if chord_has_modifier { + // Modifiers must be registered before the main key; otherwise macOS / IME + // treats the letter as plain typing (e.g. Cmd+F becomes "f" in the text box). + #[cfg(target_os = "macos")] + std::thread::sleep(std::time::Duration::from_millis(160)); + #[cfg(not(target_os = "macos"))] + std::thread::sleep(std::time::Duration::from_millis(55)); + } e.key(last, Direction::Click) .map_err(|err| BitFunError::tool(format!("key click: {}", err)))?; - for k in mapped[..mapped.len() - 1].iter().rev() { + for k in mods.iter().rev() { e.key(*k, Direction::Release) .map_err(|err| BitFunError::tool(format!("key release: {}", err)))?; } - } - #[cfg(target_os = "macos")] - if chord_has_modifier { - std::thread::sleep(std::time::Duration::from_millis(95)); + if chord_has_modifier { + std::thread::sleep(std::time::Duration::from_millis(35)); + } } Ok(()) }) @@ -2515,6 +2547,7 @@ impl ComputerUseHost for DesktopComputerUseHost { .await .map_err(|e| BitFunError::tool(e.to_string()))??; ComputerUseHost::computer_use_after_pointer_mutation(self); + ComputerUseHost::computer_use_after_committed_ui_action(self); Ok(()) } @@ -2531,7 +2564,9 @@ impl ComputerUseHost for DesktopComputerUseHost { }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; - ComputerUseHost::computer_use_after_pointer_mutation(self); + // Typing does not move the pointer; do not set click_needs (would block Enter after search). + ComputerUseHost::computer_use_after_committed_ui_action(self); + ComputerUseHost::computer_use_trust_pointer_after_text_input(self); Ok(()) } @@ -2562,6 +2597,26 @@ impl ComputerUseHost for DesktopComputerUseHost { } } + fn computer_use_after_committed_ui_action(&self) { + if let Ok(mut s) = self.state.lock() { + s.transition_after_committed_ui_action(); + } + } + + fn computer_use_trust_pointer_after_ocr_move(&self) { + if let Ok(mut s) = self.state.lock() { + // `mouse_move` already set click_needs; OCR globals are authoritative like AX. + s.click_needs_fresh_screenshot = false; + s.pointer_trusted_after_ocr_move = true; + } + } + + fn computer_use_trust_pointer_after_text_input(&self) { + if let Ok(mut s) = self.state.lock() { + s.click_needs_fresh_screenshot = false; + } + } + fn computer_use_guard_click_allowed(&self) -> BitFunResult<()> { let s = self .state @@ -2570,6 +2625,9 @@ impl ComputerUseHost for DesktopComputerUseHost { if s.click_needs_fresh_screenshot { return Err(BitFunError::tool(STALE_CAPTURE_TOOL_MESSAGE.to_string())); } + if s.pointer_trusted_after_ocr_move { + return Ok(()); + } match s.last_shot_refinement { Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => {} Some(ComputerUseScreenshotRefinement::QuadrantNavigation { diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index f1722452..bfef6dd9 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -224,6 +224,20 @@ impl CandidateMatch { score += 20; } + // WeChat (and similar): global search field is often the first AXTextField match but is the wrong target + // when the user wants the **chat composer**. Deprioritize known search chrome. + if let Some(ref id) = self.identifier { + if id.contains("_SC_SEARCH_FIELD") { + score -= 1500; + } + } + + // Among text inputs, the composer is usually **lower** on screen than the top search bar. + let rl = self.role.to_lowercase(); + if rl.contains("textfield") || rl.contains("textarea") { + score += ((self.gy / 8.0) as i64).clamp(0, 400); + } + score } @@ -352,13 +366,28 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult { + let a_txt = a.role.contains("TextField") || a.role.contains("TextArea"); + let b_txt = b.role.contains("TextField") || b.role.contains("TextArea"); + if a_txt && b_txt { + b.gy.partial_cmp(&a.gy).unwrap_or(std::cmp::Ordering::Equal) + } else { + std::cmp::Ordering::Equal + } + } + o => o, + } + }); let total = candidates.len() as u32; let best = &candidates[0]; diff --git a/src/apps/desktop/src/computer_use/screen_ocr.rs b/src/apps/desktop/src/computer_use/screen_ocr.rs index df7e3733..caf1872c 100644 --- a/src/apps/desktop/src/computer_use/screen_ocr.rs +++ b/src/apps/desktop/src/computer_use/screen_ocr.rs @@ -139,13 +139,91 @@ fn normalize_query(text_query: &str) -> BitFunResult { Ok(q.to_string()) } +/// Normalize for substring / fuzzy matching. Strips **all** Unicode whitespace so that +/// Vision output like `"尉 怡 青"` or `"尉怡 青"` still matches query `"尉怡青"` (CJK UIs often +/// insert spaces between glyphs). Latin phrases become `"helloworld"`-style; substring checks +/// remain meaningful for short tokens. fn normalize_for_match(s: &str) -> String { - s.split_whitespace() - .collect::>() - .join(" ") + s.chars() + .filter(|c| !c.is_whitespace()) + .collect::() .to_lowercase() } +/// Levenshtein distance on Unicode scalar values (not UTF-8 bytes). +fn levenshtein_chars(a: &str, b: &str) -> usize { + let a: Vec = a.chars().collect(); + let b: Vec = b.chars().collect(); + let n = a.len(); + let m = b.len(); + if n == 0 { + return m; + } + if m == 0 { + return n; + } + let mut prev: Vec = (0..=m).collect(); + let mut curr = vec![0usize; m + 1]; + for i in 0..n { + curr[0] = i + 1; + for j in 0..m { + let cost = usize::from(a[i] != b[j]); + curr[j + 1] = (prev[j] + cost) + .min(prev[j + 1] + 1) + .min(curr[j] + 1); + } + std::mem::swap(&mut prev, &mut curr); + } + prev[m] +} + +/// Max allowed edit distance for fuzzy OCR match (Vision mis-reads one CJK glyph, etc.). +fn fuzzy_max_distance(query_len_chars: usize) -> usize { + match query_len_chars { + 0 => 0, + 1 => 0, + 2 | 3 | 4 => 1, + 5..=8 => 2, + _ => 3, + } +} + +fn fuzzy_text_matches_query(ocr_text: &str, query: &str) -> bool { + let t = normalize_for_match(ocr_text); + let q = normalize_for_match(query); + if q.is_empty() { + return false; + } + if t.contains(&q) { + return true; + } + let ql = q.chars().count(); + let dist = levenshtein_chars(&t, &q); + dist <= fuzzy_max_distance(ql) +} + +#[cfg(test)] +mod ocr_match_tests { + use super::*; + + #[test] + fn normalize_strips_whitespace_for_cjk_substring() { + let q = normalize_for_match("尉怡青"); + assert!(normalize_for_match("尉 怡 青").contains(&q)); + assert!(normalize_for_match(" 尉怡 青 ").contains(&q)); + } + + #[test] + fn fuzzy_one_glyph_substitution_three_chars() { + assert!(fuzzy_text_matches_query("卫怡青", "尉怡青")); + } + + #[test] + fn levenshtein_ascii() { + assert_eq!(levenshtein_chars("cat", "cats"), 1); + } +} + fn rank_matches(mut matches: Vec, query: &str) -> Vec { let normalized_query = normalize_for_match(query); matches.sort_by(|a, b| compare_match(a, b, &normalized_query)); @@ -195,7 +273,10 @@ fn filter_and_rank(query: &str, raw_matches: Vec) -> Vec>(); rank_matches(filtered, query) } @@ -384,8 +465,8 @@ pub fn crop_shot_to_ocr_region( #[cfg(target_os = "macos")] mod macos { use super::{ - filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, - OcrTextMatch, + filter_and_rank, fuzzy_text_matches_query, image_box_to_global_match, + image_content_rect_or_full, levenshtein_chars, normalize_for_match, OcrTextMatch, }; use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; use bitfun_core::util::errors::{BitFunError, BitFunResult}; @@ -426,7 +507,8 @@ mod macos { if ranked.is_empty() { return Err(BitFunError::tool(format!( "No OCR text matched {:?} on screen (macOS Vision found {} text regions total). \ - If the UI is Chinese, try a shorter substring (e.g. one or two characters) or ensure the text is visible in the capture; Vision may mis-read stylized UI.", + Matching strips whitespace between glyphs and allows small edit distance for OCR errors. \ + If the UI is Chinese, try a shorter substring or ensure the text is visible in the capture.", text_query, observations.len() ))); @@ -518,6 +600,32 @@ mod macos { } } + // Fuzzy fallback: Vision may insert spaces in CJK, mis-read one character, or split labels. + if chosen_text.is_none() { + let mut best: Option<(String, f32, usize)> = None; + for i in 0..n { + let candidate = unsafe { candidates.objectAtIndex_unchecked(i) }; + let text = candidate.string().to_string(); + if !fuzzy_text_matches_query(&text, text_query) { + continue; + } + let nt = normalize_for_match(&text); + let dist = levenshtein_chars(&nt, &q_norm); + let conf = candidate.confidence(); + let take = match &best { + None => true, + Some((_, bf, bd)) => dist < *bd || (dist == *bd && conf > *bf), + }; + if take { + best = Some((text, conf, dist)); + } + } + if let Some((t, c, _)) = best { + chosen_text = Some(t); + chosen_confidence = c; + } + } + let text = chosen_text?; // Vision bounding box is normalized to the **full** image (JPEG), not the content rect. @@ -554,8 +662,8 @@ mod macos { #[cfg(target_os = "windows")] mod windows_backend { use super::{ - filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, - OcrTextMatch, + filter_and_rank, fuzzy_text_matches_query, image_box_to_global_match, + image_content_rect_or_full, normalize_for_match, OcrTextMatch, }; use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; use bitfun_core::util::errors::{BitFunError, BitFunResult}; @@ -687,8 +795,10 @@ mod windows_backend { ) -> Option { let text = word.Text().ok()?.to_string(); - // Pre-filter - if !normalize_for_match(&text).contains(&normalize_for_match(text_query)) { + // Pre-filter (same normalization + fuzzy as macOS / Linux) + let nq = normalize_for_match(text_query); + let nt = normalize_for_match(&text); + if !nt.contains(&nq) && !fuzzy_text_matches_query(&text, text_query) { return None; } @@ -717,8 +827,8 @@ mod windows_backend { #[cfg(target_os = "linux")] mod linux_backend { use super::{ - filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, - OcrTextMatch, + filter_and_rank, fuzzy_text_matches_query, image_box_to_global_match, + image_content_rect_or_full, normalize_for_match, OcrTextMatch, }; use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; use bitfun_core::util::errors::{BitFunError, BitFunResult}; @@ -841,8 +951,9 @@ mod linux_backend { _content_width: u32, _content_height: u32, ) -> Option { - // Pre-filter - if !normalize_for_match(text).contains(&normalize_for_match(text_query)) { + let nq = normalize_for_match(text_query); + let nt = normalize_for_match(text); + if !nt.contains(&nq) && !fuzzy_text_matches_query(text, text_query) { return None; } diff --git a/src/apps/desktop/src/computer_use/ui_locate_common.rs b/src/apps/desktop/src/computer_use/ui_locate_common.rs index e1ccb225..55a755e6 100644 --- a/src/apps/desktop/src/computer_use/ui_locate_common.rs +++ b/src/apps/desktop/src/computer_use/ui_locate_common.rs @@ -83,6 +83,28 @@ fn contains_ci(hay: &str, needle: &str) -> bool { hay.to_lowercase().contains(&needle.to_lowercase()) } +/// `role_substring` match with macOS AX aliases: chat apps often expose compose as **`AXTextField`** +/// while models ask for `TextArea`; treat those as overlapping for locate/click_element. +pub fn role_substring_matches_ax_role(ax_role: &str, want: &str) -> bool { + let w = want.trim(); + if w.is_empty() { + return true; + } + if contains_ci(ax_role, w) { + return true; + } + let wl = w.to_lowercase(); + match wl.as_str() { + "textarea" | "text area" | "text_area" | "axtextarea" => { + contains_ci(ax_role, "TextArea") || contains_ci(ax_role, "TextField") + } + "textfield" | "text field" | "text_field" | "axtextfield" => { + contains_ci(ax_role, "TextField") || contains_ci(ax_role, "TextArea") + } + _ => false, + } +} + fn combine_is_any(query: &UiElementLocateQuery) -> bool { matches!( query.filter_combine.as_deref(), @@ -102,7 +124,7 @@ pub fn matches_filters_any( if let Some(ref want) = query.role_substring { if !want.trim().is_empty() { has_filter = true; - if contains_ci(role.unwrap_or(""), want.trim()) { + if role_substring_matches_ax_role(role.unwrap_or(""), want.trim()) { matched = true; } } @@ -136,7 +158,7 @@ pub fn matches_filters_all( if let Some(ref want) = query.role_substring { if !want.trim().is_empty() { let r = role.unwrap_or(""); - if !contains_ci(r, want.trim()) { + if !role_substring_matches_ax_role(r, want.trim()) { return false; } } @@ -234,6 +256,23 @@ pub fn ok_result_with_context( }) } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn role_textarea_alias_matches_axtextfield() { + assert!(role_substring_matches_ax_role("AXTextField", "TextArea")); + assert!(role_substring_matches_ax_role("AXTextField", "textarea")); + assert!(!role_substring_matches_ax_role("AXButton", "TextArea")); + } + + #[test] + fn role_textfield_alias_matches_axtextarea() { + assert!(role_substring_matches_ax_role("AXTextArea", "TextField")); + } +} + /// Whether an element's global bounds fall within any visible display. pub fn is_element_on_screen(gx: f64, gy: f64, width: f64, height: f64) -> bool { // Element must have reasonable size (not a giant container) diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index a4dc8079..0d0f49d4 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -11,7 +11,17 @@ Narrate only when it helps: multi-step work, complex/challenging problems, sensi Keep narration brief and value-dense; avoid repeating obvious steps. Use plain human language for narration unless in a technical context. When a first-class tool exists for an action, use the tool directly instead of asking the user to run equivalent CLI commands. -**Computer use (desktop automation):** If the user's request needs **more than one** ComputerUse call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether `Bash` applies (e.g. `open -a "AppName"` or osascript/PowerShell/xdotool), (b) whether **`key_chord`** / **`type_text`** can replace mouse steps (Enter, Escape, Tab, shortcuts), (c) which `click_element` / `move_to_text` / `locate` calls to try if pointing is required, (d) target app/window, (e) how you will verify focus. Then execute step-by-step. +**Computer use (desktop automation):** When doing desktop automation, prefer script/command-line automation where possible, but execute steps ONE AT A TIME (like you would with GUI automation), not in a single huge script. + +For script automation: +- **Step-by-step**: One simple script/command per step (e.g., activate app → open search → type name → press Enter, etc.) +- **macOS**: Use simple `osascript` commands (one per step), or `open -a "App"` +- **Windows**: Use simple `powershell`/`cmd` commands (one per step) +- **Linux**: Use simple `xdotool`/`wmctrl` commands (one per step) + +Only use ComputerUse when scripts can't do the job, or when you need visual confirmation. + +If the user's request needs **more than one** ComputerUse call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether **script automation applies** (one step at a time), (b) whether `Bash` applies (e.g. `open -a "AppName"`), (c) whether **`key_chord`** / **`type_text`** can replace mouse steps (Enter, Escape, Tab, shortcuts), (d) which `click_element` / `move_to_text` / `locate` calls to try if pointing is required, (e) target app/window, (f) how you will verify focus. Then execute step-by-step. # Session Coordination For complex coding tasks or office-style multi-step tasks, prefer multi-session coordination over doing everything in the current session. @@ -75,12 +85,12 @@ Everything is in one tool: **`ComputerUse`** with these actions: `click_element` **GUI automation (`ComputerUse`) is a fallback, not the default.** 1. **Direct command/script automation (HIGHEST PRIORITY)**: - - **macOS**: Use `osascript` (AppleScript or JavaScript for Automation) via `Bash` for app automation - - **Windows**: Use `powershell` or `cmd` via `Bash` for app automation - - **Linux**: Use `xdotool`, `ydotool`, `wmctrl`, etc. via `Bash` for window/UI automation + - **Step-by-step**: Execute one simple command/script per step, not a single huge script + - **macOS**: `osascript` (simple one-liners), `open -a "App"`, etc. + - **Windows**: `powershell`/`cmd` (simple one-liners), `start`, etc. + - **Linux**: `xdotool`, `ydotool`, `wmctrl` (simple one-liners), etc. - **App-specific CLI tools**: Use CLI versions of apps when available (e.g. `subl`, `code`, `git`, etc.) - - **Shell commands**: `open -a "App"` on macOS, `start` on Windows to launch/focus apps - - Prefer this over **any** GUI automation when a script/command can complete the task + - Prefer this over **any** GUI automation when a script/command can complete the task (one step at a time) 2. **`key_chord`** -- OS and app keyboard shortcuts; **Enter/Return/Escape/Tab/Space** and clipboard (copy/cut/paste). **Prefer over mouse** whenever a key completes the same step (see **Keyboard before mouse**). **No** mandatory screenshot before non-Enter chords (see Screenshot policy). @@ -157,8 +167,11 @@ The system automatically tracks your action history. If `loop_warning` appears i - **If stuck after trying alternatives:** explain what you attempted and ask the user for guidance rather than continuing to loop. ## Key rules +- **Script automation FIRST:** For common app tasks (sending messages, opening files, etc.), FIRST consider using a script (osascript on macOS, PowerShell on Windows, xdotool on Linux) to complete the ENTIRE TASK in one go, instead of multiple GUI automation steps. - **macOS apps:** Use `open -a "AppName"` via Bash to launch/focus, or `osascript` for more complex automation; not Spotlight. - **Foreground safety:** Check `computer_use_context.foreground_application` -- if wrong app is focused, fix focus first. `locate` and `click_element` search the **foreground** app only. +- **Multi-monitor safety:** If you have multiple displays and actions keep targeting the wrong screen, STOP and use a script (osascript/xdotool) or re-verify which app is on which display. +- **Minimize `wait`:** Use `wait` only when you explicitly need to wait for an app to launch or a UI to load. Do not add `wait` after every single action "just in case." - **Targeting order (when the pointer is required):** `click_element` → **`move_to_text`** (when text is visible) → **`click_label`** if SoM is already on a screenshot → **screenshot** drill / crop + **`mouse_move`** + **`click`** last. Apply **Keyboard before mouse** first -- do not use this order to click a control that **Enter** / **Escape** / focus keys could handle. - **Screenshot cadence:** Only when you need pixels, SoM, or a **fine** basis before guarded **`click`**; and always immediately before **`key_chord`** with Enter/Return (host). **Do not** treat `screenshot` as the default next step after every non-click action. - **No blind Enter:** Fresh `screenshot` required before `key_chord` with Return/Enter only (not before other chords). diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index f99fe108..27f546d8 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -416,6 +416,18 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { /// After `mouse_click`, require a fresh screenshot before the next click (unless pointer moved, which also invalidates). fn computer_use_after_click(&self) {} + /// After a committed UI action that should be **visually confirmed** on the next `screenshot` + /// (Cowork-style: observe → act → verify). Desktop sets a pending flag; cleared when `screenshot_display` runs. + fn computer_use_after_committed_ui_action(&self) {} + + /// After `move_to_text` positioned the pointer with **trusted global OCR coordinates** (not JPEG guesses), + /// clear the stale-capture guard so the next **`click`** or Enter **`key_chord`** may proceed without another `screenshot`. + fn computer_use_trust_pointer_after_ocr_move(&self) {} + + /// After `type_text`: the pointer did not move; clear the stale-capture guard so Enter **`key_chord`** + /// is not blocked solely because of a prior click / scroll. + fn computer_use_trust_pointer_after_text_input(&self) {} + /// Refuse `mouse_click` if the pointer moved (or a click happened) since the last screenshot, /// or if the latest capture is not a valid “fine” basis (desktop: ~500×500 point crop **or** /// quadrant navigation region with longest side < [`COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE`]). @@ -555,6 +567,10 @@ pub struct ComputerUseInteractionState { pub enter_ready: bool, pub requires_fresh_screenshot_before_click: bool, pub requires_fresh_screenshot_before_enter: bool, + /// When true, the last action (click, key, typing, scroll, etc.) changed the UI; take **`screenshot`** + /// next to **confirm** the outcome (Cowork-style verify step), ideally after **`wait`** if the UI animates. + #[serde(default, skip_serializing_if = "is_false")] + pub recommend_screenshot_to_verify_last_action: bool, #[serde(default, skip_serializing_if = "Option::is_none")] pub last_screenshot_kind: Option, #[serde(default, skip_serializing_if = "Option::is_none")] @@ -576,6 +592,7 @@ mod tests { enter_ready: true, requires_fresh_screenshot_before_click: true, requires_fresh_screenshot_before_enter: false, + recommend_screenshot_to_verify_last_action: true, last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), last_mutation: Some(ComputerUseLastMutationKind::Screenshot), recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), @@ -602,5 +619,9 @@ mod tests { value["recommended_next_action"], serde_json::json!("screenshot_navigate_quadrant") ); + assert_eq!( + value["recommend_screenshot_to_verify_last_action"], + serde_json::json!(true) + ); } } diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs index 541c7e8b..07256460 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs @@ -57,6 +57,7 @@ mod tests { enter_ready: true, requires_fresh_screenshot_before_click: true, requires_fresh_screenshot_before_enter: false, + recommend_screenshot_to_verify_last_action: true, last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), last_mutation: None, recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), @@ -70,6 +71,10 @@ mod tests { body["interaction_state"]["recommended_next_action"], json!("screenshot_navigate_quadrant") ); + assert_eq!( + body["interaction_state"]["recommend_screenshot_to_verify_last_action"], + json!(true) + ); } #[test] @@ -104,6 +109,7 @@ mod tests { enter_ready: true, requires_fresh_screenshot_before_click: true, requires_fresh_screenshot_before_enter: false, + recommend_screenshot_to_verify_last_action: false, last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), last_mutation: None, recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 7825fb92..3dc9ecbe 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -7,7 +7,7 @@ use super::computer_use_input::{ use super::computer_use_locate::execute_computer_use_locate; use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; use crate::agentic::tools::computer_use_host::{ - ComputerScreenshot, ComputerUseNavigateQuadrant, ComputerUseScreenshotRefinement, + ComputerScreenshot, ComputerUseHost, ComputerUseNavigateQuadrant, ComputerUseScreenshotRefinement, OcrRegionNative, ScreenshotCropCenter, UiElementLocateQuery, COMPUTER_USE_POINT_CROP_HALF_MAX, COMPUTER_USE_POINT_CROP_HALF_MIN, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, @@ -377,7 +377,8 @@ impl ComputerUseTool { "som_label_note": som_note, }); let shortcut_policy = format!( - "**Targeting priority:** `click_element` → **`move_to_text`** (OCR + move; no prior `screenshot` for targeting) → **`click_label`** if SoM exists on a shot → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation and navigation — do not guess move targets from JPEG pixels.** **`click`** never moves the pointer. **Host-only mandatory screenshot:** before **`click`** or Enter **`key_chord`** when the pointer changed since the last capture — **not** before `mouse_move`, `scroll`, `type_text`, `locate`, `wait`, or non-Enter `key_chord`. **Valid basis for a guarded `click`:** `FullDisplay`, `quadrant_navigation_click_ready`, or point crop; or bare **`screenshot`** after a pointer-changing action (**~500×500** implicit confirmation around mouse/caret). **`mouse_move`** must use **global** coordinates (from `move_to_text` global_center_*, `locate`, AX, or `pointer_global`). **Bare confirmation `screenshot`:** whenever the host still requires a capture before **`click`** or Enter **`key_chord`** (`requires_fresh_screenshot_*`), a bare `screenshot` (no crop / no reset) is **~500×500** centered on **mouse** (`screenshot_implicit_center` default `mouse`) — **including during quadrant drill** and the **first** such capture in a session. Before Enter in a text field, set **`screenshot_implicit_center`: `text_caret`**. Use **`screenshot_reset_navigation`**: true for a **full-screen** capture instead. **If AX failed:** try **`move_to_text`** before a long screenshot drill. **Optional refinement** for tiny targets: `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px) or point crop. Small moves: **ComputerUseMouseStep** over tiny **ComputerUseMousePrecise** (screen globals only).", + "**Verify step:** after **`click`**, **`key_chord`**, **`type_text`**, **`scroll`**, or **`drag`**, check **`interaction_state.recommend_screenshot_to_verify_last_action`** — when true, call **`screenshot`** next to confirm UI state (Cowork-style). \ +**Targeting priority:** `click_element` → **`move_to_text`** (OCR + move; no prior `screenshot` for targeting) → **`click_label`** if SoM exists on a shot → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation and navigation — do not guess move targets from JPEG pixels.** **`click`** never moves the pointer. **Host-only mandatory screenshot:** before **`click`** or Enter **`key_chord`** when the pointer changed since the last capture — **not** before `mouse_move`, `scroll`, `type_text`, `locate`, `wait`, or non-Enter `key_chord`. **Valid basis for a guarded `click`:** `FullDisplay`, `quadrant_navigation_click_ready`, or point crop; or bare **`screenshot`** after a pointer-changing action (**~500×500** implicit confirmation around mouse/caret). **`mouse_move`** must use **global** coordinates (from `move_to_text` global_center_*, `locate`, AX, or `pointer_global`). **Bare confirmation `screenshot`:** whenever the host still requires a capture before **`click`** or Enter **`key_chord`** (`requires_fresh_screenshot_*`), a bare `screenshot` (no crop / no reset) is **~500×500** centered on **mouse** (`screenshot_implicit_center` default `mouse`) — **including during quadrant drill** and the **first** such capture in a session. Before Enter in a text field, set **`screenshot_implicit_center`: `text_caret`**. Use **`screenshot_reset_navigation`**: true for a **full-screen** capture instead. **If AX failed:** try **`move_to_text`** before a long screenshot drill. **Optional refinement** for tiny targets: `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px) or point crop. Small moves: **ComputerUseMouseStep** over tiny **ComputerUseMousePrecise** (screen globals only).", COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE ); let region_crop_size_note = shot @@ -844,17 +845,18 @@ impl Tool for ComputerUseTool { let keys = Self::key_chord_os_hint(); Ok(format!( "Desktop automation (host OS: {}). {} All actions in one tool. Send only parameters that apply to the chosen `action`. \ +**Cowork-style loop (default rhythm):** **`screenshot`** (observe current UI) → **one** input action (`key_chord`, `type_text`, `move_to_text` + `click`, `click_element`, `scroll`, …) → **`screenshot`** again to **verify** the outcome before the next decision. Use **`wait`** if the UI animates. When **`interaction_state.recommend_screenshot_to_verify_last_action`** is true, your **next** call should usually be **`screenshot`** (optionally **`screenshot_reset_navigation`**: true for full-screen context). This is separate from the host rule that requires a **fresh** capture **before** guarded **`click`** / Enter **`key_chord`** after pointer moves — follow both. \ **Input priority:** Prefer **`key_chord`** / **`type_text`** over mouse when one key or typing completes the step (e.g. **Enter** to confirm default, **Escape** to cancel, **Tab** to move focus). Do not click “OK”/“Submit” when **Enter** is equivalent; use **`screenshot`** then Enter **`key_chord`** per host when required. \ **Targeting priority (when pointing is required):** `click_element` → **`move_to_text`** (OCR + move pointer only) → `click_label` (when SoM exists) → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation — do not guess move targets from JPEG pixels.** \ -**`click_element`:** Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. Bypasses coordinate screenshot guard. \ -**`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). If **several** hits match, the host returns **preview JPEGs + accessibility** per candidate — pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same query/region. Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ +**`click_element`:** Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. On macOS, **`TextArea`** and **`TextField`** match both `AXTextArea` and `AXTextField` (many chat apps use TextField for compose). If several text fields match, the host deprioritizes known **search** controls (e.g. WeChat `_SC_SEARCH_FIELD`) and prefers **lower** on-screen fields (composer). Bypasses coordinate screenshot guard. \ +**`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). Matching **strips whitespace** between CJK glyphs and allows **small edit distance** when Vision mis-reads one character. The host **trusts** the resulting globals — **next `click`** does **not** require an extra `screenshot` (same as AX). If **several** hits match, the host returns **preview JPEGs + accessibility** per candidate — pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same query/region, or narrow with **`ocr_region_native`**. When **`click_label`** lists a results table (`AXTable`), prefer **`click_label`** on that label over guessing OCR row text. Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ **`click_label`:** After `screenshot` with `som_labels`, click by label number. Bypasses coordinate guard. \ **`click`:** Press at **current pointer only** — **never** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates`. Position first with **`move_to_text`**, **`mouse_move`** (**globals only**), or **`click_element`**. After pointer moves, **`screenshot`** again before the next guarded **`click`** when the host requires it. \ **`mouse_move` / `drag`:** **`use_screen_coordinates`: true** required — global coordinates from **`move_to_text`**, **`locate`**, AX, or **`pointer_global`**; never JPEG pixel guesses. \ **`scroll` / `type_text` / `pointer_move_rel` / `wait` / `locate`:** No mandatory pre-screenshot by themselves. **`pointer_move_rel`** (and **ComputerUseMouseStep**) are **blocked immediately after `screenshot`** until **`move_to_text`**, **`mouse_move`** (globals), **`click_element`**, or **`click_label`** — do not nudge from the JPEG. \ **`key_chord`:** Press key combination; prefer over **`click`** when shortcuts or **Enter**/**Escape**/**Tab** suffice. **Mandatory fresh screenshot only** when chord includes Return/Enter. \ **`screenshot`:** JPEG for **confirmation** (optional pointer + SoM). When the host requires a fresh capture before **`click`** or Enter **`key_chord`**, a bare `screenshot` is **~500×500** around the **mouse** or **caret** (also during quadrant drill). Use **`screenshot_reset_navigation`**: true to force **full-screen** for wide context. \ -**`type_text`:** Type text; prefer clipboard for long content.", +**`type_text`:** Type text; prefer clipboard for long content. Does **not** move the pointer — **Enter** **`key_chord`** may follow without a mandatory `screenshot` unless you moved the pointer since the last capture. If **`screenshot`** shows the correct chat is already open and the input may be focused, **try `type_text` first** before spending steps on `click_element` / `move_to_text`.", os, keys, )) } @@ -887,7 +889,7 @@ impl Tool for ComputerUseTool { "start_y": { "type": "integer", "description": "For `drag`: start Y coordinate." }, "end_x": { "type": "integer", "description": "For `drag`: end X coordinate." }, "end_y": { "type": "integer", "description": "For `drag`: end Y coordinate." }, - "keys": { "type": "array", "items": { "type": "string" }, "description": "For `key_chord`: key names to press together. Use OS-appropriate modifier names. Host requires a fresh screenshot only before chords that include Return/Enter (not before other chords)." }, + "keys": { "type": "array", "items": { "type": "string" }, "description": "For `key_chord`: keys in order — **modifiers first**, then the main key (e.g. `[\"command\",\"f\"]`). Desktop host waits after pressing modifiers so shortcuts register (important on macOS with IME). Modifiers: command, control, shift, alt/option. Arrows: `up`, `down`, … Host may require a fresh screenshot before Return/Enter when the pointer is stale." }, "text": { "type": "string", "description": "For `type_text`: text to type. Prefer clipboard paste (key_chord) for long content." }, "ms": { "type": "integer", "description": "For `wait`: duration in milliseconds." }, "label": { "type": "integer", "minimum": 1, "description": "For `click_label`: 1-based Set-of-Mark label number from the latest screenshot." }, @@ -1166,6 +1168,7 @@ impl Tool for ComputerUseTool { host_ref .mouse_move_global_f64(matched.center_x, matched.center_y) .await?; + ComputerUseHost::computer_use_trust_pointer_after_ocr_move(host_ref); let other_matches = matches .iter() @@ -1212,7 +1215,7 @@ impl Tool for ComputerUseTool { ) .await; let summary = format!( - "OCR move_to_text: matched {:?} at ({:.0}, {:.0}) [index {} of {}].", + "OCR move_to_text: matched {:?} at ({:.0}, {:.0}) [index {} of {}]. Pointer is from trusted global OCR — you may **`click`** next without a separate **`screenshot`** (host clears stale-capture guard).", matched.text, matched.center_x, matched.center_y, @@ -1337,6 +1340,7 @@ impl Tool for ComputerUseTool { host_ref.mouse_move_global_f64(sx1, sy1).await?; host_ref.wait_ms(50).await?; host_ref.mouse_up(button).await?; + ComputerUseHost::computer_use_after_committed_ui_action(host_ref); let input_coords = json!({ "kind": "drag", diff --git a/src/web-ui/src/locales/en-US/settings/session-config.json b/src/web-ui/src/locales/en-US/settings/session-config.json index 9c5cbde4..0d0cf02c 100644 --- a/src/web-ui/src/locales/en-US/settings/session-config.json +++ b/src/web-ui/src/locales/en-US/settings/session-config.json @@ -14,7 +14,7 @@ }, "computerUse": { "sectionTitle": "Computer use (Claw)", - "sectionDescription": "Let the assistant capture the screen and control the mouse and keyboard.", + "sectionDescription": "In the BitFun desktop app, the assistant can capture the screen and control the mouse and keyboard; requires a multimodal model for vision.", "enable": "Enable Computer use", "enableDesc": "When off, the ComputerUse tool stays disabled even in Claw mode.", "accessibility": "Accessibility", diff --git a/src/web-ui/src/locales/zh-CN/settings/session-config.json b/src/web-ui/src/locales/zh-CN/settings/session-config.json index c84250df..9d611335 100644 --- a/src/web-ui/src/locales/zh-CN/settings/session-config.json +++ b/src/web-ui/src/locales/zh-CN/settings/session-config.json @@ -14,7 +14,7 @@ }, "computerUse": { "sectionTitle": "Computer use(助理 Claw)", - "sectionDescription": "在 BitFun 桌面端允许助理截取屏幕并控制键鼠。", + "sectionDescription": "在 BitFun 桌面端允许助理截取屏幕并控制键鼠;需多模态模型理解画面。", "enable": "启用 Computer use", "enableDesc": "关闭时,即使在 Claw 模式下也不会启用 ComputerUse 工具。", "accessibility": "辅助功能",