Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 72 additions & 14 deletions src/apps/desktop/src/computer_use/desktop_host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,12 @@ struct ComputerUseSessionMutableState {
/// After `screenshot`, block `pointer_move_rel` / `ComputerUseMouseStep` until an absolute move
/// from AX/OCR/globals (`mouse_move`, `move_to_text`, `click_element`, `click_label`) clears this.
block_vision_pixel_nudge_after_screenshot: bool,
/// After click / key / type / scroll / drag: recommend a **`screenshot`** to confirm UI state (Cowork verify).
/// Cleared on the next successful `screenshot_display`.
pending_verify_screenshot: bool,
/// After `move_to_text` (global OCR coordinates): next guarded **`click`** may run without a prior
/// `screenshot_display` / fine-crop basis — same idea as `click_element` relaxed guard.
pointer_trusted_after_ocr_move: bool,
/// Action optimizer for loop detection, history, and visual verification.
optimizer: ComputerUseOptimizer,
}
Expand All @@ -752,6 +758,8 @@ impl ComputerUseSessionMutableState {
navigation_focus: None,
screenshot_cache: None,
block_vision_pixel_nudge_after_screenshot: false,
pending_verify_screenshot: false,
pointer_trusted_after_ocr_move: false,
optimizer: ComputerUseOptimizer::new(),
}
}
Expand All @@ -767,17 +775,27 @@ impl ComputerUseSessionMutableState {
self.last_shot_refinement = Some(refinement);
self.navigation_focus = nav_focus;
self.click_needs_fresh_screenshot = false;
self.pending_verify_screenshot = false;
self.pointer_trusted_after_ocr_move = false;
self.block_vision_pixel_nudge_after_screenshot = true;
}

/// Called after pointer mutation (move, step, relative), click, scroll, key_chord, or type_text.
fn transition_after_pointer_mutation(&mut self) {
self.click_needs_fresh_screenshot = true;
self.pointer_trusted_after_ocr_move = false;
}

/// Called after click (same effect as pointer mutation for freshness).
fn transition_after_click(&mut self) {
self.click_needs_fresh_screenshot = true;
self.pending_verify_screenshot = true;
self.pointer_trusted_after_ocr_move = false;
}

/// Called after key, typing, scroll, or drag — UI likely changed; next `screenshot` should confirm.
fn transition_after_committed_ui_action(&mut self) {
self.pending_verify_screenshot = true;
}
}

Expand Down Expand Up @@ -1040,10 +1058,10 @@ end tell"#])
"space" => Key::Space,
"backspace" => Key::Backspace,
"delete" => Key::Delete,
"up" => Key::UpArrow,
"down" => Key::DownArrow,
"left" => Key::LeftArrow,
"right" => Key::RightArrow,
"up" | "arrow_up" | "arrowup" => Key::UpArrow,
"down" | "arrow_down" | "arrowdown" => Key::DownArrow,
"left" | "arrow_left" | "arrowleft" => Key::LeftArrow,
"right" | "arrow_right" | "arrowright" => Key::RightArrow,
"home" => Key::Home,
"end" => Key::End,
"pageup" | "page_up" => Key::PageUp,
Expand Down Expand Up @@ -1938,8 +1956,9 @@ impl ComputerUseHost for DesktopComputerUseHost {
let s = self.state.lock().unwrap();
let last_ref = s.last_shot_refinement;
let click_needs_fresh = s.click_needs_fresh_screenshot;
let pending_verify = s.pending_verify_screenshot;

let (click_ready, screenshot_kind, recommended_next_action) = match last_ref {
let (click_ready, screenshot_kind, mut recommended_next_action) = match last_ref {
Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => (
!click_needs_fresh,
Some(ComputerUseInteractionScreenshotKind::RegionCrop),
Expand Down Expand Up @@ -1967,11 +1986,16 @@ impl ComputerUseHost for DesktopComputerUseHost {
None => (false, None, Some("screenshot".to_string())),
};

if pending_verify && recommended_next_action.is_none() {
recommended_next_action = Some("screenshot".to_string());
}

ComputerUseInteractionState {
click_ready,
enter_ready: !click_needs_fresh,
requires_fresh_screenshot_before_click: click_needs_fresh,
requires_fresh_screenshot_before_enter: click_needs_fresh,
recommend_screenshot_to_verify_last_action: pending_verify,
last_screenshot_kind: screenshot_kind,
last_mutation: None,
recommended_next_action,
Expand Down Expand Up @@ -2464,6 +2488,7 @@ impl ComputerUseHost for DesktopComputerUseHost {
.await
.map_err(|e| BitFunError::tool(e.to_string()))??;
ComputerUseHost::computer_use_after_pointer_mutation(self);
ComputerUseHost::computer_use_after_committed_ui_action(self);
Ok(())
}

Expand All @@ -2482,7 +2507,6 @@ impl ComputerUseHost for DesktopComputerUseHost {
.iter()
.map(|s| Self::map_key(s))
.collect::<BitFunResult<_>>()?;
#[cfg(target_os = "macos")]
let chord_has_modifier = keys_for_job.iter().any(|s| {
matches!(
s.to_lowercase().as_str(),
Expand All @@ -2493,28 +2517,37 @@ impl ComputerUseHost for DesktopComputerUseHost {
e.key(mapped[0], Direction::Click)
.map_err(|err| BitFunError::tool(format!("key: {}", err)))?;
} else {
for k in &mapped[..mapped.len() - 1] {
let mods = &mapped[..mapped.len() - 1];
let last = *mapped.last().unwrap();
for k in mods {
e.key(*k, Direction::Press)
.map_err(|err| BitFunError::tool(format!("key press: {}", err)))?;
}
let last = *mapped.last().unwrap();
if chord_has_modifier {
// Modifiers must be registered before the main key; otherwise macOS / IME
// treats the letter as plain typing (e.g. Cmd+F becomes "f" in the text box).
#[cfg(target_os = "macos")]
std::thread::sleep(std::time::Duration::from_millis(160));
#[cfg(not(target_os = "macos"))]
std::thread::sleep(std::time::Duration::from_millis(55));
}
e.key(last, Direction::Click)
.map_err(|err| BitFunError::tool(format!("key click: {}", err)))?;
for k in mapped[..mapped.len() - 1].iter().rev() {
for k in mods.iter().rev() {
e.key(*k, Direction::Release)
.map_err(|err| BitFunError::tool(format!("key release: {}", err)))?;
}
}
#[cfg(target_os = "macos")]
if chord_has_modifier {
std::thread::sleep(std::time::Duration::from_millis(95));
if chord_has_modifier {
std::thread::sleep(std::time::Duration::from_millis(35));
}
}
Ok(())
})
})
.await
.map_err(|e| BitFunError::tool(e.to_string()))??;
ComputerUseHost::computer_use_after_pointer_mutation(self);
ComputerUseHost::computer_use_after_committed_ui_action(self);
Ok(())
}

Expand All @@ -2531,7 +2564,9 @@ impl ComputerUseHost for DesktopComputerUseHost {
})
.await
.map_err(|e| BitFunError::tool(e.to_string()))??;
ComputerUseHost::computer_use_after_pointer_mutation(self);
// Typing does not move the pointer; do not set click_needs (would block Enter after search).
ComputerUseHost::computer_use_after_committed_ui_action(self);
ComputerUseHost::computer_use_trust_pointer_after_text_input(self);
Ok(())
}

Expand Down Expand Up @@ -2562,6 +2597,26 @@ impl ComputerUseHost for DesktopComputerUseHost {
}
}

fn computer_use_after_committed_ui_action(&self) {
if let Ok(mut s) = self.state.lock() {
s.transition_after_committed_ui_action();
}
}

fn computer_use_trust_pointer_after_ocr_move(&self) {
if let Ok(mut s) = self.state.lock() {
// `mouse_move` already set click_needs; OCR globals are authoritative like AX.
s.click_needs_fresh_screenshot = false;
s.pointer_trusted_after_ocr_move = true;
}
}

fn computer_use_trust_pointer_after_text_input(&self) {
if let Ok(mut s) = self.state.lock() {
s.click_needs_fresh_screenshot = false;
}
}

fn computer_use_guard_click_allowed(&self) -> BitFunResult<()> {
let s = self
.state
Expand All @@ -2570,6 +2625,9 @@ impl ComputerUseHost for DesktopComputerUseHost {
if s.click_needs_fresh_screenshot {
return Err(BitFunError::tool(STALE_CAPTURE_TOOL_MESSAGE.to_string()));
}
if s.pointer_trusted_after_ocr_move {
return Ok(());
}
match s.last_shot_refinement {
Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => {}
Some(ComputerUseScreenshotRefinement::QuadrantNavigation {
Expand Down
35 changes: 32 additions & 3 deletions src/apps/desktop/src/computer_use/macos_ax_ui.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,20 @@ impl CandidateMatch {
score += 20;
}

// WeChat (and similar): global search field is often the first AXTextField match but is the wrong target
// when the user wants the **chat composer**. Deprioritize known search chrome.
if let Some(ref id) = self.identifier {
if id.contains("_SC_SEARCH_FIELD") {
score -= 1500;
}
}

// Among text inputs, the composer is usually **lower** on screen than the top search bar.
let rl = self.role.to_lowercase();
if rl.contains("textfield") || rl.contains("textarea") {
score += ((self.gy / 8.0) as i64).clamp(0, 400);
}

score
}

Expand Down Expand Up @@ -352,13 +366,28 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult<Ui

if candidates.is_empty() {
return Err(BitFunError::tool(
"No accessibility element matched in the frontmost app. Tips: use `filter_combine: \"any\"` for OR matching; use only `role_substring` or only `title_contains`; match UI language; ensure the target app is focused. Or fall back to `screenshot` + vision path."
"No accessibility element matched in the frontmost app. Tips: `role_substring` **`TextArea`** also matches **`AXTextField`** (WeChat compose is often TextField). Use `filter_combine: \"any\"` for OR matching; match UI language; ensure the target app is focused. For chat apps, if the conversation is already open, **`type_text`** may work without clicking. Or use `move_to_text` / `screenshot` + `click_label`."
.to_string(),
));
}

// Sort by rank score (descending)
candidates.sort_by(|a, b| b.rank_score().cmp(&a.rank_score()));
// Sort by rank score (descending); tie-break text fields toward **lower on screen** (chat input).
candidates.sort_by(|a, b| {
let sa = a.rank_score();
let sb = b.rank_score();
match sb.cmp(&sa) {
std::cmp::Ordering::Equal => {
let a_txt = a.role.contains("TextField") || a.role.contains("TextArea");
let b_txt = b.role.contains("TextField") || b.role.contains("TextArea");
if a_txt && b_txt {
b.gy.partial_cmp(&a.gy).unwrap_or(std::cmp::Ordering::Equal)
} else {
std::cmp::Ordering::Equal
}
}
o => o,
}
});

let total = candidates.len() as u32;
let best = &candidates[0];
Expand Down
Loading
Loading