diff --git a/src/apps/desktop/src/api/ssh_api.rs b/src/apps/desktop/src/api/ssh_api.rs index 8572cae8..ab177059 100644 --- a/src/apps/desktop/src/api/ssh_api.rs +++ b/src/apps/desktop/src/api/ssh_api.rs @@ -6,7 +6,7 @@ use tauri::State; use bitfun_core::service::remote_ssh::{ SSHConnectionConfig, SSHConnectionResult, SavedConnection, RemoteTreeNode, - SSHConfigLookupResult, SSHConfigEntry, + SSHConfigLookupResult, SSHConfigEntry, ServerInfo, }; use crate::api::app_state::SSHServiceError; use crate::AppState; @@ -119,6 +119,15 @@ pub async fn ssh_is_connected( Ok(is_connected) } +#[tauri::command] +pub async fn ssh_get_server_info( + state: State<'_, AppState>, + connection_id: String, +) -> Result, String> { + let manager = state.get_ssh_manager_async().await?; + Ok(manager.resolve_remote_home_if_missing(&connection_id).await) +} + #[tauri::command] pub async fn ssh_get_config( state: State<'_, AppState>, diff --git a/src/apps/desktop/src/computer_use/desktop_host.rs b/src/apps/desktop/src/computer_use/desktop_host.rs index ea91577c..d016d28d 100644 --- a/src/apps/desktop/src/computer_use/desktop_host.rs +++ b/src/apps/desktop/src/computer_use/desktop_host.rs @@ -1161,6 +1161,24 @@ end tell"#]) } } + /// Square region in global logical coordinates for raw OCR preview crops around `(cx, cy)`. + fn ocr_region_square_around_point( + cx: f64, + cy: f64, + half: u32, + ) -> BitFunResult { + let hh = half as f64; + let x0 = (cx - hh).floor() as i32; + let y0 = (cy - hh).floor() as i32; + let w = half.saturating_mul(2).max(1); + Ok(OcrRegionNative { + x0, + y0, + width: w, + height: w, + }) + } + /// Capture **raw** display pixels (no pointer/SoM overlay), cropped to `region` intersected with the chosen display. /// /// `region` and [`DisplayInfo::width`]/[`height`] are **global logical points** (CG / AX). The framebuffer @@ -2124,6 +2142,58 @@ impl ComputerUseHost for DesktopComputerUseHost { .collect()) } + async fn accessibility_hit_at_global_point( + &self, + gx: f64, + gy: f64, + ) -> BitFunResult> + { + #[cfg(target_os = "macos")] + { + let hit = tokio::task::spawn_blocking(move || { + crate::computer_use::macos_ax_ui::accessibility_hit_at_global_point(gx, gy) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))?; + return Ok(hit); + } + #[cfg(target_os = "windows")] + { + return tokio::task::spawn_blocking(move || { + crate::computer_use::windows_ax_ui::accessibility_hit_at_global_point(gx, gy) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))?; + } + #[cfg(target_os = "linux")] + { + let _ = (gx, gy); + Ok(None) + } + #[cfg(not(any( + target_os = "macos", + target_os = "windows", + target_os = "linux" + )))] + { + let _ = (gx, gy); + Ok(None) + } + } + + async fn ocr_preview_crop_jpeg( + &self, + gx: f64, + gy: f64, + half_extent_native: u32, + ) -> BitFunResult> { + let region = Self::ocr_region_square_around_point(gx, gy, half_extent_native)?; + let shot = tokio::task::spawn_blocking(move || Self::screenshot_raw_native_region(region)) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + Ok(shot.bytes) + } + fn last_screenshot_refinement(&self) -> Option { self.state.lock().ok().and_then(|s| s.last_shot_refinement) } diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index cd969cd7..f1722452 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -3,7 +3,9 @@ //! Coordinates match CoreGraphics global space used by [`crate::computer_use::DesktopComputerUseHost`]. use crate::computer_use::ui_locate_common; -use bitfun_core::agentic::tools::computer_use_host::{SomElement, UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::agentic::tools::computer_use_host::{ + OcrAccessibilityHit, SomElement, UiElementLocateQuery, UiElementLocateResult, +}; use bitfun_core::util::errors::{BitFunError, BitFunResult}; use core_foundation::array::{CFArray, CFArrayRef}; use core_foundation::base::{CFTypeRef, TCFType}; @@ -24,6 +26,12 @@ unsafe extern "C" { attribute: CFStringRef, value: *mut CFTypeRef, ) -> i32; + fn AXUIElementCopyElementAtPosition( + element: AXUIElementRef, + x: f32, + y: f32, + out_elem: *mut AXUIElementRef, + ) -> i32; fn AXValueGetType(value: AXValueRef) -> u32; fn AXValueGetValue(value: AXValueRef, the_type: u32, ptr: *mut c_void) -> bool; } @@ -489,6 +497,54 @@ pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { results } +unsafe fn ax_parent_context_line(elem: AXUIElementRef) -> Option { + let parent_val = ax_copy_attr(elem, "AXParent")?; + let parent = parent_val as AXUIElementRef; + if parent.is_null() { + ax_release(parent_val); + return None; + } + let (r, t, _) = read_role_title_id(parent); + ax_release(parent_val); + Some(element_short_desc(r.as_deref(), t.as_deref())) +} + +/// Hit-test the accessibility element at global screen coordinates (OCR `move_to_text` disambiguation). +pub fn accessibility_hit_at_global_point(gx: f64, gy: f64) -> Option { + unsafe { + let sys = AXUIElementCreateSystemWide(); + if sys.is_null() { + return None; + } + let mut elem: AXUIElementRef = std::ptr::null(); + let err = AXUIElementCopyElementAtPosition(sys, gx as f32, gy as f32, &mut elem); + ax_release(sys as CFTypeRef); + if err != 0 || elem.is_null() { + if !elem.is_null() { + ax_release(elem as CFTypeRef); + } + return None; + } + let (role, title, ident) = read_role_title_id(elem); + let parent_context = ax_parent_context_line(elem); + ax_release(elem as CFTypeRef); + let desc = format!( + "{} | title={:?} | id={:?} | parent=[{}]", + role.as_deref().unwrap_or("?"), + title.as_deref().unwrap_or(""), + ident.as_deref().unwrap_or(""), + parent_context.as_deref().unwrap_or("?"), + ); + Some(OcrAccessibilityHit { + role, + title, + identifier: ident, + parent_context, + description: desc, + }) + } +} + // ── Raw OCR: frontmost window bounds (separate from agent screenshot pipeline) ───────────────── /// Bounds of the foreground app's focused or main window in global screen coordinates (same space as pointer / screen capture). diff --git a/src/apps/desktop/src/computer_use/windows_ax_ui.rs b/src/apps/desktop/src/computer_use/windows_ax_ui.rs index 1323bee1..fc44b326 100644 --- a/src/apps/desktop/src/computer_use/windows_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/windows_ax_ui.rs @@ -1,11 +1,14 @@ //! Windows UI Automation (UIA) tree walk for stable screen coordinates. use crate::computer_use::ui_locate_common; -use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::agentic::tools::computer_use_host::{ + OcrAccessibilityHit, UiElementLocateQuery, UiElementLocateResult, +}; use bitfun_core::util::errors::{BitFunError, BitFunResult}; use std::collections::VecDeque; use windows::Win32::System::Com::{CoCreateInstance, CoInitializeEx, CLSCTX_INPROC_SERVER, COINIT_APARTMENTTHREADED}; use windows::Win32::UI::Accessibility::{CUIAutomation, IUIAutomation, IUIAutomationElement, IUIAutomationTreeWalker}; +use windows::Win32::Foundation::POINT; use windows::Win32::UI::WindowsAndMessaging::GetForegroundWindow; fn bstr_to_string(b: windows_core::BSTR) -> String { @@ -158,3 +161,59 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult BitFunResult> { + unsafe { + let _ = CoInitializeEx(None, COINIT_APARTMENTTHREADED); + } + let automation: IUIAutomation = unsafe { + CoCreateInstance(&CUIAutomation, None, CLSCTX_INPROC_SERVER).map_err(|e| { + BitFunError::tool(format!("UI Automation (CoCreateInstance): {}.", e)) + })? + }; + let pt = POINT { + x: gx.round() as i32, + y: gy.round() as i32, + }; + let elem = unsafe { automation.ElementFromPoint(pt) }; + let elem = match elem { + Ok(e) => e, + Err(_) => return Ok(None), + }; + let name = unsafe { elem.CurrentName().ok().map(bstr_to_string).unwrap_or_default() }; + let ident = unsafe { + elem.CurrentAutomationId() + .ok() + .map(bstr_to_string) + .unwrap_or_default() + }; + let role = localized_control_type_string(&elem); + let parent_context = if let Ok(walker) = unsafe { automation.ControlViewWalker() } { + unsafe { walker.GetParentElement(&elem) } + .ok() + .and_then(|parent| { + let pn = unsafe { parent.CurrentName().ok().map(bstr_to_string).unwrap_or_default() }; + let pr = localized_control_type_string(&parent); + let s = format!("{}: {}", pr, pn); + if s == ": " || s.trim().is_empty() { + None + } else { + Some(s) + } + }) + } else { + None + }; + let desc = format!( + "role={} name={:?} id={:?} parent={:?}", + role, name, ident, parent_context + ); + Ok(Some(OcrAccessibilityHit { + role: if role.is_empty() { None } else { Some(role) }, + title: if name.is_empty() { None } else { Some(name) }, + identifier: if ident.is_empty() { None } else { Some(ident) }, + parent_context, + description: desc, + })) +} diff --git a/src/apps/desktop/src/lib.rs b/src/apps/desktop/src/lib.rs index 2ee6beff..617fe242 100644 --- a/src/apps/desktop/src/lib.rs +++ b/src/apps/desktop/src/lib.rs @@ -672,6 +672,7 @@ pub async fn run() { api::ssh_api::ssh_disconnect, api::ssh_api::ssh_disconnect_all, api::ssh_api::ssh_is_connected, + api::ssh_api::ssh_get_server_info, api::ssh_api::ssh_get_config, api::ssh_api::ssh_list_config_hosts, api::ssh_api::remote_read_file, diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index 4dbc7040..a4dc8079 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -11,7 +11,7 @@ Narrate only when it helps: multi-step work, complex/challenging problems, sensi Keep narration brief and value-dense; avoid repeating obvious steps. Use plain human language for narration unless in a technical context. When a first-class tool exists for an action, use the tool directly instead of asking the user to run equivalent CLI commands. -**Computer use (desktop automation):** If the user's request needs **more than one** ComputerUse call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether `Bash` applies (e.g. `open -a "AppName"`), (b) which `click_element` / `move_to_text` / `locate` calls to try, (c) target app/window, (d) how you will verify focus. Then execute step-by-step. +**Computer use (desktop automation):** If the user's request needs **more than one** ComputerUse call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether `Bash` applies (e.g. `open -a "AppName"` or osascript/PowerShell/xdotool), (b) whether **`key_chord`** / **`type_text`** can replace mouse steps (Enter, Escape, Tab, shortcuts), (c) which `click_element` / `move_to_text` / `locate` calls to try if pointing is required, (d) target app/window, (e) how you will verify focus. Then execute step-by-step. # Session Coordination For complex coding tasks or office-style multi-step tasks, prefer multi-session coordination over doing everything in the current session. @@ -42,47 +42,94 @@ Do not manipulate or persuade anyone to expand access or disable safeguards. Do # Computer use (BitFun desktop, when enabled) Everything is in one tool: **`ComputerUse`** with these actions: `click_element`, `click_label`, `move_to_text`, `click`, `mouse_move`, `scroll`, `drag`, `screenshot`, `locate`, `key_chord`, `type_text`, `pointer_move_rel`, `wait`. +## Keyboard before mouse (MANDATORY — not a suggestion) +**Always ask yourself first: "Can I complete this step with a keystroke?"** If yes, use `key_chord` or `type_text`. Mouse is a fallback, not the default. + +**Decision tree — apply top-to-bottom, stop at the first match:** +1. **After typing in a search/input field** (search, filter, filename, etc.) → **ALWAYS try `key_chord` with `return` first**, before any mouse action. The Enter key is the standard way to confirm/submit input. +2. **Default action / submit / confirm** (OK, Save, Submit, Continue, Send, Done, Yes, or primary button) → **`key_chord` with `return`** (requires fresh screenshot per policy). NEVER click these buttons when Enter works. +3. **Cancel / close / dismiss** (dialog, popup, modal, sheet) → **`key_chord` with `escape`**. Do not click "Cancel" / X. +4. **Navigate between controls/fields** when current focus is unknown or lost → **`key_chord` with `tab`** (forward) or **`shift+tab`** (backward). Do not immediately reach for the mouse when you can Tab to the target. +5. **Toggle a focused checkbox/radio/switch** → **`key_chord` with `space`**. Do not click it. +6. **Select in a focused dropdown/list** → **arrow keys** via `key_chord`, then `return` to confirm. Do not click items. +7. **Open context menu** → **`key_chord` with `shift+F10`** (Windows/Linux) or **`control+click`** as secondary to `right` button click on macOS; still prefer menu shortcuts when available. +8. **Clipboard** → **`key_chord`** for copy/cut/paste/select-all. Never click Edit menu for these. +9. **App shortcuts** (visible in menus or well-known: Cmd+S/Ctrl+S to save, Cmd+W/Ctrl+W to close tab, Cmd+T/Ctrl+T new tab, Cmd+L/Ctrl+L focus address bar, Cmd+F/Ctrl+F find, etc.) → **`key_chord`**. Do not click the menu item. +10. **Scroll a page** → **`key_chord` with `space`** (page down), **`shift+space`** (page up), **`home`**, **`end`**, or arrow keys — before using `scroll` action. +11. **Text editing** (select all, move to start/end of line, delete word) → Use standard keyboard shortcuts via `key_chord` before attempting mouse selection or clicking. + +**Strategy when stuck with mouse:** +- If `move_to_text` fails to find your target → try `key_chord` with `tab` (or `shift+tab`) to navigate focus. +- If you're repeatedly trying `mouse_move` with guessed coordinates and failing → STOP. Switch strategy: try `tab` navigation, try `key_chord` shortcuts, or re-verify which app is focused. +- If you've tried the same mouse-based approach 2-3 times without success → you MUST switch to a completely different strategy (keyboard, different targeting method, verify app focus, ask user for help). + +**Only use mouse** (`click_element`, `move_to_text`+`click`, `click_label`, or vision path) when: +- The target cannot be reached by Tab/keyboard focus navigation from current focus +- You need to click a specific non-default button/link that has no keyboard equivalent +- The focused element is unknown and you cannot determine it from context +- You have already tried the keyboard-first approach and it failed + ## Automation priority (try higher first) -**Targeting rule:** Prefer **non-screenshot** targeting before any workflow that depends on **new** screenshots for pointing. **`screenshot` + quadrant / crop + `mouse_move` + `click` is the lowest-priority targeting path** — use only when AX, OCR, and (if already available) SoM labels are insufficient. +**Targeting rule:** Prefer **script/command-line automation** over GUI automation whenever possible. Scripts are faster, more reliable, and less prone to breaking when UI changes. + +**GUI automation (`ComputerUse`) is a fallback, not the default.** + +1. **Direct command/script automation (HIGHEST PRIORITY)**: + - **macOS**: Use `osascript` (AppleScript or JavaScript for Automation) via `Bash` for app automation + - **Windows**: Use `powershell` or `cmd` via `Bash` for app automation + - **Linux**: Use `xdotool`, `ydotool`, `wmctrl`, etc. via `Bash` for window/UI automation + - **App-specific CLI tools**: Use CLI versions of apps when available (e.g. `subl`, `code`, `git`, etc.) + - **Shell commands**: `open -a "App"` on macOS, `start` on Windows to launch/focus apps + - Prefer this over **any** GUI automation when a script/command can complete the task + +2. **`key_chord`** -- OS and app keyboard shortcuts; **Enter/Return/Escape/Tab/Space** and clipboard (copy/cut/paste). **Prefer over mouse** whenever a key completes the same step (see **Keyboard before mouse**). **No** mandatory screenshot before non-Enter chords (see Screenshot policy). -1. **`Bash` / `TerminalControl`** -- shell commands, scripts, `open -a "App"` on macOS to launch/focus apps. -2. **`key_chord`** -- OS and app keyboard shortcuts, clipboard (copy/cut/paste). Prefer over mouse when a shortcut exists. **No** mandatory screenshot before non-Enter chords (see Screenshot policy). 3. **`click_element`** -- accessibility (AX/UIA/AT-SPI): locate + move + click in one call. **Bypasses screenshot guard.** Use when filters can match the control. + 4. **`move_to_text`** (OCR) -- match **visible on-screen text** and **move the pointer** to it (no click, no keys). **Does not require a prior model-driven `screenshot` for targeting** (host captures internally). Use **`click`** in a separate step if you need a mouse press. Use **before** `screenshot` drill or **`mouse_move` + `click`** whenever distinctive text is visible in the **same language as the UI**. Prefer this over SoM/vision when you have not yet taken a screenshot or when labels are missing. + 5. **`click_label`** -- if a **previous** `screenshot` already returned numbered Set-of-Mark labels, click by number. **Requires** that screenshot step first; still **prefer `move_to_text` over starting a long screenshot-only drill** when readable text is enough. + 6. **`locate`** -- find an element without clicking (JSON + coordinates). No screenshot required for the lookup itself. + 7. **`screenshot`** (confirm UI / SoM / drill only) + **`mouse_move`** (**`use_screen_coordinates`: true**, globals from **`locate`** / **`move_to_text`** / tool JSON) + **`click`** -- **last resort** when AX/OCR/SoM are insufficient. **Never** derive `mouse_move` targets from JPEG pixels. **`click`** still needs a valid host basis (host). -8. **`mouse_move`**, **`scroll`**, **`drag`**, **`type_text`**, **`pointer_move_rel`**, **`ComputerUseMouseStep`**, **`wait`** -- manipulate without mandatory pre-screenshot (see Screenshot policy; host may still require refresh before a later **`click`** or Enter **`key_chord`**). **`mouse_move` / `drag`:** globals only (`use_screen_coordinates`: true). **`pointer_move_rel` / `ComputerUseMouseStep`:** the **desktop host refuses** these as the **next** action after **`screenshot`** — reposition with **`move_to_text`**, **`mouse_move`**, **`click_element`**, or **`click_label`** first (do not nudge from the JPEG). + +8. **`mouse_move`**, **`scroll`**, **`drag`**, **`type_text`**, **`pointer_move_rel`**, **`ComputerUseMouseStep`**, **`wait`** -- manipulate without mandatory pre-screenshot (see Screenshot policy; host may still require refresh before a later **`click`** or Enter **`key_chord`**). **`mouse_move` / `drag`:** globals only (`use_screen_coordinates`: true). **`pointer_move_rel` / `ComputerUseMouseStep`:** the **desktop host refuses** these as the **next** action after **`screenshot`** -- reposition with **`move_to_text`**, **`mouse_move`**, **`click_element`**, or **`click_label`** first (do not nudge from the JPEG). ## `click_element` (preferred for most accessibility-backed clicks) Use `click_element` when the target has a known accessible title or role. It locates the element via AX tree, moves the pointer to its center, and clicks -- all in one call. No screenshot or quadrant drill needed. Supports `button` (left/right/middle) and `num_clicks` (1/2/3 for single/double/triple click). **Filter tips:** Use `title_contains` and/or `role_substring` in the **same language as the app UI**. Use `filter_combine: "any"` when fields might not overlap (e.g. text fields with no title). If no match, refine the query or fall back to SoM / OCR / vision path. Prefer short, distinctive substrings. If a call returns no match, **change the query** before retrying. -**When `click_element` won't work:** Chat apps (e.g. WeChat), Electron/web views, owner-drawn controls, and minimal AX trees often omit or misname roles/titles (your filter may not match even when the control is visible). **Do not** repeat the same `title_contains`/`role_substring` more than twice — switch to **`move_to_text`** on visible chrome (tabs, buttons, search hints) or screenshot + `click_label` / quadrant workflow. That is expected, not a bug. +**When `click_element` won't work:** Many apps (Electron/web views, custom-drawn UI) have limited AX trees. **Do not** repeat the same `title_contains`/`role_substring` more than twice -- switch to **`move_to_text`** on visible chrome (tabs, buttons, search hints) or screenshot + `click_label` / quadrant workflow. That is expected, not a bug. ## Screenshot policy (host-enforced) **Mandatory fresh screenshot / valid fine-capture basis applies only to:** -- **`click`** (at current pointer — **`click` never accepts x/y**) — the host may require a **fine** capture basis (point crop, quadrant terminal, or full-frame per host rules); use point crop or quadrant drill until `quadrant_navigation_click_ready` when needed, **or** use `click_element` / `click_label` / `move_to_text` instead of guessing pixels. -- **`key_chord` that includes `return` or `enter` / `kp_enter`** — requires a fresh screenshot since the last pointer-changing action (host). +- **`click`** (at current pointer -- **`click` never accepts x/y**) -- the host may require a **fine** capture basis (point crop, quadrant terminal, or full-frame per host rules); use point crop or quadrant drill until `quadrant_navigation_click_ready` when needed, **or** use `click_element` / `click_label` / `move_to_text` instead of guessing pixels. +- **`key_chord` that includes `return` or `enter` / `kp_enter`** -- requires a fresh screenshot since the last pointer-changing action (host). -**Not** subject to “must screenshot first” by themselves: `mouse_move`, `scroll`, `drag`, `type_text`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, and **`move_to_text`** / **`click_element`** / **`click_label`** (they bypass the click guard or do not use it). +**Not** subject to "must screenshot first" by themselves: `mouse_move`, `scroll`, `drag`, `type_text`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, and **`move_to_text`** / **`click_element`** / **`click_label`** (they bypass the click guard or do not use it). -**Cadence:** Take **`screenshot`** when you need **visual confirmation**, SoM labels, or the host requires a fresh capture before **`click`** / Enter. When confirmation is required, the host applies **~500×500** around the mouse or text caret (including during quadrant drill) unless you force full-frame with **`screenshot_reset_navigation`**. Do **not** add extra screenshots before ordinary moves, typing, or non-Enter shortcuts “just in case.” +**Cadence:** Take **`screenshot`** when you need **visual confirmation**, SoM labels, or the host requires a fresh capture before **`click`** / Enter. When confirmation is required, the host applies **~500×500** around the mouse or text caret (including during quadrant drill) unless you force full-frame with **`screenshot_reset_navigation`**. Do **not** add extra screenshots before ordinary moves, typing, or non-Enter shortcuts "just in case." ## Screenshot path (lowest targeting tier) -After **`click_element`** and **`move_to_text`** are exhausted or inappropriate, use **`screenshot`** for **confirmation** and SoM — not for inventing move coordinates. +After **`click_element`** and **`move_to_text`** are exhausted or inappropriate, use **`screenshot`** for **confirmation** and SoM -- not for inventing move coordinates. When you **do** take a `screenshot`, inspect JSON: - If `som_labels` is present, **`click_label`** is preferred. -- **Do not** read pixel coordinates off the JPEG for **`mouse_move`** — use **`locate`**, **`move_to_text`**, or globals from tool results with **`use_screen_coordinates`: true**. +- **Do not** read pixel coordinates off the JPEG for **`mouse_move`** -- use **`locate`**, **`move_to_text`**, or globals from tool results with **`use_screen_coordinates`: true**. -## `move_to_text` (OCR — high priority, not a last resort) +## `move_to_text` (OCR -- high priority, not a last resort) Use **`move_to_text`** when visible text identifies the target and AX is weak or unknown. It **only moves the cursor**; add **`click`** afterward if you need a press. **Call it before** chaining multiple `screenshot` + quadrant steps when a short substring would suffice. -Pass a substring in the **same language as the UI**. If multiple matches, refine `text_query`. +Pass a substring in the **same language as the UI**. If the host reports **several OCR hits** (`disambiguation_required`), it returns **one preview JPEG per candidate** plus **accessibility** metadata -- pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same `text_query` / `ocr_region_native`. Otherwise refine `text_query` or `ocr_region_native`. + +**Failure recovery for `move_to_text`:** If `move_to_text` returns no matches or the wrong match: +1. FIRST: Try a shorter substring (e.g. 1-2 characters instead of full phrase) +2. THEN: If that still fails, try `key_chord` with `tab` (or `shift+tab`) to navigate focus to the target +3. ONLY THEN: Consider screenshot path as last resort -**vs globals:** Prefer **`move_to_text`** (then **`click`** if needed) over **`mouse_move` + `click`** when text is visible. **`mouse_move`** must use **`use_screen_coordinates`: true** with numbers from **`locate`** / **`move_to_text`** / **`pointer_global`** — never JPEG guesses. +**vs globals:** Prefer **`move_to_text`** (then **`click`** if needed) over **`mouse_move` + `click`** when text is visible. **`mouse_move`** must use **`use_screen_coordinates`: true** with numbers from **`locate`** / **`move_to_text`** / **`pointer_global`** -- never JPEG guesses. ## Vision / drill path (last resort) When `click_element`, **`move_to_text`**, and (if applicable) `click_label` cannot complete the step: @@ -110,16 +157,16 @@ The system automatically tracks your action history. If `loop_warning` appears i - **If stuck after trying alternatives:** explain what you attempted and ask the user for guidance rather than continuing to loop. ## Key rules -- **macOS apps:** Use `open -a "AppName"` via Bash to launch/focus, not Spotlight. +- **macOS apps:** Use `open -a "AppName"` via Bash to launch/focus, or `osascript` for more complex automation; not Spotlight. - **Foreground safety:** Check `computer_use_context.foreground_application` -- if wrong app is focused, fix focus first. `locate` and `click_element` search the **foreground** app only. -- **Targeting order:** `click_element` → **`move_to_text`** (when text is visible) → **`click_label`** if SoM is already on a screenshot → **screenshot** drill / crop + **`mouse_move`** + **`click`** last. +- **Targeting order (when the pointer is required):** `click_element` → **`move_to_text`** (when text is visible) → **`click_label`** if SoM is already on a screenshot → **screenshot** drill / crop + **`mouse_move`** + **`click`** last. Apply **Keyboard before mouse** first -- do not use this order to click a control that **Enter** / **Escape** / focus keys could handle. - **Screenshot cadence:** Only when you need pixels, SoM, or a **fine** basis before guarded **`click`**; and always immediately before **`key_chord`** with Enter/Return (host). **Do not** treat `screenshot` as the default next step after every non-click action. - **No blind Enter:** Fresh `screenshot` required before `key_chord` with Return/Enter only (not before other chords). -- **Shortcut-first:** Use `key_chord` for Copy/Paste/Save/Undo etc. Do not click menus when shortcuts exist. Menus in screenshots often display shortcuts -- use them. +- **Shortcut-first:** Use `key_chord` for Copy/Paste/Save/Undo and other labeled shortcuts. Do not click menus when shortcuts exist. Menus in screenshots often display shortcuts -- use them. Together with **Keyboard before mouse**, prefer keys over clicking visible buttons when keys are equivalent (especially **Enter** on default actions). - **Re-plan on failure:** If `locate`/`click_element` misses or screenshot shows unexpected UI, stop and reassess. Do not retry the same approach more than twice. - **Sensitive actions:** For messages, payments, or destructive actions, state steps and get user confirmation first. - **Pointer info:** After `screenshot`, `pointer_image_x/y` and the red synthetic cursor show pointer position. Optional follow-up `screenshot` after large pointer moves if you need pixels before a guarded **`click`**. -- **Screenshot layout:** JPEGs are for **confirmation** (optional pointer + SoM). **Do not** use JPEG pixel indices for **`mouse_move`** — the host disables image/normalized moves; use **global** coordinates only. +- **Screenshot layout:** JPEGs are for **confirmation** (optional pointer + SoM). **Do not** use JPEG pixel indices for **`mouse_move`** -- the host disables image/normalized moves; use **global** coordinates only. - **Multi-step plans:** For tasks spanning multiple apps/steps, output a numbered plan before starting. - **Host OS:** Use modifier names matching this host (see Environment Information). Do not mix OS conventions. - On macOS, development builds need Accessibility permission for the debug binary. diff --git a/src/crates/core/src/agentic/coordination/coordinator.rs b/src/crates/core/src/agentic/coordination/coordinator.rs index d0107781..4c1729f8 100644 --- a/src/crates/core/src/agentic/coordination/coordinator.rs +++ b/src/crates/core/src/agentic/coordination/coordinator.rs @@ -121,68 +121,32 @@ impl ConversationCoordinator { let workspace_path = config.workspace_path.as_ref()?; let path_buf = PathBuf::from(workspace_path); - let remote_id = config - .remote_connection_id - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()); - - // Remote tool routing must not be inferred from path alone: the same string can be a - // client path (e.g. macOS `/Users/...`) or a POSIX path on an SSH host. Only treat the - // workspace as remote when the session was created with an explicit SSH connection id. - let Some(rid) = remote_id else { - return Some(WorkspaceBinding::new(None, path_buf)); - }; + let identity = crate::service::remote_ssh::workspace_state::resolve_workspace_session_identity( + workspace_path, + config.remote_connection_id.as_deref(), + config.remote_ssh_host.as_deref(), + ) + .await?; - let path_norm = - crate::service::remote_ssh::workspace_state::normalize_remote_workspace_path( - workspace_path, - ); - let host_from_config = config - .remote_ssh_host - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()); - - let entry = - crate::service::remote_ssh::workspace_state::lookup_remote_connection_with_hint( + if let Some(rid) = identity.remote_connection_id.as_deref() { + let connection_name = crate::service::remote_ssh::workspace_state::lookup_remote_connection_with_hint( workspace_path, Some(rid), ) - .await; - - let local_session_path = if let Some(ref e) = entry { - if !e.ssh_host.trim().is_empty() { - crate::service::remote_ssh::workspace_state::remote_workspace_session_mirror_dir( - &e.ssh_host, - &e.remote_root, - ) - } else { - crate::service::remote_ssh::workspace_state::unresolved_remote_session_storage_dir( - rid, &path_norm, - ) - } - } else if let Some(h) = host_from_config { - crate::service::remote_ssh::workspace_state::remote_workspace_session_mirror_dir( - h, &path_norm, - ) - } else { - crate::service::remote_ssh::workspace_state::unresolved_remote_session_storage_dir( - rid, &path_norm, - ) - }; - - let connection_name = entry + .await .map(|e| e.connection_name) .unwrap_or_else(|| rid.to_string()); - Some(WorkspaceBinding::new_remote( - None, - path_buf, - rid.to_string(), - connection_name, - local_session_path, - )) + return Some(WorkspaceBinding::new_remote( + None, + path_buf, + rid.to_string(), + connection_name, + identity, + )); + } + + Some(WorkspaceBinding::new(None, path_buf)) } /// Build `WorkspaceServices` from a resolved `WorkspaceBinding`. @@ -438,8 +402,11 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet ) .await?; - self.sync_session_metadata_to_workspace(&session, workspace_path.clone()) - .await; + // SessionManager::create_session_with_id_and_creator already persists the + // session into the effective workspace session storage path. Avoid writing + // a second copy here using the raw workspace path, because remote workspaces + // resolve to a different effective storage path and double-writing can leave + // metadata/turn files split across two locations. self.emit_event(AgenticEvent::SessionCreated { session_id: session.session_id.clone(), @@ -451,102 +418,6 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet Ok(session) } - async fn sync_session_metadata_to_workspace(&self, session: &Session, workspace_path: String) { - use crate::agentic::persistence::PersistenceManager; - use crate::infrastructure::PathManager; - use crate::service::session::{SessionMetadata, SessionStatus}; - - let path_manager = match PathManager::new() { - Ok(pm) => Arc::new(pm), - Err(e) => { - warn!("Failed to initialize PathManager for session metadata sync: {e}"); - return; - } - }; - - let binding = Self::build_workspace_binding(&session.config).await; - let workspace_path_buf = binding - .as_ref() - .map(|b| b.session_storage_path().to_path_buf()) - .unwrap_or_else(|| PathBuf::from(&workspace_path)); - - let persistence_manager = match PersistenceManager::new(path_manager) { - Ok(manager) => manager, - Err(e) => { - warn!("Failed to initialize PersistenceManager for session metadata sync: {e}"); - return; - } - }; - - let now_ms = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_millis() as u64; - - let existing = match persistence_manager - .load_session_metadata(&workspace_path_buf, &session.session_id) - .await - { - Ok(meta) => meta, - Err(e) => { - debug!( - "Failed to load existing session metadata before sync: session_id={}, error={}", - session.session_id, e - ); - None - } - }; - - let metadata = SessionMetadata { - session_id: session.session_id.clone(), - session_name: session.session_name.clone(), - agent_type: session.agent_type.clone(), - created_by: session - .created_by - .clone() - .or_else(|| existing.as_ref().and_then(|m| m.created_by.clone())), - model_name: existing - .as_ref() - .map(|m| m.model_name.clone()) - .filter(|name| !name.is_empty()) - .unwrap_or_else(|| "default".to_string()), - created_at: existing.as_ref().map(|m| m.created_at).unwrap_or(now_ms), - last_active_at: now_ms, - turn_count: existing.as_ref().map(|m| m.turn_count).unwrap_or(0), - message_count: existing.as_ref().map(|m| m.message_count).unwrap_or(0), - tool_call_count: existing.as_ref().map(|m| m.tool_call_count).unwrap_or(0), - status: existing - .as_ref() - .map(|m| m.status.clone()) - .unwrap_or(SessionStatus::Active), - terminal_session_id: existing - .as_ref() - .and_then(|m| m.terminal_session_id.clone()), - snapshot_session_id: session.snapshot_session_id.clone().or_else(|| { - existing - .as_ref() - .and_then(|m| m.snapshot_session_id.clone()) - }), - tags: existing - .as_ref() - .map(|m| m.tags.clone()) - .unwrap_or_default(), - custom_metadata: existing.as_ref().and_then(|m| m.custom_metadata.clone()), - todos: existing.as_ref().and_then(|m| m.todos.clone()), - workspace_path: Some(workspace_path), - }; - - if let Err(e) = persistence_manager - .save_session_metadata(&workspace_path_buf, &metadata) - .await - { - warn!( - "Failed to sync session metadata to workspace: session_id={}, error={}", - session.session_id, e - ); - } - } - /// Ensure the completed/failed/cancelled turn is persisted to the workspace /// session storage. If the frontend already saved a richer version /// during streaming, we only update the final status; otherwise we create @@ -567,9 +438,10 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet status: crate::service::session::TurnStatus, user_message_metadata: Option, ) { + use crate::agentic::core::SessionConfig; use crate::agentic::persistence::PersistenceManager; use crate::infrastructure::PathManager; - use crate::service::session::{DialogTurnData, UserMessageData}; + use crate::service::session::{DialogTurnData, SessionMetadata, SessionStatus, UserMessageData}; let path_manager = match PathManager::new() { Ok(pm) => std::sync::Arc::new(pm), @@ -604,6 +476,42 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet .unwrap_or_default() .as_millis() as u64; + if let Ok(None) = persistence_manager + .load_session_metadata(&workspace_path_buf, session_id) + .await + { + let metadata = SessionMetadata { + session_id: session_id.to_string(), + session_name: "Recovered Session".to_string(), + agent_type: "agentic".to_string(), + created_by: None, + model_name: "default".to_string(), + created_at: now_ms, + last_active_at: now_ms, + turn_count: 0, + message_count: 0, + tool_call_count: 0, + status: SessionStatus::Active, + terminal_session_id: None, + snapshot_session_id: None, + tags: Vec::new(), + custom_metadata: None, + todos: None, + workspace_path: Some(workspace_path.to_string()), + workspace_hostname: None, + }; + if let Err(e) = persistence_manager + .save_session_metadata(&workspace_path_buf, &metadata) + .await + { + warn!( + "Failed to create fallback session metadata during turn finalization: session_id={}, error={}", + session_id, e + ); + return; + } + } + let mut turn_data = DialogTurnData::new( turn_id.to_string(), turn_index, diff --git a/src/crates/core/src/agentic/persistence/manager.rs b/src/crates/core/src/agentic/persistence/manager.rs index e415deb1..0faec0f4 100644 --- a/src/crates/core/src/agentic/persistence/manager.rs +++ b/src/crates/core/src/agentic/persistence/manager.rs @@ -6,6 +6,9 @@ use crate::agentic::core::{ strip_prompt_markup, CompressionState, Message, MessageContent, Session, SessionConfig, SessionState, SessionSummary, }; +use crate::service::remote_ssh::workspace_state::{ + resolve_workspace_session_identity, LOCAL_WORKSPACE_SSH_HOST, +}; use crate::infrastructure::PathManager; use crate::service::session::{ DialogTurnData, SessionMetadata, SessionStatus, SessionTranscriptExport, @@ -578,6 +581,36 @@ impl PersistenceManager { .or_else(|| existing.map(|value| value.model_name.clone())) .unwrap_or_else(|| "default".to_string()); + let resolved_identity = session + .config + .workspace_path + .as_deref() + .and_then(|workspace_root| { + futures::executor::block_on(resolve_workspace_session_identity( + workspace_root, + session.config.remote_connection_id.as_deref(), + session.config.remote_ssh_host.as_deref(), + )) + }); + + let workspace_root = resolved_identity + .as_ref() + .map(|identity| identity.workspace_path.clone()) + .or_else(|| session.config.workspace_path.clone()) + .or_else(|| existing.and_then(|value| value.workspace_path.clone())) + .unwrap_or_else(|| workspace_path.to_string_lossy().to_string()); + let workspace_hostname = resolved_identity + .as_ref() + .map(|identity| identity.hostname.clone()) + .or_else(|| existing.and_then(|value| value.workspace_hostname.clone())) + .or_else(|| { + if session.config.remote_connection_id.is_some() { + session.config.remote_ssh_host.clone() + } else { + Some(LOCAL_WORKSPACE_SSH_HOST.to_string()) + } + }); + SessionMetadata { session_id: session.session_id.clone(), session_name: session.session_name.clone(), @@ -605,7 +638,8 @@ impl PersistenceManager { tags: existing.map(|value| value.tags.clone()).unwrap_or_default(), custom_metadata: existing.and_then(|value| value.custom_metadata.clone()), todos: existing.and_then(|value| value.todos.clone()), - workspace_path: Some(workspace_path.to_string_lossy().to_string()), + workspace_path: Some(workspace_root), + workspace_hostname: workspace_hostname, } } @@ -1517,7 +1551,13 @@ impl PersistenceManager { .map(|value| value.config.clone()) .unwrap_or_default(); if config.workspace_path.is_none() { - config.workspace_path = Some(workspace_path.to_string_lossy().to_string()); + config.workspace_path = metadata.workspace_path.clone(); + } + if config.remote_ssh_host.is_none() { + config.remote_ssh_host = metadata + .workspace_hostname + .clone() + .filter(|host| host != LOCAL_WORKSPACE_SSH_HOST && host != "_unresolved"); } if config.model_id.is_none() && !metadata.model_name.is_empty() { config.model_id = Some(metadata.model_name.clone()); @@ -1565,7 +1605,7 @@ impl PersistenceManager { .unwrap_or(StoredSessionStateFile { schema_version: SESSION_SCHEMA_VERSION, config: SessionConfig { - workspace_path: Some(workspace_path.to_string_lossy().to_string()), + workspace_path: None, ..Default::default() }, snapshot_session_id: None, @@ -1667,7 +1707,12 @@ impl PersistenceManager { metadata.last_active_at = turn .end_time .unwrap_or_else(|| Self::system_time_to_unix_ms(SystemTime::now())); - metadata.workspace_path = Some(workspace_path.to_string_lossy().to_string()); + metadata.workspace_path = metadata.workspace_path.clone().or_else(|| { + turns + .first() + .and_then(|_| None::) + .or_else(|| Some(workspace_path.to_string_lossy().to_string())) + }); self.save_session_metadata(workspace_path, &metadata).await } diff --git a/src/crates/core/src/agentic/session/session_manager.rs b/src/crates/core/src/agentic/session/session_manager.rs index 2253f918..e8b82b04 100644 --- a/src/crates/core/src/agentic/session/session_manager.rs +++ b/src/crates/core/src/agentic/session/session_manager.rs @@ -87,35 +87,27 @@ impl SessionManager { } /// Resolve the effective storage path for a session's workspace. - /// Remote workspaces use [`get_effective_session_path`] (same as coordinator / session Tauri APIs). async fn effective_workspace_path_from_config(config: &SessionConfig) -> Option { let workspace_path = config.workspace_path.as_ref()?; - let path_buf = PathBuf::from(workspace_path); - - let remote_id = config - .remote_connection_id - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()); - - let Some(rid) = remote_id else { - return Some(path_buf); - }; - - let host_from_config = config - .remote_ssh_host - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()); - - Some( - crate::service::remote_ssh::workspace_state::get_effective_session_path( - workspace_path.as_str(), - Some(rid), - host_from_config, - ) - .await, + let identity = crate::service::remote_ssh::workspace_state::resolve_workspace_session_identity( + workspace_path, + config.remote_connection_id.as_deref(), + config.remote_ssh_host.as_deref(), ) + .await?; + + if identity.hostname == crate::service::remote_ssh::workspace_state::LOCAL_WORKSPACE_SSH_HOST { + Some(PathBuf::from(identity.workspace_path)) + } else if identity.hostname == "_unresolved" { + Some( + crate::service::remote_ssh::workspace_state::unresolved_remote_session_storage_dir( + identity.remote_connection_id.as_deref().unwrap_or_default(), + &identity.workspace_path, + ), + ) + } else { + Some(identity.session_storage_path()) + } } #[allow(dead_code)] diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index 4c90516f..f99fe108 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -263,6 +263,21 @@ pub struct UiElementLocateResult { pub other_matches: Vec, } +/// Hit-tested accessibility node at a global screen point (OCR disambiguation). +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] +pub struct OcrAccessibilityHit { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub role: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub identifier: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub parent_context: Option, + /// One-line summary for the model (role, title, parent). + pub description: String, +} + #[async_trait] pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { async fn permission_snapshot(&self) -> BitFunResult; @@ -301,6 +316,28 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { )) } + /// Best-effort accessibility element at a global screen point (native hit-test). + /// Desktop uses AX (macOS) / UIA (Windows). Returns `None` when unavailable or on miss. + async fn accessibility_hit_at_global_point( + &self, + _gx: f64, + _gy: f64, + ) -> BitFunResult> { + Ok(None) + } + + /// JPEG crop (no pointer overlay) around `(gx, gy)` for OCR candidate previews. + async fn ocr_preview_crop_jpeg( + &self, + _gx: f64, + _gy: f64, + _half_extent_native: u32, + ) -> BitFunResult> { + Err(BitFunError::tool( + "OCR preview crops are not available on this host.".to_string(), + )) + } + /// Map `(x, y)` from the **last** screenshot's image pixel grid to global pointer pixels. /// Fails if no screenshot was taken in this process since startup (or since last host reset). fn map_image_coords_to_pointer(&self, x: i32, y: i32) -> BitFunResult<(i32, i32)>; diff --git a/src/crates/core/src/agentic/tools/computer_use_optimizer.rs b/src/crates/core/src/agentic/tools/computer_use_optimizer.rs index e10426b5..7716ccfc 100644 --- a/src/crates/core/src/agentic/tools/computer_use_optimizer.rs +++ b/src/crates/core/src/agentic/tools/computer_use_optimizer.rs @@ -104,7 +104,47 @@ impl ComputerUseOptimizer { is_loop: true, pattern_length: 0, repetitions: 0, - suggestion: "Screen state unchanged after multiple actions. Try a different approach or use accessibility tree instead of vision.".to_string(), + suggestion: "Screen state unchanged after multiple actions. Try: 1) Use `key_chord` (Enter, Escape, Tab) instead of mouse, 2) Use `click_element` or `move_to_text` for precise targeting instead of screenshot drill, 3) Verify app is focused.".to_string(), + }; + } + + // Check for excessive mouse usage without keyboard + if self.check_excessive_mouse_usage() { + return LoopDetectionResult { + is_loop: true, + pattern_length: 0, + repetitions: 0, + suggestion: "Detected heavy mouse usage without keyboard. Consider: 1) Use `key_chord` with Enter/Escape/Tab/Space instead of clicking buttons, 2) Use `move_to_text` (OCR) instead of screenshot-based targeting, 3) Use `click_element` (accessibility tree) when possible.".to_string(), + }; + } + + // Check for screenshot → mouse_move → click pattern without using precise coordinates + if self.check_screenshot_mouse_pattern() { + return LoopDetectionResult { + is_loop: true, + pattern_length: 0, + repetitions: 0, + suggestion: "Detected screenshot + mouse move pattern. Use `move_to_text` for visible text or `click_element` for accessibility elements instead of estimating from JPEG. Use `global_center_x/y` from prior tool results with `use_screen_coordinates: true`.".to_string(), + }; + } + + // Check for repeated move_to_text failures without trying keyboard navigation + if self.check_repeated_move_to_text_failures() { + return LoopDetectionResult { + is_loop: true, + pattern_length: 0, + repetitions: 0, + suggestion: "Detected repeated move_to_text failures. Try: 1) Use `key_chord` with Tab/Shift+Tab to navigate focus instead of OCR, 2) Try a shorter substring in `move_to_text`, 3) Verify you're targeting the correct window/app.".to_string(), + }; + } + + // Check for screenshot → mouse_move loop without any clicks or progress + if self.check_screenshot_mouse_loop() { + return LoopDetectionResult { + is_loop: true, + pattern_length: 0, + repetitions: 0, + suggestion: "Detected screenshot → mouse_move loop without progress. Stop guessing coordinates! Try: 1) Use `key_chord` with Tab to navigate focus, 2) Use `move_to_text` with a visible text target, 3) Verify the correct app is focused.".to_string(), }; } @@ -143,7 +183,7 @@ impl ComputerUseOptimizer { pattern_length: pattern_len, repetitions: reps, suggestion: format!( - "Detected repeating pattern of {} actions (repeated {} times). Try: 1) Use accessibility tree (click_element/locate) instead of vision, 2) Use keyboard shortcuts instead of mouse, 3) Take a fresh screenshot to verify current state.", + "Detected repeating pattern of {} actions (repeated {} times). Try: 1) Use `key_chord` (Enter/Escape/Tab/Space) instead of mouse clicks, 2) Use `click_element` (accessibility tree) or `move_to_text` (OCR) instead of vision-based targeting, 3) Take a fresh screenshot to verify current state.", pattern_len, reps ), }) @@ -166,6 +206,103 @@ impl ComputerUseOptimizer { } } + /// Detect excessive mouse usage without any keyboard actions + fn check_excessive_mouse_usage(&self) -> bool { + let recent: Vec<_> = self.action_history.iter().rev().take(10).collect(); + if recent.len() < 10 { + return false; + } + + let mouse_actions = ["click", "mouse_move", "scroll", "drag", "pointer_move_rel"]; + let has_keyboard = recent.iter().any(|r| + r.action_type == "key_chord" || r.action_type == "type_text" + ); + + let mouse_count = recent.iter().filter(|r| + mouse_actions.contains(&r.action_type.as_str()) + ).count(); + + // If 8+ of last 10 actions are mouse and no keyboard usage + !has_keyboard && mouse_count >= 8 + } + + /// Detect screenshot → mouse_move → click pattern without precise coordinates + fn check_screenshot_mouse_pattern(&self) -> bool { + let recent: Vec<_> = self.action_history.iter().rev().take(12).collect(); + if recent.len() < 9 { + return false; + } + + let mut screenshot_count = 0; + let mut mouse_move_count = 0; + let mut has_move_to_text = false; + let mut has_click_element = false; + + for action in &recent { + match action.action_type.as_str() { + "screenshot" => screenshot_count += 1, + "mouse_move" => mouse_move_count += 1, + "move_to_text" => has_move_to_text = true, + "click_element" => has_click_element = true, + _ => {} + } + } + + // If we have many screenshots + mouse moves but no move_to_text/click_element + screenshot_count >= 3 && mouse_move_count >= 2 && !has_move_to_text && !has_click_element + } + + /// Detect repeated move_to_text failures without trying keyboard navigation + fn check_repeated_move_to_text_failures(&self) -> bool { + let recent: Vec<_> = self.action_history.iter().rev().take(8).collect(); + if recent.len() < 5 { + return false; + } + + let mut move_to_text_failures = 0; + let mut has_keyboard = false; + + for action in &recent { + if action.action_type == "move_to_text" && !action.success { + move_to_text_failures += 1; + } + if action.action_type == "key_chord" { + has_keyboard = true; + } + } + + // 3+ move_to_text failures and no keyboard attempts + move_to_text_failures >= 3 && !has_keyboard + } + + /// Detect screenshot → mouse_move loop without any clicks or progress + fn check_screenshot_mouse_loop(&self) -> bool { + let recent: Vec<_> = self.action_history.iter().rev().take(10).collect(); + if recent.len() < 6 { + return false; + } + + let mut screenshot_count = 0; + let mut mouse_move_count = 0; + let mut has_click = false; + let mut has_keyboard = false; + let mut has_move_to_text = false; + + for action in &recent { + match action.action_type.as_str() { + "screenshot" => screenshot_count += 1, + "mouse_move" => mouse_move_count += 1, + "click" => has_click = true, + "key_chord" | "type_text" => has_keyboard = true, + "move_to_text" => has_move_to_text = true, + _ => {} + } + } + + // Many screenshots + mouse moves, but no clicks/keyboard/move_to_text + screenshot_count >= 3 && mouse_move_count >= 2 && !has_click && !has_keyboard && !has_move_to_text + } + /// Get action history for backtracking pub fn get_history(&self) -> Vec { self.action_history.iter().cloned().collect() diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 36d5bd36..7825fb92 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -8,7 +8,7 @@ use super::computer_use_locate::execute_computer_use_locate; use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; use crate::agentic::tools::computer_use_host::{ ComputerScreenshot, ComputerUseNavigateQuadrant, ComputerUseScreenshotRefinement, - ScreenshotCropCenter, UiElementLocateQuery, + OcrRegionNative, ScreenshotCropCenter, UiElementLocateQuery, COMPUTER_USE_POINT_CROP_HALF_MAX, COMPUTER_USE_POINT_CROP_HALF_MIN, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, }; @@ -93,6 +93,90 @@ impl ComputerUseTool { Self } + /// Max OCR hits to attach as preview crops + AX (multimodal disambiguation). + const MOVE_TO_TEXT_DISAMBIGUATION_MAX: usize = 8; + /// Half-size in native screen pixels for each candidate preview (~400×400 logical crop). + const MOVE_TO_TEXT_PREVIEW_HALF_NATIVE: u32 = 200; + + async fn move_to_text_disambiguation_response( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + context: &ToolUseContext, + text_query: &str, + ocr_region_native: Option, + matches: &[ScreenOcrTextMatch], + ) -> BitFunResult> { + Self::require_multimodal_tool_output_for_screenshot(context)?; + let take = matches.len().min(Self::MOVE_TO_TEXT_DISAMBIGUATION_MAX); + let mut attachments: Vec = Vec::with_capacity(take); + let mut candidates: Vec = Vec::with_capacity(take); + for (i, m) in matches.iter().take(take).enumerate() { + let idx_1based = i + 1; + let ax = host_ref + .accessibility_hit_at_global_point(m.center_x, m.center_y) + .await?; + let jpeg = host_ref + .ocr_preview_crop_jpeg( + m.center_x, + m.center_y, + Self::MOVE_TO_TEXT_PREVIEW_HALF_NATIVE, + ) + .await?; + attachments.push(ToolImageAttachment { + mime_type: "image/jpeg".to_string(), + data_base64: B64.encode(&jpeg), + }); + candidates.push(json!({ + "match_index": idx_1based, + "ocr_text": m.text, + "confidence": m.confidence, + "global_center_x": m.center_x, + "global_center_y": m.center_y, + "bounds_left": m.bounds_left, + "bounds_top": m.bounds_top, + "bounds_width": m.bounds_width, + "bounds_height": m.bounds_height, + "accessibility": ax, + "preview_image_attachment_index": i, + })); + } + let input_coords = json!({ + "kind": "move_to_text", + "text_query": text_query, + "ocr_region_native": ocr_region_native, + "move_to_text_phase": "disambiguation", + }); + let mut body = json!({ + "success": true, + "action": "move_to_text", + "move_to_text_phase": "disambiguation", + "text_query": text_query, + "ocr_region_native": ocr_region_native, + "disambiguation_required": true, + "instruction": "Several OCR hits for this substring. Each candidate has a **preview JPEG** (same order as `candidates`) and **accessibility** metadata at the OCR center. **Do not** derive `mouse_move` from JPEG pixels. Pick `match_index`, then call **`move_to_text` again** with the same `text_query`, same `ocr_region_native`, and **`move_to_text_match_index`** = that index. Pointer was not moved.", + "candidates": candidates, + "total_ocr_matches": matches.len(), + "candidates_previewed": take, + }); + if take < matches.len() { + if let Some(obj) = body.as_object_mut() { + obj.insert( + "truncation_note".to_string(), + json!(format!( + "Only the first {} of {} OCR matches are previewed; narrow `ocr_region_native` or `text_query` if needed.", + take, matches.len() + )), + ); + } + } + let body = computer_use_augment_result_json(host_ref, body, Some(input_coords)).await; + let hint = format!( + "move_to_text: {} OCR matches — set move_to_text_match_index after viewing {} preview JPEGs + AX. Pointer not moved.", + matches.len(), + take + ); + Ok(vec![ToolResult::ok_with_images(body, Some(hint), attachments)]) + } + fn primary_api_format(ctx: &ToolUseContext) -> String { ctx.options .as_ref() @@ -760,14 +844,15 @@ impl Tool for ComputerUseTool { let keys = Self::key_chord_os_hint(); Ok(format!( "Desktop automation (host OS: {}). {} All actions in one tool. Send only parameters that apply to the chosen `action`. \ -**Targeting priority:** `click_element` → **`move_to_text`** (OCR + move pointer only) → `click_label` (when SoM exists) → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation — do not guess move targets from JPEG pixels.** \ +**Input priority:** Prefer **`key_chord`** / **`type_text`** over mouse when one key or typing completes the step (e.g. **Enter** to confirm default, **Escape** to cancel, **Tab** to move focus). Do not click “OK”/“Submit” when **Enter** is equivalent; use **`screenshot`** then Enter **`key_chord`** per host when required. \ +**Targeting priority (when pointing is required):** `click_element` → **`move_to_text`** (OCR + move pointer only) → `click_label` (when SoM exists) → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation — do not guess move targets from JPEG pixels.** \ **`click_element`:** Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. Bypasses coordinate screenshot guard. \ -**`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ +**`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). If **several** hits match, the host returns **preview JPEGs + accessibility** per candidate — pick **`move_to_text_match_index`** (1-based) and call **`move_to_text` again** with the same query/region. Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ **`click_label`:** After `screenshot` with `som_labels`, click by label number. Bypasses coordinate guard. \ **`click`:** Press at **current pointer only** — **never** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates`. Position first with **`move_to_text`**, **`mouse_move`** (**globals only**), or **`click_element`**. After pointer moves, **`screenshot`** again before the next guarded **`click`** when the host requires it. \ **`mouse_move` / `drag`:** **`use_screen_coordinates`: true** required — global coordinates from **`move_to_text`**, **`locate`**, AX, or **`pointer_global`**; never JPEG pixel guesses. \ **`scroll` / `type_text` / `pointer_move_rel` / `wait` / `locate`:** No mandatory pre-screenshot by themselves. **`pointer_move_rel`** (and **ComputerUseMouseStep**) are **blocked immediately after `screenshot`** until **`move_to_text`**, **`mouse_move`** (globals), **`click_element`**, or **`click_label`** — do not nudge from the JPEG. \ -**`key_chord`:** Press key combination. **Mandatory fresh screenshot only** when chord includes Return/Enter. \ +**`key_chord`:** Press key combination; prefer over **`click`** when shortcuts or **Enter**/**Escape**/**Tab** suffice. **Mandatory fresh screenshot only** when chord includes Return/Enter. \ **`screenshot`:** JPEG for **confirmation** (optional pointer + SoM). When the host requires a fresh capture before **`click`** or Enter **`key_chord`**, a bare `screenshot` is **~500×500** around the **mouse** or **caret** (also during quadrant drill). Use **`screenshot_reset_navigation`**: true to force **full-screen** for wide context. \ **`type_text`:** Type text; prefer clipboard for long content.", os, keys, @@ -788,7 +873,7 @@ impl Tool for ComputerUseTool { "action": { "type": "string", "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], - "description": "The action to perform. `click_element` = find UI element by accessibility + click (preferred for named controls). `click_label` = click a numbered Set-of-Mark label from the latest screenshot. `move_to_text` = OCR visible text and move pointer **only** (no click). **After `click_element` fails, prefer `move_to_text` (visible substring, UI language) over vision guesses.** `click` = press at **current pointer only** — **do not** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates` (use `mouse_move` first). `mouse_move` = absolute move with **`use_screen_coordinates`: true** (globals from tools — **no** JPEG pixel mode). `scroll` = mouse wheel. `drag` = drag between two globals (**`use_screen_coordinates`: true**). `screenshot` = confirmation JPEG (host may apply ~500×500 when required). `locate` = find UI element (no click). `key_chord` = keyboard shortcut. `type_text` = type string. `pointer_move_rel` = relative move — **host blocks right after `screenshot`** until a trusted absolute move (`move_to_text`, `mouse_move`, `click_element`, `click_label`). `wait` = pause." + "description": "The action to perform. **Prefer `key_chord` over `click` when a key completes the same step (Enter, Escape, Tab, Space, app shortcuts).** `click_element` = find UI element by accessibility + click (for named controls when keyboard is not equivalent). `click_label` = click a numbered Set-of-Mark label from the latest screenshot. `move_to_text` = OCR visible text and move pointer **only** (no click). **After `click_element` fails, prefer `move_to_text` (visible substring, UI language) over vision guesses.** `click` = press at **current pointer only** — **do not** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates` (use `mouse_move` first). `mouse_move` = absolute move with **`use_screen_coordinates`: true** (globals from tools — **no** JPEG pixel mode). `scroll` = mouse wheel. `drag` = drag between two globals (**`use_screen_coordinates`: true**). `screenshot` = confirmation JPEG (host may apply ~500×500 when required). `locate` = find UI element (no click). `key_chord` = keyboard shortcut. `type_text` = type string. `pointer_move_rel` = relative move — **host blocks right after `screenshot`** until a trusted absolute move (`move_to_text`, `mouse_move`, `click_element`, `click_label`). `wait` = pause." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -807,6 +892,7 @@ impl Tool for ComputerUseTool { "ms": { "type": "integer", "description": "For `wait`: duration in milliseconds." }, "label": { "type": "integer", "minimum": 1, "description": "For `click_label`: 1-based Set-of-Mark label number from the latest screenshot." }, "text_query": { "type": "string", "description": "For `move_to_text`: visible text to OCR-match on screen (case-insensitive substring)." }, + "move_to_text_match_index": { "type": "integer", "minimum": 1, "description": "For `move_to_text`: **1-based** index from `candidates[].match_index` after a **disambiguation** response (multiple OCR hits). Omit on the first pass; set when choosing which hit to move to." }, "ocr_region_native": { "type": "object", "description": "For `move_to_text`: optional global native rectangle for OCR. If omitted, macOS uses the frontmost window bounds from Accessibility; other OSes use the primary display. Overrides the automatic region when set. Requires x0, y0, width, height.", @@ -1032,6 +1118,10 @@ impl Tool for ComputerUseTool { ) })?; let ocr_region_native = parse_ocr_region_native(input)?; + let move_to_text_match_index = input + .get("move_to_text_match_index") + .and_then(|v| v.as_u64()) + .map(|u| u as u32); { let matches = Self::find_text_on_screen( @@ -1040,22 +1130,49 @@ impl Tool for ComputerUseTool { ocr_region_native.clone(), ) .await?; - let matched = matches.first().cloned().ok_or_else(|| { - BitFunError::tool(format!( + if matches.is_empty() { + return Err(BitFunError::tool(format!( "move_to_text found no visible OCR match for {:?}. Take a fresh screenshot and try a shorter or more distinctive substring, or use click_label / click_element.", text_query - )) - })?; + ))); + } + + let n = matches.len(); + if n > 1 && move_to_text_match_index.is_none() { + return Self::move_to_text_disambiguation_response( + host_ref, + context, + text_query, + ocr_region_native.clone(), + &matches, + ) + .await; + } + let sel: usize = match move_to_text_match_index { + None => 0, + Some(idx) => { + if idx < 1 || idx > n as u32 { + return Err(BitFunError::tool(format!( + "move_to_text_match_index must be between 1 and {} ({} OCR matches for {:?}).", + n, n, text_query + ))); + } + (idx - 1) as usize + } + }; + + let matched = &matches[sel]; host_ref .mouse_move_global_f64(matched.center_x, matched.center_y) .await?; let other_matches = matches .iter() - .skip(1) + .enumerate() + .filter(|(i, _)| *i != sel) .take(4) - .map(|m| { + .map(|(_, m)| { json!({ "text": m.text, "confidence": m.confidence, @@ -1069,12 +1186,14 @@ impl Tool for ComputerUseTool { "kind": "move_to_text", "text_query": text_query, "ocr_region_native": &ocr_region_native, + "move_to_text_match_index": move_to_text_match_index, }); let body = computer_use_augment_result_json( host_ref, json!({ "success": true, "action": "move_to_text", + "move_to_text_phase": "move", "text_query": text_query, "ocr_region_native": ocr_region_native, "matched_text": matched.text, @@ -1086,14 +1205,19 @@ impl Tool for ComputerUseTool { "bounds_width": matched.bounds_width, "bounds_height": matched.bounds_height, "total_matches": matches.len(), + "move_to_text_match_index": move_to_text_match_index.unwrap_or(1), "other_matches": other_matches, }), Some(input_coords), ) .await; let summary = format!( - "OCR move_to_text: matched {:?} at ({:.0}, {:.0}).", - matched.text, matched.center_x, matched.center_y + "OCR move_to_text: matched {:?} at ({:.0}, {:.0}) [index {} of {}].", + matched.text, + matched.center_x, + matched.center_y, + sel + 1, + matches.len() ); Ok(vec![ToolResult::ok(body, Some(summary))]) } diff --git a/src/crates/core/src/agentic/workspace.rs b/src/crates/core/src/agentic/workspace.rs index c90762c2..360a0414 100644 --- a/src/crates/core/src/agentic/workspace.rs +++ b/src/crates/core/src/agentic/workspace.rs @@ -1,3 +1,4 @@ +use crate::service::remote_ssh::workspace_state::WorkspaceSessionIdentity; use async_trait::async_trait; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -20,18 +21,29 @@ pub struct WorkspaceBinding { /// the path on the remote server (e.g. `/root/project`). pub root_path: PathBuf, pub backend: WorkspaceBackend, - /// Local path used for session persistence when the workspace is remote. - /// For local workspaces this is `None` (we use `root_path` directly). - pub local_session_path: Option, + /// Unified identity for session persistence. Local and remote workspaces + /// share the same model; the only semantic difference is hostname. + pub session_identity: WorkspaceSessionIdentity, } impl WorkspaceBinding { pub fn new(workspace_id: Option, root_path: PathBuf) -> Self { + let workspace_path = root_path.to_string_lossy().to_string(); + let session_identity = crate::service::remote_ssh::workspace_state::workspace_session_identity( + &workspace_path, + None, + None, + ) + .unwrap_or(WorkspaceSessionIdentity { + hostname: crate::service::remote_ssh::workspace_state::LOCAL_WORKSPACE_SSH_HOST.to_string(), + workspace_path, + remote_connection_id: None, + }); Self { workspace_id, root_path, backend: WorkspaceBackend::Local, - local_session_path: None, + session_identity, } } @@ -40,7 +52,7 @@ impl WorkspaceBinding { root_path: PathBuf, connection_id: String, connection_name: String, - local_session_path: PathBuf, + session_identity: WorkspaceSessionIdentity, ) -> Self { Self { workspace_id, @@ -49,7 +61,7 @@ impl WorkspaceBinding { connection_id, connection_name, }, - local_session_path: Some(local_session_path), + session_identity, } } @@ -73,9 +85,8 @@ impl WorkspaceBinding { } /// The path to use for session persistence. - /// Remote workspaces store sessions locally; local workspaces use root_path. pub fn session_storage_path(&self) -> &Path { - self.local_session_path.as_deref().unwrap_or(&self.root_path) + Path::new(&self.session_identity.workspace_path) } } diff --git a/src/crates/core/src/service/remote_ssh/manager.rs b/src/crates/core/src/service/remote_ssh/manager.rs index b830d616..396a5ddb 100644 --- a/src/crates/core/src/service/remote_ssh/manager.rs +++ b/src/crates/core/src/service/remote_ssh/manager.rs @@ -905,27 +905,22 @@ impl SSHConnectionManager { } log::info!("Authentication successful for user {}", config.username); - // Get server info (prefer full probe; fall back to $HOME only so SFTP `~` works when uname fails) + // Resolve remote home to an absolute path (SFTP does not expand `~`; never rely on literal `~` in UI). let mut server_info = Self::get_server_info_internal(&handle).await; if server_info .as_ref() .map(|s| s.home_dir.trim().is_empty()) .unwrap_or(true) { - if let Ok((stdout, _, status)) = Self::execute_command_internal(&handle, "echo $HOME").await { - if status == 0 { - let home = stdout.trim().to_string(); - if !home.is_empty() { - match &mut server_info { - Some(si) => si.home_dir = home, - None => { - server_info = Some(ServerInfo { - os_type: "unknown".to_string(), - hostname: "unknown".to_string(), - home_dir: home, - }); - } - } + if let Some(home) = Self::probe_remote_home_dir(&handle).await { + match &mut server_info { + Some(si) => si.home_dir = home, + None => { + server_info = Some(ServerInfo { + os_type: "unknown".to_string(), + hostname: "unknown".to_string(), + home_dir: home, + }); } } } @@ -954,9 +949,8 @@ impl SSHConnectionManager { }) } - /// Get server information + /// Get server information (partial lines allowed so we can still fill `home_dir` via [`Self::probe_remote_home_dir`]). async fn get_server_info_internal(handle: &Handle) -> Option { - // Try to get server info via SSH session let (stdout, _stderr, exit_status) = Self::execute_command_internal(handle, "uname -s && hostname && echo $HOME") .await .ok()?; @@ -966,17 +960,42 @@ impl SSHConnectionManager { } let lines: Vec<&str> = stdout.trim().lines().collect(); - if lines.len() < 3 { + if lines.is_empty() { return None; } Some(ServerInfo { os_type: lines[0].to_string(), - hostname: lines[1].to_string(), - home_dir: lines[2].to_string(), + hostname: lines.get(1).unwrap_or(&"").to_string(), + home_dir: lines.get(2).unwrap_or(&"").to_string(), }) } + /// Resolve remote home directory via SSH `exec` (tilde and `$HOME` are expanded by the remote shell). + async fn probe_remote_home_dir(handle: &Handle) -> Option { + const PROBES: &[&str] = &[ + "sh -c 'echo ~'", + "echo $HOME", + "bash -lc 'echo ~'", + "bash -c 'echo ~'", + "sh -c 'getent passwd \"$(id -un)\" 2>/dev/null | cut -d: -f6'", + ]; + for cmd in PROBES { + let Ok((stdout, _, status)) = Self::execute_command_internal(handle, cmd).await else { + continue; + }; + if status != 0 { + continue; + } + let first = stdout.trim().lines().next().unwrap_or("").trim(); + if first.is_empty() || first == "~" { + continue; + } + return Some(first.to_string()); + } + None + } + /// Execute a command on the remote server async fn execute_command_internal( handle: &Handle, @@ -1054,6 +1073,47 @@ impl SSHConnectionManager { guard.get(connection_id).and_then(|c| c.server_info.clone()) } + /// If `home_dir` is missing, run [`Self::probe_remote_home_dir`] and persist it on the connection. + pub async fn resolve_remote_home_if_missing(&self, connection_id: &str) -> Option { + let need_probe = { + let guard = self.connections.read().await; + match guard.get(connection_id) { + None => return None, + Some(conn) => conn + .server_info + .as_ref() + .map(|s| s.home_dir.trim().is_empty()) + .unwrap_or(true), + } + }; + if !need_probe { + return self.get_server_info(connection_id).await; + } + let handle = { + let guard = self.connections.read().await; + guard.get(connection_id)?.handle.clone() + }; + let Some(home) = Self::probe_remote_home_dir(&handle).await else { + return self.get_server_info(connection_id).await; + }; + { + let mut guard = self.connections.write().await; + if let Some(conn) = guard.get_mut(connection_id) { + match conn.server_info.as_mut() { + Some(si) => si.home_dir = home.clone(), + None => { + conn.server_info = Some(ServerInfo { + os_type: "unknown".to_string(), + hostname: "unknown".to_string(), + home_dir: home, + }); + } + } + } + } + self.get_server_info(connection_id).await + } + /// Get connection configuration pub async fn get_connection_config(&self, connection_id: &str) -> Option { let guard = self.connections.read().await; diff --git a/src/crates/core/src/service/remote_ssh/workspace_state.rs b/src/crates/core/src/service/remote_ssh/workspace_state.rs index 24ed89a8..e2dc62f3 100644 --- a/src/crates/core/src/service/remote_ssh/workspace_state.rs +++ b/src/crates/core/src/service/remote_ssh/workspace_state.rs @@ -13,6 +13,98 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::RwLock; +/// Unified workspace identity used to resolve session persistence for both +/// local and remote workspaces. The only semantic difference is `hostname`: +/// local workspaces use [`LOCAL_WORKSPACE_SSH_HOST`], while remote workspaces +/// use the SSH host from connection metadata. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct WorkspaceSessionIdentity { + pub hostname: String, + pub workspace_path: String, + pub remote_connection_id: Option, +} + +impl WorkspaceSessionIdentity { + pub fn is_remote(&self) -> bool { + self.hostname != LOCAL_WORKSPACE_SSH_HOST + } + + pub fn session_storage_path(&self) -> PathBuf { + if self.is_remote() { + remote_workspace_session_mirror_dir(&self.hostname, &self.workspace_path) + } else { + PathBuf::from(&self.workspace_path) + } + } +} + +/// Build a unified session identity for local or remote workspaces. +/// +/// Local: `hostname=localhost`, `workspace_path=canonical local root` +/// Remote: `hostname=ssh_host`, `workspace_path=normalized remote root` +pub fn workspace_session_identity( + workspace_path: &str, + remote_connection_id: Option<&str>, + remote_ssh_host: Option<&str>, +) -> Option { + let remote_connection_id = remote_connection_id + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string); + + if let Some(connection_id) = remote_connection_id { + let hostname = remote_ssh_host + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string)?; + return Some(WorkspaceSessionIdentity { + hostname, + workspace_path: normalize_remote_workspace_path(workspace_path), + remote_connection_id: Some(connection_id), + }); + } + + let local_root = normalize_local_workspace_root_for_stable_id(Path::new(workspace_path)).ok()?; + Some(WorkspaceSessionIdentity { + hostname: LOCAL_WORKSPACE_SSH_HOST.to_string(), + workspace_path: local_root, + remote_connection_id: None, + }) +} + +/// Resolve a session identity while tolerating temporarily unresolved remote hosts. +/// If the remote host is unknown, fall back to the dedicated unresolved session tree. +pub async fn resolve_workspace_session_identity( + workspace_path: &str, + remote_connection_id: Option<&str>, + remote_ssh_host: Option<&str>, +) -> Option { + let remote_connection_id = remote_connection_id + .map(str::trim) + .filter(|s| !s.is_empty()); + + if let Some(connection_id) = remote_connection_id { + if let Some(host) = remote_ssh_host.map(str::trim).filter(|s| !s.is_empty()) { + return workspace_session_identity(workspace_path, Some(connection_id), Some(host)); + } + + if let Some(entry) = lookup_remote_connection_with_hint(workspace_path, Some(connection_id)).await { + return Some(WorkspaceSessionIdentity { + hostname: entry.ssh_host, + workspace_path: entry.remote_root, + remote_connection_id: Some(entry.connection_id), + }); + } + + return Some(WorkspaceSessionIdentity { + hostname: "_unresolved".to_string(), + workspace_path: normalize_remote_workspace_path(workspace_path), + remote_connection_id: Some(connection_id.to_string()), + }); + } + + workspace_session_identity(workspace_path, None, None) +} /// SSH host label for **local disk** workspaces (`Normal` / `Assistant`). /// Remote workspaces use the SSH config host instead. Together with a normalized absolute /// root path this forms a globally unique workspace scope: `{host}:{path}`. @@ -517,23 +609,22 @@ pub async fn get_effective_session_path( remote_connection_id: Option<&str>, remote_ssh_host: Option<&str>, ) -> std::path::PathBuf { - if let Some(manager) = get_remote_workspace_manager() { - manager - .get_effective_session_path(workspace_path, remote_connection_id, remote_ssh_host) - .await - } else { - let path_norm = normalize_remote_workspace_path(workspace_path); - if let (Some(_), Some(host)) = ( - remote_connection_id.map(str::trim).filter(|s| !s.is_empty()), - remote_ssh_host.map(str::trim).filter(|s| !s.is_empty()), - ) { - return remote_workspace_session_mirror_dir(host, &path_norm); - } - if let Some(rid) = remote_connection_id.map(str::trim).filter(|s| !s.is_empty()) { - return unresolved_remote_session_storage_dir(rid, &path_norm); + if let Some(identity) = resolve_workspace_session_identity( + workspace_path, + remote_connection_id, + remote_ssh_host, + ) + .await + { + if identity.hostname == "_unresolved" { + if let Some(connection_id) = identity.remote_connection_id.as_deref() { + return unresolved_remote_session_storage_dir(connection_id, &identity.workspace_path); + } } - std::path::PathBuf::from(workspace_path) + return identity.session_storage_path(); } + + std::path::PathBuf::from(workspace_path) } /// Check if a specific path belongs to any registered remote workspace. diff --git a/src/crates/core/src/service/session/types.rs b/src/crates/core/src/service/session/types.rs index cb8ef23b..975bc39a 100644 --- a/src/crates/core/src/service/session/types.rs +++ b/src/crates/core/src/service/session/types.rs @@ -73,9 +73,14 @@ pub struct SessionMetadata { #[serde(skip_serializing_if = "Option::is_none")] pub todos: Option, - /// Workspace path this session belongs to (set at creation time) + /// Workspace path this session belongs to (normalized source workspace root, not mirror dir) #[serde(skip_serializing_if = "Option::is_none", alias = "workspace_path")] pub workspace_path: Option, + + /// Unified hostname for workspace identity: `localhost` for local workspaces, + /// SSH host for remote workspaces. + #[serde(default, skip_serializing_if = "Option::is_none", alias = "workspace_hostname")] + pub workspace_hostname: Option, } /// Session status @@ -442,6 +447,7 @@ impl SessionMetadata { custom_metadata: None, todos: None, workspace_path: None, + workspace_hostname: None, } } diff --git a/src/crates/core/src/service/workspace/manager.rs b/src/crates/core/src/service/workspace/manager.rs index 035fc1f5..d3065c76 100644 --- a/src/crates/core/src/service/workspace/manager.rs +++ b/src/crates/core/src/service/workspace/manager.rs @@ -347,7 +347,20 @@ impl WorkspaceInfo { metadata: HashMap::new(), }; - if !is_remote { + if is_remote { + if let Some(ssh_host) = options.remote_ssh_host.as_ref().filter(|s| !s.trim().is_empty()) { + workspace.metadata.insert( + "sshHost".to_string(), + serde_json::Value::String(ssh_host.trim().to_string()), + ); + } + if let Some(conn_id) = options.remote_connection_id.as_ref().filter(|s| !s.trim().is_empty()) { + workspace.metadata.insert( + "connectionId".to_string(), + serde_json::Value::String(conn_id.trim().to_string()), + ); + } + } else { workspace.metadata.insert( "sshHost".to_string(), serde_json::Value::String(LOCAL_WORKSPACE_SSH_HOST.to_string()), @@ -982,6 +995,20 @@ impl WorkspaceManager { if let Some(display_name) = &options.display_name { workspace.name = display_name.clone(); } + if options.workspace_kind == WorkspaceKind::Remote { + if let Some(ssh_host) = options.remote_ssh_host.as_ref().filter(|s| !s.trim().is_empty()) { + workspace.metadata.insert( + "sshHost".to_string(), + serde_json::Value::String(ssh_host.trim().to_string()), + ); + } + if let Some(conn_id) = options.remote_connection_id.as_ref().filter(|s| !s.trim().is_empty()) { + workspace.metadata.insert( + "connectionId".to_string(), + serde_json::Value::String(conn_id.trim().to_string()), + ); + } + } workspace.load_identity().await; workspace.load_worktree().await; } diff --git a/src/crates/core/src/service/workspace/service.rs b/src/crates/core/src/service/workspace/service.rs index 0f17ed66..2138cc88 100644 --- a/src/crates/core/src/service/workspace/service.rs +++ b/src/crates/core/src/service/workspace/service.rs @@ -1047,10 +1047,33 @@ impl WorkspaceService { if let Some(data) = workspace_data { let mut manager = self.manager.write().await; - *manager.get_workspaces_mut() = data.workspaces; - manager.set_opened_workspace_ids(data.opened_workspace_ids.clone()); - manager.set_recent_workspaces(data.recent_workspaces); - manager.set_recent_assistant_workspaces(data.recent_assistant_workspaces); + let mut workspaces = data.workspaces; + // Filter out legacy remote workspaces that don't have the required metadata (sshHost and connectionId) + workspaces.retain(|_id, ws| { + if ws.workspace_kind == WorkspaceKind::Remote { + // Check if this remote workspace has the required metadata + let has_ssh_host = ws.metadata.get("sshHost").and_then(|v| v.as_str()).map_or(false, |s| !s.trim().is_empty()); + let has_connection_id = ws.metadata.get("connectionId").and_then(|v| v.as_str()).map_or(false, |s| !s.trim().is_empty()); + if !has_ssh_host || !has_connection_id { + // Skip this legacy remote workspace + info!("Skipping legacy remote workspace without required metadata: id={}, root_path={}", _id, ws.root_path.display()); + return false; + } + } + true + }); + + *manager.get_workspaces_mut() = workspaces; + // Also filter opened/recent lists to remove references to removed legacy workspaces + let filtered_opened_ids: Vec = data.opened_workspace_ids.clone().into_iter().filter(|id| manager.get_workspaces().contains_key(id)).collect(); + manager.set_opened_workspace_ids(filtered_opened_ids); + + let filtered_recent: Vec = data.recent_workspaces.clone().into_iter().filter(|id| manager.get_workspaces().contains_key(id)).collect(); + manager.set_recent_workspaces(filtered_recent); + + let filtered_recent_assistant: Vec = data.recent_assistant_workspaces.clone().into_iter().filter(|id| manager.get_workspaces().contains_key(id)).collect(); + manager.set_recent_assistant_workspaces(filtered_recent_assistant); + let id_remap = manager.migrate_local_workspace_ids_to_stable_storage(); let raw_current = data diff --git a/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx b/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx index 14228b65..34fe9d6d 100644 --- a/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx +++ b/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx @@ -26,7 +26,7 @@ import './RemoteFileBrowser.scss'; interface RemoteFileBrowserProps { connectionId: string; - /** Defaults to `~` (remote home) to avoid listing `/` on restricted hosts. */ + /** Defaults to `/tmp` if parent does not pass a resolved absolute home (avoid literal `~` for SFTP). */ initialPath?: string; /** Used by the Home button; defaults to `initialPath`. */ homePath?: string; @@ -82,7 +82,7 @@ function isTauriDesktop(): boolean { export const RemoteFileBrowser: React.FC = ({ connectionId, - initialPath = '~', + initialPath = '/tmp', homePath, onSelect, onCancel, diff --git a/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx b/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx index cd3252b7..6dde64c0 100644 --- a/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx +++ b/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx @@ -87,7 +87,8 @@ export const SSHRemoteProvider: React.FC = ({ children } const [showFileBrowser, setShowFileBrowser] = useState(false); const [error, setError] = useState(null); const [connectionError, setConnectionError] = useState(null); - const [remoteFileBrowserInitialPath, setRemoteFileBrowserInitialPath] = useState('~'); + /** Fallback only when home cannot be resolved (never use literal `~` for SFTP). */ + const [remoteFileBrowserInitialPath, setRemoteFileBrowserInitialPath] = useState('/tmp'); // Per-workspace connection statuses (keyed by connectionId) const [workspaceStatuses, setWorkspaceStatuses] = useState>({}); const heartbeatInterval = useRef(null); @@ -272,8 +273,9 @@ export const SSHRemoteProvider: React.FC = ({ children } for (const [, workspace] of toReconnect) { const isAlreadyOpened = openedRemote.some( ws => + ws.connectionId === workspace.connectionId && normalizeRemoteWorkspacePath(ws.rootPath) === - normalizeRemoteWorkspacePath(workspace.remotePath) + normalizeRemoteWorkspacePath(workspace.remotePath) ); // Check if SSH is already live @@ -342,7 +344,7 @@ export const SSHRemoteProvider: React.FC = ({ children } log.warn('Auto-reconnect failed, removing workspace from sidebar', { connectionId: workspace.connectionId, }); - await workspaceManager.removeRemoteWorkspace(workspace.connectionId).catch(() => {}); + await workspaceManager.removeRemoteWorkspace(workspace.connectionId, workspace.remotePath).catch(() => {}); } } } catch (e) { @@ -397,9 +399,17 @@ export const SSHRemoteProvider: React.FC = ({ children } if (result.success && result.connectionId) { log.info('SSH connection successful', { connectionId: result.connectionId }); - const home = result.serverInfo?.homeDir?.trim(); + let home = result.serverInfo?.homeDir?.trim(); + if (!home && result.connectionId) { + try { + const info = await sshApi.getServerInfo(result.connectionId); + home = info?.homeDir?.trim(); + } catch { + /* non-desktop or probe skipped */ + } + } setRemoteFileBrowserInitialPath( - home && home.length > 0 ? normalizeRemoteWorkspacePath(home) : '~' + home && home.length > 0 ? normalizeRemoteWorkspacePath(home) : '/tmp' ); setStatus('connected'); setIsConnected(true); @@ -453,7 +463,7 @@ export const SSHRemoteProvider: React.FC = ({ children } setRemoteWorkspace(null); setIsConnected(false); setShowFileBrowser(false); - setRemoteFileBrowserInitialPath('~'); + setRemoteFileBrowserInitialPath('/tmp'); if (currentRemoteWorkspace) { setWorkspaceStatus(currentRemoteWorkspace.connectionId, 'disconnected'); diff --git a/src/web-ui/src/features/ssh-remote/sshApi.ts b/src/web-ui/src/features/ssh-remote/sshApi.ts index e159967f..9e22bee8 100644 --- a/src/web-ui/src/features/ssh-remote/sshApi.ts +++ b/src/web-ui/src/features/ssh-remote/sshApi.ts @@ -11,6 +11,7 @@ import type { RemoteWorkspace, SSHConfigLookupResult, SSHConfigEntry, + ServerInfo, } from './types'; // API adapter for Tauri/Server Mode compatibility @@ -68,6 +69,13 @@ export const sshApi = { return api.invoke('ssh_is_connected', { connectionId }); }, + /** + * Server info for an active connection; may probe `echo ~` / `$HOME` if `homeDir` was missing. + */ + async getServerInfo(connectionId: string): Promise { + return api.invoke('ssh_get_server_info', { connectionId }); + }, + /** * Get SSH config for a host from ~/.ssh/config */ diff --git a/src/web-ui/src/flow_chat/store/FlowChatStore.ts b/src/web-ui/src/flow_chat/store/FlowChatStore.ts index c76f28dc..f7bedf73 100644 --- a/src/web-ui/src/flow_chat/store/FlowChatStore.ts +++ b/src/web-ui/src/flow_chat/store/FlowChatStore.ts @@ -1583,7 +1583,8 @@ export class FlowChatStore { mode: validatedAgentType, workspacePath: (metadata as any).workspacePath || workspacePath, remoteConnectionId: metadata.remoteConnectionId || remoteConnectionId, - remoteSshHost: metadata.remoteSshHost || remoteSshHost, + remoteSshHost: + metadata.remoteSshHost || metadata.workspaceHostname || remoteSshHost, parentSessionId: relationship.parentSessionId, sessionKind: relationship.sessionKind, btwThreads: [], diff --git a/src/web-ui/src/infrastructure/services/business/workspaceManager.ts b/src/web-ui/src/infrastructure/services/business/workspaceManager.ts index c9461ef3..5b0edd90 100644 --- a/src/web-ui/src/infrastructure/services/business/workspaceManager.ts +++ b/src/web-ui/src/infrastructure/services/business/workspaceManager.ts @@ -454,9 +454,9 @@ class WorkspaceManager { } } - public async removeRemoteWorkspace(connectionId: string): Promise { + public async removeRemoteWorkspace(connectionId: string, remotePath?: string): Promise { try { - const workspace = this.findRemoteWorkspaceByConnectionId(connectionId); + const workspace = this.findRemoteWorkspace(connectionId, remotePath); if (!workspace) { return; } @@ -480,18 +480,26 @@ class WorkspaceManager { this.emit({ type: 'workspace:active-changed', workspace: currentWorkspace }); } catch (error) { - log.error('Failed to remove remote workspace', { connectionId, error }); + log.error('Failed to remove remote workspace', { connectionId, remotePath, error }); const errorMessage = error instanceof Error ? error.message : String(error); this.updateState({ error: errorMessage }, { type: 'workspace:error', error: errorMessage }); throw error; } } - private findRemoteWorkspaceByConnectionId(connectionId: string): WorkspaceInfo | undefined { + private findRemoteWorkspace(connectionId: string, remotePath?: string): WorkspaceInfo | undefined { + const normalizedRemotePath = remotePath ? normalizeRemoteWorkspacePath(remotePath) : null; for (const [, ws] of this.state.openedWorkspaces) { - if (ws.connectionId === connectionId && ws.workspaceKind === WorkspaceKind.Remote) { - return ws; + if (ws.workspaceKind !== WorkspaceKind.Remote) { + continue; } + if (ws.connectionId !== connectionId) { + continue; + } + if (normalizedRemotePath && normalizeRemoteWorkspacePath(ws.rootPath) !== normalizedRemotePath) { + continue; + } + return ws; } return undefined; } diff --git a/src/web-ui/src/shared/types/session-history.ts b/src/web-ui/src/shared/types/session-history.ts index ff9c48ea..e373d9e2 100644 --- a/src/web-ui/src/shared/types/session-history.ts +++ b/src/web-ui/src/shared/types/session-history.ts @@ -33,6 +33,8 @@ export interface SessionMetadata { workspacePath?: string; remoteConnectionId?: string; remoteSshHost?: string; + /** Backend unified workspace identity field: localhost for local, SSH host for remote. */ + workspaceHostname?: string; } export type SessionStatus = 'active' | 'archived' | 'completed';