/** * CLI `ComputerExecutor` implementation. Wraps two native modules: * - `@ant/computer-use-input` (Rust/enigo) — mouse, keyboard, frontmost app * - `@ant/computer-use-swift` — SCContentFilter screenshots, NSWorkspace apps, TCC * * Contract: `packages/desktop/computer-use-mcp/src/executor.ts` in the apps * repo. The reference impl is Cowork's `apps/desktop/src/main/nest-only/ * computer-use/executor.ts` — see notable deviations under "CLI deltas" below. * * ── CLI deltas from Cowork ───────────────────────────────────────────────── * * No `withClickThrough`. Cowork wraps every mouse op in * `BrowserWindow.setIgnoreMouseEvents(true)` so clicks fall through the * overlay. We're a terminal — no window — so the click-through bracket is * a no-op. The sentinel `CLI_HOST_BUNDLE_ID` never matches frontmost. * * Terminal as surrogate host. `getTerminalBundleId()` detects the emulator * we're running inside. It's passed as `hostBundleId` to `prepareDisplay`/ * `resolvePrepareCapture` so the Swift side exempts it from hide AND skips * it in the activate z-order walk (so the terminal being frontmost doesn't * eat clicks meant for the target app). Also stripped from `allowedBundleIds` * via `withoutTerminal()` so screenshots don't capture it (Swift 0.2.1's * captureExcluding takes an allow-list despite the name — apps#30355). * `capabilities.hostBundleId` stays as the sentinel — the package's * frontmost gate uses that, and the terminal being frontmost is fine. * * Clipboard via `pbcopy`/`pbpaste`. No Electron `clipboard` module. */ import type { ComputerExecutor, DisplayGeometry, FrontmostApp, InstalledApp, ResolvePrepareCaptureResult, RunningApp, ScreenshotResult, } from '@ant/computer-use-mcp' import { API_RESIZE_PARAMS, targetImageSize } from '@ant/computer-use-mcp' import { logForDebugging } from '../debug.js' import { errorMessage } from '../errors.js' import { execFileNoThrow } from '../execFileNoThrow.js' import { sleep } from '../sleep.js' import { CLI_CU_CAPABILITIES, CLI_HOST_BUNDLE_ID, getTerminalBundleId, } from './common.js' import { drainRunLoop } from './drainRunLoop.js' import { notifyExpectedEscape } from './escHotkey.js' import { requireComputerUseInput } from './inputLoader.js' import { requireComputerUseSwift } from './swiftLoader.js' // ── Helpers ─────────────────────────────────────────────────────────────────── const SCREENSHOT_JPEG_QUALITY = 0.75 /** Logical → physical → API target dims. See `targetImageSize` + COORDINATES.md. */ function computeTargetDims( logicalW: number, logicalH: number, scaleFactor: number, ): [number, number] { const physW = Math.round(logicalW * scaleFactor) const physH = Math.round(logicalH * scaleFactor) return targetImageSize(physW, physH, API_RESIZE_PARAMS) } async function readClipboardViaPbpaste(): Promise { const { stdout, code } = await execFileNoThrow('pbpaste', [], { useCwd: false, }) if (code !== 0) { throw new Error(`pbpaste exited with code ${code}`) } return stdout } async function writeClipboardViaPbcopy(text: string): Promise { const { code } = await execFileNoThrow('pbcopy', [], { input: text, useCwd: false, }) if (code !== 0) { throw new Error(`pbcopy exited with code ${code}`) } } type Input = ReturnType /** * Single-element key sequence matching "escape" or "esc" (case-insensitive). * Used to hole-punch the CGEventTap abort for model-synthesized Escape — enigo * accepts both spellings, so the tap must too. */ function isBareEscape(parts: readonly string[]): boolean { if (parts.length !== 1) return false const lower = parts[0]!.toLowerCase() return lower === 'escape' || lower === 'esc' } /** * Instant move, then 50ms — an input→HID→AppKit→NSEvent round-trip before the * caller reads `NSEvent.mouseLocation` or dispatches a click. Used for click, * scroll, and drag-from; `animatedMove` is reserved for drag-to only. The * intermediate animation frames were triggering hover states and, on the * decomposed mouseDown/moveMouse path, emitting stray `.leftMouseDragged` * events (toolCalls.ts handleScroll's mouse_full workaround). */ const MOVE_SETTLE_MS = 50 async function moveAndSettle( input: Input, x: number, y: number, ): Promise { await input.moveMouse(x, y, false) await sleep(MOVE_SETTLE_MS) } /** * Release `pressed` in reverse (last pressed = first released). Errors are * swallowed so a release failure never masks the real error. * * Drains via pop() rather than snapshotting length: if a drainRunLoop- * orphaned press lambda resolves an in-flight input.key() AFTER finally * calls us, that late push is still released on the next iteration. The * orphaned flag stops the lambda at its NEXT check, not the current await. */ async function releasePressed(input: Input, pressed: string[]): Promise { let k: string | undefined while ((k = pressed.pop()) !== undefined) { try { await input.key(k, 'release') } catch { // Swallow — best-effort release. } } } /** * Bracket `fn()` with modifier press/release. `pressed` tracks which presses * actually landed, so a mid-press throw only releases what was pressed — no * stuck modifiers. The finally covers both press-phase and fn() throws. * * Caller must already be inside drainRunLoop() — key() dispatches to the * main queue and needs the pump to resolve. */ async function withModifiers( input: Input, mods: string[], fn: () => Promise, ): Promise { const pressed: string[] = [] try { for (const m of mods) { await input.key(m, 'press') pressed.push(m) } return await fn() } finally { await releasePressed(input, pressed) } } /** * Port of Cowork's `typeViaClipboard`. Sequence: * 1. Save the user's clipboard. * 2. Write our text. * 3. READ-BACK VERIFY — clipboard writes can silently fail. If the * read-back doesn't match, never press Cmd+V (would paste junk). * 4. Cmd+V via keys(). * 5. Sleep 100ms — battle-tested threshold for the paste-effect vs * clipboard-restore race. Restoring too soon means the target app * pastes the RESTORED content. * 6. Restore — in a `finally`, so a throw between 2-5 never leaves the * user's clipboard clobbered. Restore failures are swallowed. */ async function typeViaClipboard(input: Input, text: string): Promise { let saved: string | undefined try { saved = await readClipboardViaPbpaste() } catch { logForDebugging( '[computer-use] pbpaste before paste failed; proceeding without restore', ) } try { await writeClipboardViaPbcopy(text) if ((await readClipboardViaPbpaste()) !== text) { throw new Error('Clipboard write did not round-trip.') } await input.keys(['command', 'v']) await sleep(100) } finally { if (typeof saved === 'string') { try { await writeClipboardViaPbcopy(saved) } catch { logForDebugging('[computer-use] clipboard restore after paste failed') } } } } /** * Port of Cowork's `animateMouseMovement` + `animatedMove`. Ease-out-cubic at * 60fps; distance-proportional duration at 2000 px/sec, capped at 0.5s. When * the sub-gate is off (or distance < ~2 frames), falls through to * `moveAndSettle`. Called only from `drag` for the press→to motion — target * apps may watch for `.leftMouseDragged` specifically (not just "button down + * position changed") and the slow motion gives them time to process * intermediate positions (scrollbars, window resizes). */ async function animatedMove( input: Input, targetX: number, targetY: number, mouseAnimationEnabled: boolean, ): Promise { if (!mouseAnimationEnabled) { await moveAndSettle(input, targetX, targetY) return } const start = await input.mouseLocation() const deltaX = targetX - start.x const deltaY = targetY - start.y const distance = Math.hypot(deltaX, deltaY) if (distance < 1) return const durationSec = Math.min(distance / 2000, 0.5) if (durationSec < 0.03) { await moveAndSettle(input, targetX, targetY) return } const frameRate = 60 const frameIntervalMs = 1000 / frameRate const totalFrames = Math.floor(durationSec * frameRate) for (let frame = 1; frame <= totalFrames; frame++) { const t = frame / totalFrames const eased = 1 - Math.pow(1 - t, 3) await input.moveMouse( Math.round(start.x + deltaX * eased), Math.round(start.y + deltaY * eased), false, ) if (frame < totalFrames) { await sleep(frameIntervalMs) } } // Last frame has no trailing sleep — same HID round-trip before the // caller's mouseButton reads NSEvent.mouseLocation. await sleep(MOVE_SETTLE_MS) } // ── Factory ─────────────────────────────────────────────────────────────── export function createCliExecutor(opts: { getMouseAnimationEnabled: () => boolean getHideBeforeActionEnabled: () => boolean }): ComputerExecutor { if (process.platform !== 'darwin') { throw new Error( `createCliExecutor called on ${process.platform}. Computer control is macOS-only.`, ) } // Swift loaded once at factory time — every executor method needs it. // Input loaded lazily via requireComputerUseInput() on first mouse/keyboard // call — it caches internally, so screenshot-only flows never pull the // enigo .node. const cu = requireComputerUseSwift() const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts const terminalBundleId = getTerminalBundleId() const surrogateHost = terminalBundleId ?? CLI_HOST_BUNDLE_ID // Swift 0.2.1's captureExcluding/captureRegion take an ALLOW list despite the // name (apps#30355 — complement computed Swift-side against running apps). // The terminal isn't in the user's grants so it's naturally excluded, but if // the package ever passes it through we strip it here so the terminal never // photobombs a screenshot. const withoutTerminal = (allowed: readonly string[]): string[] => terminalBundleId === null ? [...allowed] : allowed.filter(id => id !== terminalBundleId) logForDebugging( terminalBundleId ? `[computer-use] terminal ${terminalBundleId} → surrogate host (hide-exempt, activate-skip, screenshot-excluded)` : '[computer-use] terminal not detected; falling back to sentinel host', ) return { capabilities: { ...CLI_CU_CAPABILITIES, hostBundleId: CLI_HOST_BUNDLE_ID, }, // ── Pre-action sequence (hide + defocus) ──────────────────────────── async prepareForAction( allowlistBundleIds: string[], displayId?: number, ): Promise { if (!getHideBeforeActionEnabled()) { return [] } // prepareDisplay isn't @MainActor (plain Task{}), but its .hide() calls // trigger window-manager events that queue on CFRunLoop. Without the // pump, those pile up during Swift's ~1s of usleeps and flush all at // once when the next pumped call runs — visible window flashing. // Electron drains CFRunLoop continuously so Cowork doesn't see this. // Worst-case 100ms + 5×200ms safety-net ≈ 1.1s, well under the 30s // drainRunLoop ceiling. // // "Continue with action execution even if switching fails" — the // frontmost gate in toolCalls.ts catches any actual unsafe state. return drainRunLoop(async () => { try { const result = await cu.apps.prepareDisplay( allowlistBundleIds, surrogateHost, displayId, ) if (result.activated) { logForDebugging( `[computer-use] prepareForAction: activated ${result.activated}`, ) } return result.hidden } catch (err) { logForDebugging( `[computer-use] prepareForAction failed; continuing to action: ${errorMessage(err)}`, { level: 'warn' }, ) return [] } }) }, async previewHideSet( allowlistBundleIds: string[], displayId?: number, ): Promise> { return cu.apps.previewHideSet( [...allowlistBundleIds, surrogateHost], displayId, ) }, // ── Display ────────────────────────────────────────────────────────── async getDisplaySize(displayId?: number): Promise { return cu.display.getSize(displayId) }, async listDisplays(): Promise { return cu.display.listAll() }, async findWindowDisplays( bundleIds: string[], ): Promise> { return cu.apps.findWindowDisplays(bundleIds) }, async resolvePrepareCapture(opts: { allowedBundleIds: string[] preferredDisplayId?: number autoResolve: boolean doHide?: boolean }): Promise { const d = cu.display.getSize(opts.preferredDisplayId) const [targetW, targetH] = computeTargetDims( d.width, d.height, d.scaleFactor, ) return drainRunLoop(() => cu.resolvePrepareCapture( withoutTerminal(opts.allowedBundleIds), surrogateHost, SCREENSHOT_JPEG_QUALITY, targetW, targetH, opts.preferredDisplayId, opts.autoResolve, opts.doHide, ), ) }, /** * Pre-size to `targetImageSize` output so the API transcoder's early-return * fires — no server-side resize, `scaleCoord` stays coherent. See * packages/desktop/computer-use-mcp/COORDINATES.md. */ async screenshot(opts: { allowedBundleIds: string[] displayId?: number }): Promise { const d = cu.display.getSize(opts.displayId) const [targetW, targetH] = computeTargetDims( d.width, d.height, d.scaleFactor, ) return drainRunLoop(() => cu.screenshot.captureExcluding( withoutTerminal(opts.allowedBundleIds), SCREENSHOT_JPEG_QUALITY, targetW, targetH, opts.displayId, ), ) }, async zoom( regionLogical: { x: number; y: number; w: number; h: number }, allowedBundleIds: string[], displayId?: number, ): Promise<{ base64: string; width: number; height: number }> { const d = cu.display.getSize(displayId) const [outW, outH] = computeTargetDims( regionLogical.w, regionLogical.h, d.scaleFactor, ) return drainRunLoop(() => cu.screenshot.captureRegion( withoutTerminal(allowedBundleIds), regionLogical.x, regionLogical.y, regionLogical.w, regionLogical.h, outW, outH, SCREENSHOT_JPEG_QUALITY, displayId, ), ) }, // ── Keyboard ───────────────────────────────────────────────────────── /** * xdotool-style sequence e.g. "ctrl+shift+a" → split on '+' and pass to * keys(). keys() dispatches to DispatchQueue.main — drainRunLoop pumps * CFRunLoop so it resolves. Rust's error-path cleanup (enigo_wrap.rs) * releases modifiers on each invocation, so a mid-loop throw leaves * nothing stuck. 8ms between iterations — 125Hz USB polling cadence. */ async key(keySequence: string, repeat?: number): Promise { const input = requireComputerUseInput() const parts = keySequence.split('+').filter(p => p.length > 0) // Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape // etc. pass through without aborting. const isEsc = isBareEscape(parts) const n = repeat ?? 1 await drainRunLoop(async () => { for (let i = 0; i < n; i++) { if (i > 0) { await sleep(8) } if (isEsc) { notifyExpectedEscape() } await input.keys(parts) } }) }, async holdKey(keyNames: string[], durationMs: number): Promise { const input = requireComputerUseInput() // Press/release each wrapped in drainRunLoop; the sleep sits outside so // durationMs isn't bounded by drainRunLoop's 30s timeout. `pressed` // tracks which presses landed so a mid-press throw still releases // everything that was actually pressed. // // `orphaned` guards against a timeout-orphan race: if the press-phase // drainRunLoop times out while the esc-hotkey pump-retain keeps the // pump running, the orphaned lambda would continue pushing to `pressed` // after finally's releasePressed snapshotted the length — leaving keys // stuck. The flag stops the lambda at the next iteration. const pressed: string[] = [] let orphaned = false try { await drainRunLoop(async () => { for (const k of keyNames) { if (orphaned) return // Bare Escape: notify the CGEventTap so it doesn't fire the // abort callback for a model-synthesized press. Same as key(). if (isBareEscape([k])) { notifyExpectedEscape() } await input.key(k, 'press') pressed.push(k) } }) await sleep(durationMs) } finally { orphaned = true await drainRunLoop(() => releasePressed(input, pressed)) } }, async type(text: string, opts: { viaClipboard: boolean }): Promise { const input = requireComputerUseInput() if (opts.viaClipboard) { // keys(['command','v']) inside needs the pump. await drainRunLoop(() => typeViaClipboard(input, text)) return } // `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this // once per grapheme. typeText doesn't dispatch to the main queue. await input.typeText(text) }, readClipboard: readClipboardViaPbpaste, writeClipboard: writeClipboardViaPbcopy, // ── Mouse ──────────────────────────────────────────────────────────── async moveMouse(x: number, y: number): Promise { await moveAndSettle(requireComputerUseInput(), x, y) }, /** * Move, then click. Modifiers are press/release bracketed via withModifiers * — same pattern as Cowork. AppKit computes NSEvent.clickCount from timing * + position proximity, so double/triple click work without setting the * CGEvent clickState field. key() inside withModifiers needs the pump; * the modifier-less path doesn't. */ async click( x: number, y: number, button: 'left' | 'right' | 'middle', count: 1 | 2 | 3, modifiers?: string[], ): Promise { const input = requireComputerUseInput() await moveAndSettle(input, x, y) if (modifiers && modifiers.length > 0) { await drainRunLoop(() => withModifiers(input, modifiers, () => input.mouseButton(button, 'click', count), ), ) } else { await input.mouseButton(button, 'click', count) } }, async mouseDown(): Promise { await requireComputerUseInput().mouseButton('left', 'press') }, async mouseUp(): Promise { await requireComputerUseInput().mouseButton('left', 'release') }, async getCursorPosition(): Promise<{ x: number; y: number }> { return requireComputerUseInput().mouseLocation() }, /** * `from === undefined` → drag from current cursor (training's * left_click_drag with start_coordinate omitted). Inner `finally`: the * button is ALWAYS released even if the move throws — otherwise the * user's left button is stuck-pressed until they physically click. * 50ms sleep after press: enigo's move_mouse reads NSEvent.pressedMouseButtons * to decide .leftMouseDragged vs .mouseMoved; the synthetic leftMouseDown * needs a HID-tap round-trip to show up there. */ async drag( from: { x: number; y: number } | undefined, to: { x: number; y: number }, ): Promise { const input = requireComputerUseInput() if (from !== undefined) { await moveAndSettle(input, from.x, from.y) } await input.mouseButton('left', 'press') await sleep(MOVE_SETTLE_MS) try { await animatedMove(input, to.x, to.y, getMouseAnimationEnabled()) } finally { await input.mouseButton('left', 'release') } }, /** * Move first, then scroll each axis. Vertical-first — it's the common * axis; a horizontal failure shouldn't lose the vertical. */ async scroll(x: number, y: number, dx: number, dy: number): Promise { const input = requireComputerUseInput() await moveAndSettle(input, x, y) if (dy !== 0) { await input.mouseScroll(dy, 'vertical') } if (dx !== 0) { await input.mouseScroll(dx, 'horizontal') } }, // ── App management ─────────────────────────────────────────────────── async getFrontmostApp(): Promise { const info = requireComputerUseInput().getFrontmostAppInfo() if (!info || !info.bundleId) return null return { bundleId: info.bundleId, displayName: info.appName } }, async appUnderPoint( x: number, y: number, ): Promise<{ bundleId: string; displayName: string } | null> { return cu.apps.appUnderPoint(x, y) }, async listInstalledApps(): Promise { // `ComputerUseInstalledApp` is `{bundleId, displayName, path}`. // `InstalledApp` adds optional `iconDataUrl` — left unpopulated; // the approval dialog fetches lazily via getAppIcon() below. return drainRunLoop(() => cu.apps.listInstalled()) }, async getAppIcon(path: string): Promise { return cu.apps.iconDataUrl(path) ?? undefined }, async listRunningApps(): Promise { return cu.apps.listRunning() }, async openApp(bundleId: string): Promise { await cu.apps.open(bundleId) }, } } /** * Module-level export (not on the executor object) — called at turn-end from * `stopHooks.ts` / `query.ts`, outside the executor lifecycle. Fire-and-forget * at the call site; the caller `.catch()`es. */ export async function unhideComputerUseApps( bundleIds: readonly string[], ): Promise { if (bundleIds.length === 0) return const cu = requireComputerUseSwift() await cu.apps.unhide([...bundleIds]) }