From 8efa8e18c128ddf975ee4dbcc95cfb6740d520d2 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Mon, 26 Jan 2026 21:03:51 +0800 Subject: [PATCH] refactor(ext): rewrite ext. totally-broken -> still-broken; THIS IS NOT WORKING --- .../extension/src/agent/AgentController.ts | 316 +++------------- .../src/agent/RemotePageController.ts | 159 +------- packages/extension/src/agent/TabsManager.ts | 49 +-- packages/extension/src/agent/protocol.ts | 116 ++---- packages/extension/src/agent/rpc.ts | 87 +---- packages/extension/src/agent/useAgent.ts | 51 --- .../extension/src/entrypoints/background.ts | 165 ++------- packages/extension/src/entrypoints/content.ts | 241 ++++-------- packages/extension/structure.md | 347 ++++++------------ 9 files changed, 333 insertions(+), 1198 deletions(-) diff --git a/packages/extension/src/agent/AgentController.ts b/packages/extension/src/agent/AgentController.ts index 692ec1c..81922f7 100644 --- a/packages/extension/src/agent/AgentController.ts +++ b/packages/extension/src/agent/AgentController.ts @@ -1,14 +1,8 @@ /** * AgentController - Manages agent lifecycle in SidePanel context * - * This class encapsulates all agent logic, keeping it isolated from the React UI. - * It runs entirely in the SidePanel frontend context, using the Background Script - * only as a stateless message relay for communicating with content scripts. - * - * Design goals: - * - Agent state lives here, not in Service Worker - * - SW is only a relay - no agent logic there - * - Future-proof: can be moved to other contexts (e.g., a controlling web page) + * Agent state lives here, SW is only a relay. + * Mask visibility is managed via chrome.storage (content scripts poll it). */ import { PageAgentCore } from '@page-agent/core' import type { AgentActivity, AgentStatus, ExecutionResult, HistoricalEvent } from '@page-agent/core' @@ -16,8 +10,7 @@ import type { AgentActivity, AgentStatus, ExecutionResult, HistoricalEvent } fro import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '../utils/constants' import { RemotePageController } from './RemotePageController' import { type TabInfo, TabsManager } from './TabsManager' -import type { TabEventMessage } from './protocol' -import { isExtensionMessage } from './protocol' +import type { AgentState as StorageAgentState } from './protocol' import { createTabTools } from './tabTools' /** LLM configuration */ @@ -34,16 +27,6 @@ export interface AgentState { history: HistoricalEvent[] } -/** Event types emitted by AgentController */ -export interface AgentControllerEvents { - statuschange: AgentStatus - historychange: HistoricalEvent[] - activity: AgentActivity -} - -/** - * Format tab list for browser state header - */ function formatTabListHeader(tabs: TabInfo[], currentTabId: number | null): string { if (tabs.length === 0) return '' @@ -74,102 +57,53 @@ function formatTabListHeader(tabs: TabInfo[], currentTabId: number | null): stri return lines.join('\n') } -/** - * AgentController manages the agent lifecycle in the SidePanel. - * Emits events for React UI to subscribe to. - */ export class AgentController extends EventTarget { private agent: PageAgentCore | null = null private tabsManager: TabsManager | null = null private pageController: RemotePageController | null = null private llmConfig: LLMConfig - /** Current task being executed */ currentTask = '' - // ===== Mask State Management ===== - /** Browser's currently active tab (the one user sees) */ - private browserActiveTabId: number | null = null - /** Whether the browser window has focus */ - private windowHasFocus = true - /** Bound handler for tab events */ - private tabEventHandler: (message: unknown) => void - constructor() { super() - // Default to demo config this.llmConfig = { apiKey: DEMO_API_KEY, baseURL: DEMO_BASE_URL, model: DEMO_MODEL, } - // Bind tab event handler - this.tabEventHandler = this.handleTabEvent.bind(this) } - /** - * Initialize controller and load saved config - */ async init(): Promise { await this.loadConfig() - - // Initialize browser active tab - const [activeTab] = await chrome.tabs.query({ active: true, currentWindow: true }) - if (activeTab?.id) { - this.browserActiveTabId = activeTab.id - } - - // Register tab event listener - chrome.runtime.onMessage.addListener(this.tabEventHandler) - - console.log('[AgentController] Initialized, browserActiveTabId:', this.browserActiveTabId) + this.updateStorageState(null, false) + console.log('[AgentController] Initialized') } - /** - * Load LLM configuration from storage - */ private async loadConfig(): Promise { const result = await chrome.storage.local.get('llmConfig') if (result.llmConfig) { this.llmConfig = result.llmConfig as LLMConfig - console.log('[AgentController] Loaded LLM config from storage') - } else { - console.log('[AgentController] Using default demo config') } } - /** - * Save LLM configuration to storage - */ async configure(config: LLMConfig): Promise { this.llmConfig = config await chrome.storage.local.set({ llmConfig: config }) - console.log('[AgentController] Saved LLM config') - // Dispose existing agent if any if (this.agent && !this.agent.disposed) { this.agent.dispose() this.agent = null } } - /** - * Get current LLM config - */ getConfig(): LLMConfig { return { ...this.llmConfig } } - /** - * Get current agent state - */ getState(): AgentState { if (!this.agent) { - return { - status: 'idle', - task: '', - history: [], - } + return { status: 'idle', task: '', history: [] } } return { status: this.agent.status, @@ -178,86 +112,64 @@ export class AgentController extends EventTarget { } } - /** - * Get current agent status - */ get status(): AgentStatus { return this.agent?.status ?? 'idle' } - /** - * Get agent history - */ get history(): HistoricalEvent[] { return this.agent?.history ?? [] } - /** - * Check if a tab is managed by this controller - */ isTabManaged(tabId: number): boolean { return this.tabsManager?.isTabManaged(tabId) ?? false } - /** - * Get current tab ID - */ getCurrentTabId(): number | null { return this.tabsManager?.getCurrentTabId() ?? null } - /** - * Check if mask should be shown for a specific tab. - * Used by content script queries on page load. - */ - shouldShowMaskForTab(tabId: number): boolean { - const agentCurrentTabId = this.tabsManager?.getCurrentTabId() - const isRunning = this.status === 'running' - const isBrowserActiveTab = this.browserActiveTabId === tabId - const isAgentCurrentTab = agentCurrentTabId === tabId - const shouldShow = isRunning && this.windowHasFocus && isBrowserActiveTab && isAgentCurrentTab - - console.debug('[AgentController] shouldShowMaskForTab:', { - queryTabId: tabId, - agentStatus: this.status, - isRunning, - windowHasFocus: this.windowHasFocus, - browserActiveTabId: this.browserActiveTabId, - isBrowserActiveTab, - agentCurrentTabId, - isAgentCurrentTab, - shouldShow, - }) - - return shouldShow + /** Update storage state (fire-and-forget, no need to await) */ + private updateStorageState(tabId: number | null, running: boolean): void { + const agentState: StorageAgentState = { tabId, running } + chrome.storage.local.set({ agentState }) } - /** - * Create and configure agent instance - */ - private async createAgent(): Promise { - // Create page controller - this.pageController = new RemotePageController() + /** Synchronously dispose current agent and clear state */ + private disposeCurrentAgent(): void { + if (this.agent && !this.agent.disposed) { + this.agent.dispose() + } + if (this.tabsManager) { + this.tabsManager.dispose() + } + this.agent = null + this.tabsManager = null + this.pageController = null + this.updateStorageState(null, false) + } - // Create tabs manager + private async createAgent(): Promise { + this.pageController = new RemotePageController() this.tabsManager = new TabsManager() - // Generate task ID const taskId = Math.random().toString(36).slice(2, 10) - // Initialize tabs manager - await this.tabsManager.init(taskId, this.pageController) + // Pass callback to update storage when tab changes + await this.tabsManager.init(taskId, this.pageController, (tabId) => { + this.updateStorageState(tabId, true) + }) - // Create tab tools const tabTools = createTabTools(this.tabsManager) + // eslint-disable-next-line @typescript-eslint/no-this-alias + const controller = this + const newAgent = new PageAgentCore({ ...this.llmConfig, pageController: this.createPageControllerProxy(this.pageController, this.tabsManager) as any, language: 'en-US', customTools: tabTools, onBeforeStep: async (agentInstance: PageAgentCore) => { - // Check for tab changes and push observations if (this.tabsManager) { const changes = this.tabsManager.getAndClearChanges() @@ -278,7 +190,6 @@ export class AgentController extends EventTarget { }, }) - // Forward agent events newAgent.addEventListener('statuschange', () => { this.dispatchEvent(new CustomEvent('statuschange', { detail: newAgent.status })) }) @@ -292,19 +203,13 @@ export class AgentController extends EventTarget { this.dispatchEvent(new CustomEvent('activity', { detail: activity })) }) - newAgent.addEventListener('dispose', async () => { - console.debug('[AgentController] Agent dispose event received') + newAgent.addEventListener('dispose', () => { if (this.agent === newAgent) { - // Dispose all PageControllers on all managed tabs - if (this.tabsManager) { - console.debug('[AgentController] Disposing all PageControllers...') - await this.tabsManager.disposeAllPageControllers() - this.tabsManager.dispose() - } + this.tabsManager?.dispose() this.agent = null this.tabsManager = null this.pageController = null - console.debug('[AgentController] Agent and TabsManager disposed') + controller.updateStorageState(null, false) } this.dispatchEvent(new CustomEvent('statuschange', { detail: 'idle' })) }) @@ -312,17 +217,11 @@ export class AgentController extends EventTarget { return newAgent } - /** - * Create a proxy for PageController that: - * 1. Injects tab info into BrowserState.header - * 2. Syncs mask state after setTargetTab - */ + /** Proxy that injects tab list into browser state header */ private createPageControllerProxy( controller: RemotePageController, tabs: TabsManager ): RemotePageController { - // eslint-disable-next-line @typescript-eslint/no-this-alias - const agentController = this return new Proxy(controller, { get(target, prop, receiver) { if (prop === 'getBrowserState') { @@ -338,58 +237,28 @@ export class AgentController extends EventTarget { } } } - if (prop === 'setTargetTab') { - return async function (tabId: number) { - await target.setTargetTab(tabId) - // Sync mask after tab switch - await agentController.syncMaskState() - } - } return Reflect.get(target, prop, receiver) }, }) } - /** - * Execute a task - */ async execute(task: string): Promise { - console.log('[AgentController] ===== EXECUTE TASK =====') - console.log('[AgentController] Task:', task) + console.log('[AgentController] Execute:', task) this.currentTask = task - - // Emit running status immediately this.dispatchEvent(new CustomEvent('statuschange', { detail: 'running' })) try { - // Clean up any existing agent - if (this.agent && !this.agent.disposed) { - console.log('[AgentController] Disposing existing agent before new task') - this.agent.dispose() - await new Promise((r) => setTimeout(r, 100)) - } + // Clean up previous agent synchronously + this.disposeCurrentAgent() - // Clear old references - this.agent = null - this.tabsManager = null - this.pageController = null - - // Create fresh agent - console.log('[AgentController] Creating new agent...') this.agent = await this.createAgent() - console.log('[AgentController] Agent created successfully') + // Note: storage state is updated by TabsManager.init() via onTabSwitch callback - // Show mask if conditions are met (agent running + tab in foreground) - await this.syncMaskState() - - // Execute task - console.log('[AgentController] Starting task execution...') const result = await this.agent.execute(task) - console.log('[AgentController] Task completed:', result) return result } catch (error) { - console.error('[AgentController] Task execution error:', error) + console.error('[AgentController] Error:', error) const message = error instanceof Error ? error.message : String(error) this.dispatchEvent( new CustomEvent('historychange', { @@ -401,115 +270,20 @@ export class AgentController extends EventTarget { } } - /** - * Stop current task - */ stop(): void { - console.log('[AgentController] Stopping agent') - if (this.agent) { - this.agent.dispose() - } + console.log('[AgentController] Stop') + this.agent?.dispose() } - // ===== Mask State Management ===== - - /** - * Handle tab events from background script - */ - private handleTabEvent(message: unknown): void { - if (!isExtensionMessage(message)) return - if (message.type !== 'tab:event') return - - const event = message as TabEventMessage - - switch (event.eventType) { - case 'activated': - this.browserActiveTabId = event.tabId - console.debug('[AgentController] Tab activated:', event.tabId) - this.syncMaskState() - break - - case 'windowFocusChanged': - this.windowHasFocus = event.data?.focused ?? false - console.debug('[AgentController] Window focus changed:', this.windowHasFocus) - this.syncMaskState() - break - } - } - - /** - * Calculate whether mask should be visible. - * Mask is shown only when: - * 1. Agent is running - * 2. Window has focus - * 3. Browser's active tab === agent's current tab - */ - private get shouldMaskBeVisible(): boolean { - const agentCurrentTabId = this.tabsManager?.getCurrentTabId() - return ( - this.status === 'running' && - this.windowHasFocus && - this.browserActiveTabId !== null && - agentCurrentTabId !== null && - this.browserActiveTabId === agentCurrentTabId - ) - } - - /** - * Sync mask visibility based on current state. - * Shows mask on agent's current tab if conditions are met, hides otherwise. - */ - async syncMaskState(): Promise { - const agentCurrentTabId = this.tabsManager?.getCurrentTabId() - if (!this.pageController || agentCurrentTabId === null) { - return - } - - const shouldShow = this.shouldMaskBeVisible - console.debug('[AgentController] syncMaskState:', { - shouldShow, - agentCurrentTabId, - browserActiveTabId: this.browserActiveTabId, - windowHasFocus: this.windowHasFocus, - status: this.status, - }) - - try { - if (shouldShow) { - await this.pageController.showMask() - } else { - await this.pageController.hideMask() - } - } catch (e) { - console.debug('[AgentController] syncMaskState failed (ignored):', e) - } - } - - /** - * Dispose controller and clean up - */ dispose(): void { - console.log('[AgentController] Disposing controller') - - // Remove tab event listener - chrome.runtime.onMessage.removeListener(this.tabEventHandler) - - if (this.agent && !this.agent.disposed) { - this.agent.dispose() - } - this.agent = null - this.tabsManager = null - this.pageController = null + console.log('[AgentController] Dispose') + this.disposeCurrentAgent() this.currentTask = '' } } -// Singleton instance let controllerInstance: AgentController | null = null -/** - * Get or create the AgentController singleton - */ export function getAgentController(): AgentController { if (!controllerInstance) { controllerInstance = new AgentController() diff --git a/packages/extension/src/agent/RemotePageController.ts b/packages/extension/src/agent/RemotePageController.ts index 3e71251..25800be 100644 --- a/packages/extension/src/agent/RemotePageController.ts +++ b/packages/extension/src/agent/RemotePageController.ts @@ -1,11 +1,8 @@ /** * RemotePageController - Proxy for PageController in ContentScript * - * This class implements the same interface as PageController but forwards - * all method calls via RPC to the real PageController running in ContentScript. - * This allows PageAgentCore to work transparently with remote DOM operations. - * - * Tab targeting is managed externally by TabsManager via setTargetTab(). + * Forwards method calls via RPC to the real PageController in ContentScript. + * Mask visibility is managed by content script via storage polling. */ import type { ActionResult, @@ -15,16 +12,12 @@ import type { } from './protocol' import { type RPCClient, createRPCClient } from './rpc' -const DEBUG_PREFIX = '[RemotePageController]' - /** * Check if a URL can run content scripts. - * Chrome extensions cannot inject content scripts into certain pages. */ export function isContentScriptAllowed(url: string | undefined): boolean { if (!url) return false - // Restricted URL patterns const restrictedPatterns = [ /^chrome:\/\//, /^chrome-extension:\/\//, @@ -41,95 +34,50 @@ export function isContentScriptAllowed(url: string | undefined): boolean { return !restrictedPatterns.some((pattern) => pattern.test(url)) } -/** - * RemotePageController is a proxy that implements the PageController interface. - * All methods are async and forward to ContentScript via RPC. - * - * This class extends EventTarget to maintain API compatibility with PageController, - * though events in the remote context are not currently bridged. - */ export class RemotePageController { private rpc: RPCClient | null = null private _currentTabId: number | null = null private _currentTabUrl: string | undefined = undefined - private _previousTabId: number | null = null - /** Get the current target tab ID */ get currentTabId(): number | null { return this._currentTabId } - /** Get the current target tab URL */ get currentTabUrl(): string | undefined { return this._currentTabUrl } - /** Check if current tab supports content scripts */ get isCurrentTabAccessible(): boolean { return isContentScriptAllowed(this._currentTabUrl) } - // Tab ID is now set externally via setTargetTab() - - /** - * Set the target tab for all RPC operations. - * Called by TabsManager when switching tabs. - * Only handles cleanup on old tab - mask control is managed by AgentController. - */ async setTargetTab(tabId: number): Promise { - const previousTabId = this._currentTabId - const previousRpc = this.rpc - - console.debug(`${DEBUG_PREFIX} setTargetTab: ${previousTabId} → ${tabId}`) - - // Get tab info to check URL const tab = await chrome.tabs.get(tabId) - const tabUrl = tab.url - // Update state - this._previousTabId = previousTabId this._currentTabId = tabId - this._currentTabUrl = tabUrl + this._currentTabUrl = tab.url - // Check if this tab can run content scripts - if (!isContentScriptAllowed(tabUrl)) { - console.debug(`${DEBUG_PREFIX} Tab ${tabId} cannot run content scripts: ${tabUrl}`) - // Clear RPC - operations will return restricted page state + if (!isContentScriptAllowed(tab.url)) { this.rpc = null return } - // Create new RPC client for the new tab this.rpc = createRPCClient(tabId) - // Verify content script is ready by making a test call - // This uses the retry mechanism to wait for content script initialization + // Verify content script is ready try { await this.rpc.getLastUpdateTime() - console.debug(`${DEBUG_PREFIX} Content script ready on tab ${tabId}`) - } catch (error) { - console.error(`${DEBUG_PREFIX} Content script not ready on tab ${tabId}:`, error) - // Don't clear rpc - subsequent calls will retry and may succeed + } catch { + // Don't clear rpc - subsequent calls will retry } - - // Note: Mask show/hide is now controlled by AgentController.syncMaskState() - console.debug(`${DEBUG_PREFIX} Target tab set to ${tabId}`) } - /** - * Ensure RPC client is initialized - * @throws Error if setTargetTab() has not been called - */ private ensureInitialized(): void { if (!this._currentTabId) { throw new Error('RemotePageController not initialized. Call setTargetTab() first.') } } - /** - * Create a browser state for restricted pages that cannot run content scripts. - * Treats restricted pages as empty pages rather than errors. - */ private createRestrictedPageState(): BrowserState { return { url: this._currentTabUrl || '', @@ -140,9 +88,6 @@ export class RemotePageController { } } - /** - * Create a no-op action result for restricted pages - */ private createRestrictedActionResult(action: string): ActionResult { return { success: false, @@ -150,157 +95,77 @@ export class RemotePageController { } } - // ======= State Queries ======= - - /** - * Get current page URL - */ async getCurrentUrl(): Promise { - // Can return URL even for restricted pages return this._currentTabUrl || '' } - /** - * Get last tree update timestamp - */ async getLastUpdateTime(): Promise { if (!this.rpc) return Date.now() return this.rpc.getLastUpdateTime() } - /** - * Get structured browser state for LLM consumption. - */ async getBrowserState(): Promise { - // Return restricted page state if content scripts cannot run if (!this.rpc) { return this.createRestrictedPageState() } return this.rpc.getBrowserState() } - // ======= DOM Tree Operations ======= - - /** - * Update DOM tree, returns simplified HTML for LLM. - */ async updateTree(): Promise { this.ensureInitialized() if (!this.rpc) return '(empty page)' return this.rpc.updateTree() } - /** - * Clean up all element highlights - */ async cleanUpHighlights(): Promise { if (!this.rpc) return return this.rpc.cleanUpHighlights() } - // ======= Element Actions ======= - - /** - * Click element by index - */ async clickElement(index: number): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('click') return this.rpc.clickElement(index) } - /** - * Input text into element by index - */ async inputText(index: number, text: string): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('input text') return this.rpc.inputText(index, text) } - /** - * Select dropdown option by index and option text - */ async selectOption(index: number, optionText: string): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('select option') return this.rpc.selectOption(index, optionText) } - /** - * Scroll vertically - */ async scroll(options: ScrollOptions): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('scroll') return this.rpc.scroll(options) } - /** - * Scroll horizontally - */ async scrollHorizontally(options: ScrollHorizontallyOptions): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('scroll') return this.rpc.scrollHorizontally(options) } - /** - * Execute arbitrary JavaScript on the page - */ async executeJavascript(script: string): Promise { this.ensureInitialized() if (!this.rpc) return this.createRestrictedActionResult('execute script') return this.rpc.executeJavascript(script) } - // ======= Mask Operations ======= + /** @note Mask visibility is managed by content script via storage polling. */ + async showMask(): Promise {} + /** @note Mask visibility is managed by content script via storage polling. */ + async hideMask(): Promise {} - /** - * Show the visual mask overlay. - */ - async showMask(): Promise { - if (!this.rpc) return - return this.rpc.showMask() - } - - /** - * Hide the visual mask overlay. - */ - async hideMask(): Promise { - if (!this.rpc) return - await this.cleanUpHighlights() - return this.rpc.hideMask() - } - - /** - * Dispose and clean up resources on current tab - */ + /** Clear local state. Content script PageControllers clean up via storage polling. */ dispose(): void { - console.debug(`${DEBUG_PREFIX} dispose() called, current tab: ${this._currentTabId}`) - if (this.rpc) { - this.rpc.dispose().catch((e) => { - console.debug(`${DEBUG_PREFIX} dispose RPC failed (ignored):`, e) - }) - } this._currentTabId = null - this._previousTabId = null this.rpc = null } - - /** - * Dispose PageController on a specific tab (cleanup for multi-tab scenarios) - */ - async disposeTab(tabId: number): Promise { - console.debug(`${DEBUG_PREFIX} disposeTab(${tabId})`) - try { - const rpc = createRPCClient(tabId) - await rpc.cleanUpHighlights() - await rpc.hideMask() - await rpc.dispose() - console.debug(`${DEBUG_PREFIX} Tab ${tabId} disposed successfully`) - } catch (e) { - console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed (ignored):`, e) - } - } } diff --git a/packages/extension/src/agent/TabsManager.ts b/packages/extension/src/agent/TabsManager.ts index 8abd045..ac56c2f 100644 --- a/packages/extension/src/agent/TabsManager.ts +++ b/packages/extension/src/agent/TabsManager.ts @@ -83,16 +83,25 @@ export class TabsManager { /** Bound handler for cleanup */ private onTabRemovedHandler: (tabId: number) => void + /** Callback when current tab changes */ + private onTabSwitch: ((tabId: number) => void) | null = null + constructor() { this.onTabRemovedHandler = this.onTabRemoved.bind(this) } /** * Initialize the manager with current active tab + * @param onTabSwitch - Callback when current tab changes (for storage updates) */ - async init(taskId: string, pageController: RemotePageController): Promise { + async init( + taskId: string, + pageController: RemotePageController, + onTabSwitch?: (tabId: number) => void + ): Promise { this.taskId = taskId this.pageController = pageController + this.onTabSwitch = onTabSwitch ?? null this.disposed = false // Get current active tab as initial tab @@ -104,6 +113,8 @@ export class TabsManager { throw new Error('No active tab found') } + console.log(`${DEBUG_PREFIX} Initialized with tab:`, activeTab.id) + this.initialTabId = activeTab.id this.currentTabId = activeTab.id this.currentTabHistory = [] @@ -118,11 +129,10 @@ export class TabsManager { // Set target tab on page controller await pageController.setTargetTab(activeTab.id) + this.onTabSwitch?.(activeTab.id) // Register tab removal listener chrome.tabs.onRemoved.addListener(this.onTabRemovedHandler) - - console.debug(`${DEBUG_PREFIX} Initialized with tab:`, activeTab.id) } /** @@ -264,6 +274,7 @@ export class TabsManager { // Update page controller target await this.pageController.setTargetTab(tabId) + this.onTabSwitch?.(tabId) // Update tab info cache const tab = await chrome.tabs.get(tabId) @@ -411,34 +422,10 @@ export class TabsManager { } /** - * Dispose PageController on all managed tabs. - * This cleans up highlights and masks on every tab. - * Should be called before dispose() to ensure clean state. - */ - async disposeAllPageControllers(): Promise { - if (!this.pageController) return - - const allTabIds = this.getAllManagedTabIds() - console.debug( - `${DEBUG_PREFIX} Disposing PageControllers on ${allTabIds.length} tabs:`, - allTabIds - ) - - // Dispose each tab in parallel - await Promise.all( - allTabIds.map((tabId) => - this.pageController!.disposeTab(tabId).catch((e) => { - console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed:`, e) - }) - ) - ) - - console.debug(`${DEBUG_PREFIX} All PageControllers disposed`) - } - - /** - * Dispose manager and clean up - * Note: Tab group is intentionally kept - only internal state is cleared + * Dispose manager and clean up. + * Tab group is intentionally kept for user. + * PageControllers in content scripts are not explicitly disposed - they are + * lazy-loaded and will clean up via storage polling (running=false). */ dispose(): void { if (this.disposed) return diff --git a/packages/extension/src/agent/protocol.ts b/packages/extension/src/agent/protocol.ts index b15f118..a9fee85 100644 --- a/packages/extension/src/agent/protocol.ts +++ b/packages/extension/src/agent/protocol.ts @@ -1,15 +1,15 @@ /** * Message Protocol for PageAgentExt * - * MV3 Compliant Architecture: - * - SidePanel hosts the agent, all state lives there - * - Background (SW) is a stateless message relay - * - Content Script runs PageController + * Simple unidirectional architecture: + * - AGENT_TO_PAGE: SidePanel → SW → ContentScript (RPC calls) + * - TAB_CHANGE: SW broadcasts tab events to all extension pages * - * Message flows: - * 1. RPC: SidePanel → SW → ContentScript → sendResponse (PageController calls) - * 2. Query: ContentScript → SW → SidePanel → SW → ContentScript (mask state check) - * 3. Events: SW → SidePanel (tab events from chrome.tabs API) + * Key principles: + * - SW is stateless, only relays messages + * - No long-lived connections + * - All responses via sendResponse callback + * - Content script never sends messages, only responds */ // ============================================================================ @@ -46,117 +46,53 @@ export interface ScrollHorizontallyOptions { index?: number } +/** Agent state stored in chrome.storage for mask coordination */ +export interface AgentState { + tabId: number | null + running: boolean +} + // ============================================================================ -// Message Types +// Message Types (only 2) // ============================================================================ /** Message type identifier */ -type MessageType = - | 'rpc:call' // SidePanel → SW: RPC call to content script (response via sendResponse) - | 'cs:rpc' // SW → ContentScript: Forwarded RPC call - | 'cs:query' // ContentScript → SW: Query to sidepanel - | 'query:response' // SW → ContentScript: Query response - | 'tab:event' // SW → SidePanel: Tab event notification +export type MessageType = 'AGENT_TO_PAGE' | 'TAB_CHANGE' -/** Base message structure */ -interface BaseMessage { - type: MessageType - id: string // Unique message ID for request-response matching -} - -// ============================================================================ -// RPC Messages (SidePanel ↔ SW ↔ ContentScript) -// ============================================================================ - -/** SidePanel → SW: Request to call PageController method */ -export interface RPCCallMessage extends BaseMessage { - type: 'rpc:call' +/** SidePanel → SW → ContentScript: RPC call to PageController */ +export interface AgentToPageMessage { + type: 'AGENT_TO_PAGE' tabId: number method: string args: unknown[] } -/** SW → ContentScript: Forwarded RPC call */ -export interface CSRPCMessage extends BaseMessage { - type: 'cs:rpc' - method: string - args: unknown[] -} - -// ============================================================================ -// Query Messages (ContentScript → SW → SidePanel) -// ============================================================================ - -/** Query types that content script can ask */ -export type QueryType = 'shouldShowMask' - -/** ContentScript → SW: Query to sidepanel */ -export interface CSQueryMessage extends BaseMessage { - type: 'cs:query' - queryType: QueryType - tabId: number -} - -/** SW → ContentScript: Query response */ -export interface QueryResponseMessage extends BaseMessage { - type: 'query:response' - result: unknown -} - -// ============================================================================ -// Tab Event Messages (SW → SidePanel) -// ============================================================================ - /** Tab event types */ export type TabEventType = 'removed' | 'updated' | 'activated' | 'windowFocusChanged' -/** SW → SidePanel: Tab event notification */ -export interface TabEventMessage extends BaseMessage { - type: 'tab:event' +/** SW → All: Tab event broadcast */ +export interface TabChangeMessage { + type: 'TAB_CHANGE' eventType: TabEventType tabId: number data?: { - // For 'updated' events status?: string url?: string - // For 'activated' events windowId?: number - // For 'windowFocusChanged' events focused?: boolean } } -// ============================================================================ -// Union Types -// ============================================================================ - /** All message types */ -export type ExtensionMessage = - | RPCCallMessage - | CSRPCMessage - | CSQueryMessage - | QueryResponseMessage - | TabEventMessage +export type ExtensionMessage = AgentToPageMessage | TabChangeMessage // ============================================================================ -// Utility Functions +// Type Guard // ============================================================================ -/** Generate unique message ID */ -export function generateMessageId(): string { - return `${Date.now()}-${Math.random().toString(36).slice(2, 8)}` -} +const MESSAGE_TYPES = new Set(['AGENT_TO_PAGE', 'TAB_CHANGE']) -/** Known message types for type guard */ -const MESSAGE_TYPES = new Set([ - 'rpc:call', - 'cs:rpc', - 'cs:query', - 'query:response', - 'tab:event', -]) - -/** Type guard - checks if message has a known type */ +/** Type guard - checks if message is a known extension message */ export function isExtensionMessage(msg: unknown): msg is ExtensionMessage { return typeof msg === 'object' && msg !== null && MESSAGE_TYPES.has((msg as any).type) } diff --git a/packages/extension/src/agent/rpc.ts b/packages/extension/src/agent/rpc.ts index ddca2e8..2153e09 100644 --- a/packages/extension/src/agent/rpc.ts +++ b/packages/extension/src/agent/rpc.ts @@ -1,41 +1,25 @@ /** * RPC Client for PageController remote calls * - * This module provides RPC functionality from SidePanel to ContentScript - * via the Background (SW) relay. - * - * Flow: SidePanel → SW (relay) → ContentScript → sendResponse → SidePanel - * - * MV3 Compliant: Uses chrome.runtime.sendMessage with direct sendResponse, - * no pending calls map or custom response listeners needed. + * Flow: SidePanel → SW (relay) → ContentScript → sendResponse */ -import { - type ActionResult, - type BrowserState, - type RPCCallMessage, - type ScrollHorizontallyOptions, - type ScrollOptions, - generateMessageId, +import type { + ActionResult, + AgentToPageMessage, + BrowserState, + ScrollHorizontallyOptions, + ScrollOptions, } from './protocol' -/** RPC configuration */ const RPC_CONFIG = { - /** Maximum retry attempts for transient failures */ maxRetries: 3, - /** Base delay between retries in ms (exponential backoff) */ retryDelayMs: 500, } -/** - * Sleep for a given number of milliseconds - */ function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } -/** - * Check if a tab exists - */ async function tabExists(tabId: number): Promise { try { await chrome.tabs.get(tabId) @@ -45,9 +29,6 @@ async function tabExists(tabId: number): Promise { } } -/** - * Error thrown when RPC call fails - */ export class RPCError extends Error { constructor( message: string, @@ -58,21 +39,15 @@ export class RPCError extends Error { } } -/** Response type from background script */ interface RPCResponse { success: boolean result?: unknown error?: string } -/** - * Make a single RPC call (no retry) - * Uses chrome.runtime.sendMessage which returns the response directly via sendResponse - */ async function callOnce(tabId: number, method: string, args: unknown[]): Promise { - const message: RPCCallMessage = { - type: 'rpc:call', - id: generateMessageId(), + const message: AgentToPageMessage = { + type: 'AGENT_TO_PAGE', tabId, method, args, @@ -87,9 +62,6 @@ async function callOnce(tabId: number, method: string, args: unknown[]): Promise } } -/** - * Make an RPC call with retry logic - */ async function call(tabId: number, method: string, args: unknown[]): Promise { let lastError: Error | null = null @@ -100,38 +72,33 @@ async function call(tabId: number, method: string, args: unknown[]): Promise scrollHorizontally(options: ScrollHorizontallyOptions): Promise executeJavascript(script: string): Promise - showMask(): Promise - hideMask(): Promise - dispose(): Promise } -/** - * Create an RPC client bound to a specific tab - */ export function createRPCClient(tabId: number): RPCClient { - console.debug(`[RPC] Creating client for tab ${tabId}`) - return { tabId, @@ -203,27 +162,5 @@ export function createRPCClient(tabId: number): RPCClient { async executeJavascript(script: string): Promise { return call(tabId, 'executeJavascript', [script]) as Promise }, - - async showMask(): Promise { - await call(tabId, 'showMask', []) - }, - - async hideMask(): Promise { - // Best effort - don't throw if content script is gone - try { - await callOnce(tabId, 'hideMask', []) - } catch (e) { - console.debug('[RPC] hideMask failed (ignored):', e) - } - }, - - async dispose(): Promise { - // Best effort - don't throw if content script is gone - try { - await callOnce(tabId, 'dispose', []) - } catch (e) { - console.debug('[RPC] dispose failed (ignored):', e) - } - }, } } diff --git a/packages/extension/src/agent/useAgent.ts b/packages/extension/src/agent/useAgent.ts index fd52592..d4b06e6 100644 --- a/packages/extension/src/agent/useAgent.ts +++ b/packages/extension/src/agent/useAgent.ts @@ -1,25 +1,17 @@ /** * React hook for using AgentController - * - * This hook provides a React-friendly interface to the AgentController, - * handling event subscriptions and state updates. */ import type { AgentActivity, AgentStatus, HistoricalEvent } from '@page-agent/core' import { useCallback, useEffect, useRef, useState } from 'react' import { type AgentController, type LLMConfig, getAgentController } from './AgentController' -import type { CSQueryMessage } from './protocol' -import { isExtensionMessage } from './protocol' export interface UseAgentResult { - // State status: AgentStatus history: HistoricalEvent[] activity: AgentActivity | null currentTask: string config: LLMConfig - - // Actions execute: (task: string) => Promise stop: () => void configure: (config: LLMConfig) => Promise @@ -37,17 +29,14 @@ export function useAgent(): UseAgentResult { model: '', }) - // Initialize controller and subscribe to events useEffect(() => { const controller = getAgentController() controllerRef.current = controller - // Initialize controller.init().then(() => { setConfig(controller.getConfig()) }) - // Event handlers const handleStatusChange = (e: Event) => { const newStatus = (e as CustomEvent).detail as AgentStatus setStatus(newStatus) @@ -70,50 +59,10 @@ export function useAgent(): UseAgentResult { controller.addEventListener('historychange', handleHistoryChange) controller.addEventListener('activity', handleActivity) - // Handle shouldShowMask queries from content scripts - const handleMessage = ( - message: unknown, - _sender: chrome.runtime.MessageSender, - sendResponse: (response?: unknown) => void - ): boolean => { - if (!isExtensionMessage(message)) return false - if (message.type !== 'cs:query') return false - - const query = message as CSQueryMessage - if (query.queryType === 'shouldShowMask') { - const ctrl = controllerRef.current - if (!ctrl) { - sendResponse(false) - return true - } - - // Use AgentController's shouldShowMaskForTab which checks: - // 1. Agent is running - // 2. Window has focus - // 3. Browser's active tab === query.tabId - // 4. Agent's current tab === query.tabId - const shouldShow = ctrl.shouldShowMaskForTab(query.tabId) - - console.debug('[useAgent] shouldShowMask query:', { - tabId: query.tabId, - shouldShow, - }) - - sendResponse(shouldShow) - return true - } - - return false - } - - chrome.runtime.onMessage.addListener(handleMessage) - - // Cleanup return () => { controller.removeEventListener('statuschange', handleStatusChange) controller.removeEventListener('historychange', handleHistoryChange) controller.removeEventListener('activity', handleActivity) - chrome.runtime.onMessage.removeListener(handleMessage) controller.dispose() } }, []) diff --git a/packages/extension/src/entrypoints/background.ts b/packages/extension/src/entrypoints/background.ts index 8db42bb..6b41dac 100644 --- a/packages/extension/src/entrypoints/background.ts +++ b/packages/extension/src/entrypoints/background.ts @@ -1,82 +1,51 @@ /** * Background Script (Service Worker) - Stateless Message Relay * - * MV3 COMPLIANT: This script is completely stateless. - * It only relays messages between contexts: - * - SidePanel ↔ ContentScript (RPC for PageController) - * - ContentScript → SidePanel (queries like shouldShowMask) - * - Tab events → SidePanel (chrome.tabs API events) - * - * NO agent logic, NO state, NO long-running operations. + * Completely stateless. Only two responsibilities: + * 1. Relay AGENT_TO_PAGE messages from SidePanel to ContentScript + * 2. Broadcast TAB_CHANGE events to all extension pages */ import { - type CSQueryMessage, - type CSRPCMessage, - type ExtensionMessage, - type QueryResponseMessage, - type RPCCallMessage, - type TabEventMessage, - generateMessageId, + type AgentToPageMessage, + type TabChangeMessage, isExtensionMessage, } from '../agent/protocol' // ============================================================================ -// Message Relay Handlers +// Message Relay // ============================================================================ -/** - * Handle messages from SidePanel and ContentScript - */ chrome.runtime.onMessage.addListener( ( message: unknown, - sender: chrome.runtime.MessageSender, + _sender: chrome.runtime.MessageSender, sendResponse: (response?: unknown) => void ): boolean => { if (!isExtensionMessage(message)) { return false } - const msg = message as ExtensionMessage - - switch (msg.type) { - case 'rpc:call': - // SidePanel → SW: Forward RPC to content script, return result via sendResponse - handleRPCCall(msg as RPCCallMessage, sendResponse) - return true // Async response - - case 'cs:query': - // ContentScript → SW: Forward query to sidepanel - handleCSQuery(msg as CSQueryMessage, sender) - return false - - default: - return false + if (message.type === 'AGENT_TO_PAGE') { + handleAgentToPage(message as AgentToPageMessage, sendResponse) + return true // Async response } + + return false } ) /** * Forward RPC call from SidePanel to ContentScript - * Uses sendResponse to return result directly (MV3 compliant) */ -async function handleRPCCall( - msg: RPCCallMessage, +async function handleAgentToPage( + msg: AgentToPageMessage, sendResponse: (response: { success: boolean; result?: unknown; error?: string }) => void ): Promise { const { tabId, method, args } = msg - // Create message for content script - const csMessage: CSRPCMessage = { - type: 'cs:rpc', - id: msg.id, - method, - args, - } - try { - // Send to content script and wait for response - const result = await chrome.tabs.sendMessage(tabId, csMessage) + // Forward directly to content script, same message format + const result = await chrome.tabs.sendMessage(tabId, msg) sendResponse({ success: true, result }) } catch (error) { sendResponse({ @@ -86,122 +55,59 @@ async function handleRPCCall( } } -/** - * Forward query from ContentScript to SidePanel - */ -async function handleCSQuery( - msg: CSQueryMessage, - sender: chrome.runtime.MessageSender -): Promise { - const { id, queryType, tabId } = msg +// ============================================================================ +// Tab Event Broadcasting +// ============================================================================ - // For shouldShowMask, we need to ask the sidepanel - // Since sidepanel may not be open, we'll use a timeout approach - // The sidepanel registers a listener for these queries - - try { - // Broadcast to sidepanel (it will respond via query:response) - const response = await chrome.runtime.sendMessage(msg) - - // Forward response back to content script - if (sender.tab?.id) { - const queryResponse: QueryResponseMessage = { - type: 'query:response', - id, - result: response, - } - await chrome.tabs.sendMessage(sender.tab.id, queryResponse) - } - } catch (error) { - // Sidepanel not open or no response, return default - if (sender.tab?.id) { - const queryResponse: QueryResponseMessage = { - type: 'query:response', - id, - result: queryType === 'shouldShowMask' ? false : null, - } - await chrome.tabs.sendMessage(sender.tab.id, queryResponse).catch(() => {}) - } - } +function broadcastTabChange(message: TabChangeMessage): void { + chrome.runtime.sendMessage(message).catch(() => { + // No listeners (sidepanel not open) + }) } -// ============================================================================ -// Tab Event Forwarding -// ============================================================================ - -/** - * Forward tab removed events to sidepanel - */ chrome.tabs.onRemoved.addListener((tabId) => { - const message: TabEventMessage = { - type: 'tab:event', - id: generateMessageId(), + broadcastTabChange({ + type: 'TAB_CHANGE', eventType: 'removed', tabId, - } - chrome.runtime.sendMessage(message).catch(() => { - // Sidepanel may not be open }) }) -/** - * Forward tab updated events to sidepanel - */ chrome.tabs.onUpdated.addListener((tabId, changeInfo) => { - // Only forward loading/complete status changes if (!changeInfo.status) return - const message: TabEventMessage = { - type: 'tab:event', - id: generateMessageId(), + broadcastTabChange({ + type: 'TAB_CHANGE', eventType: 'updated', tabId, data: { status: changeInfo.status, url: changeInfo.url, }, - } - chrome.runtime.sendMessage(message).catch(() => { - // Sidepanel may not be open }) }) -/** - * Forward tab activated events to sidepanel (user switches tabs) - */ chrome.tabs.onActivated.addListener((activeInfo) => { - const message: TabEventMessage = { - type: 'tab:event', - id: generateMessageId(), + broadcastTabChange({ + type: 'TAB_CHANGE', eventType: 'activated', tabId: activeInfo.tabId, data: { windowId: activeInfo.windowId, }, - } - chrome.runtime.sendMessage(message).catch(() => { - // Sidepanel may not be open }) }) -/** - * Forward window focus changed events to sidepanel - */ chrome.windows.onFocusChanged.addListener((windowId) => { - // windowId is chrome.windows.WINDOW_ID_NONE (-1) when all windows lose focus const focused = windowId !== chrome.windows.WINDOW_ID_NONE - const message: TabEventMessage = { - type: 'tab:event', - id: generateMessageId(), + broadcastTabChange({ + type: 'TAB_CHANGE', eventType: 'windowFocusChanged', - tabId: -1, // Not applicable for window focus events + tabId: -1, data: { windowId: focused ? windowId : undefined, focused, }, - } - chrome.runtime.sendMessage(message).catch(() => { - // Sidepanel may not be open }) }) @@ -210,10 +116,7 @@ chrome.windows.onFocusChanged.addListener((windowId) => { // ============================================================================ export default defineBackground(() => { - console.log('[Background] Service Worker started (stateless relay mode)') + console.log('[Background] Service Worker started') - // Open sidepanel on action click - chrome.sidePanel.setPanelBehavior({ openPanelOnActionClick: true }).catch(() => { - // Side panel may not be supported - }) + chrome.sidePanel.setPanelBehavior({ openPanelOnActionClick: true }).catch(() => {}) }) diff --git a/packages/extension/src/entrypoints/content.ts b/packages/extension/src/entrypoints/content.ts index 37bc656..2fa26e6 100644 --- a/packages/extension/src/entrypoints/content.ts +++ b/packages/extension/src/entrypoints/content.ts @@ -1,68 +1,81 @@ /** * Content Script Entry Point * - * This script runs in the context of web pages and hosts the real PageController. - * It listens for RPC messages relayed through the Background Script and - * dispatches them to PageController. - * - * Message flow: - * - RPC: SidePanel → SW → ContentScript (this file) → response → SW → SidePanel - * - Query: ContentScript → SW → SidePanel → SW → ContentScript (for shouldShowMask) + * Runs in web page context, hosts PageController. + * - Receives AGENT_TO_PAGE messages and responds via sendResponse + * - Polls chrome.storage to manage mask visibility (no outgoing messages) */ import { PageController } from '@page-agent/page-controller' -import type { CSQueryMessage, CSRPCMessage, QueryResponseMessage } from '../agent/protocol' -import { generateMessageId, isExtensionMessage } from '../agent/protocol' +import type { AgentState, AgentToPageMessage } from '../agent/protocol' +import { isExtensionMessage } from '../agent/protocol' -const DEBUG_PREFIX = '[ContentScript]' +const DEBUG_PREFIX = '[Content]' export default defineContentScript({ matches: [''], runAt: 'document_idle', async main() { - const pageUrl = window.location.href - console.debug(`${DEBUG_PREFIX} Content script loaded on ${pageUrl}`) + console.debug(`${DEBUG_PREFIX} Loaded on ${window.location.href}`) - // Lazy-initialized controller - created on demand, disposed between tasks + // Lazy-initialized controller let controller: PageController | null = null let initError: Error | null = null + let myTabId: number | null = null function getController(): PageController { - if (initError) { - console.debug(`${DEBUG_PREFIX} getController: re-throwing init error`) - throw initError - } + if (initError) throw initError if (!controller) { try { controller = new PageController({ enableMask: true }) console.debug(`${DEBUG_PREFIX} PageController created`) } catch (error) { initError = error instanceof Error ? error : new Error(String(error)) - console.error(`${DEBUG_PREFIX} Failed to create PageController:`, initError) throw initError } } return controller } - function disposeController(): void { - console.debug(`${DEBUG_PREFIX} Disposing controller...`) - controller?.dispose() - controller = null - initError = null - console.debug(`${DEBUG_PREFIX} PageController disposed`) - } + // Register message handler + chrome.runtime.onMessage.addListener( + ( + message: unknown, + _sender: chrome.runtime.MessageSender, + sendResponse: (response?: unknown) => void + ): boolean => { + if (!isExtensionMessage(message)) return false + if (message.type !== 'AGENT_TO_PAGE') return false - // Register RPC message handler - registerRPCHandler(getController, () => controller, disposeController) + const msg = message as AgentToPageMessage - // Check if there's an active task that needs mask to be shown - setTimeout(() => queryShouldShowMask(getController), 100) + // Cache our tab ID from the first message + if (myTabId === null) { + myTabId = msg.tabId + console.debug(`${DEBUG_PREFIX} Tab ID: ${myTabId}`) + } - // Cleanup on page unload + handleRPC(msg.method, msg.args, getController, () => controller) + .then(sendResponse) + .catch((error) => { + console.error(`${DEBUG_PREFIX} RPC ${msg.method} failed:`, error) + sendResponse({ error: error instanceof Error ? error.message : String(error) }) + }) + + return true // Async response + } + ) + + // Start mask polling + startMaskPolling( + () => myTabId, + getController, + () => controller + ) + + // Cleanup on unload window.addEventListener('beforeunload', () => { - console.debug(`${DEBUG_PREFIX} Page unloading, disposing controller`) controller?.dispose() controller = null }) @@ -70,137 +83,59 @@ export default defineContentScript({ }) /** - * Query the sidepanel (via SW) whether mask should be shown + * Poll storage every second to manage mask visibility. + * Content script is autonomous - decides mask state based on: + * - agentState in storage (tabId, running) + * - document.visibilityState */ -async function queryShouldShowMask(getController: () => PageController): Promise { - const tabId = await getCurrentTabId() - if (!tabId) { - console.debug(`${DEBUG_PREFIX} Cannot query shouldShowMask: no tab ID`) - return - } +function startMaskPolling( + getTabId: () => number | null, + getController: () => PageController, + getControllerIfExists: () => PageController | null +): void { + let maskVisible = false - const queryId = generateMessageId() - const queryMessage: CSQueryMessage = { - type: 'cs:query', - id: queryId, - queryType: 'shouldShowMask', - tabId, - } + const poll = async () => { + const tabId = getTabId() + if (tabId === null) return // Don't know our tab ID yet - console.debug(`${DEBUG_PREFIX} shouldShowMask query:`, { - tabId, - url: window.location.href, - queryId, - }) - - try { - // Set up response listener - const responsePromise = new Promise((resolve) => { - const timeout = setTimeout(() => { - console.debug(`${DEBUG_PREFIX} shouldShowMask query timeout (3s)`) - chrome.runtime.onMessage.removeListener(listener) - resolve(false) - }, 3000) - - const listener = (message: unknown) => { - if (!isExtensionMessage(message)) return - if (message.type !== 'query:response') return - if ((message as QueryResponseMessage).id !== queryId) return - - clearTimeout(timeout) - chrome.runtime.onMessage.removeListener(listener) - resolve((message as QueryResponseMessage).result as boolean) + try { + const { agentState } = (await chrome.storage.local.get('agentState')) as { + agentState?: AgentState } - chrome.runtime.onMessage.addListener(listener) - }) + const shouldShow = + agentState?.running === true && + agentState?.tabId === tabId && + document.visibilityState === 'visible' - // Send query - await chrome.runtime.sendMessage(queryMessage) - - // Wait for response - const shouldShowMask = await responsePromise - - console.debug(`${DEBUG_PREFIX} shouldShowMask response:`, { - tabId, - shouldShowMask, - action: shouldShowMask ? 'showMask' : 'noAction', - }) - - if (shouldShowMask) { - await getController().showMask() - console.debug(`${DEBUG_PREFIX} Mask shown after page load`) + if (shouldShow && !maskVisible) { + await getController().showMask() + maskVisible = true + } else if (!shouldShow && maskVisible) { + await getControllerIfExists()?.hideMask() + maskVisible = false + } + } catch { + // Storage access failed, ignore } - } catch (error) { - console.debug(`${DEBUG_PREFIX} shouldShowMask query failed:`, error) } + + setInterval(poll, 1000) + // Also poll on visibility change for faster response + document.addEventListener('visibilitychange', poll) } /** - * Get current tab ID + * Handle RPC method call */ -async function getCurrentTabId(): Promise { - try { - const response = await chrome.runtime.sendMessage({ type: 'getTabId' }) - return response?.tabId ?? null - } catch { - // Fallback: we're in content script, tab ID comes from sender in SW - return null - } -} - -/** - * Register RPC message handler - */ -function registerRPCHandler( - getController: () => PageController, - getControllerIfExists: () => PageController | null, - disposeController: () => void -): void { - chrome.runtime.onMessage.addListener( - ( - message: unknown, - _sender: chrome.runtime.MessageSender, - sendResponse: (response?: unknown) => void - ): boolean => { - if (!isExtensionMessage(message)) return false - if (message.type !== 'cs:rpc') return false - - const rpcMessage = message as CSRPCMessage - const { method, args } = rpcMessage - - console.debug(`${DEBUG_PREFIX} RPC: ${method}`, args) - - // Handle the RPC call - handleRPCCall(method, args, getController, getControllerIfExists, disposeController) - .then((result) => { - sendResponse(result) - }) - .catch((error) => { - console.error(`${DEBUG_PREFIX} RPC ${method} failed:`, error) - sendResponse({ error: error instanceof Error ? error.message : String(error) }) - }) - - // Return true to indicate async response - return true - } - ) - - console.debug(`${DEBUG_PREFIX} RPC handler registered`) -} - -/** - * Handle an RPC call - */ -async function handleRPCCall( +async function handleRPC( method: string, args: unknown[], getController: () => PageController, - getControllerIfExists: () => PageController | null, - disposeController: () => void + getControllerIfExists: () => PageController | null ): Promise { switch (method) { - // State queries case 'getCurrentUrl': return getController().getCurrentUrl() @@ -210,7 +145,6 @@ async function handleRPCCall( case 'getBrowserState': return getController().getBrowserState() - // DOM operations case 'updateTree': return getController().updateTree() @@ -218,7 +152,6 @@ async function handleRPCCall( await getControllerIfExists()?.cleanUpHighlights() return undefined - // Element actions case 'clickElement': return getController().clickElement(args[0] as number) @@ -239,20 +172,6 @@ async function handleRPCCall( case 'executeJavascript': return getController().executeJavascript(args[0] as string) - // Mask operations - case 'showMask': - await getController().showMask() - return undefined - - case 'hideMask': - await getControllerIfExists()?.hideMask() - return undefined - - // Lifecycle - case 'dispose': - disposeController() - return undefined - default: throw new Error(`Unknown RPC method: ${method}`) } diff --git a/packages/extension/structure.md b/packages/extension/structure.md index 7f02c53..1c9e34b 100644 --- a/packages/extension/structure.md +++ b/packages/extension/structure.md @@ -1,247 +1,147 @@ # PageAgentExt Architecture -This document describes the MV3-compliant architecture of the Chrome extension version of PageAgent. +MV3-compliant Chrome extension architecture. ## Design Principles -The architecture follows Chrome MV3 Service Worker constraints: +1. **Service Worker is stateless** - Only relays messages, no state +2. **Agent runs in SidePanel** - All agent logic lives there +3. **Unidirectional communication** - Agent → SW → Content +4. **Storage-based coordination** - Mask state via chrome.storage -1. **Service Worker is stateless** - No long-running loops, no in-memory state -2. **Agent runs in frontend context** - SidePanel hosts all agent logic -3. **SW is a message relay** - Only forwards messages between contexts -4. **Event-driven** - All operations are triggered by user actions or message events +## Environments -## Environment Definitions - -The extension operates across three isolated JavaScript contexts: - -### 1. Side Panel (Frontend - Agent Host) +### 1. Side Panel (Agent Host) **Files:** `src/entrypoints/sidepanel/` -**Responsibilities:** - -- Hosts `PageAgentCore` instance and main execution loop +- Hosts `PageAgentCore` and execution loop - Manages `TabsManager` for multi-tab control -- Uses `RemotePageController` to proxy DOM operations via SW -- Stores agent state (task, history, status) -- Provides React UI for user interaction -- Handles `shouldShowMask` queries from content scripts +- Uses `RemotePageController` for RPC to content script +- Writes agent state to storage for mask coordination **Key Components:** -- `AgentController` - Encapsulates agent lifecycle, isolated from UI -- `useAgent` hook - React integration for AgentController -- `App.tsx` - Main UI component -- `ConfigPanel` - LLM settings +- `AgentController` - Agent lifecycle, writes `agentState` to storage +- `useAgent` hook - React integration +- `App.tsx` - Main UI -**Lifecycle:** When sidepanel closes, agent disposes naturally. No state persists in SW. - -### 2. Background (Service Worker - Stateless Relay) +### 2. Background (Service Worker) **File:** `src/entrypoints/background.ts` -**Responsibilities:** +**Only two responsibilities:** -- Relays RPC messages from SidePanel to ContentScript -- Forwards tab events (onRemoved, onUpdated, onActivated, onFocusChanged) to SidePanel -- Opens sidepanel on action click -- **NO** agent logic, **NO** state +1. Relay `AGENT_TO_PAGE` messages to content script +2. Broadcast `TAB_CHANGE` events -**Message Flows:** - -``` -SidePanel → SW → ContentScript (RPC calls) -ContentScript → SW → SidePanel (mask state queries) -SW → SidePanel (tab events) -``` +**No state, no agent logic.** ### 3. Content Script **File:** `src/entrypoints/content.ts` -**Responsibilities:** - -- Runs in web page context -- Hosts real `PageController` instance (lazy-initialized) +- Hosts `PageController` (lazy-initialized) - Handles RPC messages for DOM operations -- Queries SidePanel for mask state on page load -- Manages visual mask overlay - -**Lifecycle:** PageController is created on first RPC call and disposed between tasks. +- Polls storage every 1s for mask state +- Uses `document.visibilityState` to manage mask visibility ## Architecture Diagram ``` ┌─────────────────────────────────────────────────────────────────┐ -│ Side Panel (Frontend) │ +│ Side Panel │ │ ┌────────────────────────────────────────────────────────────┐ │ │ │ AgentController │ │ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────┐ │ │ │ │ │ PageAgentCore│ │ TabsManager │ │RemotePageController│ │ │ │ │ └──────────────┘ └──────────────┘ └────────┬─────────┘ │ │ │ └───────────────────────────────────────────────┼────────────┘ │ -│ │ │ -│ ┌──────────────┐ ┌──────────────┐ │ │ -│ │ React UI │ │ Query Handler│◄─────────────┼───────────┐ │ -│ │ (App.tsx) │ │(shouldShowMask) │ │ │ -│ └──────────────┘ └──────────────┘ │ │ │ -└──────────────────────────────────────────────────┼───────────┼───┘ - │ │ - RPC Call │ Query │ - ▼ │ -┌─────────────────────────────────────────────────────────────────┐ -│ Background (Service Worker) │ -│ │ -│ ┌────────────────┐ │ -│ │ Message Relay │ │ -│ │ (stateless) │ │ -│ └───────┬────────┘ │ -│ │ │ -│ Tab Events ─────────────────┼─────────────────► SidePanel │ -│ (removed, updated, │ │ -│ activated, focusChanged) │ │ -└──────────────────────────────┼───────────────────────────────────┘ - │ RPC Forward - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Content Script │ -│ ┌────────────────────────────────────────────────────────────┐ │ -│ │ PageController │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ │ -│ │ │ DOM Tree │ │ Actions │ │ Mask │ │ │ -│ │ └─────────────┘ └─────────────┘ └──────────────────┘ │ │ -│ └────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ - ┌───────────────┐ - │ Web Page │ - │ DOM │ - └───────────────┘ +│ │ │ │ +│ │ write agentState │ AGENT_TO_PAGE │ +│ ▼ ▼ │ +└─────────────────────────┼────────────────────────┼───────────────┘ + │ │ + ┌─────────┴─────────┐ │ + │ chrome.storage │ │ + └─────────┬─────────┘ │ + │ │ + │ poll │ + │ ▼ +┌─────────────────────────┼─────────────────────────────────────────┐ +│ │ Background (SW) │ +│ │ ┌────────────────┐ │ +│ │ │ Message Relay │ │ +│ │ │ (stateless) │ │ +│ │ └───────┬────────┘ │ +│ │ │ │ +│ TAB_CHANGE broadcast ──┼─────────────┼─────────────► │ +└─────────────────────────┼─────────────┼────────────────────────────┘ + │ │ forward + │ ▼ +┌─────────────────────────┼─────────────────────────────────────────┐ +│ Content Script │ │ +│ ┌──────────────────────┴───────────────────────────────────────┐ │ +│ │ PageController │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ │ +│ │ │ DOM Tree │ │ Actions │ │ Mask (storage │ │ │ +│ │ │ │ │ │ │ polling + vis) │ │ │ +│ │ └─────────────┘ └─────────────┘ └──────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ ``` ## Message Protocol -All messages use a simple type-based protocol defined in `src/messaging/protocol.ts`. - -### Message Types +Only two message types: | Type | Direction | Purpose | |------|-----------|---------| -| `rpc:call` | SidePanel → SW | Request to call PageController method | -| `rpc:response` | SW → SidePanel | Response from PageController | -| `cs:rpc` | SW → ContentScript | Forwarded RPC call | -| `cs:query` | ContentScript → SW | Query to SidePanel (e.g., shouldShowMask) | -| `query:response` | SW → ContentScript | Response to query | -| `tab:event` | SW → SidePanel | Tab events (removed/updated/activated/focusChanged) | +| `AGENT_TO_PAGE` | SidePanel → SW → Content | RPC call to PageController | +| `TAB_CHANGE` | SW → All | Tab events broadcast | ### RPC Methods -All PageController methods are available via RPC: - - State: `getCurrentUrl`, `getLastUpdateTime`, `getBrowserState` - DOM: `updateTree`, `cleanUpHighlights` - Actions: `clickElement`, `inputText`, `selectOption`, `scroll`, `scrollHorizontally`, `executeJavascript` -- Mask: `showMask`, `hideMask` - Lifecycle: `dispose` -## Communication Flow +## Mask Management -### Task Execution +Mask visibility is managed autonomously by content script via storage polling. -``` -1. User enters task in SidePanel - └─> AgentController.execute(task) +### Storage State -2. AgentController creates agent instances - ├─> new PageAgentCore() - ├─> new TabsManager() - └─> new RemotePageController() - -3. Agent executes step loop: - ├─> LLM generates next action - ├─> RemotePageController.method() called - │ └─> RPC message → SW → ContentScript - ├─> ContentScript executes on real PageController - │ └─> Response → SW → SidePanel - ├─> Agent updates history - └─> React UI re-renders via events - -4. Task completes or user stops - └─> Agent disposes, status changes +```typescript +interface AgentState { + tabId: number | null // Agent's current tab + running: boolean // Agent is executing +} +// Key: 'agentState' ``` -### Page Reload During Task +### Content Script Logic -``` -1. Page reloads/navigates -2. Content script initializes -3. Content script queries: shouldShowMask? - └─> cs:query → SW → SidePanel -4. SidePanel checks: agentRunning + windowFocus + (browserActiveTab === agentCurrentTab) - └─> query:response → SW → ContentScript -5. Content script shows/hides mask accordingly +```typescript +setInterval(async () => { + const { agentState } = await chrome.storage.local.get('agentState') + + const shouldShow = + agentState?.running && + agentState?.tabId === myTabId && + document.visibilityState === 'visible' + + if (shouldShow) showMask() + else hideMask() +}, 1000) ``` -## File Structure +### Agent Updates Storage -``` -packages/extension/src/ -├── agent/ -│ ├── RemotePageController.ts # Proxy for PageController RPC -│ ├── TabsManager.ts # Multi-tab management -│ └── tabTools.ts # Agent tools for tab control -├── entrypoints/ -│ ├── background.ts # Stateless SW relay -│ ├── content.ts # Content script with PageController -│ └── sidepanel/ -│ ├── AgentController.ts # Agent lifecycle management -│ ├── useAgent.ts # React hook for agent -│ ├── App.tsx # Main UI component -│ ├── components/ -│ │ ├── ConfigPanel.tsx -│ │ ├── cards/ -│ │ └── index.tsx -│ ├── index.html -│ └── main.tsx -├── messaging/ -│ ├── protocol.ts # Message type definitions -│ ├── rpc.ts # RPC client for SidePanel -│ └── index.ts -├── components/ui/ # shadcn components -├── lib/utils.ts -└── utils/constants.ts -``` - -## Design Decisions - -### Why Agent in SidePanel? - -MV3 Service Workers have strict lifecycle constraints: -- Terminate after ~30s of inactivity -- Cannot maintain long-running loops -- State is lost on termination - -By hosting the agent in SidePanel (a visible frontend page), we get: -- Persistent execution while panel is open -- Natural disposal when panel closes -- No SW wake-up complexity - -### Agent Isolation from UI - -`AgentController` is a separate class from the React UI for: -- **Testability** - Can test agent logic without React -- **Portability** - Future: move agent to popup, options page, or external page -- **Clean separation** - UI concerns don't pollute agent logic - -### Simplified Messaging - -Previous architecture had complex retry/wake-up logic for SW. New architecture: -- SW is stateless, always ready -- No ping/wake-up needed -- Simple request-response pattern -- Retry logic only for content script initialization +- Task start: `{ tabId, running: true }` +- Tab switch: `{ tabId: newTabId, running: true }` +- Task end: `{ tabId: null, running: false }` ## Multi-Tab Control @@ -252,69 +152,34 @@ Previous architecture had complex retry/wake-up logic for SW. New architecture: ### Tab Grouping -Agent-opened tabs are grouped in a Chrome tab group named `Task()`. +Agent-opened tabs are grouped in Chrome tab group `Task()`. -### Tab Switching - -Only initial tab and managed tabs can be switched to. This prevents the agent from accessing unrelated tabs. - -## Mask Management - -The visual mask overlay blocks user interaction during automation. Mask visibility is centrally controlled by `AgentController` based on three conditions: +## File Structure ``` -shouldMaskBeVisible = agentRunning && windowHasFocus && (browserActiveTab === agentCurrentTab) +packages/extension/src/ +├── agent/ +│ ├── AgentController.ts # Agent lifecycle, storage updates +│ ├── RemotePageController.ts # RPC proxy for PageController +│ ├── TabsManager.ts # Multi-tab management +│ ├── protocol.ts # Message types (AGENT_TO_PAGE, TAB_CHANGE) +│ ├── rpc.ts # RPC client +│ ├── tabTools.ts # Agent tools for tab control +│ └── useAgent.ts # React hook +├── entrypoints/ +│ ├── background.ts # Stateless SW relay +│ ├── content.ts # Content script with storage polling +│ └── sidepanel/ +│ ├── App.tsx +│ ├── components/ +│ ├── index.html +│ └── main.tsx +├── components/ui/ +└── utils/ ``` -### Key Concepts - -- **browserActiveTab** - The tab currently visible to the user (tracked via `chrome.tabs.onActivated`) -- **agentCurrentTab** - The tab agent is operating on (`TabsManager.currentTabId`) -- **windowHasFocus** - Whether browser window has focus (tracked via `chrome.windows.onFocusChanged`) - -### State Transitions - -| Event | Action | -|-------|--------| -| Agent starts | Show mask if current tab is in foreground | -| Agent stops | Hide mask | -| User switches to agent's tab | Show mask | -| User switches away from agent's tab | Hide mask | -| Window loses focus | Hide mask | -| Window regains focus | Show mask if on agent's tab | -| Agent switches to different tab | Sync mask based on new state | -| Page reloads | Content script queries `shouldShowMask` | - -### Implementation - -- `AgentController.syncMaskState()` - Syncs mask visibility based on current state -- `AgentController.shouldShowMaskForTab(tabId)` - Used by content script queries -- Background forwards `activated` and `windowFocusChanged` events to SidePanel -- `RemotePageController` does NOT auto-show mask on tab switch (controlled by AgentController) - -## Configuration - -LLM config (apiKey, baseURL, model) is stored in `chrome.storage.local`. This persists across sessions and is managed via the ConfigPanel. - ## Security -1. **API Key Storage** - Keys in `chrome.storage.local` (extension-only access) +1. **API Key Storage** - Keys in `chrome.storage.local` 2. **Content Script Isolation** - Runs in isolated world -3. **Tab Restriction** - Agent can only control tabs it opened or started from -4. **No Arbitrary Tab Access** - Cannot switch to unmanaged tabs - -## Development - -```bash -# Install dependencies -npm install - -# Start development server -npm run dev - -# Build for production -npm run build - -# Package extension -npm run zip -``` +3. **Tab Restriction** - Agent only controls its own tabs