feat: multi tabs control

This commit is contained in:
Simon
2026-01-24 19:29:27 +08:00
parent 2aa9c3b978
commit fa5ab9d567
17 changed files with 2303 additions and 1061 deletions

View File

@@ -4,7 +4,11 @@
* This class implements the same interface as PageController but forwards
* all method calls via RPC to the real PageController running in ContentScript.
* This allows PageAgentCore to work transparently with remote DOM operations.
*
* Tab targeting is managed externally by TabsManager via setTargetTab().
*/
import type { PageController } from '@page-agent/page-controller'
import type {
ActionResult,
BrowserState,
@@ -13,6 +17,32 @@ import type {
} from '../messaging/protocol'
import { type RPCClient, createRPCClient } from '../messaging/rpc'
const DEBUG_PREFIX = '[RemotePageController]'
/**
* Check if a URL can run content scripts.
* Chrome extensions cannot inject content scripts into certain pages.
*/
export function isContentScriptAllowed(url: string | undefined): boolean {
if (!url) return false
// Restricted URL patterns
const restrictedPatterns = [
/^chrome:\/\//,
/^chrome-extension:\/\//,
/^about:/,
/^edge:\/\//,
/^brave:\/\//,
/^opera:\/\//,
/^vivaldi:\/\//,
/^file:\/\//,
/^view-source:/,
/^devtools:\/\//,
]
return !restrictedPatterns.some((pattern) => pattern.test(url))
}
/**
* RemotePageController is a proxy that implements the PageController interface.
* All methods are async and forward to ContentScript via RPC.
@@ -20,30 +50,133 @@ import { type RPCClient, createRPCClient } from '../messaging/rpc'
* This class extends EventTarget to maintain API compatibility with PageController,
* though events in the remote context are not currently bridged.
*/
export class RemotePageController extends EventTarget {
private rpc: RPCClient
private _tabId: number | null = null
private _tabIdPromise: Promise<number>
export class RemotePageController {
private rpc: RPCClient | null = null
private _currentTabId: number | null = null
private _currentTabUrl: string | undefined = undefined
private _previousTabId: number | null = null
/** Get the target tab ID (null if not yet resolved) */
get tabId(): number | null {
return this._tabId
/** Get the current target tab ID */
get currentTabId(): number | null {
return this._currentTabId
}
/** Get the promise that resolves to the target tab ID */
get tabIdPromise(): Promise<number> {
return this._tabIdPromise
/** Get the current target tab URL */
get currentTabUrl(): string | undefined {
return this._currentTabUrl
}
constructor() {
super()
// Capture the active tab ID at construction time to avoid issues when tab loses focus
this._tabIdPromise = chrome.tabs.query({ active: true, currentWindow: true }).then(([tab]) => {
if (!tab?.id) throw new Error('No active tab found')
this._tabId = tab.id
return tab.id
})
this.rpc = createRPCClient(this._tabIdPromise)
/** Check if current tab supports content scripts */
get isCurrentTabAccessible(): boolean {
return isContentScriptAllowed(this._currentTabUrl)
}
// Tab ID is now set externally via setTargetTab()
/**
* Set the target tab for all RPC operations.
* Called by TabsManager when switching tabs.
* Handles cleanup on old tab and mask show on new tab.
*/
async setTargetTab(tabId: number): Promise<void> {
const previousTabId = this._currentTabId
const previousRpc = this.rpc
console.debug(`${DEBUG_PREFIX} setTargetTab: ${previousTabId}${tabId}`)
// Clean up old tab completely (highlights + mask)
if (previousTabId && previousTabId !== tabId && previousRpc) {
console.debug(`${DEBUG_PREFIX} Cleaning up previous tab ${previousTabId}`)
try {
// Clean up highlights first - this is important for visual cleanup
await previousRpc.cleanUpHighlights()
} catch (e) {
console.debug(
`${DEBUG_PREFIX} cleanUpHighlights on tab ${previousTabId} failed (ignored):`,
e
)
}
try {
await previousRpc.hideMask()
} catch (e) {
console.debug(`${DEBUG_PREFIX} hideMask on tab ${previousTabId} failed (ignored):`, e)
}
}
// Get tab info to check URL
const tab = await chrome.tabs.get(tabId)
const tabUrl = tab.url
// Update state
this._previousTabId = previousTabId
this._currentTabId = tabId
this._currentTabUrl = tabUrl
// Check if this tab can run content scripts
if (!isContentScriptAllowed(tabUrl)) {
console.debug(`${DEBUG_PREFIX} Tab ${tabId} cannot run content scripts: ${tabUrl}`)
// Clear RPC - operations will return restricted page state
this.rpc = null
return
}
// Create new RPC client for the new tab
this.rpc = createRPCClient(tabId)
// Verify content script is ready by making a test call
// This uses the retry mechanism to wait for content script initialization
try {
await this.rpc.getLastUpdateTime()
console.debug(`${DEBUG_PREFIX} Content script ready on tab ${tabId}`)
} catch (error) {
console.error(`${DEBUG_PREFIX} Content script not ready on tab ${tabId}:`, error)
// Don't clear rpc - subsequent calls will retry and may succeed
}
// Show mask on new tab
try {
await this.rpc.showMask()
console.debug(`${DEBUG_PREFIX} Mask shown on tab ${tabId}`)
} catch (error) {
console.error(`${DEBUG_PREFIX} Failed to show mask on tab ${tabId}:`, error)
// Continue anyway - mask is optional
}
console.debug(`${DEBUG_PREFIX} Target tab set to ${tabId}`)
}
/**
* Ensure RPC client is initialized
* @throws Error if setTargetTab() has not been called
*/
private ensureInitialized(): void {
if (!this._currentTabId) {
throw new Error('RemotePageController not initialized. Call setTargetTab() first.')
}
}
/**
* Create a browser state for restricted pages that cannot run content scripts.
* Treats restricted pages as empty pages rather than errors.
*/
private createRestrictedPageState(): BrowserState {
return {
url: this._currentTabUrl || '',
title: '',
header: '',
content: '(empty page)',
footer: '',
}
}
/**
* Create a no-op action result for restricted pages
*/
private createRestrictedActionResult(action: string): ActionResult {
return {
success: false,
message: `Cannot ${action} on this page. Use open_new_tab to navigate to a web page first.`,
}
}
// ======= State Queries =======
@@ -52,13 +185,15 @@ export class RemotePageController extends EventTarget {
* Get current page URL
*/
async getCurrentUrl(): Promise<string> {
return this.rpc.getCurrentUrl()
// Can return URL even for restricted pages
return this._currentTabUrl || ''
}
/**
* Get last tree update timestamp
*/
async getLastUpdateTime(): Promise<number> {
if (!this.rpc) return Date.now()
return this.rpc.getLastUpdateTime()
}
@@ -66,6 +201,10 @@ export class RemotePageController extends EventTarget {
* Get structured browser state for LLM consumption.
*/
async getBrowserState(): Promise<BrowserState> {
// Return restricted page state if content scripts cannot run
if (!this.rpc) {
return this.createRestrictedPageState()
}
return this.rpc.getBrowserState()
}
@@ -75,6 +214,8 @@ export class RemotePageController extends EventTarget {
* Update DOM tree, returns simplified HTML for LLM.
*/
async updateTree(): Promise<string> {
this.ensureInitialized()
if (!this.rpc) return '(empty page)'
return this.rpc.updateTree()
}
@@ -82,6 +223,7 @@ export class RemotePageController extends EventTarget {
* Clean up all element highlights
*/
async cleanUpHighlights(): Promise<void> {
if (!this.rpc) return
return this.rpc.cleanUpHighlights()
}
@@ -91,6 +233,8 @@ export class RemotePageController extends EventTarget {
* Click element by index
*/
async clickElement(index: number): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('click')
return this.rpc.clickElement(index)
}
@@ -98,6 +242,8 @@ export class RemotePageController extends EventTarget {
* Input text into element by index
*/
async inputText(index: number, text: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('input text')
return this.rpc.inputText(index, text)
}
@@ -105,6 +251,8 @@ export class RemotePageController extends EventTarget {
* Select dropdown option by index and option text
*/
async selectOption(index: number, optionText: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('select option')
return this.rpc.selectOption(index, optionText)
}
@@ -112,6 +260,8 @@ export class RemotePageController extends EventTarget {
* Scroll vertically
*/
async scroll(options: ScrollOptions): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('scroll')
return this.rpc.scroll(options)
}
@@ -119,6 +269,8 @@ export class RemotePageController extends EventTarget {
* Scroll horizontally
*/
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('scroll')
return this.rpc.scrollHorizontally(options)
}
@@ -126,6 +278,8 @@ export class RemotePageController extends EventTarget {
* Execute arbitrary JavaScript on the page
*/
async executeJavascript(script: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('execute script')
return this.rpc.executeJavascript(script)
}
@@ -135,6 +289,7 @@ export class RemotePageController extends EventTarget {
* Show the visual mask overlay.
*/
async showMask(): Promise<void> {
if (!this.rpc) return
return this.rpc.showMask()
}
@@ -142,15 +297,38 @@ export class RemotePageController extends EventTarget {
* Hide the visual mask overlay.
*/
async hideMask(): Promise<void> {
if (!this.rpc) return
return this.rpc.hideMask()
}
/**
* Dispose and clean up resources
* Dispose and clean up resources on current tab
*/
dispose(): void {
this.rpc.dispose().catch(() => {
// Ignore errors on dispose
console.debug(`${DEBUG_PREFIX} dispose() called, current tab: ${this._currentTabId}`)
if (this.rpc) {
this.rpc.dispose().catch((e) => {
console.debug(`${DEBUG_PREFIX} dispose RPC failed (ignored):`, e)
})
}
this._currentTabId = null
this._previousTabId = null
this.rpc = null
}
/**
* Dispose PageController on a specific tab (cleanup for multi-tab scenarios)
*/
async disposeTab(tabId: number): Promise<void> {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId})`)
try {
const rpc = createRPCClient(tabId)
await rpc.cleanUpHighlights()
await rpc.hideMask()
await rpc.dispose()
console.debug(`${DEBUG_PREFIX} Tab ${tabId} disposed successfully`)
} catch (e) {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed (ignored):`, e)
}
}
}

View File

@@ -0,0 +1,566 @@
/**
* TabsManager - Manages multiple browser tabs for agent automation
*
* Responsibilities:
* - Maintain initialTabId (tab where user started the task)
* - Maintain currentTabId (current operation target)
* - Maintain currentTabHistory (history stack for fallback)
* - Maintain managedTabIds (tabs opened by agent)
* - Manage Chrome Tab Group (named "Task(<taskId>)")
* - Listen to chrome.tabs.onRemoved for tab close handling
*/
import { type RemotePageController, isContentScriptAllowed } from './RemotePageController'
const DEBUG_PREFIX = '[TabsManager]'
/** Tab info for display in browser state */
export interface TabInfo {
id: number
url: string
title: string
isInitial: boolean
isCurrent: boolean
/** Whether content scripts can run on this page */
isAccessible: boolean
}
/** Changes since last getAndClearChanges() call */
export interface TabChanges {
opened: TabInfo[]
closed: { id: number; url: string; title: string }[]
currentSwitched?: { from: number; to: number; reason: 'user_close' | 'explicit' }
}
/** Tab group colors supported by Chrome */
const TAB_GROUP_COLORS = [
'grey',
'blue',
'red',
'yellow',
'green',
'pink',
'purple',
'cyan',
] as const
type TabGroupColor = (typeof TAB_GROUP_COLORS)[number]
function randomColor(): TabGroupColor {
return TAB_GROUP_COLORS[Math.floor(Math.random() * TAB_GROUP_COLORS.length)]
}
export class TabsManager {
/** Tab where user started the task */
private initialTabId: number | null = null
/** Current operation target tab */
private currentTabId: number | null = null
/** History stack for current tab (for fallback on close) */
private currentTabHistory: number[] = []
/** Tabs opened by agent (not including initial tab) */
private managedTabIds = new Set<number>()
/** Tab group ID for managed tabs */
private tabGroupId: number | null = null
/** Task ID for group naming */
private taskId: string = ''
/** Reference to RemotePageController for tab switching */
private pageController: RemotePageController | null = null
/** Pending changes for observation generation */
private pendingChanges: TabChanges = { opened: [], closed: [] }
/** Tab info cache for closed tab reporting */
private tabInfoCache = new Map<number, { url: string; title: string }>()
/** Whether manager is disposed */
private disposed = false
/** Bound handler for cleanup */
private onTabRemovedHandler: (tabId: number) => void
constructor() {
this.onTabRemovedHandler = this.onTabRemoved.bind(this)
}
/**
* Initialize the manager with current active tab
*/
async init(taskId: string, pageController: RemotePageController): Promise<void> {
this.taskId = taskId
this.pageController = pageController
this.disposed = false
// Get current active tab as initial tab
const [activeTab] = await chrome.tabs.query({
active: true,
currentWindow: true,
})
if (!activeTab?.id) {
throw new Error('No active tab found')
}
this.initialTabId = activeTab.id
this.currentTabId = activeTab.id
this.currentTabHistory = []
this.managedTabIds.clear()
this.pendingChanges = { opened: [], closed: [] }
// Cache initial tab info
this.tabInfoCache.set(activeTab.id, {
url: activeTab.url || '',
title: activeTab.title || '',
})
// Set target tab on page controller
await pageController.setTargetTab(activeTab.id)
// Register tab removal listener
chrome.tabs.onRemoved.addListener(this.onTabRemovedHandler)
console.debug(`${DEBUG_PREFIX} Initialized with tab:`, activeTab.id)
}
/**
* Open a new tab and set it as current
*/
async openNewTab(url: string): Promise<{ tabId: number; message: string }> {
if (!this.initialTabId || !this.pageController) {
throw new Error('TabsManager not initialized')
}
// Create new tab next to current tab
const newTab = await chrome.tabs.create({
url,
active: false, // Don't activate - agent controls focus via mask
openerTabId: this.currentTabId ?? this.initialTabId,
})
if (!newTab.id) {
throw new Error('Failed to create new tab')
}
const tabId = newTab.id
// Add to managed tabs
this.managedTabIds.add(tabId)
// Create or update tab group
await this.ensureTabGroup(tabId)
// Wait for page to complete loading before switching
// This ensures content script is ready when we set target tab
await this.waitForTabComplete(tabId)
// Get updated tab info after load
const loadedTab = await chrome.tabs.get(tabId)
const loadedUrl = loadedTab.url || url
// Cache tab info
this.tabInfoCache.set(tabId, {
url: loadedUrl,
title: loadedTab.title || url,
})
// Record change
this.pendingChanges.opened.push({
id: tabId,
url: loadedUrl,
title: loadedTab.title || url,
isInitial: false,
isCurrent: true,
isAccessible: isContentScriptAllowed(loadedUrl),
})
// Switch to new tab (content script should be ready now)
await this.switchToTab(tabId)
return {
tabId,
message: `Opened new tab [${tabId}] with URL: ${url}`,
}
}
/**
* Wait for a tab to complete loading
*/
private waitForTabComplete(tabId: number, timeoutMs = 30_000): Promise<void> {
return new Promise((resolve, reject) => {
let resolved = false
const cleanup = () => {
if (!resolved) {
resolved = true
clearTimeout(timeout)
chrome.tabs.onUpdated.removeListener(listener)
}
}
const timeout = setTimeout(() => {
cleanup()
reject(new Error(`Tab ${tabId} did not complete loading within ${timeoutMs}ms`))
}, timeoutMs)
const listener = (updatedTabId: number, changeInfo: { status?: string }) => {
if (updatedTabId === tabId && changeInfo.status === 'complete') {
cleanup()
resolve()
}
}
// Add listener FIRST to avoid race condition
chrome.tabs.onUpdated.addListener(listener)
// Then check if already complete
chrome.tabs
.get(tabId)
.then((tab) => {
if (tab.status === 'complete' && !resolved) {
cleanup()
resolve()
}
})
.catch((error: unknown) => {
cleanup()
reject(error instanceof Error ? error : new Error(String(error)))
})
})
}
/**
* Switch current tab to specified tab
*/
async switchToTab(tabId: number): Promise<string> {
if (!this.pageController) {
throw new Error('TabsManager not initialized')
}
// Verify tab exists
try {
await chrome.tabs.get(tabId)
} catch {
throw new Error(`Tab ${tabId} does not exist`)
}
// Verify tab is in our control list
if (tabId !== this.initialTabId && !this.managedTabIds.has(tabId)) {
throw new Error(
`Tab ${tabId} is not in the managed tab list. Only initial tab and tabs opened by agent can be switched to.`
)
}
const previousTabId = this.currentTabId
// Push current to history (if different)
if (this.currentTabId && this.currentTabId !== tabId) {
this.currentTabHistory.push(this.currentTabId)
}
this.currentTabId = tabId
// Update page controller target
await this.pageController.setTargetTab(tabId)
// Update tab info cache
const tab = await chrome.tabs.get(tabId)
this.tabInfoCache.set(tabId, {
url: tab.url || '',
title: tab.title || '',
})
console.debug(`${DEBUG_PREFIX} Switched to tab:`, tabId)
return `Switched to tab [${tabId}]${previousTabId ? ` (from tab [${previousTabId}])` : ''}`
}
/**
* Close a tab, optionally switch to specified tab
*/
async closeTab(tabId: number, switchTo?: number): Promise<string> {
if (!this.pageController) {
throw new Error('TabsManager not initialized')
}
// Cannot close initial tab
if (tabId === this.initialTabId) {
throw new Error('Cannot close the initial tab')
}
// Verify tab is managed
if (!this.managedTabIds.has(tabId)) {
throw new Error(`Tab ${tabId} is not in the managed tab list`)
}
// Get tab info before closing
const tabInfo = this.tabInfoCache.get(tabId)
// If closing current tab, determine switch target
if (tabId === this.currentTabId) {
const targetTabId = switchTo ?? this.findFallbackTab(tabId)
if (targetTabId) {
await this.switchToTab(targetTabId)
}
}
// Close the tab
await chrome.tabs.remove(tabId)
// Clean up
this.managedTabIds.delete(tabId)
this.tabInfoCache.delete(tabId)
this.currentTabHistory = this.currentTabHistory.filter((id) => id !== tabId)
// Record change
if (tabInfo) {
this.pendingChanges.closed.push({
id: tabId,
url: tabInfo.url,
title: tabInfo.title,
})
}
return `Closed tab [${tabId}]${switchTo ? ` and switched to tab [${switchTo}]` : ''}`
}
/**
* Get list of all tabs under control
*/
async getTabList(): Promise<TabInfo[]> {
const tabs: TabInfo[] = []
// Add initial tab
if (this.initialTabId) {
try {
const tab = await chrome.tabs.get(this.initialTabId)
const url = tab.url || ''
tabs.push({
id: tab.id!,
url,
title: tab.title || '',
isInitial: true,
isCurrent: tab.id === this.currentTabId,
isAccessible: isContentScriptAllowed(url),
})
// Update cache
this.tabInfoCache.set(tab.id!, { url, title: tab.title || '' })
} catch {
// Initial tab was closed - will be handled by onRemoved
}
}
// Add managed tabs
for (const tabId of this.managedTabIds) {
try {
const tab = await chrome.tabs.get(tabId)
const url = tab.url || ''
tabs.push({
id: tab.id!,
url,
title: tab.title || '',
isInitial: false,
isCurrent: tab.id === this.currentTabId,
isAccessible: isContentScriptAllowed(url),
})
// Update cache
this.tabInfoCache.set(tab.id!, { url, title: tab.title || '' })
} catch {
// Tab was closed - will be handled by onRemoved
}
}
return tabs
}
/**
* Get current tab ID
*/
getCurrentTabId(): number | null {
return this.currentTabId
}
/**
* Get and clear pending changes (for observation generation)
*/
getAndClearChanges(): TabChanges {
const changes = this.pendingChanges
this.pendingChanges = { opened: [], closed: [] }
return changes
}
/**
* Check if a tab is managed by this manager (initial or opened by agent)
*/
isTabManaged(tabId: number): boolean {
return tabId === this.initialTabId || this.managedTabIds.has(tabId)
}
/**
* Get all managed tab IDs (initial + agent-opened tabs)
*/
getAllManagedTabIds(): number[] {
const ids: number[] = []
if (this.initialTabId) ids.push(this.initialTabId)
for (const id of this.managedTabIds) {
ids.push(id)
}
return ids
}
/**
* Dispose PageController on all managed tabs.
* This cleans up highlights and masks on every tab.
* Should be called before dispose() to ensure clean state.
*/
async disposeAllPageControllers(): Promise<void> {
if (!this.pageController) return
const allTabIds = this.getAllManagedTabIds()
console.debug(
`${DEBUG_PREFIX} Disposing PageControllers on ${allTabIds.length} tabs:`,
allTabIds
)
// Dispose each tab in parallel
await Promise.all(
allTabIds.map((tabId) =>
this.pageController!.disposeTab(tabId).catch((e) => {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed:`, e)
})
)
)
console.debug(`${DEBUG_PREFIX} All PageControllers disposed`)
}
/**
* Dispose manager and clean up
* Note: Tab group is intentionally kept - only internal state is cleared
*/
dispose(): void {
if (this.disposed) return
this.disposed = true
console.debug(`${DEBUG_PREFIX} dispose() called`)
// Remove listener
chrome.tabs.onRemoved.removeListener(this.onTabRemovedHandler)
// Clear internal state only - keep tab group intact for user
this.initialTabId = null
this.currentTabId = null
this.currentTabHistory = []
this.managedTabIds.clear()
this.tabGroupId = null
this.pageController = null
this.tabInfoCache.clear()
this.pendingChanges = { opened: [], closed: [] }
console.debug(`${DEBUG_PREFIX} Disposed`)
}
/**
* Handle tab removal event
*/
private async onTabRemoved(tabId: number): Promise<void> {
if (this.disposed) return
// Check if it's a tab we care about
const isInitial = tabId === this.initialTabId
const isManaged = this.managedTabIds.has(tabId)
if (!isInitial && !isManaged) return
console.debug(`${DEBUG_PREFIX} Tab removed:`, tabId, { isInitial, isManaged })
// Get cached info for change reporting
const tabInfo = this.tabInfoCache.get(tabId)
if (tabInfo) {
this.pendingChanges.closed.push({
id: tabId,
url: tabInfo.url,
title: tabInfo.title,
})
}
// Clean up
this.managedTabIds.delete(tabId)
this.tabInfoCache.delete(tabId)
this.currentTabHistory = this.currentTabHistory.filter((id) => id !== tabId)
// If initial tab was closed, this is fatal
if (isInitial) {
this.initialTabId = null
console.error(`${DEBUG_PREFIX} Initial tab was closed - task should fail`)
// The agent will detect this via getTabList() and handle appropriately
return
}
// If current tab was closed, fallback to previous
if (tabId === this.currentTabId && this.pageController) {
const fallbackTabId = this.findFallbackTab(tabId)
if (fallbackTabId) {
this.pendingChanges.currentSwitched = {
from: tabId,
to: fallbackTabId,
reason: 'user_close',
}
// Don't await - fire and forget to avoid blocking
this.switchToTab(fallbackTabId).catch(() => {
// Ignore - tab switch failed but we're already in error recovery
})
}
}
}
/**
* Find fallback tab when current tab is closed
*/
private findFallbackTab(closedTabId: number): number | null {
// Try history stack (most recent first)
while (this.currentTabHistory.length > 0) {
const tabId = this.currentTabHistory.pop()!
if (tabId !== closedTabId && (tabId === this.initialTabId || this.managedTabIds.has(tabId))) {
return tabId
}
}
// Fall back to initial tab
if (this.initialTabId && this.initialTabId !== closedTabId) {
return this.initialTabId
}
return null
}
/**
* Ensure tab group exists and add tab to it
*/
private async ensureTabGroup(tabId: number): Promise<void> {
try {
if (this.tabGroupId === null) {
// Create new group
this.tabGroupId = await chrome.tabs.group({ tabIds: [tabId] })
// Set group properties
await chrome.tabGroups.update(this.tabGroupId, {
title: `Task(${this.taskId.slice(0, 8)})`,
color: randomColor(),
collapsed: false,
})
console.debug(`${DEBUG_PREFIX} Created tab group:`, this.tabGroupId)
} else {
// Add to existing group
await chrome.tabs.group({
tabIds: [tabId],
groupId: this.tabGroupId,
})
}
} catch (error) {
console.debug(`${DEBUG_PREFIX} Failed to manage tab group:`, error)
// Non-fatal - continue without grouping
}
}
}

View File

@@ -0,0 +1,70 @@
/**
* Tab control tools for browser extension
*
* These tools allow the agent to manage multiple browser tabs:
* - open_new_tab: Open a new tab and set it as current
* - switch_to_tab: Switch to an existing tab
* - close_tab: Close a tab (optionally switch to another)
*/
import zod from 'zod'
import type { TabsManager } from './TabsManager'
/** Tool definition compatible with PageAgentCore customTools */
interface TabTool {
description: string
inputSchema: zod.ZodType
execute: (input: unknown) => Promise<string>
}
/**
* Create tab control tools bound to a TabsManager instance.
* These tools are injected into PageAgentCore via customTools config.
*/
export function createTabTools(tabsManager: TabsManager): Record<string, TabTool> {
return {
open_new_tab: {
description:
'Open a new browser tab with the specified URL. The new tab becomes the current tab for all subsequent page operations.',
inputSchema: zod.object({
url: zod.string().describe('The URL to open in the new tab'),
}),
execute: async (input: unknown) => {
const { url } = input as { url: string }
const result = await tabsManager.openNewTab(url)
return result.message
},
},
switch_to_tab: {
description:
'Switch to an existing tab by its ID. After switching, all page operations will target the new current tab. You can only switch to tabs in the tab list shown in browser state.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to switch to'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }
return tabsManager.switchToTab(tab_id)
},
},
close_tab: {
description:
'Close a tab by its ID. Cannot close the initial tab. Optionally specify which tab to switch to after closing.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to close'),
switch_to: zod
.number()
.int()
.optional()
.describe(
'Optional: Tab ID to switch to after closing. If not specified, will switch to previous tab in history.'
),
}),
execute: async (input: unknown) => {
const { tab_id, switch_to } = input as { tab_id: number; switch_to?: number }
return tabsManager.closeTab(tab_id, switch_to)
},
},
}
}

View File

@@ -1,259 +1,191 @@
/**
* Background Script Entry Point
* Background Script (Service Worker) - Stateless Message Relay
*
* This script runs as the extension's service worker and hosts:
* - PageAgentCore (headless agent)
* - RemotePageController (proxy to ContentScript)
* - Command handlers for SidePanel
* - Event broadcasting to SidePanel
* MV3 COMPLIANT: This script is completely stateless.
* It only relays messages between contexts:
* - SidePanel ↔ ContentScript (RPC for PageController)
* - ContentScript → SidePanel (queries like shouldShowMask)
* - Tab events → SidePanel (chrome.tabs API events)
*
* NO agent logic, NO state, NO long-running operations.
*/
import { PageAgentCore } from '@page-agent/core'
import { RemotePageController } from '../agent/RemotePageController'
import { eventBroadcaster } from '../messaging/events'
import {
type AgentActivity,
type AgentState,
type AgentStatus,
type HistoricalEvent,
agentCommands,
contentScriptQuery,
type CSQueryMessage,
type CSRPCMessage,
type ExtensionMessage,
type QueryResponseMessage,
type RPCCallMessage,
type RPCResponseMessage,
type TabEventMessage,
generateMessageId,
isExtensionMessage,
} from '../messaging/protocol'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '../utils/constants'
// Agent instance (singleton for now - single page control)
let agent: PageAgentCore | null = null
// Track the target tab ID for event filtering
let targetTabId: number | null = null
// ============================================================================
// Message Relay Handlers
// ============================================================================
// LLM configuration (persisted in storage)
interface LLMConfig {
apiKey: string
baseURL: string
model: string
/**
* Handle messages from SidePanel and ContentScript
*/
chrome.runtime.onMessage.addListener(
(
message: unknown,
sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) {
return false
}
// Default to demo config
let llmConfig: LLMConfig = {
apiKey: DEMO_API_KEY,
baseURL: DEMO_BASE_URL,
model: DEMO_MODEL,
const msg = message as ExtensionMessage
switch (msg.type) {
case 'rpc:call':
// SidePanel → SW: Forward RPC to content script
handleRPCCall(msg as RPCCallMessage)
return false // No sync response needed
case 'cs:query':
// ContentScript → SW: Forward query to sidepanel
handleCSQuery(msg as CSQueryMessage, sender)
return false
default:
return false
}
}
)
/**
* Forward RPC call from SidePanel to ContentScript
*/
async function handleRPCCall(msg: RPCCallMessage): Promise<void> {
const { id, tabId, method, args } = msg
// Create message for content script
const csMessage: CSRPCMessage = {
type: 'cs:rpc',
id,
method,
args,
}
try {
// Send to content script and wait for response
const result = await chrome.tabs.sendMessage(tabId, csMessage)
// Forward response back to sidepanel
const response: RPCResponseMessage = {
type: 'rpc:response',
id,
success: true,
result,
}
await chrome.runtime.sendMessage(response)
} catch (error) {
// Forward error back to sidepanel
const response: RPCResponseMessage = {
type: 'rpc:response',
id,
success: false,
error: error instanceof Error ? error.message : String(error),
}
await chrome.runtime.sendMessage(response).catch(() => {
// Sidepanel may be closed
})
}
}
/**
* Forward query from ContentScript to SidePanel
*/
async function handleCSQuery(
msg: CSQueryMessage,
sender: chrome.runtime.MessageSender
): Promise<void> {
const { id, queryType, tabId } = msg
// For shouldShowMask, we need to ask the sidepanel
// Since sidepanel may not be open, we'll use a timeout approach
// The sidepanel registers a listener for these queries
try {
// Broadcast to sidepanel (it will respond via query:response)
const response = await chrome.runtime.sendMessage(msg)
// Forward response back to content script
if (sender.tab?.id) {
const queryResponse: QueryResponseMessage = {
type: 'query:response',
id,
result: response,
}
await chrome.tabs.sendMessage(sender.tab.id, queryResponse)
}
} catch (error) {
// Sidepanel not open or no response, return default
if (sender.tab?.id) {
const queryResponse: QueryResponseMessage = {
type: 'query:response',
id,
result: queryType === 'shouldShowMask' ? false : null,
}
await chrome.tabs.sendMessage(sender.tab.id, queryResponse).catch(() => {})
}
}
}
// ============================================================================
// Tab Event Forwarding
// ============================================================================
/**
* Forward tab removed events to sidepanel
*/
chrome.tabs.onRemoved.addListener((tabId) => {
const message: TabEventMessage = {
type: 'tab:event',
id: generateMessageId(),
eventType: 'removed',
tabId,
}
chrome.runtime.sendMessage(message).catch(() => {
// Sidepanel may not be open
})
})
/**
* Forward tab updated events to sidepanel
*/
chrome.tabs.onUpdated.addListener((tabId, changeInfo) => {
// Only forward loading/complete status changes
if (!changeInfo.status) return
const message: TabEventMessage = {
type: 'tab:event',
id: generateMessageId(),
eventType: 'updated',
tabId,
data: {
status: changeInfo.status,
url: changeInfo.url,
},
}
chrome.runtime.sendMessage(message).catch(() => {
// Sidepanel may not be open
})
})
// ============================================================================
// Extension Setup
// ============================================================================
export default defineBackground(() => {
console.log('[PageAgentExt] Background script started')
// Load saved config from storage
loadConfig()
// Register command handlers
registerCommandHandlers()
// Register tab event listeners for page reload/close detection
registerTabEventListeners()
// Register content script notification handlers
registerContentScriptHandlers()
console.log('[Background] Service Worker started (stateless relay mode)')
// Open sidepanel on action click
chrome.sidePanel
.setPanelBehavior({ openPanelOnActionClick: true })
.catch((error) => console.error('[PageAgentExt] Failed to set panel behavior:', error))
})
/**
* Load LLM configuration from storage (falls back to demo config)
*/
async function loadConfig(): Promise<void> {
const result = await chrome.storage.local.get('llmConfig')
if (result.llmConfig) {
llmConfig = result.llmConfig as LLMConfig
console.log('[PageAgentExt] Loaded LLM config from storage')
} else {
console.log('[PageAgentExt] Using default demo config')
}
}
/**
* Save LLM configuration to storage
*/
async function saveConfig(config: LLMConfig): Promise<void> {
llmConfig = config
await chrome.storage.local.set({ llmConfig: config })
console.log('[PageAgentExt] Saved LLM config')
}
/**
* Get current agent state snapshot
*/
function getAgentState(): AgentState {
if (!agent) {
return {
status: 'idle',
task: '',
history: [],
}
}
return {
status: agent.status as AgentStatus,
task: agent.task,
history: agent.history as HistoricalEvent[],
}
}
/**
* Create and configure agent instance
*/
function createAgent(): PageAgentCore {
const pageController = new RemotePageController()
// Track the target tab ID for event filtering
pageController.tabIdPromise.then((tabId) => {
targetTabId = tabId
console.log('[PageAgentExt] Tracking tab:', tabId)
})
const newAgent = new PageAgentCore({
...llmConfig,
pageController: pageController as any, // Type assertion for interface compatibility
language: 'en-US',
})
// Forward agent events to SidePanel
newAgent.addEventListener('statuschange', () => {
eventBroadcaster.status(newAgent.status as AgentStatus)
})
newAgent.addEventListener('historychange', () => {
eventBroadcaster.history(newAgent.history as HistoricalEvent[])
})
newAgent.addEventListener('activity', (e) => {
const activity = (e as CustomEvent).detail as AgentActivity
eventBroadcaster.activity(activity)
})
newAgent.addEventListener('dispose', () => {
if (agent === newAgent) {
agent = null
targetTabId = null
}
eventBroadcaster.status('idle')
})
return newAgent
}
/**
* Register command handlers for SidePanel communication
*/
function registerCommandHandlers(): void {
// Execute task
agentCommands.onMessage('agent:execute', async ({ data: task }) => {
console.log('[PageAgentExt] Executing task:', task)
// Create new agent if needed
if (!agent || agent.disposed) {
agent = createAgent()
}
// Execute task (don't await - runs in background)
agent.execute(task).catch((error) => {
console.error('[PageAgentExt] Task execution error:', error)
const message = error instanceof Error ? error.message : String(error)
// Broadcast error as a history event so it persists in UI
const errorEvent: HistoricalEvent = { type: 'error', message }
eventBroadcaster.history([errorEvent])
eventBroadcaster.status('error')
chrome.sidePanel.setPanelBehavior({ openPanelOnActionClick: true }).catch(() => {
// Side panel may not be supported
})
})
// Stop agent
agentCommands.onMessage('agent:stop', async () => {
console.log('[PageAgentExt] Stopping agent')
if (agent) {
agent.dispose('User requested stop')
agent = null
}
})
// Get current state
agentCommands.onMessage('agent:getState', async () => {
return getAgentState()
})
// Configure LLM
agentCommands.onMessage('agent:configure', async ({ data: config }) => {
await saveConfig(config)
// Recreate agent with new config if it exists
if (agent && !agent.disposed) {
agent.dispose('Configuration changed')
agent = null
}
})
console.log('[PageAgentExt] Command handlers registered')
}
/**
* Register tab event listeners for detecting page reload/navigation/close
*/
function registerTabEventListeners(): void {
// Listen for tab updates (page reload, navigation)
chrome.tabs.onUpdated.addListener((tabId, changeInfo, _tab) => {
// Only handle events for the target tab when agent is running
if (!agent || agent.disposed || tabId !== targetTabId) return
if (changeInfo.status === 'loading') {
// Page is reloading or navigating
console.log('[PageAgentExt] Target page is reloading/navigating')
agent.pushObservation(
'⚠️ Page is reloading. DOM state will change - wait for page to stabilize before next action.'
)
}
})
// Listen for tab close
chrome.tabs.onRemoved.addListener((tabId, _removeInfo) => {
// Only handle events for the target tab when agent is running
if (!agent || agent.disposed || tabId !== targetTabId) return
console.log('[PageAgentExt] Target page was closed')
agent.pushObservation(
'⚠️ Target page was closed by user. If this page is required for the task, consider marking the task as failed.'
)
// Clear target tab ID since it no longer exists
targetTabId = null
})
console.log('[PageAgentExt] Tab event listeners registered')
}
/**
* Register handlers for content script queries
*/
function registerContentScriptHandlers(): void {
// Handle shouldShowMask query - content script asks if mask should be shown
contentScriptQuery.onMessage('content:shouldShowMask', async ({ sender }) => {
const tabId = sender.tab?.id
// Check if there's an active task for this tab
const shouldShow = Boolean(tabId && agent && !agent.disposed && tabId === targetTabId)
console.log('[PageAgentExt] shouldShowMask query:', { tabId, targetTabId, shouldShow })
return shouldShow
})
// Handle content script errors - broadcast to sidepanel for user visibility
contentScriptQuery.onMessage('content:error', async ({ data }) => {
console.error('[PageAgentExt] Content script error:', data.message, 'on', data.url)
// Broadcast error to sidepanel
const errorEvent: HistoricalEvent = {
type: 'error',
message: `Content script error on ${data.url}: ${data.message}`,
}
eventBroadcaster.history([errorEvent])
})
console.log('[PageAgentExt] Content script handlers registered')
}

View File

@@ -2,78 +2,72 @@
* Content Script Entry Point
*
* This script runs in the context of web pages and hosts the real PageController.
* It listens for RPC messages from Background and dispatches them to PageController.
* It listens for RPC messages relayed through the Background Script and
* dispatches them to PageController.
*
* PageController is created lazily on first RPC call and can be disposed/recreated
* between tasks. This supports multi-page workflows and ensures clean state.
* Message flow:
* - RPC: SidePanel → SW → ContentScript (this file) → response → SW → SidePanel
* - Query: ContentScript → SW → SidePanel → SW → ContentScript (for shouldShowMask)
*/
import { PageController } from '@page-agent/page-controller'
import { contentScriptQuery, pageControllerRPC } from '../messaging/protocol'
import type {
CSQueryMessage,
CSRPCMessage,
QueryResponseMessage,
RPCMethod,
} from '../messaging/protocol'
import { generateMessageId, isExtensionMessage } from '../messaging/protocol'
const DEBUG_PREFIX = '[ContentScript]'
export default defineContentScript({
matches: ['<all_urls>'],
runAt: 'document_idle',
async main() {
console.log('[PageAgentExt] Content script loaded on', window.location.href)
const pageUrl = window.location.href
console.debug(`${DEBUG_PREFIX} Content script loaded on ${pageUrl}`)
// Lazy-initialized controller - created on demand, disposed between tasks
let controller: PageController | null = null
let initError: Error | null = null
function getController(): PageController {
// Re-throw init error if controller creation previously failed
if (initError) {
console.debug(`${DEBUG_PREFIX} getController: re-throwing init error`)
throw initError
}
if (!controller) {
try {
controller = new PageController({ enableMask: true })
console.log('[PageAgentExt] PageController created')
console.debug(`${DEBUG_PREFIX} PageController created`)
} catch (error) {
initError = error instanceof Error ? error : new Error(String(error))
console.error('[PageAgentExt] Failed to create PageController:', initError)
// Report error to background
reportError(initError.message)
console.error(`${DEBUG_PREFIX} Failed to create PageController:`, initError)
throw initError
}
}
return controller
}
// Register RPC handlers with lazy controller access
registerRPCHandlers(
getController,
() => controller,
() => {
function disposeController(): void {
console.debug(`${DEBUG_PREFIX} Disposing controller...`)
controller?.dispose()
controller = null
initError = null // Clear error on dispose to allow retry
console.log('[PageAgentExt] PageController disposed')
initError = null
console.debug(`${DEBUG_PREFIX} PageController disposed`)
}
)
// Register RPC message handler
registerRPCHandler(getController, () => controller, disposeController)
// Check if there's an active task that needs mask to be shown
// This handles page reload/navigation during task execution
setTimeout(async () => {
try {
const shouldShowMask = await contentScriptQuery.sendMessage(
'content:shouldShowMask',
undefined
)
if (shouldShowMask) {
console.log('[PageAgentExt] Restoring mask after page reload')
await getController().showMask()
}
} catch (error) {
// Ignore errors - background may not be ready
console.log('[PageAgentExt] shouldShowMask check skipped:', error)
}
}, 100)
setTimeout(() => queryShouldShowMask(getController), 100)
// Cleanup on page unload
window.addEventListener('beforeunload', () => {
console.debug(`${DEBUG_PREFIX} Page unloading, disposing controller`)
controller?.dispose()
controller = null
})
@@ -81,84 +75,178 @@ export default defineContentScript({
})
/**
* Report content script error to background for user visibility
* Query the sidepanel (via SW) whether mask should be shown
*/
function reportError(message: string): void {
contentScriptQuery
.sendMessage('content:error', { message, url: window.location.href })
.catch(() => {
// Silently ignore if background is not available
async function queryShouldShowMask(getController: () => PageController): Promise<void> {
const tabId = await getCurrentTabId()
if (!tabId) {
console.debug(`${DEBUG_PREFIX} Cannot query shouldShowMask: no tab ID`)
return
}
const queryId = generateMessageId()
const queryMessage: CSQueryMessage = {
type: 'cs:query',
id: queryId,
queryType: 'shouldShowMask',
tabId,
}
try {
// Set up response listener
const responsePromise = new Promise<boolean>((resolve) => {
const timeout = setTimeout(() => {
chrome.runtime.onMessage.removeListener(listener)
resolve(false)
}, 3000)
const listener = (message: unknown) => {
if (!isExtensionMessage(message)) return
if (message.type !== 'query:response') return
if ((message as QueryResponseMessage).id !== queryId) return
clearTimeout(timeout)
chrome.runtime.onMessage.removeListener(listener)
resolve((message as QueryResponseMessage).result as boolean)
}
chrome.runtime.onMessage.addListener(listener)
})
// Send query
await chrome.runtime.sendMessage(queryMessage)
// Wait for response
const shouldShowMask = await responsePromise
console.debug(`${DEBUG_PREFIX} shouldShowMask result:`, shouldShowMask)
if (shouldShowMask) {
console.debug(`${DEBUG_PREFIX} Restoring mask after page reload`)
await getController().showMask()
}
} catch (error) {
console.debug(`${DEBUG_PREFIX} shouldShowMask query failed:`, error)
}
}
/**
* Register all RPC message handlers for PageController methods
* Get current tab ID
*/
function registerRPCHandlers(
async function getCurrentTabId(): Promise<number | null> {
try {
const response = await chrome.runtime.sendMessage({ type: 'getTabId' })
return response?.tabId ?? null
} catch {
// Fallback: we're in content script, tab ID comes from sender in SW
return null
}
}
/**
* Register RPC message handler
*/
function registerRPCHandler(
getController: () => PageController,
getControllerIfExists: () => PageController | null,
disposeController: () => void
): void {
chrome.runtime.onMessage.addListener(
(
message: unknown,
_sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) return false
if (message.type !== 'cs:rpc') return false
const rpcMessage = message as CSRPCMessage
const { method, args } = rpcMessage
console.debug(`${DEBUG_PREFIX} RPC: ${method}`, args)
// Handle the RPC call
handleRPCCall(method, args, getController, getControllerIfExists, disposeController)
.then((result) => {
sendResponse(result)
})
.catch((error) => {
console.error(`${DEBUG_PREFIX} RPC ${method} failed:`, error)
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
// Return true to indicate async response
return true
}
)
console.debug(`${DEBUG_PREFIX} RPC handler registered`)
}
/**
* Handle an RPC call
*/
async function handleRPCCall(
method: RPCMethod,
args: unknown[],
getController: () => PageController,
getControllerIfExists: () => PageController | null,
disposeController: () => void
): Promise<unknown> {
switch (method) {
// State queries
pageControllerRPC.onMessage('rpc:getCurrentUrl', async () => {
case 'getCurrentUrl':
return getController().getCurrentUrl()
})
pageControllerRPC.onMessage('rpc:getLastUpdateTime', async () => {
case 'getLastUpdateTime':
return getController().getLastUpdateTime()
})
pageControllerRPC.onMessage('rpc:getBrowserState', async () => {
case 'getBrowserState':
return getController().getBrowserState()
})
// DOM operations
pageControllerRPC.onMessage('rpc:updateTree', async () => {
case 'updateTree':
return getController().updateTree()
})
pageControllerRPC.onMessage('rpc:cleanUpHighlights', async () => {
case 'cleanUpHighlights':
await getControllerIfExists()?.cleanUpHighlights()
})
return undefined
// Element actions
pageControllerRPC.onMessage('rpc:clickElement', async ({ data: index }) => {
return getController().clickElement(index)
})
case 'clickElement':
return getController().clickElement(args[0] as number)
pageControllerRPC.onMessage('rpc:inputText', async ({ data }) => {
return getController().inputText(data.index, data.text)
})
case 'inputText':
return getController().inputText(args[0] as number, args[1] as string)
pageControllerRPC.onMessage('rpc:selectOption', async ({ data }) => {
return getController().selectOption(data.index, data.optionText)
})
case 'selectOption':
return getController().selectOption(args[0] as number, args[1] as string)
pageControllerRPC.onMessage('rpc:scroll', async ({ data: options }) => {
return getController().scroll(options)
})
case 'scroll':
return getController().scroll(args[0] as Parameters<PageController['scroll']>[0])
pageControllerRPC.onMessage('rpc:scrollHorizontally', async ({ data: options }) => {
return getController().scrollHorizontally(options)
})
case 'scrollHorizontally':
return getController().scrollHorizontally(
args[0] as Parameters<PageController['scrollHorizontally']>[0]
)
pageControllerRPC.onMessage('rpc:executeJavascript', async ({ data: script }) => {
return getController().executeJavascript(script)
})
case 'executeJavascript':
return getController().executeJavascript(args[0] as string)
// Mask operations
pageControllerRPC.onMessage('rpc:showMask', async () => {
case 'showMask':
await getController().showMask()
})
return undefined
pageControllerRPC.onMessage('rpc:hideMask', async () => {
case 'hideMask':
await getControllerIfExists()?.hideMask()
})
return undefined
// Lifecycle - dispose clears the controller, next call will create fresh one
pageControllerRPC.onMessage('rpc:dispose', async () => {
// Lifecycle
case 'dispose':
disposeController()
})
return undefined
console.log('[PageAgentExt] RPC handlers registered')
default:
throw new Error(`Unknown RPC method: ${method}`)
}
}

View File

@@ -0,0 +1,378 @@
/**
* AgentController - Manages agent lifecycle in SidePanel context
*
* This class encapsulates all agent logic, keeping it isolated from the React UI.
* It runs entirely in the SidePanel frontend context, using the Background Script
* only as a stateless message relay for communicating with content scripts.
*
* Design goals:
* - Agent state lives here, not in Service Worker
* - SW is only a relay - no agent logic there
* - Future-proof: can be moved to other contexts (e.g., a controlling web page)
*/
import { PageAgentCore } from '@page-agent/core'
import type { AgentActivity, AgentStatus, ExecutionResult, HistoricalEvent } from '@page-agent/core'
import { RemotePageController } from '../../agent/RemotePageController'
import { type TabInfo, TabsManager } from '../../agent/TabsManager'
import { createTabTools } from '../../agent/tabTools'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '../../utils/constants'
/** LLM configuration */
export interface LLMConfig {
apiKey: string
baseURL: string
model: string
}
/** Agent state snapshot for UI */
export interface AgentState {
status: AgentStatus
task: string
history: HistoricalEvent[]
}
/** Event types emitted by AgentController */
export interface AgentControllerEvents {
statuschange: AgentStatus
historychange: HistoricalEvent[]
activity: AgentActivity
}
/**
* Format tab list for browser state header
*/
function formatTabListHeader(tabs: TabInfo[], currentTabId: number | null): string {
if (tabs.length === 0) return ''
const lines = ['Tab List:']
for (const tab of tabs) {
const markers: string[] = []
if (tab.isCurrent) markers.push('current')
if (tab.isInitial) markers.push('initial')
if (!tab.isAccessible) markers.push('restricted')
const markerStr = markers.length > 0 ? ` (${markers.join(', ')})` : ''
lines.push(`- [Tab ${tab.id}] ${tab.url}${markerStr}`)
}
const currentTab = tabs.find((t) => t.isCurrent)
lines.push('')
if (currentTab && !currentTab.isAccessible) {
lines.push(
`⚠️ Current tab [${currentTabId}] is a restricted page. Use open_new_tab to navigate to a regular web page.`
)
} else {
lines.push(
`Note: All page info below belongs to current tab [${currentTabId}]. To view or operate on other tabs, use switch_to_tab first.`
)
}
lines.push('')
return lines.join('\n')
}
/**
* AgentController manages the agent lifecycle in the SidePanel.
* Emits events for React UI to subscribe to.
*/
export class AgentController extends EventTarget {
private agent: PageAgentCore | null = null
private tabsManager: TabsManager | null = null
private pageController: RemotePageController | null = null
private llmConfig: LLMConfig
/** Current task being executed */
currentTask = ''
constructor() {
super()
// Default to demo config
this.llmConfig = {
apiKey: DEMO_API_KEY,
baseURL: DEMO_BASE_URL,
model: DEMO_MODEL,
}
}
/**
* Initialize controller and load saved config
*/
async init(): Promise<void> {
await this.loadConfig()
console.log('[AgentController] Initialized')
}
/**
* Load LLM configuration from storage
*/
private async loadConfig(): Promise<void> {
const result = await chrome.storage.local.get('llmConfig')
if (result.llmConfig) {
this.llmConfig = result.llmConfig as LLMConfig
console.log('[AgentController] Loaded LLM config from storage')
} else {
console.log('[AgentController] Using default demo config')
}
}
/**
* Save LLM configuration to storage
*/
async configure(config: LLMConfig): Promise<void> {
this.llmConfig = config
await chrome.storage.local.set({ llmConfig: config })
console.log('[AgentController] Saved LLM config')
// Dispose existing agent if any
if (this.agent && !this.agent.disposed) {
this.agent.dispose()
this.agent = null
}
}
/**
* Get current LLM config
*/
getConfig(): LLMConfig {
return { ...this.llmConfig }
}
/**
* Get current agent state
*/
getState(): AgentState {
if (!this.agent) {
return {
status: 'idle',
task: '',
history: [],
}
}
return {
status: this.agent.status,
task: this.agent.task,
history: this.agent.history,
}
}
/**
* Get current agent status
*/
get status(): AgentStatus {
return this.agent?.status ?? 'idle'
}
/**
* Get agent history
*/
get history(): HistoricalEvent[] {
return this.agent?.history ?? []
}
/**
* Check if a tab is managed by this controller
*/
isTabManaged(tabId: number): boolean {
return this.tabsManager?.isTabManaged(tabId) ?? false
}
/**
* Get current tab ID
*/
getCurrentTabId(): number | null {
return this.tabsManager?.getCurrentTabId() ?? null
}
/**
* Create and configure agent instance
*/
private async createAgent(): Promise<PageAgentCore> {
// Create page controller
this.pageController = new RemotePageController()
// Create tabs manager
this.tabsManager = new TabsManager()
// Generate task ID
const taskId = Math.random().toString(36).slice(2, 10)
// Initialize tabs manager
await this.tabsManager.init(taskId, this.pageController)
// Create tab tools
const tabTools = createTabTools(this.tabsManager)
const newAgent = new PageAgentCore({
...this.llmConfig,
pageController: this.createPageControllerProxy(this.pageController, this.tabsManager) as any,
language: 'en-US',
customTools: tabTools,
onBeforeStep: async (agentInstance: PageAgentCore) => {
// Check for tab changes and push observations
if (this.tabsManager) {
const changes = this.tabsManager.getAndClearChanges()
for (const tab of changes.opened) {
agentInstance.pushObservation(`New tab opened: [Tab ${tab.id}] ${tab.url}`)
}
for (const tab of changes.closed) {
agentInstance.pushObservation(`Tab closed: [Tab ${tab.id}] ${tab.url}`)
}
if (changes.currentSwitched?.reason === 'user_close') {
agentInstance.pushObservation(
`⚠️ Current tab [${changes.currentSwitched.from}] was closed. Auto-switched to tab [${changes.currentSwitched.to}].`
)
}
}
},
})
// Forward agent events
newAgent.addEventListener('statuschange', () => {
this.dispatchEvent(new CustomEvent('statuschange', { detail: newAgent.status }))
})
newAgent.addEventListener('historychange', () => {
this.dispatchEvent(new CustomEvent('historychange', { detail: newAgent.history }))
})
newAgent.addEventListener('activity', (e: Event) => {
const activity = (e as CustomEvent).detail as AgentActivity
this.dispatchEvent(new CustomEvent('activity', { detail: activity }))
})
newAgent.addEventListener('dispose', async () => {
console.debug('[AgentController] Agent dispose event received')
if (this.agent === newAgent) {
// Dispose all PageControllers on all managed tabs
if (this.tabsManager) {
console.debug('[AgentController] Disposing all PageControllers...')
await this.tabsManager.disposeAllPageControllers()
this.tabsManager.dispose()
}
this.agent = null
this.tabsManager = null
this.pageController = null
console.debug('[AgentController] Agent and TabsManager disposed')
}
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'idle' }))
})
return newAgent
}
/**
* Create a proxy for PageController that injects tab info into BrowserState.header
*/
private createPageControllerProxy(
controller: RemotePageController,
tabs: TabsManager
): RemotePageController {
return new Proxy(controller, {
get(target, prop, receiver) {
if (prop === 'getBrowserState') {
return async function () {
const state = await target.getBrowserState()
const tabList = await tabs.getTabList()
const currentTabId = tabs.getCurrentTabId()
const tabHeader = formatTabListHeader(tabList, currentTabId)
return {
...state,
header: tabHeader + (state.header || ''),
}
}
}
return Reflect.get(target, prop, receiver)
},
})
}
/**
* Execute a task
*/
async execute(task: string): Promise<ExecutionResult | null> {
console.log('[AgentController] ===== EXECUTE TASK =====')
console.log('[AgentController] Task:', task)
this.currentTask = task
// Emit running status immediately
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'running' }))
try {
// Clean up any existing agent
if (this.agent && !this.agent.disposed) {
console.log('[AgentController] Disposing existing agent before new task')
this.agent.dispose()
await new Promise((r) => setTimeout(r, 100))
}
// Clear old references
this.agent = null
this.tabsManager = null
this.pageController = null
// Create fresh agent
console.log('[AgentController] Creating new agent...')
this.agent = await this.createAgent()
console.log('[AgentController] Agent created successfully')
// Execute task
console.log('[AgentController] Starting task execution...')
const result = await this.agent.execute(task)
console.log('[AgentController] Task completed:', result)
return result
} catch (error) {
console.error('[AgentController] Task execution error:', error)
const message = error instanceof Error ? error.message : String(error)
this.dispatchEvent(
new CustomEvent('historychange', {
detail: [{ type: 'error', message } as HistoricalEvent],
})
)
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'error' }))
return null
}
}
/**
* Stop current task
*/
stop(): void {
console.log('[AgentController] Stopping agent')
if (this.agent) {
this.agent.dispose()
}
}
/**
* Dispose controller and clean up
*/
dispose(): void {
console.log('[AgentController] Disposing controller')
if (this.agent && !this.agent.disposed) {
this.agent.dispose()
}
this.agent = null
this.tabsManager = null
this.pageController = null
this.currentTask = ''
}
}
// Singleton instance
let controllerInstance: AgentController | null = null
/**
* Get or create the AgentController singleton
*/
export function getAgentController(): AgentController {
if (!controllerInstance) {
controllerInstance = new AgentController()
}
return controllerInstance
}

View File

@@ -8,65 +8,19 @@ import {
InputGroupButton,
InputGroupTextarea,
} from '@/components/ui/input-group'
import { subscribeToEvents } from '@/messaging/events'
import { agentCommands } from '@/messaging/protocol'
import type { AgentActivity, AgentState, AgentStatus, HistoricalEvent } from '@/messaging/protocol'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '@/utils/constants'
import { EmptyState, Logo, StatusDot } from './components'
import { ConfigPanel } from './components/ConfigPanel'
import { ActivityCard, EventCard } from './components/cards'
import { EmptyState, Logo, StatusDot } from './components/misc'
import { useAgent } from './useAgent'
export default function App() {
const [showConfig, setShowConfig] = useState(false)
const [task, setTask] = useState('')
const [status, setStatus] = useState<AgentStatus>('idle')
const [history, setHistory] = useState<HistoricalEvent[]>([])
const [activity, setActivity] = useState<AgentActivity | null>(null)
const [currentTask, setCurrentTask] = useState('')
const historyRef = useRef<HTMLDivElement>(null)
const textareaRef = useRef<HTMLTextAreaElement>(null)
// Subscribe to agent events
useEffect(() => {
// Initialize with demo config if not set
chrome.storage.local.get('llmConfig').then((result) => {
if (!result.llmConfig) {
chrome.storage.local.set({
llmConfig: { apiKey: DEMO_API_KEY, baseURL: DEMO_BASE_URL, model: DEMO_MODEL },
})
}
})
const unsubscribe = subscribeToEvents({
onStatus: (newStatus) => {
setStatus(newStatus)
if (newStatus === 'idle' || newStatus === 'completed' || newStatus === 'error') {
setActivity(null)
}
},
onHistory: (newHistory) => {
setHistory(newHistory)
},
onActivity: (newActivity) => {
setActivity(newActivity)
},
onStateSnapshot: (state) => {
setStatus(state.status)
setHistory(state.history)
setCurrentTask(state.task)
},
})
// Get initial state
agentCommands.sendMessage('agent:getState', undefined).then((state: AgentState) => {
setStatus(state.status)
setHistory(state.history)
setCurrentTask(state.task)
})
return unsubscribe
}, [])
const { status, history, activity, currentTask, config, execute, stop, configure } = useAgent()
// Auto-scroll to bottom on new events
useEffect(() => {
@@ -76,21 +30,25 @@ export default function App() {
}, [history, activity])
const handleSubmit = useCallback(
async (e?: React.FormEvent) => {
(e?: React.FormEvent) => {
e?.preventDefault()
if (!task.trim() || status === 'running') return
setCurrentTask(task)
setHistory([])
await agentCommands.sendMessage('agent:execute', task)
const taskToExecute = task.trim()
setTask('')
console.log('[SidePanel] Executing task:', taskToExecute)
execute(taskToExecute).catch((error) => {
console.error('[SidePanel] Failed to execute task:', error)
})
},
[task, status]
[task, status, execute]
)
const handleStop = useCallback(async () => {
await agentCommands.sendMessage('agent:stop', undefined)
}, [])
const handleStop = useCallback(() => {
console.log('[SidePanel] Stopping task...')
stop()
}, [stop])
const handleKeyDown = (e: React.KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) {
@@ -100,7 +58,16 @@ export default function App() {
}
if (showConfig) {
return <ConfigPanel onClose={() => setShowConfig(false)} />
return (
<ConfigPanel
config={config}
onSave={async (newConfig) => {
await configure(newConfig)
setShowConfig(false)
}}
onClose={() => setShowConfig(false)}
/>
)
}
const isRunning = status === 'running'
@@ -157,7 +124,6 @@ export default function App() {
onChange={(e) => setTask(e.target.value)}
onKeyDown={handleKeyDown}
disabled={isRunning}
// rows={2}
className="text-xs pr-12 min-h-10"
/>
<InputGroupAddon align="inline-end" className="absolute bottom-0 right-0">

View File

@@ -1,34 +1,35 @@
import { Loader2 } from 'lucide-react'
import { useEffect, useState } from 'react'
import { Button } from '@/components/ui/button'
import { Input } from '@/components/ui/input'
import { agentCommands } from '@/messaging'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '@/utils/constants'
// Configuration panel component
export function ConfigPanel({ onClose }: { onClose: () => void }) {
const [apiKey, setApiKey] = useState(DEMO_API_KEY)
const [baseURL, setBaseURL] = useState(DEMO_BASE_URL)
const [model, setModel] = useState(DEMO_MODEL)
import type { LLMConfig } from '../AgentController'
interface ConfigPanelProps {
config: LLMConfig
onSave: (config: LLMConfig) => Promise<void>
onClose: () => void
}
export function ConfigPanel({ config, onSave, onClose }: ConfigPanelProps) {
const [apiKey, setApiKey] = useState(config.apiKey || DEMO_API_KEY)
const [baseURL, setBaseURL] = useState(config.baseURL || DEMO_BASE_URL)
const [model, setModel] = useState(config.model || DEMO_MODEL)
const [saving, setSaving] = useState(false)
// Update local state when config prop changes
useEffect(() => {
chrome.storage.local.get('llmConfig').then((result) => {
const config = result.llmConfig as
| { apiKey?: string; baseURL?: string; model?: string }
| undefined
if (config) {
setApiKey(config.apiKey || DEMO_API_KEY)
setBaseURL(config.baseURL || DEMO_BASE_URL)
setModel(config.model || DEMO_MODEL)
}
})
}, [])
}, [config])
const handleSave = async () => {
setSaving(true)
try {
await agentCommands.sendMessage('agent:configure', { apiKey, baseURL, model })
onClose()
await onSave({ apiKey, baseURL, model })
} finally {
setSaving(false)
}

View File

@@ -1,8 +1,10 @@
import {
type AgentErrorEvent,
type AgentStepEvent,
type ObservationEvent,
type RetryEvent,
import type {
AgentActivity,
AgentErrorEvent,
AgentStepEvent,
HistoricalEvent,
ObservationEvent,
RetryEvent,
} from '@page-agent/core'
import {
CheckCircle,
@@ -21,7 +23,6 @@ import {
import { Fragment, useState } from 'react'
import { cn } from '@/lib/utils'
import { AgentActivity, HistoricalEvent } from '@/messaging'
// Result card for done action
function ResultCard({

View File

@@ -1,5 +1,6 @@
import type { AgentStatus } from '@page-agent/core'
import { cn } from '@/lib/utils'
import { AgentStatus } from '@/messaging'
// Status dot indicator
export function StatusDot({ status }: { status: AgentStatus }) {

View File

@@ -0,0 +1,153 @@
/**
* React hook for using AgentController
*
* This hook provides a React-friendly interface to the AgentController,
* handling event subscriptions and state updates.
*/
import type { AgentActivity, AgentStatus, HistoricalEvent } from '@page-agent/core'
import { useCallback, useEffect, useRef, useState } from 'react'
import type { CSQueryMessage } from '../../messaging/protocol'
import { isExtensionMessage } from '../../messaging/protocol'
import { type AgentController, type LLMConfig, getAgentController } from './AgentController'
export interface UseAgentResult {
// State
status: AgentStatus
history: HistoricalEvent[]
activity: AgentActivity | null
currentTask: string
config: LLMConfig
// Actions
execute: (task: string) => Promise<void>
stop: () => void
configure: (config: LLMConfig) => Promise<void>
}
export function useAgent(): UseAgentResult {
const controllerRef = useRef<AgentController | null>(null)
const [status, setStatus] = useState<AgentStatus>('idle')
const [history, setHistory] = useState<HistoricalEvent[]>([])
const [activity, setActivity] = useState<AgentActivity | null>(null)
const [currentTask, setCurrentTask] = useState('')
const [config, setConfig] = useState<LLMConfig>({
apiKey: '',
baseURL: '',
model: '',
})
// Initialize controller and subscribe to events
useEffect(() => {
const controller = getAgentController()
controllerRef.current = controller
// Initialize
controller.init().then(() => {
setConfig(controller.getConfig())
})
// Event handlers
const handleStatusChange = (e: Event) => {
const newStatus = (e as CustomEvent).detail as AgentStatus
setStatus(newStatus)
if (newStatus === 'idle' || newStatus === 'completed' || newStatus === 'error') {
setActivity(null)
}
}
const handleHistoryChange = (e: Event) => {
const newHistory = (e as CustomEvent).detail as HistoricalEvent[]
setHistory([...newHistory])
}
const handleActivity = (e: Event) => {
const newActivity = (e as CustomEvent).detail as AgentActivity
setActivity(newActivity)
}
controller.addEventListener('statuschange', handleStatusChange)
controller.addEventListener('historychange', handleHistoryChange)
controller.addEventListener('activity', handleActivity)
// Handle shouldShowMask queries from content scripts
const handleMessage = (
message: unknown,
_sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) return false
if (message.type !== 'cs:query') return false
const query = message as CSQueryMessage
if (query.queryType === 'shouldShowMask') {
const ctrl = controllerRef.current
if (!ctrl) {
sendResponse(false)
return true
}
const isManaged = ctrl.isTabManaged(query.tabId)
const isCurrent = ctrl.getCurrentTabId() === query.tabId
const isRunning = ctrl.status === 'running'
const shouldShow = isManaged && isCurrent && isRunning
console.debug('[useAgent] shouldShowMask query:', {
tabId: query.tabId,
isManaged,
isCurrent,
isRunning,
shouldShow,
})
sendResponse(shouldShow)
return true
}
return false
}
chrome.runtime.onMessage.addListener(handleMessage)
// Cleanup
return () => {
controller.removeEventListener('statuschange', handleStatusChange)
controller.removeEventListener('historychange', handleHistoryChange)
controller.removeEventListener('activity', handleActivity)
chrome.runtime.onMessage.removeListener(handleMessage)
controller.dispose()
}
}, [])
const execute = useCallback(async (task: string) => {
const controller = controllerRef.current
if (!controller) return
setCurrentTask(task)
setHistory([])
await controller.execute(task)
}, [])
const stop = useCallback(() => {
controllerRef.current?.stop()
}, [])
const configure = useCallback(async (newConfig: LLMConfig) => {
const controller = controllerRef.current
if (!controller) return
await controller.configure(newConfig)
setConfig(newConfig)
}, [])
return {
status,
history,
activity,
currentTask,
config,
execute,
stop,
configure,
}
}

View File

@@ -1,98 +0,0 @@
/**
* Agent Event Broadcasting
*
* This module handles broadcasting agent events from Background to SidePanel.
* Uses chrome.runtime API for broadcasting to all extension contexts.
*/
import type { AgentActivity, AgentState, AgentStatus, HistoricalEvent } from './protocol'
// Event type constants
const EVENT_TYPES = {
STATUS: 'event:status',
HISTORY: 'event:history',
ACTIVITY: 'event:activity',
STATE_SNAPSHOT: 'event:stateSnapshot',
} as const
type EventType = (typeof EVENT_TYPES)[keyof typeof EVENT_TYPES]
interface EventMessage<T = unknown> {
type: EventType
payload: T
}
/**
* Broadcast an event to all extension contexts (sidepanel, popup, etc.)
*/
function broadcast<T>(type: EventType, payload: T): void {
const message: EventMessage<T> = { type, payload }
// Use chrome.runtime.sendMessage to broadcast to all contexts
chrome.runtime.sendMessage(message).catch(() => {
// Ignore errors when no listeners are active
})
}
/**
* Event broadcaster for agent state updates.
* Called from Background to notify SidePanel of changes.
*/
export const eventBroadcaster = {
/** Broadcast status change */
status(status: AgentStatus): void {
broadcast(EVENT_TYPES.STATUS, status)
},
/** Broadcast history update */
history(history: HistoricalEvent[]): void {
broadcast(EVENT_TYPES.HISTORY, history)
},
/** Broadcast activity (transient) */
activity(activity: AgentActivity): void {
broadcast(EVENT_TYPES.ACTIVITY, activity)
},
/** Broadcast full state snapshot */
stateSnapshot(state: AgentState): void {
broadcast(EVENT_TYPES.STATE_SNAPSHOT, state)
},
}
/**
* Event listener type for SidePanel
*/
export interface EventListener {
onStatus?: (status: AgentStatus) => void
onHistory?: (history: HistoricalEvent[]) => void
onActivity?: (activity: AgentActivity) => void
onStateSnapshot?: (state: AgentState) => void
}
/**
* Subscribe to agent events in SidePanel.
* Returns an unsubscribe function.
*/
export function subscribeToEvents(listener: EventListener): () => void {
const handler = (message: EventMessage) => {
switch (message.type) {
case EVENT_TYPES.STATUS:
listener.onStatus?.(message.payload as AgentStatus)
break
case EVENT_TYPES.HISTORY:
listener.onHistory?.(message.payload as HistoricalEvent[])
break
case EVENT_TYPES.ACTIVITY:
listener.onActivity?.(message.payload as AgentActivity)
break
case EVENT_TYPES.STATE_SNAPSHOT:
listener.onStateSnapshot?.(message.payload as AgentState)
break
}
}
chrome.runtime.onMessage.addListener(handler)
return () => {
chrome.runtime.onMessage.removeListener(handler)
}
}

View File

@@ -3,4 +3,3 @@
*/
export * from './protocol'
export * from './rpc'
export * from './events'

View File

@@ -1,15 +1,19 @@
/**
* Message Protocol for PageAgentExt
*
* This file defines all message types for cross-context communication:
* - RPC: Background <-> ContentScript (PageController remote calls)
* - Commands: SidePanel -> Background (user actions)
* - Events: Background -> SidePanel (agent state updates)
* NEW ARCHITECTURE (MV3 compliant):
* - SidePanel hosts the agent, all state lives there
* - Background (SW) is a stateless message relay
* - Content Script runs PageController
*
* Message flows:
* 1. RPC: SidePanel → SW → ContentScript → SW → SidePanel (PageController calls)
* 2. Query: ContentScript → SW → SidePanel → SW → ContentScript (mask state check)
* 3. Events: SW → SidePanel (tab events from chrome.tabs API)
*/
import { defineExtensionMessaging } from '@webext-core/messaging'
// ============================================================================
// Shared Types (re-exported from core packages for convenience)
// Shared Types
// ============================================================================
/** Action result from PageController operations */
@@ -42,146 +46,138 @@ export interface ScrollHorizontallyOptions {
index?: number
}
/** Agent execution status */
export type AgentStatus = 'idle' | 'running' | 'completed' | 'error'
// ============================================================================
// Message Types
// ============================================================================
/** Agent activity for real-time UI feedback */
export type AgentActivity =
| { type: 'thinking' }
| { type: 'executing'; tool: string; input: unknown }
| { type: 'executed'; tool: string; input: unknown; output: string; duration: number }
| { type: 'retrying'; attempt: number; maxAttempts: number }
| { type: 'error'; message: string }
/** Message type identifier */
type MessageType =
| 'rpc:call' // SidePanel → SW: RPC call to content script
| 'rpc:response' // SW → SidePanel: RPC response from content script
| 'cs:rpc' // SW → ContentScript: Forwarded RPC call
| 'cs:query' // ContentScript → SW: Query to sidepanel
| 'query:response' // SW → ContentScript: Query response
| 'tab:event' // SW → SidePanel: Tab event notification
/** Historical event (simplified for serialization) */
export interface HistoricalEvent {
type: 'step' | 'observation' | 'user_takeover' | 'retry' | 'error'
// For 'step' type
stepIndex?: number
reflection?: {
evaluation_previous_goal?: string
memory?: string
next_goal?: string
}
action?: {
name: string
input: unknown
output: string
}
// For 'observation' type
content?: string
// For 'retry' type
attempt?: number
maxAttempts?: number
// For 'error' and 'retry' types
message?: string
// Raw LLM response for debugging (step and error types)
rawResponse?: unknown
}
/** Agent state snapshot */
export interface AgentState {
status: AgentStatus
task: string
history: HistoricalEvent[]
/** Base message structure */
interface BaseMessage {
type: MessageType
id: string // Unique message ID for request-response matching
}
// ============================================================================
// RPC Protocol: Background <-> ContentScript
// Used by RemotePageController to call PageController methods
// RPC Messages (SidePanel ↔ SW ↔ ContentScript)
// ============================================================================
export interface PageControllerRPCProtocol {
// State queries
'rpc:getCurrentUrl': () => string
'rpc:getLastUpdateTime': () => number
'rpc:getBrowserState': () => BrowserState
/** RPC method names matching PageController interface */
export type RPCMethod =
| 'getCurrentUrl'
| 'getLastUpdateTime'
| 'getBrowserState'
| 'updateTree'
| 'cleanUpHighlights'
| 'clickElement'
| 'inputText'
| 'selectOption'
| 'scroll'
| 'scrollHorizontally'
| 'executeJavascript'
| 'showMask'
| 'hideMask'
| 'dispose'
// DOM operations
'rpc:updateTree': () => string
'rpc:cleanUpHighlights': () => void
/** SidePanel → SW: Request to call PageController method */
export interface RPCCallMessage extends BaseMessage {
type: 'rpc:call'
tabId: number
method: RPCMethod
args: unknown[]
}
// Element actions
'rpc:clickElement': (index: number) => ActionResult
'rpc:inputText': (data: { index: number; text: string }) => ActionResult
'rpc:selectOption': (data: { index: number; optionText: string }) => ActionResult
'rpc:scroll': (options: ScrollOptions) => ActionResult
'rpc:scrollHorizontally': (options: ScrollHorizontallyOptions) => ActionResult
'rpc:executeJavascript': (script: string) => ActionResult
/** SW → SidePanel: Response from PageController */
export interface RPCResponseMessage extends BaseMessage {
type: 'rpc:response'
success: boolean
result?: unknown
error?: string
}
// Mask operations
'rpc:showMask': () => void
'rpc:hideMask': () => void
// Lifecycle
'rpc:dispose': () => void
/** SW → ContentScript: Forwarded RPC call */
export interface CSRPCMessage extends BaseMessage {
type: 'cs:rpc'
method: RPCMethod
args: unknown[]
}
// ============================================================================
// Command Protocol: SidePanel -> Background
// Used by SidePanel UI to control the agent
// Query Messages (ContentScript → SW → SidePanel)
// ============================================================================
export interface AgentCommandProtocol {
// Task control
'agent:execute': (task: string) => void
'agent:stop': () => void
/** Query types that content script can ask */
export type QueryType = 'shouldShowMask'
// State queries
'agent:getState': () => AgentState
/** ContentScript → SW: Query to sidepanel */
export interface CSQueryMessage extends BaseMessage {
type: 'cs:query'
queryType: QueryType
tabId: number
}
// Configuration
'agent:configure': (config: { apiKey: string; baseURL: string; model: string }) => void
/** SW → ContentScript: Query response */
export interface QueryResponseMessage extends BaseMessage {
type: 'query:response'
result: unknown
}
// ============================================================================
// Content Script Query Protocol: ContentScript -> Background
// Used by ContentScript to query Background state
// Tab Event Messages (SW → SidePanel)
// ============================================================================
export interface ContentScriptQueryProtocol {
/** Check if there's an active task for this tab, returns true if mask should be shown */
'content:shouldShowMask': () => boolean
/** Report content script initialization error to background */
'content:error': (error: { message: string; url: string }) => void
/** Tab event types */
export type TabEventType = 'removed' | 'updated'
/** SW → SidePanel: Tab event notification */
export interface TabEventMessage extends BaseMessage {
type: 'tab:event'
eventType: TabEventType
tabId: number
data?: {
// For 'updated' events
status?: string
url?: string
}
}
// ============================================================================
// Event Protocol: Background -> SidePanel
// Used by Background to push updates to SidePanel
// Union Types
// ============================================================================
export interface AgentEventProtocol {
'event:status': (status: AgentStatus) => void
'event:history': (history: HistoricalEvent[]) => void
'event:activity': (activity: AgentActivity) => void
'event:stateSnapshot': (state: AgentState) => void
/** All message types */
export type ExtensionMessage =
| RPCCallMessage
| RPCResponseMessage
| CSRPCMessage
| CSQueryMessage
| QueryResponseMessage
| TabEventMessage
// ============================================================================
// Utility Functions
// ============================================================================
/** Generate unique message ID */
export function generateMessageId(): string {
return `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
}
// ============================================================================
// Messaging Instances
// ============================================================================
/**
* RPC messaging for PageController remote calls
* Background sends, ContentScript receives
*/
export const pageControllerRPC = defineExtensionMessaging<PageControllerRPCProtocol>()
/**
* Command messaging for agent control
* SidePanel sends, Background receives
*/
export const agentCommands = defineExtensionMessaging<AgentCommandProtocol>()
/**
* Event messaging for agent updates
* Background sends, SidePanel receives
*/
export const agentEvents = defineExtensionMessaging<AgentEventProtocol>()
/**
* Content script query messaging
* ContentScript sends, Background receives
*/
export const contentScriptQuery = defineExtensionMessaging<ContentScriptQueryProtocol>()
/** Type guard for our messages */
export function isExtensionMessage(msg: unknown): msg is ExtensionMessage {
return (
typeof msg === 'object' &&
msg !== null &&
'type' in msg &&
'id' in msg &&
typeof (msg as ExtensionMessage).type === 'string' &&
typeof (msg as ExtensionMessage).id === 'string'
)
}

View File

@@ -1,38 +1,75 @@
/**
* RPC utilities for PageController remote calls
* RPC Client for PageController remote calls
*
* This module provides helper functions for making RPC calls
* from Background to ContentScript with proper error handling.
* This module provides RPC functionality from SidePanel to ContentScript
* via the Background (SW) relay.
*
* Flow: SidePanel → SW (relay) → ContentScript → SW → SidePanel
*/
import { pageControllerRPC } from './protocol'
import type {
ActionResult,
BrowserState,
ScrollHorizontallyOptions,
ScrollOptions,
import {
type ActionResult,
type BrowserState,
type RPCCallMessage,
type RPCMethod,
type RPCResponseMessage,
type ScrollHorizontallyOptions,
type ScrollOptions,
generateMessageId,
isExtensionMessage,
} from './protocol'
/** RPC call configuration */
/** RPC configuration */
const RPC_CONFIG = {
/** Maximum retry attempts for transient failures */
maxRetries: 3,
/** Base delay between retries in ms (exponential backoff) */
retryDelayMs: 500,
/** Timeout for waiting for content script to be ready */
readyTimeoutMs: 5000,
/** Timeout for individual RPC call in ms */
callTimeoutMs: 30000,
}
/**
* Error thrown when RPC call fails due to tab/content script issues
*/
export class RPCError extends Error {
constructor(
message: string,
public readonly code: 'TAB_CLOSED' | 'CONTENT_SCRIPT_NOT_READY' | 'RPC_FAILED'
) {
super(message)
this.name = 'RPCError'
/** Pending RPC calls waiting for response */
const pendingCalls = new Map<
string,
{
resolve: (value: unknown) => void
reject: (error: Error) => void
timeout: ReturnType<typeof setTimeout>
}
>()
/** Whether the response listener is registered */
let listenerRegistered = false
/**
* Register the RPC response listener (called once)
*/
function ensureResponseListener(): void {
if (listenerRegistered) return
listenerRegistered = true
chrome.runtime.onMessage.addListener((message: unknown) => {
if (!isExtensionMessage(message)) return
if (message.type !== 'rpc:response') return
const response = message as RPCResponseMessage
const pending = pendingCalls.get(response.id)
if (!pending) {
console.debug('[RPC] Received response for unknown call:', response.id)
return
}
pendingCalls.delete(response.id)
clearTimeout(pending.timeout)
if (response.success) {
pending.resolve(response.result)
} else {
pending.reject(new Error(response.error || 'RPC call failed'))
}
})
console.debug('[RPC] Response listener registered')
}
/**
@@ -55,167 +92,97 @@ async function tabExists(tabId: number): Promise<boolean> {
}
/**
* Wrap an RPC call with error handling and retry logic
* Error thrown when RPC call fails
*/
async function withRetry<T>(tabId: number, operation: string, fn: () => Promise<T>): Promise<T> {
export class RPCError extends Error {
constructor(
message: string,
public readonly code: 'TAB_CLOSED' | 'CONTENT_SCRIPT_NOT_READY' | 'RPC_FAILED' | 'TIMEOUT'
) {
super(message)
this.name = 'RPCError'
}
}
/**
* Make a single RPC call (no retry)
*/
async function callOnce(tabId: number, method: RPCMethod, args: unknown[]): Promise<unknown> {
ensureResponseListener()
const id = generateMessageId()
const message: RPCCallMessage = {
type: 'rpc:call',
id,
tabId,
method,
args,
}
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
pendingCalls.delete(id)
reject(new RPCError(`RPC ${method} timed out`, 'TIMEOUT'))
}, RPC_CONFIG.callTimeoutMs)
pendingCalls.set(id, { resolve, reject, timeout })
chrome.runtime.sendMessage(message).catch((error: Error) => {
pendingCalls.delete(id)
clearTimeout(timeout)
reject(error)
})
})
}
/**
* Make an RPC call with retry logic
*/
async function call(tabId: number, method: RPCMethod, args: unknown[]): Promise<unknown> {
let lastError: Error | null = null
for (let attempt = 0; attempt < RPC_CONFIG.maxRetries; attempt++) {
try {
return await fn()
return await callOnce(tabId, method, args)
} catch (error) {
lastError = error as Error
const message = lastError.message || String(error)
// Check if tab still exists
if (!(await tabExists(tabId))) {
throw new RPCError(`Tab ${tabId} was closed during ${operation}`, 'TAB_CLOSED')
throw new RPCError(`Tab ${tabId} was closed`, 'TAB_CLOSED')
}
// Check for content script not ready errors
// Check for retryable errors
if (
message.includes('Could not establish connection') ||
message.includes('Receiving end does not exist')
message.includes('Receiving end does not exist') ||
message.includes('content script not ready')
) {
console.log(
`[RPC] Content script not ready for ${operation}, attempt ${attempt + 1}/${RPC_CONFIG.maxRetries}`
const delay = RPC_CONFIG.retryDelayMs * Math.pow(2, attempt)
console.debug(
`[RPC] Retry ${attempt + 1}/${RPC_CONFIG.maxRetries} for ${method}, waiting ${delay}ms`
)
// Wait before retry with exponential backoff
await sleep(RPC_CONFIG.retryDelayMs * Math.pow(2, attempt))
await sleep(delay)
continue
}
// For other errors, throw immediately
throw new RPCError(`RPC ${operation} failed: ${message}`, 'RPC_FAILED')
// Non-retryable error
throw lastError
}
}
// All retries exhausted
throw new RPCError(
`Content script not ready after ${RPC_CONFIG.maxRetries} attempts for ${operation}`,
`Content script not ready after ${RPC_CONFIG.maxRetries} attempts for ${method}`,
'CONTENT_SCRIPT_NOT_READY'
)
}
/**
* Create an RPC client bound to a specific tab.
* The tabId is captured at creation time to ensure messages are sent to the correct tab
* even if the user switches tabs or the page loses focus.
* RPC client interface matching PageController methods
*/
export function createRPCClient(tabIdPromise: Promise<number>): RPCClient {
return {
// State queries
async getCurrentUrl(): Promise<string> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getCurrentUrl', () =>
pageControllerRPC.sendMessage('rpc:getCurrentUrl', undefined, tabId)
)
},
async getLastUpdateTime(): Promise<number> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getLastUpdateTime', () =>
pageControllerRPC.sendMessage('rpc:getLastUpdateTime', undefined, tabId)
)
},
async getBrowserState(): Promise<BrowserState> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getBrowserState', () =>
pageControllerRPC.sendMessage('rpc:getBrowserState', undefined, tabId)
)
},
// DOM operations
async updateTree(): Promise<string> {
const tabId = await tabIdPromise
return withRetry(tabId, 'updateTree', () =>
pageControllerRPC.sendMessage('rpc:updateTree', undefined, tabId)
)
},
async cleanUpHighlights(): Promise<void> {
const tabId = await tabIdPromise
return withRetry(tabId, 'cleanUpHighlights', () =>
pageControllerRPC.sendMessage('rpc:cleanUpHighlights', undefined, tabId)
)
},
// Element actions
async clickElement(index: number): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'clickElement', () =>
pageControllerRPC.sendMessage('rpc:clickElement', index, tabId)
)
},
async inputText(index: number, text: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'inputText', () =>
pageControllerRPC.sendMessage('rpc:inputText', { index, text }, tabId)
)
},
async selectOption(index: number, optionText: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'selectOption', () =>
pageControllerRPC.sendMessage('rpc:selectOption', { index, optionText }, tabId)
)
},
async scroll(options: ScrollOptions): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'scroll', () =>
pageControllerRPC.sendMessage('rpc:scroll', options, tabId)
)
},
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'scrollHorizontally', () =>
pageControllerRPC.sendMessage('rpc:scrollHorizontally', options, tabId)
)
},
async executeJavascript(script: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'executeJavascript', () =>
pageControllerRPC.sendMessage('rpc:executeJavascript', script, tabId)
)
},
// Mask operations
async showMask(): Promise<void> {
const tabId = await tabIdPromise
return withRetry(tabId, 'showMask', () =>
pageControllerRPC.sendMessage('rpc:showMask', undefined, tabId)
)
},
async hideMask(): Promise<void> {
const tabId = await tabIdPromise
// Don't retry hideMask - if content script is gone, mask is already hidden
try {
return await pageControllerRPC.sendMessage('rpc:hideMask', undefined, tabId)
} catch {
// Ignore errors - mask is effectively hidden if content script is gone
}
},
// Lifecycle
async dispose(): Promise<void> {
const tabId = await tabIdPromise
// Don't retry dispose - best effort cleanup
try {
return await pageControllerRPC.sendMessage('rpc:dispose', undefined, tabId)
} catch {
// Ignore errors - resources are already cleaned up if content script is gone
}
},
}
}
export interface RPCClient {
tabId: number
getCurrentUrl(): Promise<string>
getLastUpdateTime(): Promise<number>
getBrowserState(): Promise<BrowserState>
@@ -231,3 +198,80 @@ export interface RPCClient {
hideMask(): Promise<void>
dispose(): Promise<void>
}
/**
* Create an RPC client bound to a specific tab
*/
export function createRPCClient(tabId: number): RPCClient {
console.debug(`[RPC] Creating client for tab ${tabId}`)
return {
tabId,
async getCurrentUrl(): Promise<string> {
return call(tabId, 'getCurrentUrl', []) as Promise<string>
},
async getLastUpdateTime(): Promise<number> {
return call(tabId, 'getLastUpdateTime', []) as Promise<number>
},
async getBrowserState(): Promise<BrowserState> {
return call(tabId, 'getBrowserState', []) as Promise<BrowserState>
},
async updateTree(): Promise<string> {
return call(tabId, 'updateTree', []) as Promise<string>
},
async cleanUpHighlights(): Promise<void> {
await call(tabId, 'cleanUpHighlights', [])
},
async clickElement(index: number): Promise<ActionResult> {
return call(tabId, 'clickElement', [index]) as Promise<ActionResult>
},
async inputText(index: number, text: string): Promise<ActionResult> {
return call(tabId, 'inputText', [index, text]) as Promise<ActionResult>
},
async selectOption(index: number, optionText: string): Promise<ActionResult> {
return call(tabId, 'selectOption', [index, optionText]) as Promise<ActionResult>
},
async scroll(options: ScrollOptions): Promise<ActionResult> {
return call(tabId, 'scroll', [options]) as Promise<ActionResult>
},
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
return call(tabId, 'scrollHorizontally', [options]) as Promise<ActionResult>
},
async executeJavascript(script: string): Promise<ActionResult> {
return call(tabId, 'executeJavascript', [script]) as Promise<ActionResult>
},
async showMask(): Promise<void> {
await call(tabId, 'showMask', [])
},
async hideMask(): Promise<void> {
// Best effort - don't throw if content script is gone
try {
await callOnce(tabId, 'hideMask', [])
} catch (e) {
console.debug('[RPC] hideMask failed (ignored):', e)
}
},
async dispose(): Promise<void> {
// Best effort - don't throw if content script is gone
try {
await callOnce(tabId, 'dispose', [])
} catch (e) {
console.debug('[RPC] dispose failed (ignored):', e)
}
},
}
}

View File

@@ -1,97 +1,116 @@
# PageAgentExt Architecture
This document describes the architecture of the Chrome extension version of PageAgent, including environment definitions, communication protocols, and extension considerations.
This document describes the MV3-compliant architecture of the Chrome extension version of PageAgent.
## Design Principles
The architecture follows Chrome MV3 Service Worker constraints:
1. **Service Worker is stateless** - No long-running loops, no in-memory state
2. **Agent runs in frontend context** - SidePanel hosts all agent logic
3. **SW is a message relay** - Only forwards messages between contexts
4. **Event-driven** - All operations are triggered by user actions or message events
## Environment Definitions
The extension operates across three isolated JavaScript contexts:
### 1. Background (Service Worker)
**File:** `src/entrypoints/background.ts`
**Responsibilities:**
- Hosts the headless `PageAgentCore` instance
- Manages agent lifecycle (create, execute, stop, dispose)
- Stores LLM configuration in `chrome.storage.local`
- Receives commands from SidePanel via messaging
- Broadcasts events to SidePanel for UI updates
- Uses `RemotePageController` to proxy DOM operations to ContentScript
**Key Components:**
- `PageAgentCore` - The AI agent (from `@page-agent/core`)
- `RemotePageController` - Proxy that forwards calls to ContentScript
- Command handlers for `agent:execute`, `agent:stop`, `agent:configure`
### 2. Content Script
**File:** `src/entrypoints/content.ts`
**Responsibilities:**
- Runs in the context of web pages
- Hosts the real `PageController` instance (lazy-initialized)
- Performs actual DOM operations (click, input, scroll, etc.)
- Responds to RPC messages from Background
- Manages visual mask overlay during automation
**Key Components:**
- `PageController` - DOM controller (from `@page-agent/page-controller`)
- RPC handlers for all PageController methods
**Lifecycle:** PageController is created lazily on first RPC call and disposed between tasks. This ensures clean state for each task and enables future multi-page support.
### 3. Side Panel (React UI)
### 1. Side Panel (Frontend - Agent Host)
**Files:** `src/entrypoints/sidepanel/`
**Responsibilities:**
- Provides user interface for controlling the agent
- Displays task input and execution history
- Shows real-time agent activity (thinking, executing, etc.)
- Manages LLM configuration settings
- Sends commands to Background and receives event updates
- Hosts `PageAgentCore` instance and main execution loop
- Manages `TabsManager` for multi-tab control
- Uses `RemotePageController` to proxy DOM operations via SW
- Stores agent state (task, history, status)
- Provides React UI for user interaction
- Handles `shouldShowMask` queries from content scripts
**Key Components:**
- `App.tsx` - Main React component with chat-style UI
- `ConfigPanel` - Settings form for LLM configuration
- Event subscription for real-time updates
- `AgentController` - Encapsulates agent lifecycle, isolated from UI
- `useAgent` hook - React integration for AgentController
- `App.tsx` - Main UI component
- `ConfigPanel` - LLM settings
## Communication Architecture
**Lifecycle:** When sidepanel closes, agent disposes naturally. No state persists in SW.
### 2. Background (Service Worker - Stateless Relay)
**File:** `src/entrypoints/background.ts`
**Responsibilities:**
- Relays RPC messages from SidePanel to ContentScript
- Forwards tab events (onRemoved, onUpdated) to SidePanel
- Opens sidepanel on action click
- **NO** agent logic, **NO** state
**Message Flows:**
```
SidePanel → SW → ContentScript (RPC calls)
ContentScript → SW → SidePanel (mask state queries)
SW → SidePanel (tab events)
```
### 3. Content Script
**File:** `src/entrypoints/content.ts`
**Responsibilities:**
- Runs in web page context
- Hosts real `PageController` instance (lazy-initialized)
- Handles RPC messages for DOM operations
- Queries SidePanel for mask state on page load
- Manages visual mask overlay
**Lifecycle:** PageController is created on first RPC call and disposed between tasks.
## Architecture Diagram
```
┌─────────────────────────────────────────────────────────────────┐
Side Panel
│ ┌──────────────┐ ┌──────────────┐ ┌───────────────────────┐ │
│ │ Task Input │ Event Stream │ History Display │ │
─────────────┘ └───────────── └───────────────────────
└─────────┼─────────────────┼─────────────────────────────────────┘
│ Commands │ Events
│ Side Panel (Frontend)
│ ┌────────────────────────────────────────────────────────────┐ │
│ │ AgentController │ │
│ ┌──────────────┐ ┌────────────── ──────────────────┐ │
│ │ │ PageAgentCore│ │ TabsManager │ │RemotePageController│ │ │
└──────────────┘ └──────────────┘ └────────┬─────────┘ │ │
│ └───────────────────────────────────────────────┼────────────┘ │
│ │ │
│ ┌──────────────┐ ┌──────────────┐ │ │
│ │ React UI │ │ Query Handler│◄─────────────┼───────────┐ │
│ │ (App.tsx) │ │(shouldShowMask) │ │ │
│ └──────────────┘ └──────────────┘ │ │ │
└──────────────────────────────────────────────────┼───────────┼───┘
│ │
RPC Call │ Query │
▼ │
┌─────────────────────────────────────────────────────────────────┐
Background
┌──────────────────────────────────────────────────────────┐
PageAgentCore
│ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐
LLM Tools │ │ RemotePageCtrl │ │
│ └─────────────┘ └─────────────┘ └───────┬────────
└─────────────────────────────────────────────┼────────────┘
└────────────────────────────────────────────────┼────────────────
RPC
│ Background (Service Worker)
┌────────────────┐
│ Message Relay │
│ (stateless) │
└───────┬────────┘
│ Tab Events ─────────────────┼─────────────────► SidePanel │
(onRemoved, onUpdated)
└──────────────────────────────┼───────────────────────────────────┘
│ RPC Forward
┌─────────────────────────────────────────────────────────────────┐
│ Content Script │
│ ┌──────────────────────────────────────────────────────────┐
│ ┌────────────────────────────────────────────────────────────┐ │
│ │ PageController │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ │
│ │ │ DOM Tree │ │ Actions │ │ Mask │ │ │
│ │ └─────────────┘ └─────────────┘ └──────────────────┘ │ │
│ └──────────────────────────────────────────────────────────┘
│ └────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
@@ -103,106 +122,65 @@ The extension operates across three isolated JavaScript contexts:
## Message Protocol
All cross-context communication uses `@webext-core/messaging` for type safety.
All messages use a simple type-based protocol defined in `src/messaging/protocol.ts`.
### Protocol Definition
### Message Types
**File:** `src/messaging/protocol.ts`
| Type | Direction | Purpose |
|------|-----------|---------|
| `rpc:call` | SidePanel → SW | Request to call PageController method |
| `rpc:response` | SW → SidePanel | Response from PageController |
| `cs:rpc` | SW → ContentScript | Forwarded RPC call |
| `cs:query` | ContentScript → SW | Query to SidePanel (e.g., shouldShowMask) |
| `query:response` | SW → ContentScript | Response to query |
| `tab:event` | SW → SidePanel | Tab removed/updated notification |
### 1. RPC Protocol (Background → ContentScript)
### RPC Methods
Used by `RemotePageController` to call `PageController` methods.
All PageController methods are available via RPC:
```typescript
interface PageControllerRPCProtocol {
// State queries
'rpc:getCurrentUrl': () => string
'rpc:getLastUpdateTime': () => number
'rpc:getBrowserState': () => BrowserState
// DOM operations
'rpc:updateTree': () => string
'rpc:cleanUpHighlights': () => void
// Element actions
'rpc:clickElement': (index: number) => ActionResult
'rpc:inputText': (data: { index: number; text: string }) => ActionResult
'rpc:selectOption': (data: { index: number; optionText: string }) => ActionResult
'rpc:scroll': (options: ScrollOptions) => ActionResult
'rpc:scrollHorizontally': (options: ScrollHorizontallyOptions) => ActionResult
'rpc:executeJavascript': (script: string) => ActionResult
// Mask operations
'rpc:showMask': () => void
'rpc:hideMask': () => void
// Lifecycle
'rpc:dispose': () => void
}
```
### 2. Command Protocol (SidePanel → Background)
Used by SidePanel UI to control the agent.
```typescript
interface AgentCommandProtocol {
'agent:execute': (task: string) => void
'agent:stop': () => void
'agent:getState': () => AgentState
'agent:configure': (config: LLMConfig) => void
}
```
### 3. Event Protocol (Background → SidePanel)
Used by Background to push updates to SidePanel.
```typescript
interface AgentEventProtocol {
'event:status': (status: AgentStatus) => void
'event:history': (history: HistoricalEvent[]) => void
'event:activity': (activity: AgentActivity) => void
'event:stateSnapshot': (state: AgentState) => void
}
```
- State: `getCurrentUrl`, `getLastUpdateTime`, `getBrowserState`
- DOM: `updateTree`, `cleanUpHighlights`
- Actions: `clickElement`, `inputText`, `selectOption`, `scroll`, `scrollHorizontally`, `executeJavascript`
- Mask: `showMask`, `hideMask`
- Lifecycle: `dispose`
## Communication Flow
### Task Execution Flow
### Task Execution
```
1. User enters task in SidePanel
└─> SidePanel sends 'agent:execute' command
└─> AgentController.execute(task)
2. Background receives command
├─> Creates PageAgentCore with RemotePageController
─> Starts task execution
2. AgentController creates agent instances
├─> new PageAgentCore()
─> new TabsManager()
└─> new RemotePageController()
3. Agent executes step loop:
├─> LLM generates next action
├─> Agent calls RemotePageController method
│ └─> RPC message sent to ContentScript
├─> RemotePageController.method() called
│ └─> RPC message → SW → ContentScript
├─> ContentScript executes on real PageController
│ └─> RPC response returned
│ └─> Response → SW → SidePanel
├─> Agent updates history
└─> Background broadcasts events to SidePanel
└─> React UI re-renders via events
4. SidePanel receives events
└─> Updates UI (status, history, activity)
5. Task completes or user stops
└─> Agent disposes, status changes to idle/completed/error
4. Task completes or user stops
└─> Agent disposes, status changes
```
### Configuration Flow
### Page Reload During Task
```
1. User opens Settings in SidePanel
2. User enters API credentials
3. SidePanel sends 'agent:configure' command
4. Background saves config to chrome.storage.local
5. Next agent creation uses new config
1. Page reloads/navigates
2. Content script initializes
3. Content script queries: shouldShowMask?
└─> cs:query → SW → SidePanel
4. SidePanel checks if tab is current + agent running
└─> query:response → SW → ContentScript
5. Content script shows/hides mask accordingly
```
## File Structure
@@ -210,99 +188,85 @@ interface AgentEventProtocol {
```
packages/extension/src/
├── agent/
── RemotePageController.ts # Proxy for PageController
── RemotePageController.ts # Proxy for PageController RPC
│ ├── TabsManager.ts # Multi-tab management
│ └── tabTools.ts # Agent tools for tab control
├── entrypoints/
│ ├── background.ts # Service worker
│ ├── content.ts # Content script
│ ├── background.ts # Stateless SW relay
│ ├── content.ts # Content script with PageController
│ └── sidepanel/
│ ├── AgentController.ts # Agent lifecycle management
│ ├── useAgent.ts # React hook for agent
│ ├── App.tsx # Main UI component
│ ├── components/
│ │ ├── ConfigPanel.tsx
│ │ ├── cards/
│ │ └── index.tsx
│ ├── index.html
── main.tsx
│ └── App.tsx # Main UI component
── main.tsx
├── messaging/
│ ├── protocol.ts # Message type definitions
│ ├── rpc.ts # RPC client for PageController
── events.ts # Event broadcasting utilities
│ └── index.ts # Module exports
│ ├── rpc.ts # RPC client for SidePanel
── index.ts
├── components/ui/ # shadcn components
├── lib/utils.ts # Utility functions
└── assets/index.css # Tailwind styles
├── lib/utils.ts
└── utils/constants.ts
```
## Design Decisions
### Tab ID Binding
### Why Agent in SidePanel?
**Problem:** When a task completes while the page is not focused (user switched tabs), RPC messages like `hideMask` or `dispose` would be sent to the wrong tab because `chrome.tabs.query({ active: true })` returns the currently active tab, not the original target tab.
MV3 Service Workers have strict lifecycle constraints:
- Terminate after ~30s of inactivity
- Cannot maintain long-running loops
- State is lost on termination
**Solution:** `RemotePageController` captures the target tab ID at construction time and binds it to its RPC client. All subsequent RPC calls use this fixed tab ID regardless of which tab is currently active.
By hosting the agent in SidePanel (a visible frontend page), we get:
- Persistent execution while panel is open
- Natural disposal when panel closes
- No SW wake-up complexity
```
Task starts → RemotePageController created → tabId captured (e.g., 123)
User switches to another tab (456 is now active)
Task completes → hideMask RPC sent to tab 123 (correct!)
```
### Agent Isolation from UI
### Lazy PageController Lifecycle
`AgentController` is a separate class from the React UI for:
- **Testability** - Can test agent logic without React
- **Portability** - Future: move agent to popup, options page, or external page
- **Clean separation** - UI concerns don't pollute agent logic
**Problem:** PageController was created once when content script loaded and persisted until page unload. If the mask was disposed mid-task, subsequent tasks couldn't show it again.
### Simplified Messaging
**Solution:** PageController is now lazy-initialized on first RPC call and fully disposed between tasks. Each task gets a fresh PageController instance with its own mask.
Previous architecture had complex retry/wake-up logic for SW. New architecture:
- SW is stateless, always ready
- No ping/wake-up needed
- Simple request-response pattern
- Retry logic only for content script initialization
```
Task 1: showMask → creates PageController + Mask → execute → hideMask → dispose → null
Task 2: showMask → creates new PageController + Mask → ...
```
## Multi-Tab Control
This also prepares for future multi-page workflows where PageController may need to be recreated when navigating between pages.
### Tab Types
## Extension Considerations
- **Initial Tab** - Where user started the task
- **Managed Tabs** - Tabs opened by agent via `open_new_tab`
### Current Limitations (v1)
### Tab Grouping
1. **Single page control only** - Agent controls the active tab where SidePanel was opened
2. **No cross-tab navigation** - Cannot follow links that open in new tabs
3. **Session-based** - Agent state is not persisted across extension restarts
Agent-opened tabs are grouped in a Chrome tab group named `Task(<taskId>)`.
### Future Extension Points
### Tab Switching
#### Multi-tab Control
Only initial tab and managed tabs can be switched to. This prevents the agent from accessing unrelated tabs.
To support controlling multiple tabs:
## Configuration
1. Add `tabId` parameter to RPC messages
2. Track tab-to-controller mapping in Background
3. Allow SidePanel to switch between controlled tabs
LLM config (apiKey, baseURL, model) is stored in `chrome.storage.local`. This persists across sessions and is managed via the ConfigPanel.
#### Persistent Sessions
## Security
To persist agent sessions:
1. Store session state in `chrome.storage.local`
2. Restore agent on extension startup
3. Handle service worker restarts gracefully
#### Cross-tab Navigation
To follow links in new tabs:
1. Listen to `chrome.tabs.onCreated` events
2. Inject content script into new tabs
3. Transfer control to new tab when navigation occurs
#### Screenshot/Vision Support
To add visual context for the agent:
1. Use `chrome.tabs.captureVisibleTab` for screenshots
2. Send images to vision-capable LLM models
3. Add screenshot tool to agent toolkit
## Security Considerations
1. **API Key Storage** - Keys stored in `chrome.storage.local` (extension-only access)
2. **Content Script Isolation** - Runs in isolated world, not accessible to page scripts
3. **Message Validation** - Only trusted extension contexts can send/receive messages
4. **Permission Scope** - Request minimal permissions needed for functionality
1. **API Key Storage** - Keys in `chrome.storage.local` (extension-only access)
2. **Content Script Isolation** - Runs in isolated world
3. **Tab Restriction** - Agent can only control tabs it opened or started from
4. **No Arbitrary Tab Access** - Cannot switch to unmanaged tabs
## Development

View File

@@ -15,6 +15,9 @@ export default defineConfig({
},
vite: () => ({
plugins: [tailwindcss()],
optimizeDeps: {
force: true,
},
build: {
minify: false,
chunkSizeWarningLimit: 2000,
@@ -32,7 +35,7 @@ export default defineConfig({
description:
'AI-powered browser automation assistant. Control web pages with natural language.',
homepage_url: 'https://alibaba.github.io/page-agent/',
permissions: ['tabs', 'sidePanel', 'storage'],
permissions: ['tabs', 'tabGroups', 'sidePanel', 'storage'],
host_permissions: ['<all_urls>'],
icons: {
64: 'assets/page-agent-64.png',