feat: multi tabs control

This commit is contained in:
Simon
2026-01-24 19:29:27 +08:00
parent 2aa9c3b978
commit fa5ab9d567
17 changed files with 2303 additions and 1061 deletions

View File

@@ -4,7 +4,11 @@
* This class implements the same interface as PageController but forwards * This class implements the same interface as PageController but forwards
* all method calls via RPC to the real PageController running in ContentScript. * all method calls via RPC to the real PageController running in ContentScript.
* This allows PageAgentCore to work transparently with remote DOM operations. * This allows PageAgentCore to work transparently with remote DOM operations.
*
* Tab targeting is managed externally by TabsManager via setTargetTab().
*/ */
import type { PageController } from '@page-agent/page-controller'
import type { import type {
ActionResult, ActionResult,
BrowserState, BrowserState,
@@ -13,6 +17,32 @@ import type {
} from '../messaging/protocol' } from '../messaging/protocol'
import { type RPCClient, createRPCClient } from '../messaging/rpc' import { type RPCClient, createRPCClient } from '../messaging/rpc'
const DEBUG_PREFIX = '[RemotePageController]'
/**
* Check if a URL can run content scripts.
* Chrome extensions cannot inject content scripts into certain pages.
*/
export function isContentScriptAllowed(url: string | undefined): boolean {
if (!url) return false
// Restricted URL patterns
const restrictedPatterns = [
/^chrome:\/\//,
/^chrome-extension:\/\//,
/^about:/,
/^edge:\/\//,
/^brave:\/\//,
/^opera:\/\//,
/^vivaldi:\/\//,
/^file:\/\//,
/^view-source:/,
/^devtools:\/\//,
]
return !restrictedPatterns.some((pattern) => pattern.test(url))
}
/** /**
* RemotePageController is a proxy that implements the PageController interface. * RemotePageController is a proxy that implements the PageController interface.
* All methods are async and forward to ContentScript via RPC. * All methods are async and forward to ContentScript via RPC.
@@ -20,30 +50,133 @@ import { type RPCClient, createRPCClient } from '../messaging/rpc'
* This class extends EventTarget to maintain API compatibility with PageController, * This class extends EventTarget to maintain API compatibility with PageController,
* though events in the remote context are not currently bridged. * though events in the remote context are not currently bridged.
*/ */
export class RemotePageController extends EventTarget { export class RemotePageController {
private rpc: RPCClient private rpc: RPCClient | null = null
private _tabId: number | null = null private _currentTabId: number | null = null
private _tabIdPromise: Promise<number> private _currentTabUrl: string | undefined = undefined
private _previousTabId: number | null = null
/** Get the target tab ID (null if not yet resolved) */ /** Get the current target tab ID */
get tabId(): number | null { get currentTabId(): number | null {
return this._tabId return this._currentTabId
} }
/** Get the promise that resolves to the target tab ID */ /** Get the current target tab URL */
get tabIdPromise(): Promise<number> { get currentTabUrl(): string | undefined {
return this._tabIdPromise return this._currentTabUrl
} }
constructor() { /** Check if current tab supports content scripts */
super() get isCurrentTabAccessible(): boolean {
// Capture the active tab ID at construction time to avoid issues when tab loses focus return isContentScriptAllowed(this._currentTabUrl)
this._tabIdPromise = chrome.tabs.query({ active: true, currentWindow: true }).then(([tab]) => { }
if (!tab?.id) throw new Error('No active tab found')
this._tabId = tab.id // Tab ID is now set externally via setTargetTab()
return tab.id
}) /**
this.rpc = createRPCClient(this._tabIdPromise) * Set the target tab for all RPC operations.
* Called by TabsManager when switching tabs.
* Handles cleanup on old tab and mask show on new tab.
*/
async setTargetTab(tabId: number): Promise<void> {
const previousTabId = this._currentTabId
const previousRpc = this.rpc
console.debug(`${DEBUG_PREFIX} setTargetTab: ${previousTabId}${tabId}`)
// Clean up old tab completely (highlights + mask)
if (previousTabId && previousTabId !== tabId && previousRpc) {
console.debug(`${DEBUG_PREFIX} Cleaning up previous tab ${previousTabId}`)
try {
// Clean up highlights first - this is important for visual cleanup
await previousRpc.cleanUpHighlights()
} catch (e) {
console.debug(
`${DEBUG_PREFIX} cleanUpHighlights on tab ${previousTabId} failed (ignored):`,
e
)
}
try {
await previousRpc.hideMask()
} catch (e) {
console.debug(`${DEBUG_PREFIX} hideMask on tab ${previousTabId} failed (ignored):`, e)
}
}
// Get tab info to check URL
const tab = await chrome.tabs.get(tabId)
const tabUrl = tab.url
// Update state
this._previousTabId = previousTabId
this._currentTabId = tabId
this._currentTabUrl = tabUrl
// Check if this tab can run content scripts
if (!isContentScriptAllowed(tabUrl)) {
console.debug(`${DEBUG_PREFIX} Tab ${tabId} cannot run content scripts: ${tabUrl}`)
// Clear RPC - operations will return restricted page state
this.rpc = null
return
}
// Create new RPC client for the new tab
this.rpc = createRPCClient(tabId)
// Verify content script is ready by making a test call
// This uses the retry mechanism to wait for content script initialization
try {
await this.rpc.getLastUpdateTime()
console.debug(`${DEBUG_PREFIX} Content script ready on tab ${tabId}`)
} catch (error) {
console.error(`${DEBUG_PREFIX} Content script not ready on tab ${tabId}:`, error)
// Don't clear rpc - subsequent calls will retry and may succeed
}
// Show mask on new tab
try {
await this.rpc.showMask()
console.debug(`${DEBUG_PREFIX} Mask shown on tab ${tabId}`)
} catch (error) {
console.error(`${DEBUG_PREFIX} Failed to show mask on tab ${tabId}:`, error)
// Continue anyway - mask is optional
}
console.debug(`${DEBUG_PREFIX} Target tab set to ${tabId}`)
}
/**
* Ensure RPC client is initialized
* @throws Error if setTargetTab() has not been called
*/
private ensureInitialized(): void {
if (!this._currentTabId) {
throw new Error('RemotePageController not initialized. Call setTargetTab() first.')
}
}
/**
* Create a browser state for restricted pages that cannot run content scripts.
* Treats restricted pages as empty pages rather than errors.
*/
private createRestrictedPageState(): BrowserState {
return {
url: this._currentTabUrl || '',
title: '',
header: '',
content: '(empty page)',
footer: '',
}
}
/**
* Create a no-op action result for restricted pages
*/
private createRestrictedActionResult(action: string): ActionResult {
return {
success: false,
message: `Cannot ${action} on this page. Use open_new_tab to navigate to a web page first.`,
}
} }
// ======= State Queries ======= // ======= State Queries =======
@@ -52,13 +185,15 @@ export class RemotePageController extends EventTarget {
* Get current page URL * Get current page URL
*/ */
async getCurrentUrl(): Promise<string> { async getCurrentUrl(): Promise<string> {
return this.rpc.getCurrentUrl() // Can return URL even for restricted pages
return this._currentTabUrl || ''
} }
/** /**
* Get last tree update timestamp * Get last tree update timestamp
*/ */
async getLastUpdateTime(): Promise<number> { async getLastUpdateTime(): Promise<number> {
if (!this.rpc) return Date.now()
return this.rpc.getLastUpdateTime() return this.rpc.getLastUpdateTime()
} }
@@ -66,6 +201,10 @@ export class RemotePageController extends EventTarget {
* Get structured browser state for LLM consumption. * Get structured browser state for LLM consumption.
*/ */
async getBrowserState(): Promise<BrowserState> { async getBrowserState(): Promise<BrowserState> {
// Return restricted page state if content scripts cannot run
if (!this.rpc) {
return this.createRestrictedPageState()
}
return this.rpc.getBrowserState() return this.rpc.getBrowserState()
} }
@@ -75,6 +214,8 @@ export class RemotePageController extends EventTarget {
* Update DOM tree, returns simplified HTML for LLM. * Update DOM tree, returns simplified HTML for LLM.
*/ */
async updateTree(): Promise<string> { async updateTree(): Promise<string> {
this.ensureInitialized()
if (!this.rpc) return '(empty page)'
return this.rpc.updateTree() return this.rpc.updateTree()
} }
@@ -82,6 +223,7 @@ export class RemotePageController extends EventTarget {
* Clean up all element highlights * Clean up all element highlights
*/ */
async cleanUpHighlights(): Promise<void> { async cleanUpHighlights(): Promise<void> {
if (!this.rpc) return
return this.rpc.cleanUpHighlights() return this.rpc.cleanUpHighlights()
} }
@@ -91,6 +233,8 @@ export class RemotePageController extends EventTarget {
* Click element by index * Click element by index
*/ */
async clickElement(index: number): Promise<ActionResult> { async clickElement(index: number): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('click')
return this.rpc.clickElement(index) return this.rpc.clickElement(index)
} }
@@ -98,6 +242,8 @@ export class RemotePageController extends EventTarget {
* Input text into element by index * Input text into element by index
*/ */
async inputText(index: number, text: string): Promise<ActionResult> { async inputText(index: number, text: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('input text')
return this.rpc.inputText(index, text) return this.rpc.inputText(index, text)
} }
@@ -105,6 +251,8 @@ export class RemotePageController extends EventTarget {
* Select dropdown option by index and option text * Select dropdown option by index and option text
*/ */
async selectOption(index: number, optionText: string): Promise<ActionResult> { async selectOption(index: number, optionText: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('select option')
return this.rpc.selectOption(index, optionText) return this.rpc.selectOption(index, optionText)
} }
@@ -112,6 +260,8 @@ export class RemotePageController extends EventTarget {
* Scroll vertically * Scroll vertically
*/ */
async scroll(options: ScrollOptions): Promise<ActionResult> { async scroll(options: ScrollOptions): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('scroll')
return this.rpc.scroll(options) return this.rpc.scroll(options)
} }
@@ -119,6 +269,8 @@ export class RemotePageController extends EventTarget {
* Scroll horizontally * Scroll horizontally
*/ */
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> { async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('scroll')
return this.rpc.scrollHorizontally(options) return this.rpc.scrollHorizontally(options)
} }
@@ -126,6 +278,8 @@ export class RemotePageController extends EventTarget {
* Execute arbitrary JavaScript on the page * Execute arbitrary JavaScript on the page
*/ */
async executeJavascript(script: string): Promise<ActionResult> { async executeJavascript(script: string): Promise<ActionResult> {
this.ensureInitialized()
if (!this.rpc) return this.createRestrictedActionResult('execute script')
return this.rpc.executeJavascript(script) return this.rpc.executeJavascript(script)
} }
@@ -135,6 +289,7 @@ export class RemotePageController extends EventTarget {
* Show the visual mask overlay. * Show the visual mask overlay.
*/ */
async showMask(): Promise<void> { async showMask(): Promise<void> {
if (!this.rpc) return
return this.rpc.showMask() return this.rpc.showMask()
} }
@@ -142,15 +297,38 @@ export class RemotePageController extends EventTarget {
* Hide the visual mask overlay. * Hide the visual mask overlay.
*/ */
async hideMask(): Promise<void> { async hideMask(): Promise<void> {
if (!this.rpc) return
return this.rpc.hideMask() return this.rpc.hideMask()
} }
/** /**
* Dispose and clean up resources * Dispose and clean up resources on current tab
*/ */
dispose(): void { dispose(): void {
this.rpc.dispose().catch(() => { console.debug(`${DEBUG_PREFIX} dispose() called, current tab: ${this._currentTabId}`)
// Ignore errors on dispose if (this.rpc) {
}) this.rpc.dispose().catch((e) => {
console.debug(`${DEBUG_PREFIX} dispose RPC failed (ignored):`, e)
})
}
this._currentTabId = null
this._previousTabId = null
this.rpc = null
}
/**
* Dispose PageController on a specific tab (cleanup for multi-tab scenarios)
*/
async disposeTab(tabId: number): Promise<void> {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId})`)
try {
const rpc = createRPCClient(tabId)
await rpc.cleanUpHighlights()
await rpc.hideMask()
await rpc.dispose()
console.debug(`${DEBUG_PREFIX} Tab ${tabId} disposed successfully`)
} catch (e) {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed (ignored):`, e)
}
} }
} }

View File

@@ -0,0 +1,566 @@
/**
* TabsManager - Manages multiple browser tabs for agent automation
*
* Responsibilities:
* - Maintain initialTabId (tab where user started the task)
* - Maintain currentTabId (current operation target)
* - Maintain currentTabHistory (history stack for fallback)
* - Maintain managedTabIds (tabs opened by agent)
* - Manage Chrome Tab Group (named "Task(<taskId>)")
* - Listen to chrome.tabs.onRemoved for tab close handling
*/
import { type RemotePageController, isContentScriptAllowed } from './RemotePageController'
const DEBUG_PREFIX = '[TabsManager]'
/** Tab info for display in browser state */
export interface TabInfo {
id: number
url: string
title: string
isInitial: boolean
isCurrent: boolean
/** Whether content scripts can run on this page */
isAccessible: boolean
}
/** Changes since last getAndClearChanges() call */
export interface TabChanges {
opened: TabInfo[]
closed: { id: number; url: string; title: string }[]
currentSwitched?: { from: number; to: number; reason: 'user_close' | 'explicit' }
}
/** Tab group colors supported by Chrome */
const TAB_GROUP_COLORS = [
'grey',
'blue',
'red',
'yellow',
'green',
'pink',
'purple',
'cyan',
] as const
type TabGroupColor = (typeof TAB_GROUP_COLORS)[number]
function randomColor(): TabGroupColor {
return TAB_GROUP_COLORS[Math.floor(Math.random() * TAB_GROUP_COLORS.length)]
}
export class TabsManager {
/** Tab where user started the task */
private initialTabId: number | null = null
/** Current operation target tab */
private currentTabId: number | null = null
/** History stack for current tab (for fallback on close) */
private currentTabHistory: number[] = []
/** Tabs opened by agent (not including initial tab) */
private managedTabIds = new Set<number>()
/** Tab group ID for managed tabs */
private tabGroupId: number | null = null
/** Task ID for group naming */
private taskId: string = ''
/** Reference to RemotePageController for tab switching */
private pageController: RemotePageController | null = null
/** Pending changes for observation generation */
private pendingChanges: TabChanges = { opened: [], closed: [] }
/** Tab info cache for closed tab reporting */
private tabInfoCache = new Map<number, { url: string; title: string }>()
/** Whether manager is disposed */
private disposed = false
/** Bound handler for cleanup */
private onTabRemovedHandler: (tabId: number) => void
constructor() {
this.onTabRemovedHandler = this.onTabRemoved.bind(this)
}
/**
* Initialize the manager with current active tab
*/
async init(taskId: string, pageController: RemotePageController): Promise<void> {
this.taskId = taskId
this.pageController = pageController
this.disposed = false
// Get current active tab as initial tab
const [activeTab] = await chrome.tabs.query({
active: true,
currentWindow: true,
})
if (!activeTab?.id) {
throw new Error('No active tab found')
}
this.initialTabId = activeTab.id
this.currentTabId = activeTab.id
this.currentTabHistory = []
this.managedTabIds.clear()
this.pendingChanges = { opened: [], closed: [] }
// Cache initial tab info
this.tabInfoCache.set(activeTab.id, {
url: activeTab.url || '',
title: activeTab.title || '',
})
// Set target tab on page controller
await pageController.setTargetTab(activeTab.id)
// Register tab removal listener
chrome.tabs.onRemoved.addListener(this.onTabRemovedHandler)
console.debug(`${DEBUG_PREFIX} Initialized with tab:`, activeTab.id)
}
/**
* Open a new tab and set it as current
*/
async openNewTab(url: string): Promise<{ tabId: number; message: string }> {
if (!this.initialTabId || !this.pageController) {
throw new Error('TabsManager not initialized')
}
// Create new tab next to current tab
const newTab = await chrome.tabs.create({
url,
active: false, // Don't activate - agent controls focus via mask
openerTabId: this.currentTabId ?? this.initialTabId,
})
if (!newTab.id) {
throw new Error('Failed to create new tab')
}
const tabId = newTab.id
// Add to managed tabs
this.managedTabIds.add(tabId)
// Create or update tab group
await this.ensureTabGroup(tabId)
// Wait for page to complete loading before switching
// This ensures content script is ready when we set target tab
await this.waitForTabComplete(tabId)
// Get updated tab info after load
const loadedTab = await chrome.tabs.get(tabId)
const loadedUrl = loadedTab.url || url
// Cache tab info
this.tabInfoCache.set(tabId, {
url: loadedUrl,
title: loadedTab.title || url,
})
// Record change
this.pendingChanges.opened.push({
id: tabId,
url: loadedUrl,
title: loadedTab.title || url,
isInitial: false,
isCurrent: true,
isAccessible: isContentScriptAllowed(loadedUrl),
})
// Switch to new tab (content script should be ready now)
await this.switchToTab(tabId)
return {
tabId,
message: `Opened new tab [${tabId}] with URL: ${url}`,
}
}
/**
* Wait for a tab to complete loading
*/
private waitForTabComplete(tabId: number, timeoutMs = 30_000): Promise<void> {
return new Promise((resolve, reject) => {
let resolved = false
const cleanup = () => {
if (!resolved) {
resolved = true
clearTimeout(timeout)
chrome.tabs.onUpdated.removeListener(listener)
}
}
const timeout = setTimeout(() => {
cleanup()
reject(new Error(`Tab ${tabId} did not complete loading within ${timeoutMs}ms`))
}, timeoutMs)
const listener = (updatedTabId: number, changeInfo: { status?: string }) => {
if (updatedTabId === tabId && changeInfo.status === 'complete') {
cleanup()
resolve()
}
}
// Add listener FIRST to avoid race condition
chrome.tabs.onUpdated.addListener(listener)
// Then check if already complete
chrome.tabs
.get(tabId)
.then((tab) => {
if (tab.status === 'complete' && !resolved) {
cleanup()
resolve()
}
})
.catch((error: unknown) => {
cleanup()
reject(error instanceof Error ? error : new Error(String(error)))
})
})
}
/**
* Switch current tab to specified tab
*/
async switchToTab(tabId: number): Promise<string> {
if (!this.pageController) {
throw new Error('TabsManager not initialized')
}
// Verify tab exists
try {
await chrome.tabs.get(tabId)
} catch {
throw new Error(`Tab ${tabId} does not exist`)
}
// Verify tab is in our control list
if (tabId !== this.initialTabId && !this.managedTabIds.has(tabId)) {
throw new Error(
`Tab ${tabId} is not in the managed tab list. Only initial tab and tabs opened by agent can be switched to.`
)
}
const previousTabId = this.currentTabId
// Push current to history (if different)
if (this.currentTabId && this.currentTabId !== tabId) {
this.currentTabHistory.push(this.currentTabId)
}
this.currentTabId = tabId
// Update page controller target
await this.pageController.setTargetTab(tabId)
// Update tab info cache
const tab = await chrome.tabs.get(tabId)
this.tabInfoCache.set(tabId, {
url: tab.url || '',
title: tab.title || '',
})
console.debug(`${DEBUG_PREFIX} Switched to tab:`, tabId)
return `Switched to tab [${tabId}]${previousTabId ? ` (from tab [${previousTabId}])` : ''}`
}
/**
* Close a tab, optionally switch to specified tab
*/
async closeTab(tabId: number, switchTo?: number): Promise<string> {
if (!this.pageController) {
throw new Error('TabsManager not initialized')
}
// Cannot close initial tab
if (tabId === this.initialTabId) {
throw new Error('Cannot close the initial tab')
}
// Verify tab is managed
if (!this.managedTabIds.has(tabId)) {
throw new Error(`Tab ${tabId} is not in the managed tab list`)
}
// Get tab info before closing
const tabInfo = this.tabInfoCache.get(tabId)
// If closing current tab, determine switch target
if (tabId === this.currentTabId) {
const targetTabId = switchTo ?? this.findFallbackTab(tabId)
if (targetTabId) {
await this.switchToTab(targetTabId)
}
}
// Close the tab
await chrome.tabs.remove(tabId)
// Clean up
this.managedTabIds.delete(tabId)
this.tabInfoCache.delete(tabId)
this.currentTabHistory = this.currentTabHistory.filter((id) => id !== tabId)
// Record change
if (tabInfo) {
this.pendingChanges.closed.push({
id: tabId,
url: tabInfo.url,
title: tabInfo.title,
})
}
return `Closed tab [${tabId}]${switchTo ? ` and switched to tab [${switchTo}]` : ''}`
}
/**
* Get list of all tabs under control
*/
async getTabList(): Promise<TabInfo[]> {
const tabs: TabInfo[] = []
// Add initial tab
if (this.initialTabId) {
try {
const tab = await chrome.tabs.get(this.initialTabId)
const url = tab.url || ''
tabs.push({
id: tab.id!,
url,
title: tab.title || '',
isInitial: true,
isCurrent: tab.id === this.currentTabId,
isAccessible: isContentScriptAllowed(url),
})
// Update cache
this.tabInfoCache.set(tab.id!, { url, title: tab.title || '' })
} catch {
// Initial tab was closed - will be handled by onRemoved
}
}
// Add managed tabs
for (const tabId of this.managedTabIds) {
try {
const tab = await chrome.tabs.get(tabId)
const url = tab.url || ''
tabs.push({
id: tab.id!,
url,
title: tab.title || '',
isInitial: false,
isCurrent: tab.id === this.currentTabId,
isAccessible: isContentScriptAllowed(url),
})
// Update cache
this.tabInfoCache.set(tab.id!, { url, title: tab.title || '' })
} catch {
// Tab was closed - will be handled by onRemoved
}
}
return tabs
}
/**
* Get current tab ID
*/
getCurrentTabId(): number | null {
return this.currentTabId
}
/**
* Get and clear pending changes (for observation generation)
*/
getAndClearChanges(): TabChanges {
const changes = this.pendingChanges
this.pendingChanges = { opened: [], closed: [] }
return changes
}
/**
* Check if a tab is managed by this manager (initial or opened by agent)
*/
isTabManaged(tabId: number): boolean {
return tabId === this.initialTabId || this.managedTabIds.has(tabId)
}
/**
* Get all managed tab IDs (initial + agent-opened tabs)
*/
getAllManagedTabIds(): number[] {
const ids: number[] = []
if (this.initialTabId) ids.push(this.initialTabId)
for (const id of this.managedTabIds) {
ids.push(id)
}
return ids
}
/**
* Dispose PageController on all managed tabs.
* This cleans up highlights and masks on every tab.
* Should be called before dispose() to ensure clean state.
*/
async disposeAllPageControllers(): Promise<void> {
if (!this.pageController) return
const allTabIds = this.getAllManagedTabIds()
console.debug(
`${DEBUG_PREFIX} Disposing PageControllers on ${allTabIds.length} tabs:`,
allTabIds
)
// Dispose each tab in parallel
await Promise.all(
allTabIds.map((tabId) =>
this.pageController!.disposeTab(tabId).catch((e) => {
console.debug(`${DEBUG_PREFIX} disposeTab(${tabId}) failed:`, e)
})
)
)
console.debug(`${DEBUG_PREFIX} All PageControllers disposed`)
}
/**
* Dispose manager and clean up
* Note: Tab group is intentionally kept - only internal state is cleared
*/
dispose(): void {
if (this.disposed) return
this.disposed = true
console.debug(`${DEBUG_PREFIX} dispose() called`)
// Remove listener
chrome.tabs.onRemoved.removeListener(this.onTabRemovedHandler)
// Clear internal state only - keep tab group intact for user
this.initialTabId = null
this.currentTabId = null
this.currentTabHistory = []
this.managedTabIds.clear()
this.tabGroupId = null
this.pageController = null
this.tabInfoCache.clear()
this.pendingChanges = { opened: [], closed: [] }
console.debug(`${DEBUG_PREFIX} Disposed`)
}
/**
* Handle tab removal event
*/
private async onTabRemoved(tabId: number): Promise<void> {
if (this.disposed) return
// Check if it's a tab we care about
const isInitial = tabId === this.initialTabId
const isManaged = this.managedTabIds.has(tabId)
if (!isInitial && !isManaged) return
console.debug(`${DEBUG_PREFIX} Tab removed:`, tabId, { isInitial, isManaged })
// Get cached info for change reporting
const tabInfo = this.tabInfoCache.get(tabId)
if (tabInfo) {
this.pendingChanges.closed.push({
id: tabId,
url: tabInfo.url,
title: tabInfo.title,
})
}
// Clean up
this.managedTabIds.delete(tabId)
this.tabInfoCache.delete(tabId)
this.currentTabHistory = this.currentTabHistory.filter((id) => id !== tabId)
// If initial tab was closed, this is fatal
if (isInitial) {
this.initialTabId = null
console.error(`${DEBUG_PREFIX} Initial tab was closed - task should fail`)
// The agent will detect this via getTabList() and handle appropriately
return
}
// If current tab was closed, fallback to previous
if (tabId === this.currentTabId && this.pageController) {
const fallbackTabId = this.findFallbackTab(tabId)
if (fallbackTabId) {
this.pendingChanges.currentSwitched = {
from: tabId,
to: fallbackTabId,
reason: 'user_close',
}
// Don't await - fire and forget to avoid blocking
this.switchToTab(fallbackTabId).catch(() => {
// Ignore - tab switch failed but we're already in error recovery
})
}
}
}
/**
* Find fallback tab when current tab is closed
*/
private findFallbackTab(closedTabId: number): number | null {
// Try history stack (most recent first)
while (this.currentTabHistory.length > 0) {
const tabId = this.currentTabHistory.pop()!
if (tabId !== closedTabId && (tabId === this.initialTabId || this.managedTabIds.has(tabId))) {
return tabId
}
}
// Fall back to initial tab
if (this.initialTabId && this.initialTabId !== closedTabId) {
return this.initialTabId
}
return null
}
/**
* Ensure tab group exists and add tab to it
*/
private async ensureTabGroup(tabId: number): Promise<void> {
try {
if (this.tabGroupId === null) {
// Create new group
this.tabGroupId = await chrome.tabs.group({ tabIds: [tabId] })
// Set group properties
await chrome.tabGroups.update(this.tabGroupId, {
title: `Task(${this.taskId.slice(0, 8)})`,
color: randomColor(),
collapsed: false,
})
console.debug(`${DEBUG_PREFIX} Created tab group:`, this.tabGroupId)
} else {
// Add to existing group
await chrome.tabs.group({
tabIds: [tabId],
groupId: this.tabGroupId,
})
}
} catch (error) {
console.debug(`${DEBUG_PREFIX} Failed to manage tab group:`, error)
// Non-fatal - continue without grouping
}
}
}

View File

@@ -0,0 +1,70 @@
/**
* Tab control tools for browser extension
*
* These tools allow the agent to manage multiple browser tabs:
* - open_new_tab: Open a new tab and set it as current
* - switch_to_tab: Switch to an existing tab
* - close_tab: Close a tab (optionally switch to another)
*/
import zod from 'zod'
import type { TabsManager } from './TabsManager'
/** Tool definition compatible with PageAgentCore customTools */
interface TabTool {
description: string
inputSchema: zod.ZodType
execute: (input: unknown) => Promise<string>
}
/**
* Create tab control tools bound to a TabsManager instance.
* These tools are injected into PageAgentCore via customTools config.
*/
export function createTabTools(tabsManager: TabsManager): Record<string, TabTool> {
return {
open_new_tab: {
description:
'Open a new browser tab with the specified URL. The new tab becomes the current tab for all subsequent page operations.',
inputSchema: zod.object({
url: zod.string().describe('The URL to open in the new tab'),
}),
execute: async (input: unknown) => {
const { url } = input as { url: string }
const result = await tabsManager.openNewTab(url)
return result.message
},
},
switch_to_tab: {
description:
'Switch to an existing tab by its ID. After switching, all page operations will target the new current tab. You can only switch to tabs in the tab list shown in browser state.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to switch to'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }
return tabsManager.switchToTab(tab_id)
},
},
close_tab: {
description:
'Close a tab by its ID. Cannot close the initial tab. Optionally specify which tab to switch to after closing.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to close'),
switch_to: zod
.number()
.int()
.optional()
.describe(
'Optional: Tab ID to switch to after closing. If not specified, will switch to previous tab in history.'
),
}),
execute: async (input: unknown) => {
const { tab_id, switch_to } = input as { tab_id: number; switch_to?: number }
return tabsManager.closeTab(tab_id, switch_to)
},
},
}
}

View File

@@ -1,259 +1,191 @@
/** /**
* Background Script Entry Point * Background Script (Service Worker) - Stateless Message Relay
* *
* This script runs as the extension's service worker and hosts: * MV3 COMPLIANT: This script is completely stateless.
* - PageAgentCore (headless agent) * It only relays messages between contexts:
* - RemotePageController (proxy to ContentScript) * - SidePanel ↔ ContentScript (RPC for PageController)
* - Command handlers for SidePanel * - ContentScript → SidePanel (queries like shouldShowMask)
* - Event broadcasting to SidePanel * - Tab events → SidePanel (chrome.tabs API events)
*
* NO agent logic, NO state, NO long-running operations.
*/ */
import { PageAgentCore } from '@page-agent/core'
import { RemotePageController } from '../agent/RemotePageController'
import { eventBroadcaster } from '../messaging/events'
import { import {
type AgentActivity, type CSQueryMessage,
type AgentState, type CSRPCMessage,
type AgentStatus, type ExtensionMessage,
type HistoricalEvent, type QueryResponseMessage,
agentCommands, type RPCCallMessage,
contentScriptQuery, type RPCResponseMessage,
type TabEventMessage,
generateMessageId,
isExtensionMessage,
} from '../messaging/protocol' } from '../messaging/protocol'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '../utils/constants'
// Agent instance (singleton for now - single page control) // ============================================================================
let agent: PageAgentCore | null = null // Message Relay Handlers
// Track the target tab ID for event filtering // ============================================================================
let targetTabId: number | null = null
// LLM configuration (persisted in storage) /**
interface LLMConfig { * Handle messages from SidePanel and ContentScript
apiKey: string */
baseURL: string chrome.runtime.onMessage.addListener(
model: string (
message: unknown,
sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) {
return false
}
const msg = message as ExtensionMessage
switch (msg.type) {
case 'rpc:call':
// SidePanel → SW: Forward RPC to content script
handleRPCCall(msg as RPCCallMessage)
return false // No sync response needed
case 'cs:query':
// ContentScript → SW: Forward query to sidepanel
handleCSQuery(msg as CSQueryMessage, sender)
return false
default:
return false
}
}
)
/**
* Forward RPC call from SidePanel to ContentScript
*/
async function handleRPCCall(msg: RPCCallMessage): Promise<void> {
const { id, tabId, method, args } = msg
// Create message for content script
const csMessage: CSRPCMessage = {
type: 'cs:rpc',
id,
method,
args,
}
try {
// Send to content script and wait for response
const result = await chrome.tabs.sendMessage(tabId, csMessage)
// Forward response back to sidepanel
const response: RPCResponseMessage = {
type: 'rpc:response',
id,
success: true,
result,
}
await chrome.runtime.sendMessage(response)
} catch (error) {
// Forward error back to sidepanel
const response: RPCResponseMessage = {
type: 'rpc:response',
id,
success: false,
error: error instanceof Error ? error.message : String(error),
}
await chrome.runtime.sendMessage(response).catch(() => {
// Sidepanel may be closed
})
}
} }
// Default to demo config /**
let llmConfig: LLMConfig = { * Forward query from ContentScript to SidePanel
apiKey: DEMO_API_KEY, */
baseURL: DEMO_BASE_URL, async function handleCSQuery(
model: DEMO_MODEL, msg: CSQueryMessage,
sender: chrome.runtime.MessageSender
): Promise<void> {
const { id, queryType, tabId } = msg
// For shouldShowMask, we need to ask the sidepanel
// Since sidepanel may not be open, we'll use a timeout approach
// The sidepanel registers a listener for these queries
try {
// Broadcast to sidepanel (it will respond via query:response)
const response = await chrome.runtime.sendMessage(msg)
// Forward response back to content script
if (sender.tab?.id) {
const queryResponse: QueryResponseMessage = {
type: 'query:response',
id,
result: response,
}
await chrome.tabs.sendMessage(sender.tab.id, queryResponse)
}
} catch (error) {
// Sidepanel not open or no response, return default
if (sender.tab?.id) {
const queryResponse: QueryResponseMessage = {
type: 'query:response',
id,
result: queryType === 'shouldShowMask' ? false : null,
}
await chrome.tabs.sendMessage(sender.tab.id, queryResponse).catch(() => {})
}
}
} }
export default defineBackground(() => { // ============================================================================
console.log('[PageAgentExt] Background script started') // Tab Event Forwarding
// ============================================================================
// Load saved config from storage /**
loadConfig() * Forward tab removed events to sidepanel
*/
// Register command handlers chrome.tabs.onRemoved.addListener((tabId) => {
registerCommandHandlers() const message: TabEventMessage = {
type: 'tab:event',
// Register tab event listeners for page reload/close detection id: generateMessageId(),
registerTabEventListeners() eventType: 'removed',
tabId,
// Register content script notification handlers }
registerContentScriptHandlers() chrome.runtime.sendMessage(message).catch(() => {
// Sidepanel may not be open
// Open sidepanel on action click })
chrome.sidePanel
.setPanelBehavior({ openPanelOnActionClick: true })
.catch((error) => console.error('[PageAgentExt] Failed to set panel behavior:', error))
}) })
/** /**
* Load LLM configuration from storage (falls back to demo config) * Forward tab updated events to sidepanel
*/ */
async function loadConfig(): Promise<void> { chrome.tabs.onUpdated.addListener((tabId, changeInfo) => {
const result = await chrome.storage.local.get('llmConfig') // Only forward loading/complete status changes
if (result.llmConfig) { if (!changeInfo.status) return
llmConfig = result.llmConfig as LLMConfig
console.log('[PageAgentExt] Loaded LLM config from storage') const message: TabEventMessage = {
} else { type: 'tab:event',
console.log('[PageAgentExt] Using default demo config') id: generateMessageId(),
eventType: 'updated',
tabId,
data: {
status: changeInfo.status,
url: changeInfo.url,
},
} }
} chrome.runtime.sendMessage(message).catch(() => {
// Sidepanel may not be open
/**
* Save LLM configuration to storage
*/
async function saveConfig(config: LLMConfig): Promise<void> {
llmConfig = config
await chrome.storage.local.set({ llmConfig: config })
console.log('[PageAgentExt] Saved LLM config')
}
/**
* Get current agent state snapshot
*/
function getAgentState(): AgentState {
if (!agent) {
return {
status: 'idle',
task: '',
history: [],
}
}
return {
status: agent.status as AgentStatus,
task: agent.task,
history: agent.history as HistoricalEvent[],
}
}
/**
* Create and configure agent instance
*/
function createAgent(): PageAgentCore {
const pageController = new RemotePageController()
// Track the target tab ID for event filtering
pageController.tabIdPromise.then((tabId) => {
targetTabId = tabId
console.log('[PageAgentExt] Tracking tab:', tabId)
}) })
})
const newAgent = new PageAgentCore({ // ============================================================================
...llmConfig, // Extension Setup
pageController: pageController as any, // Type assertion for interface compatibility // ============================================================================
language: 'en-US',
export default defineBackground(() => {
console.log('[Background] Service Worker started (stateless relay mode)')
// Open sidepanel on action click
chrome.sidePanel.setPanelBehavior({ openPanelOnActionClick: true }).catch(() => {
// Side panel may not be supported
}) })
})
// Forward agent events to SidePanel
newAgent.addEventListener('statuschange', () => {
eventBroadcaster.status(newAgent.status as AgentStatus)
})
newAgent.addEventListener('historychange', () => {
eventBroadcaster.history(newAgent.history as HistoricalEvent[])
})
newAgent.addEventListener('activity', (e) => {
const activity = (e as CustomEvent).detail as AgentActivity
eventBroadcaster.activity(activity)
})
newAgent.addEventListener('dispose', () => {
if (agent === newAgent) {
agent = null
targetTabId = null
}
eventBroadcaster.status('idle')
})
return newAgent
}
/**
* Register command handlers for SidePanel communication
*/
function registerCommandHandlers(): void {
// Execute task
agentCommands.onMessage('agent:execute', async ({ data: task }) => {
console.log('[PageAgentExt] Executing task:', task)
// Create new agent if needed
if (!agent || agent.disposed) {
agent = createAgent()
}
// Execute task (don't await - runs in background)
agent.execute(task).catch((error) => {
console.error('[PageAgentExt] Task execution error:', error)
const message = error instanceof Error ? error.message : String(error)
// Broadcast error as a history event so it persists in UI
const errorEvent: HistoricalEvent = { type: 'error', message }
eventBroadcaster.history([errorEvent])
eventBroadcaster.status('error')
})
})
// Stop agent
agentCommands.onMessage('agent:stop', async () => {
console.log('[PageAgentExt] Stopping agent')
if (agent) {
agent.dispose('User requested stop')
agent = null
}
})
// Get current state
agentCommands.onMessage('agent:getState', async () => {
return getAgentState()
})
// Configure LLM
agentCommands.onMessage('agent:configure', async ({ data: config }) => {
await saveConfig(config)
// Recreate agent with new config if it exists
if (agent && !agent.disposed) {
agent.dispose('Configuration changed')
agent = null
}
})
console.log('[PageAgentExt] Command handlers registered')
}
/**
* Register tab event listeners for detecting page reload/navigation/close
*/
function registerTabEventListeners(): void {
// Listen for tab updates (page reload, navigation)
chrome.tabs.onUpdated.addListener((tabId, changeInfo, _tab) => {
// Only handle events for the target tab when agent is running
if (!agent || agent.disposed || tabId !== targetTabId) return
if (changeInfo.status === 'loading') {
// Page is reloading or navigating
console.log('[PageAgentExt] Target page is reloading/navigating')
agent.pushObservation(
'⚠️ Page is reloading. DOM state will change - wait for page to stabilize before next action.'
)
}
})
// Listen for tab close
chrome.tabs.onRemoved.addListener((tabId, _removeInfo) => {
// Only handle events for the target tab when agent is running
if (!agent || agent.disposed || tabId !== targetTabId) return
console.log('[PageAgentExt] Target page was closed')
agent.pushObservation(
'⚠️ Target page was closed by user. If this page is required for the task, consider marking the task as failed.'
)
// Clear target tab ID since it no longer exists
targetTabId = null
})
console.log('[PageAgentExt] Tab event listeners registered')
}
/**
* Register handlers for content script queries
*/
function registerContentScriptHandlers(): void {
// Handle shouldShowMask query - content script asks if mask should be shown
contentScriptQuery.onMessage('content:shouldShowMask', async ({ sender }) => {
const tabId = sender.tab?.id
// Check if there's an active task for this tab
const shouldShow = Boolean(tabId && agent && !agent.disposed && tabId === targetTabId)
console.log('[PageAgentExt] shouldShowMask query:', { tabId, targetTabId, shouldShow })
return shouldShow
})
// Handle content script errors - broadcast to sidepanel for user visibility
contentScriptQuery.onMessage('content:error', async ({ data }) => {
console.error('[PageAgentExt] Content script error:', data.message, 'on', data.url)
// Broadcast error to sidepanel
const errorEvent: HistoricalEvent = {
type: 'error',
message: `Content script error on ${data.url}: ${data.message}`,
}
eventBroadcaster.history([errorEvent])
})
console.log('[PageAgentExt] Content script handlers registered')
}

View File

@@ -2,78 +2,72 @@
* Content Script Entry Point * Content Script Entry Point
* *
* This script runs in the context of web pages and hosts the real PageController. * This script runs in the context of web pages and hosts the real PageController.
* It listens for RPC messages from Background and dispatches them to PageController. * It listens for RPC messages relayed through the Background Script and
* dispatches them to PageController.
* *
* PageController is created lazily on first RPC call and can be disposed/recreated * Message flow:
* between tasks. This supports multi-page workflows and ensures clean state. * - RPC: SidePanel → SW → ContentScript (this file) → response → SW → SidePanel
* - Query: ContentScript → SW → SidePanel → SW → ContentScript (for shouldShowMask)
*/ */
import { PageController } from '@page-agent/page-controller' import { PageController } from '@page-agent/page-controller'
import { contentScriptQuery, pageControllerRPC } from '../messaging/protocol' import type {
CSQueryMessage,
CSRPCMessage,
QueryResponseMessage,
RPCMethod,
} from '../messaging/protocol'
import { generateMessageId, isExtensionMessage } from '../messaging/protocol'
const DEBUG_PREFIX = '[ContentScript]'
export default defineContentScript({ export default defineContentScript({
matches: ['<all_urls>'], matches: ['<all_urls>'],
runAt: 'document_idle', runAt: 'document_idle',
async main() { async main() {
console.log('[PageAgentExt] Content script loaded on', window.location.href) const pageUrl = window.location.href
console.debug(`${DEBUG_PREFIX} Content script loaded on ${pageUrl}`)
// Lazy-initialized controller - created on demand, disposed between tasks // Lazy-initialized controller - created on demand, disposed between tasks
let controller: PageController | null = null let controller: PageController | null = null
let initError: Error | null = null let initError: Error | null = null
function getController(): PageController { function getController(): PageController {
// Re-throw init error if controller creation previously failed
if (initError) { if (initError) {
console.debug(`${DEBUG_PREFIX} getController: re-throwing init error`)
throw initError throw initError
} }
if (!controller) { if (!controller) {
try { try {
controller = new PageController({ enableMask: true }) controller = new PageController({ enableMask: true })
console.log('[PageAgentExt] PageController created') console.debug(`${DEBUG_PREFIX} PageController created`)
} catch (error) { } catch (error) {
initError = error instanceof Error ? error : new Error(String(error)) initError = error instanceof Error ? error : new Error(String(error))
console.error('[PageAgentExt] Failed to create PageController:', initError) console.error(`${DEBUG_PREFIX} Failed to create PageController:`, initError)
// Report error to background
reportError(initError.message)
throw initError throw initError
} }
} }
return controller return controller
} }
// Register RPC handlers with lazy controller access function disposeController(): void {
registerRPCHandlers( console.debug(`${DEBUG_PREFIX} Disposing controller...`)
getController, controller?.dispose()
() => controller, controller = null
() => { initError = null
controller?.dispose() console.debug(`${DEBUG_PREFIX} PageController disposed`)
controller = null }
initError = null // Clear error on dispose to allow retry
console.log('[PageAgentExt] PageController disposed') // Register RPC message handler
} registerRPCHandler(getController, () => controller, disposeController)
)
// Check if there's an active task that needs mask to be shown // Check if there's an active task that needs mask to be shown
// This handles page reload/navigation during task execution setTimeout(() => queryShouldShowMask(getController), 100)
setTimeout(async () => {
try {
const shouldShowMask = await contentScriptQuery.sendMessage(
'content:shouldShowMask',
undefined
)
if (shouldShowMask) {
console.log('[PageAgentExt] Restoring mask after page reload')
await getController().showMask()
}
} catch (error) {
// Ignore errors - background may not be ready
console.log('[PageAgentExt] shouldShowMask check skipped:', error)
}
}, 100)
// Cleanup on page unload // Cleanup on page unload
window.addEventListener('beforeunload', () => { window.addEventListener('beforeunload', () => {
console.debug(`${DEBUG_PREFIX} Page unloading, disposing controller`)
controller?.dispose() controller?.dispose()
controller = null controller = null
}) })
@@ -81,84 +75,178 @@ export default defineContentScript({
}) })
/** /**
* Report content script error to background for user visibility * Query the sidepanel (via SW) whether mask should be shown
*/ */
function reportError(message: string): void { async function queryShouldShowMask(getController: () => PageController): Promise<void> {
contentScriptQuery const tabId = await getCurrentTabId()
.sendMessage('content:error', { message, url: window.location.href }) if (!tabId) {
.catch(() => { console.debug(`${DEBUG_PREFIX} Cannot query shouldShowMask: no tab ID`)
// Silently ignore if background is not available return
}
const queryId = generateMessageId()
const queryMessage: CSQueryMessage = {
type: 'cs:query',
id: queryId,
queryType: 'shouldShowMask',
tabId,
}
try {
// Set up response listener
const responsePromise = new Promise<boolean>((resolve) => {
const timeout = setTimeout(() => {
chrome.runtime.onMessage.removeListener(listener)
resolve(false)
}, 3000)
const listener = (message: unknown) => {
if (!isExtensionMessage(message)) return
if (message.type !== 'query:response') return
if ((message as QueryResponseMessage).id !== queryId) return
clearTimeout(timeout)
chrome.runtime.onMessage.removeListener(listener)
resolve((message as QueryResponseMessage).result as boolean)
}
chrome.runtime.onMessage.addListener(listener)
}) })
// Send query
await chrome.runtime.sendMessage(queryMessage)
// Wait for response
const shouldShowMask = await responsePromise
console.debug(`${DEBUG_PREFIX} shouldShowMask result:`, shouldShowMask)
if (shouldShowMask) {
console.debug(`${DEBUG_PREFIX} Restoring mask after page reload`)
await getController().showMask()
}
} catch (error) {
console.debug(`${DEBUG_PREFIX} shouldShowMask query failed:`, error)
}
} }
/** /**
* Register all RPC message handlers for PageController methods * Get current tab ID
*/ */
function registerRPCHandlers( async function getCurrentTabId(): Promise<number | null> {
try {
const response = await chrome.runtime.sendMessage({ type: 'getTabId' })
return response?.tabId ?? null
} catch {
// Fallback: we're in content script, tab ID comes from sender in SW
return null
}
}
/**
* Register RPC message handler
*/
function registerRPCHandler(
getController: () => PageController, getController: () => PageController,
getControllerIfExists: () => PageController | null, getControllerIfExists: () => PageController | null,
disposeController: () => void disposeController: () => void
): void { ): void {
// State queries chrome.runtime.onMessage.addListener(
pageControllerRPC.onMessage('rpc:getCurrentUrl', async () => { (
return getController().getCurrentUrl() message: unknown,
}) _sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) return false
if (message.type !== 'cs:rpc') return false
pageControllerRPC.onMessage('rpc:getLastUpdateTime', async () => { const rpcMessage = message as CSRPCMessage
return getController().getLastUpdateTime() const { method, args } = rpcMessage
})
pageControllerRPC.onMessage('rpc:getBrowserState', async () => { console.debug(`${DEBUG_PREFIX} RPC: ${method}`, args)
return getController().getBrowserState()
})
// DOM operations // Handle the RPC call
pageControllerRPC.onMessage('rpc:updateTree', async () => { handleRPCCall(method, args, getController, getControllerIfExists, disposeController)
return getController().updateTree() .then((result) => {
}) sendResponse(result)
})
.catch((error) => {
console.error(`${DEBUG_PREFIX} RPC ${method} failed:`, error)
sendResponse({ error: error instanceof Error ? error.message : String(error) })
})
pageControllerRPC.onMessage('rpc:cleanUpHighlights', async () => { // Return true to indicate async response
await getControllerIfExists()?.cleanUpHighlights() return true
}) }
)
// Element actions console.debug(`${DEBUG_PREFIX} RPC handler registered`)
pageControllerRPC.onMessage('rpc:clickElement', async ({ data: index }) => { }
return getController().clickElement(index)
}) /**
* Handle an RPC call
pageControllerRPC.onMessage('rpc:inputText', async ({ data }) => { */
return getController().inputText(data.index, data.text) async function handleRPCCall(
}) method: RPCMethod,
args: unknown[],
pageControllerRPC.onMessage('rpc:selectOption', async ({ data }) => { getController: () => PageController,
return getController().selectOption(data.index, data.optionText) getControllerIfExists: () => PageController | null,
}) disposeController: () => void
): Promise<unknown> {
pageControllerRPC.onMessage('rpc:scroll', async ({ data: options }) => { switch (method) {
return getController().scroll(options) // State queries
}) case 'getCurrentUrl':
return getController().getCurrentUrl()
pageControllerRPC.onMessage('rpc:scrollHorizontally', async ({ data: options }) => {
return getController().scrollHorizontally(options) case 'getLastUpdateTime':
}) return getController().getLastUpdateTime()
pageControllerRPC.onMessage('rpc:executeJavascript', async ({ data: script }) => { case 'getBrowserState':
return getController().executeJavascript(script) return getController().getBrowserState()
})
// DOM operations
// Mask operations case 'updateTree':
pageControllerRPC.onMessage('rpc:showMask', async () => { return getController().updateTree()
await getController().showMask()
}) case 'cleanUpHighlights':
await getControllerIfExists()?.cleanUpHighlights()
pageControllerRPC.onMessage('rpc:hideMask', async () => { return undefined
await getControllerIfExists()?.hideMask()
}) // Element actions
case 'clickElement':
// Lifecycle - dispose clears the controller, next call will create fresh one return getController().clickElement(args[0] as number)
pageControllerRPC.onMessage('rpc:dispose', async () => {
disposeController() case 'inputText':
}) return getController().inputText(args[0] as number, args[1] as string)
console.log('[PageAgentExt] RPC handlers registered') case 'selectOption':
return getController().selectOption(args[0] as number, args[1] as string)
case 'scroll':
return getController().scroll(args[0] as Parameters<PageController['scroll']>[0])
case 'scrollHorizontally':
return getController().scrollHorizontally(
args[0] as Parameters<PageController['scrollHorizontally']>[0]
)
case 'executeJavascript':
return getController().executeJavascript(args[0] as string)
// Mask operations
case 'showMask':
await getController().showMask()
return undefined
case 'hideMask':
await getControllerIfExists()?.hideMask()
return undefined
// Lifecycle
case 'dispose':
disposeController()
return undefined
default:
throw new Error(`Unknown RPC method: ${method}`)
}
} }

View File

@@ -0,0 +1,378 @@
/**
* AgentController - Manages agent lifecycle in SidePanel context
*
* This class encapsulates all agent logic, keeping it isolated from the React UI.
* It runs entirely in the SidePanel frontend context, using the Background Script
* only as a stateless message relay for communicating with content scripts.
*
* Design goals:
* - Agent state lives here, not in Service Worker
* - SW is only a relay - no agent logic there
* - Future-proof: can be moved to other contexts (e.g., a controlling web page)
*/
import { PageAgentCore } from '@page-agent/core'
import type { AgentActivity, AgentStatus, ExecutionResult, HistoricalEvent } from '@page-agent/core'
import { RemotePageController } from '../../agent/RemotePageController'
import { type TabInfo, TabsManager } from '../../agent/TabsManager'
import { createTabTools } from '../../agent/tabTools'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '../../utils/constants'
/** LLM configuration */
export interface LLMConfig {
apiKey: string
baseURL: string
model: string
}
/** Agent state snapshot for UI */
export interface AgentState {
status: AgentStatus
task: string
history: HistoricalEvent[]
}
/** Event types emitted by AgentController */
export interface AgentControllerEvents {
statuschange: AgentStatus
historychange: HistoricalEvent[]
activity: AgentActivity
}
/**
* Format tab list for browser state header
*/
function formatTabListHeader(tabs: TabInfo[], currentTabId: number | null): string {
if (tabs.length === 0) return ''
const lines = ['Tab List:']
for (const tab of tabs) {
const markers: string[] = []
if (tab.isCurrent) markers.push('current')
if (tab.isInitial) markers.push('initial')
if (!tab.isAccessible) markers.push('restricted')
const markerStr = markers.length > 0 ? ` (${markers.join(', ')})` : ''
lines.push(`- [Tab ${tab.id}] ${tab.url}${markerStr}`)
}
const currentTab = tabs.find((t) => t.isCurrent)
lines.push('')
if (currentTab && !currentTab.isAccessible) {
lines.push(
`⚠️ Current tab [${currentTabId}] is a restricted page. Use open_new_tab to navigate to a regular web page.`
)
} else {
lines.push(
`Note: All page info below belongs to current tab [${currentTabId}]. To view or operate on other tabs, use switch_to_tab first.`
)
}
lines.push('')
return lines.join('\n')
}
/**
* AgentController manages the agent lifecycle in the SidePanel.
* Emits events for React UI to subscribe to.
*/
export class AgentController extends EventTarget {
private agent: PageAgentCore | null = null
private tabsManager: TabsManager | null = null
private pageController: RemotePageController | null = null
private llmConfig: LLMConfig
/** Current task being executed */
currentTask = ''
constructor() {
super()
// Default to demo config
this.llmConfig = {
apiKey: DEMO_API_KEY,
baseURL: DEMO_BASE_URL,
model: DEMO_MODEL,
}
}
/**
* Initialize controller and load saved config
*/
async init(): Promise<void> {
await this.loadConfig()
console.log('[AgentController] Initialized')
}
/**
* Load LLM configuration from storage
*/
private async loadConfig(): Promise<void> {
const result = await chrome.storage.local.get('llmConfig')
if (result.llmConfig) {
this.llmConfig = result.llmConfig as LLMConfig
console.log('[AgentController] Loaded LLM config from storage')
} else {
console.log('[AgentController] Using default demo config')
}
}
/**
* Save LLM configuration to storage
*/
async configure(config: LLMConfig): Promise<void> {
this.llmConfig = config
await chrome.storage.local.set({ llmConfig: config })
console.log('[AgentController] Saved LLM config')
// Dispose existing agent if any
if (this.agent && !this.agent.disposed) {
this.agent.dispose()
this.agent = null
}
}
/**
* Get current LLM config
*/
getConfig(): LLMConfig {
return { ...this.llmConfig }
}
/**
* Get current agent state
*/
getState(): AgentState {
if (!this.agent) {
return {
status: 'idle',
task: '',
history: [],
}
}
return {
status: this.agent.status,
task: this.agent.task,
history: this.agent.history,
}
}
/**
* Get current agent status
*/
get status(): AgentStatus {
return this.agent?.status ?? 'idle'
}
/**
* Get agent history
*/
get history(): HistoricalEvent[] {
return this.agent?.history ?? []
}
/**
* Check if a tab is managed by this controller
*/
isTabManaged(tabId: number): boolean {
return this.tabsManager?.isTabManaged(tabId) ?? false
}
/**
* Get current tab ID
*/
getCurrentTabId(): number | null {
return this.tabsManager?.getCurrentTabId() ?? null
}
/**
* Create and configure agent instance
*/
private async createAgent(): Promise<PageAgentCore> {
// Create page controller
this.pageController = new RemotePageController()
// Create tabs manager
this.tabsManager = new TabsManager()
// Generate task ID
const taskId = Math.random().toString(36).slice(2, 10)
// Initialize tabs manager
await this.tabsManager.init(taskId, this.pageController)
// Create tab tools
const tabTools = createTabTools(this.tabsManager)
const newAgent = new PageAgentCore({
...this.llmConfig,
pageController: this.createPageControllerProxy(this.pageController, this.tabsManager) as any,
language: 'en-US',
customTools: tabTools,
onBeforeStep: async (agentInstance: PageAgentCore) => {
// Check for tab changes and push observations
if (this.tabsManager) {
const changes = this.tabsManager.getAndClearChanges()
for (const tab of changes.opened) {
agentInstance.pushObservation(`New tab opened: [Tab ${tab.id}] ${tab.url}`)
}
for (const tab of changes.closed) {
agentInstance.pushObservation(`Tab closed: [Tab ${tab.id}] ${tab.url}`)
}
if (changes.currentSwitched?.reason === 'user_close') {
agentInstance.pushObservation(
`⚠️ Current tab [${changes.currentSwitched.from}] was closed. Auto-switched to tab [${changes.currentSwitched.to}].`
)
}
}
},
})
// Forward agent events
newAgent.addEventListener('statuschange', () => {
this.dispatchEvent(new CustomEvent('statuschange', { detail: newAgent.status }))
})
newAgent.addEventListener('historychange', () => {
this.dispatchEvent(new CustomEvent('historychange', { detail: newAgent.history }))
})
newAgent.addEventListener('activity', (e: Event) => {
const activity = (e as CustomEvent).detail as AgentActivity
this.dispatchEvent(new CustomEvent('activity', { detail: activity }))
})
newAgent.addEventListener('dispose', async () => {
console.debug('[AgentController] Agent dispose event received')
if (this.agent === newAgent) {
// Dispose all PageControllers on all managed tabs
if (this.tabsManager) {
console.debug('[AgentController] Disposing all PageControllers...')
await this.tabsManager.disposeAllPageControllers()
this.tabsManager.dispose()
}
this.agent = null
this.tabsManager = null
this.pageController = null
console.debug('[AgentController] Agent and TabsManager disposed')
}
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'idle' }))
})
return newAgent
}
/**
* Create a proxy for PageController that injects tab info into BrowserState.header
*/
private createPageControllerProxy(
controller: RemotePageController,
tabs: TabsManager
): RemotePageController {
return new Proxy(controller, {
get(target, prop, receiver) {
if (prop === 'getBrowserState') {
return async function () {
const state = await target.getBrowserState()
const tabList = await tabs.getTabList()
const currentTabId = tabs.getCurrentTabId()
const tabHeader = formatTabListHeader(tabList, currentTabId)
return {
...state,
header: tabHeader + (state.header || ''),
}
}
}
return Reflect.get(target, prop, receiver)
},
})
}
/**
* Execute a task
*/
async execute(task: string): Promise<ExecutionResult | null> {
console.log('[AgentController] ===== EXECUTE TASK =====')
console.log('[AgentController] Task:', task)
this.currentTask = task
// Emit running status immediately
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'running' }))
try {
// Clean up any existing agent
if (this.agent && !this.agent.disposed) {
console.log('[AgentController] Disposing existing agent before new task')
this.agent.dispose()
await new Promise((r) => setTimeout(r, 100))
}
// Clear old references
this.agent = null
this.tabsManager = null
this.pageController = null
// Create fresh agent
console.log('[AgentController] Creating new agent...')
this.agent = await this.createAgent()
console.log('[AgentController] Agent created successfully')
// Execute task
console.log('[AgentController] Starting task execution...')
const result = await this.agent.execute(task)
console.log('[AgentController] Task completed:', result)
return result
} catch (error) {
console.error('[AgentController] Task execution error:', error)
const message = error instanceof Error ? error.message : String(error)
this.dispatchEvent(
new CustomEvent('historychange', {
detail: [{ type: 'error', message } as HistoricalEvent],
})
)
this.dispatchEvent(new CustomEvent('statuschange', { detail: 'error' }))
return null
}
}
/**
* Stop current task
*/
stop(): void {
console.log('[AgentController] Stopping agent')
if (this.agent) {
this.agent.dispose()
}
}
/**
* Dispose controller and clean up
*/
dispose(): void {
console.log('[AgentController] Disposing controller')
if (this.agent && !this.agent.disposed) {
this.agent.dispose()
}
this.agent = null
this.tabsManager = null
this.pageController = null
this.currentTask = ''
}
}
// Singleton instance
let controllerInstance: AgentController | null = null
/**
* Get or create the AgentController singleton
*/
export function getAgentController(): AgentController {
if (!controllerInstance) {
controllerInstance = new AgentController()
}
return controllerInstance
}

View File

@@ -8,65 +8,19 @@ import {
InputGroupButton, InputGroupButton,
InputGroupTextarea, InputGroupTextarea,
} from '@/components/ui/input-group' } from '@/components/ui/input-group'
import { subscribeToEvents } from '@/messaging/events'
import { agentCommands } from '@/messaging/protocol'
import type { AgentActivity, AgentState, AgentStatus, HistoricalEvent } from '@/messaging/protocol'
import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '@/utils/constants'
import { EmptyState, Logo, StatusDot } from './components'
import { ConfigPanel } from './components/ConfigPanel' import { ConfigPanel } from './components/ConfigPanel'
import { ActivityCard, EventCard } from './components/cards' import { ActivityCard, EventCard } from './components/cards'
import { EmptyState, Logo, StatusDot } from './components/misc'
import { useAgent } from './useAgent'
export default function App() { export default function App() {
const [showConfig, setShowConfig] = useState(false) const [showConfig, setShowConfig] = useState(false)
const [task, setTask] = useState('') const [task, setTask] = useState('')
const [status, setStatus] = useState<AgentStatus>('idle')
const [history, setHistory] = useState<HistoricalEvent[]>([])
const [activity, setActivity] = useState<AgentActivity | null>(null)
const [currentTask, setCurrentTask] = useState('')
const historyRef = useRef<HTMLDivElement>(null) const historyRef = useRef<HTMLDivElement>(null)
const textareaRef = useRef<HTMLTextAreaElement>(null) const textareaRef = useRef<HTMLTextAreaElement>(null)
// Subscribe to agent events const { status, history, activity, currentTask, config, execute, stop, configure } = useAgent()
useEffect(() => {
// Initialize with demo config if not set
chrome.storage.local.get('llmConfig').then((result) => {
if (!result.llmConfig) {
chrome.storage.local.set({
llmConfig: { apiKey: DEMO_API_KEY, baseURL: DEMO_BASE_URL, model: DEMO_MODEL },
})
}
})
const unsubscribe = subscribeToEvents({
onStatus: (newStatus) => {
setStatus(newStatus)
if (newStatus === 'idle' || newStatus === 'completed' || newStatus === 'error') {
setActivity(null)
}
},
onHistory: (newHistory) => {
setHistory(newHistory)
},
onActivity: (newActivity) => {
setActivity(newActivity)
},
onStateSnapshot: (state) => {
setStatus(state.status)
setHistory(state.history)
setCurrentTask(state.task)
},
})
// Get initial state
agentCommands.sendMessage('agent:getState', undefined).then((state: AgentState) => {
setStatus(state.status)
setHistory(state.history)
setCurrentTask(state.task)
})
return unsubscribe
}, [])
// Auto-scroll to bottom on new events // Auto-scroll to bottom on new events
useEffect(() => { useEffect(() => {
@@ -76,21 +30,25 @@ export default function App() {
}, [history, activity]) }, [history, activity])
const handleSubmit = useCallback( const handleSubmit = useCallback(
async (e?: React.FormEvent) => { (e?: React.FormEvent) => {
e?.preventDefault() e?.preventDefault()
if (!task.trim() || status === 'running') return if (!task.trim() || status === 'running') return
setCurrentTask(task) const taskToExecute = task.trim()
setHistory([])
await agentCommands.sendMessage('agent:execute', task)
setTask('') setTask('')
console.log('[SidePanel] Executing task:', taskToExecute)
execute(taskToExecute).catch((error) => {
console.error('[SidePanel] Failed to execute task:', error)
})
}, },
[task, status] [task, status, execute]
) )
const handleStop = useCallback(async () => { const handleStop = useCallback(() => {
await agentCommands.sendMessage('agent:stop', undefined) console.log('[SidePanel] Stopping task...')
}, []) stop()
}, [stop])
const handleKeyDown = (e: React.KeyboardEvent) => { const handleKeyDown = (e: React.KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
@@ -100,7 +58,16 @@ export default function App() {
} }
if (showConfig) { if (showConfig) {
return <ConfigPanel onClose={() => setShowConfig(false)} /> return (
<ConfigPanel
config={config}
onSave={async (newConfig) => {
await configure(newConfig)
setShowConfig(false)
}}
onClose={() => setShowConfig(false)}
/>
)
} }
const isRunning = status === 'running' const isRunning = status === 'running'
@@ -157,7 +124,6 @@ export default function App() {
onChange={(e) => setTask(e.target.value)} onChange={(e) => setTask(e.target.value)}
onKeyDown={handleKeyDown} onKeyDown={handleKeyDown}
disabled={isRunning} disabled={isRunning}
// rows={2}
className="text-xs pr-12 min-h-10" className="text-xs pr-12 min-h-10"
/> />
<InputGroupAddon align="inline-end" className="absolute bottom-0 right-0"> <InputGroupAddon align="inline-end" className="absolute bottom-0 right-0">

View File

@@ -1,34 +1,35 @@
import { Loader2 } from 'lucide-react' import { Loader2 } from 'lucide-react'
import { useEffect, useState } from 'react'
import { Button } from '@/components/ui/button' import { Button } from '@/components/ui/button'
import { Input } from '@/components/ui/input' import { Input } from '@/components/ui/input'
import { agentCommands } from '@/messaging' import { DEMO_API_KEY, DEMO_BASE_URL, DEMO_MODEL } from '@/utils/constants'
// Configuration panel component import type { LLMConfig } from '../AgentController'
export function ConfigPanel({ onClose }: { onClose: () => void }) {
const [apiKey, setApiKey] = useState(DEMO_API_KEY) interface ConfigPanelProps {
const [baseURL, setBaseURL] = useState(DEMO_BASE_URL) config: LLMConfig
const [model, setModel] = useState(DEMO_MODEL) onSave: (config: LLMConfig) => Promise<void>
onClose: () => void
}
export function ConfigPanel({ config, onSave, onClose }: ConfigPanelProps) {
const [apiKey, setApiKey] = useState(config.apiKey || DEMO_API_KEY)
const [baseURL, setBaseURL] = useState(config.baseURL || DEMO_BASE_URL)
const [model, setModel] = useState(config.model || DEMO_MODEL)
const [saving, setSaving] = useState(false) const [saving, setSaving] = useState(false)
// Update local state when config prop changes
useEffect(() => { useEffect(() => {
chrome.storage.local.get('llmConfig').then((result) => { setApiKey(config.apiKey || DEMO_API_KEY)
const config = result.llmConfig as setBaseURL(config.baseURL || DEMO_BASE_URL)
| { apiKey?: string; baseURL?: string; model?: string } setModel(config.model || DEMO_MODEL)
| undefined }, [config])
if (config) {
setApiKey(config.apiKey || DEMO_API_KEY)
setBaseURL(config.baseURL || DEMO_BASE_URL)
setModel(config.model || DEMO_MODEL)
}
})
}, [])
const handleSave = async () => { const handleSave = async () => {
setSaving(true) setSaving(true)
try { try {
await agentCommands.sendMessage('agent:configure', { apiKey, baseURL, model }) await onSave({ apiKey, baseURL, model })
onClose()
} finally { } finally {
setSaving(false) setSaving(false)
} }

View File

@@ -1,8 +1,10 @@
import { import type {
type AgentErrorEvent, AgentActivity,
type AgentStepEvent, AgentErrorEvent,
type ObservationEvent, AgentStepEvent,
type RetryEvent, HistoricalEvent,
ObservationEvent,
RetryEvent,
} from '@page-agent/core' } from '@page-agent/core'
import { import {
CheckCircle, CheckCircle,
@@ -21,7 +23,6 @@ import {
import { Fragment, useState } from 'react' import { Fragment, useState } from 'react'
import { cn } from '@/lib/utils' import { cn } from '@/lib/utils'
import { AgentActivity, HistoricalEvent } from '@/messaging'
// Result card for done action // Result card for done action
function ResultCard({ function ResultCard({

View File

@@ -1,5 +1,6 @@
import type { AgentStatus } from '@page-agent/core'
import { cn } from '@/lib/utils' import { cn } from '@/lib/utils'
import { AgentStatus } from '@/messaging'
// Status dot indicator // Status dot indicator
export function StatusDot({ status }: { status: AgentStatus }) { export function StatusDot({ status }: { status: AgentStatus }) {

View File

@@ -0,0 +1,153 @@
/**
* React hook for using AgentController
*
* This hook provides a React-friendly interface to the AgentController,
* handling event subscriptions and state updates.
*/
import type { AgentActivity, AgentStatus, HistoricalEvent } from '@page-agent/core'
import { useCallback, useEffect, useRef, useState } from 'react'
import type { CSQueryMessage } from '../../messaging/protocol'
import { isExtensionMessage } from '../../messaging/protocol'
import { type AgentController, type LLMConfig, getAgentController } from './AgentController'
export interface UseAgentResult {
// State
status: AgentStatus
history: HistoricalEvent[]
activity: AgentActivity | null
currentTask: string
config: LLMConfig
// Actions
execute: (task: string) => Promise<void>
stop: () => void
configure: (config: LLMConfig) => Promise<void>
}
export function useAgent(): UseAgentResult {
const controllerRef = useRef<AgentController | null>(null)
const [status, setStatus] = useState<AgentStatus>('idle')
const [history, setHistory] = useState<HistoricalEvent[]>([])
const [activity, setActivity] = useState<AgentActivity | null>(null)
const [currentTask, setCurrentTask] = useState('')
const [config, setConfig] = useState<LLMConfig>({
apiKey: '',
baseURL: '',
model: '',
})
// Initialize controller and subscribe to events
useEffect(() => {
const controller = getAgentController()
controllerRef.current = controller
// Initialize
controller.init().then(() => {
setConfig(controller.getConfig())
})
// Event handlers
const handleStatusChange = (e: Event) => {
const newStatus = (e as CustomEvent).detail as AgentStatus
setStatus(newStatus)
if (newStatus === 'idle' || newStatus === 'completed' || newStatus === 'error') {
setActivity(null)
}
}
const handleHistoryChange = (e: Event) => {
const newHistory = (e as CustomEvent).detail as HistoricalEvent[]
setHistory([...newHistory])
}
const handleActivity = (e: Event) => {
const newActivity = (e as CustomEvent).detail as AgentActivity
setActivity(newActivity)
}
controller.addEventListener('statuschange', handleStatusChange)
controller.addEventListener('historychange', handleHistoryChange)
controller.addEventListener('activity', handleActivity)
// Handle shouldShowMask queries from content scripts
const handleMessage = (
message: unknown,
_sender: chrome.runtime.MessageSender,
sendResponse: (response?: unknown) => void
): boolean => {
if (!isExtensionMessage(message)) return false
if (message.type !== 'cs:query') return false
const query = message as CSQueryMessage
if (query.queryType === 'shouldShowMask') {
const ctrl = controllerRef.current
if (!ctrl) {
sendResponse(false)
return true
}
const isManaged = ctrl.isTabManaged(query.tabId)
const isCurrent = ctrl.getCurrentTabId() === query.tabId
const isRunning = ctrl.status === 'running'
const shouldShow = isManaged && isCurrent && isRunning
console.debug('[useAgent] shouldShowMask query:', {
tabId: query.tabId,
isManaged,
isCurrent,
isRunning,
shouldShow,
})
sendResponse(shouldShow)
return true
}
return false
}
chrome.runtime.onMessage.addListener(handleMessage)
// Cleanup
return () => {
controller.removeEventListener('statuschange', handleStatusChange)
controller.removeEventListener('historychange', handleHistoryChange)
controller.removeEventListener('activity', handleActivity)
chrome.runtime.onMessage.removeListener(handleMessage)
controller.dispose()
}
}, [])
const execute = useCallback(async (task: string) => {
const controller = controllerRef.current
if (!controller) return
setCurrentTask(task)
setHistory([])
await controller.execute(task)
}, [])
const stop = useCallback(() => {
controllerRef.current?.stop()
}, [])
const configure = useCallback(async (newConfig: LLMConfig) => {
const controller = controllerRef.current
if (!controller) return
await controller.configure(newConfig)
setConfig(newConfig)
}, [])
return {
status,
history,
activity,
currentTask,
config,
execute,
stop,
configure,
}
}

View File

@@ -1,98 +0,0 @@
/**
* Agent Event Broadcasting
*
* This module handles broadcasting agent events from Background to SidePanel.
* Uses chrome.runtime API for broadcasting to all extension contexts.
*/
import type { AgentActivity, AgentState, AgentStatus, HistoricalEvent } from './protocol'
// Event type constants
const EVENT_TYPES = {
STATUS: 'event:status',
HISTORY: 'event:history',
ACTIVITY: 'event:activity',
STATE_SNAPSHOT: 'event:stateSnapshot',
} as const
type EventType = (typeof EVENT_TYPES)[keyof typeof EVENT_TYPES]
interface EventMessage<T = unknown> {
type: EventType
payload: T
}
/**
* Broadcast an event to all extension contexts (sidepanel, popup, etc.)
*/
function broadcast<T>(type: EventType, payload: T): void {
const message: EventMessage<T> = { type, payload }
// Use chrome.runtime.sendMessage to broadcast to all contexts
chrome.runtime.sendMessage(message).catch(() => {
// Ignore errors when no listeners are active
})
}
/**
* Event broadcaster for agent state updates.
* Called from Background to notify SidePanel of changes.
*/
export const eventBroadcaster = {
/** Broadcast status change */
status(status: AgentStatus): void {
broadcast(EVENT_TYPES.STATUS, status)
},
/** Broadcast history update */
history(history: HistoricalEvent[]): void {
broadcast(EVENT_TYPES.HISTORY, history)
},
/** Broadcast activity (transient) */
activity(activity: AgentActivity): void {
broadcast(EVENT_TYPES.ACTIVITY, activity)
},
/** Broadcast full state snapshot */
stateSnapshot(state: AgentState): void {
broadcast(EVENT_TYPES.STATE_SNAPSHOT, state)
},
}
/**
* Event listener type for SidePanel
*/
export interface EventListener {
onStatus?: (status: AgentStatus) => void
onHistory?: (history: HistoricalEvent[]) => void
onActivity?: (activity: AgentActivity) => void
onStateSnapshot?: (state: AgentState) => void
}
/**
* Subscribe to agent events in SidePanel.
* Returns an unsubscribe function.
*/
export function subscribeToEvents(listener: EventListener): () => void {
const handler = (message: EventMessage) => {
switch (message.type) {
case EVENT_TYPES.STATUS:
listener.onStatus?.(message.payload as AgentStatus)
break
case EVENT_TYPES.HISTORY:
listener.onHistory?.(message.payload as HistoricalEvent[])
break
case EVENT_TYPES.ACTIVITY:
listener.onActivity?.(message.payload as AgentActivity)
break
case EVENT_TYPES.STATE_SNAPSHOT:
listener.onStateSnapshot?.(message.payload as AgentState)
break
}
}
chrome.runtime.onMessage.addListener(handler)
return () => {
chrome.runtime.onMessage.removeListener(handler)
}
}

View File

@@ -3,4 +3,3 @@
*/ */
export * from './protocol' export * from './protocol'
export * from './rpc' export * from './rpc'
export * from './events'

View File

@@ -1,15 +1,19 @@
/** /**
* Message Protocol for PageAgentExt * Message Protocol for PageAgentExt
* *
* This file defines all message types for cross-context communication: * NEW ARCHITECTURE (MV3 compliant):
* - RPC: Background <-> ContentScript (PageController remote calls) * - SidePanel hosts the agent, all state lives there
* - Commands: SidePanel -> Background (user actions) * - Background (SW) is a stateless message relay
* - Events: Background -> SidePanel (agent state updates) * - Content Script runs PageController
*
* Message flows:
* 1. RPC: SidePanel → SW → ContentScript → SW → SidePanel (PageController calls)
* 2. Query: ContentScript → SW → SidePanel → SW → ContentScript (mask state check)
* 3. Events: SW → SidePanel (tab events from chrome.tabs API)
*/ */
import { defineExtensionMessaging } from '@webext-core/messaging'
// ============================================================================ // ============================================================================
// Shared Types (re-exported from core packages for convenience) // Shared Types
// ============================================================================ // ============================================================================
/** Action result from PageController operations */ /** Action result from PageController operations */
@@ -42,146 +46,138 @@ export interface ScrollHorizontallyOptions {
index?: number index?: number
} }
/** Agent execution status */ // ============================================================================
export type AgentStatus = 'idle' | 'running' | 'completed' | 'error' // Message Types
// ============================================================================
/** Agent activity for real-time UI feedback */ /** Message type identifier */
export type AgentActivity = type MessageType =
| { type: 'thinking' } | 'rpc:call' // SidePanel → SW: RPC call to content script
| { type: 'executing'; tool: string; input: unknown } | 'rpc:response' // SW → SidePanel: RPC response from content script
| { type: 'executed'; tool: string; input: unknown; output: string; duration: number } | 'cs:rpc' // SW → ContentScript: Forwarded RPC call
| { type: 'retrying'; attempt: number; maxAttempts: number } | 'cs:query' // ContentScript → SW: Query to sidepanel
| { type: 'error'; message: string } | 'query:response' // SW → ContentScript: Query response
| 'tab:event' // SW → SidePanel: Tab event notification
/** Historical event (simplified for serialization) */ /** Base message structure */
export interface HistoricalEvent { interface BaseMessage {
type: 'step' | 'observation' | 'user_takeover' | 'retry' | 'error' type: MessageType
// For 'step' type id: string // Unique message ID for request-response matching
stepIndex?: number }
reflection?: {
evaluation_previous_goal?: string // ============================================================================
memory?: string // RPC Messages (SidePanel ↔ SW ↔ ContentScript)
next_goal?: string // ============================================================================
/** RPC method names matching PageController interface */
export type RPCMethod =
| 'getCurrentUrl'
| 'getLastUpdateTime'
| 'getBrowserState'
| 'updateTree'
| 'cleanUpHighlights'
| 'clickElement'
| 'inputText'
| 'selectOption'
| 'scroll'
| 'scrollHorizontally'
| 'executeJavascript'
| 'showMask'
| 'hideMask'
| 'dispose'
/** SidePanel → SW: Request to call PageController method */
export interface RPCCallMessage extends BaseMessage {
type: 'rpc:call'
tabId: number
method: RPCMethod
args: unknown[]
}
/** SW → SidePanel: Response from PageController */
export interface RPCResponseMessage extends BaseMessage {
type: 'rpc:response'
success: boolean
result?: unknown
error?: string
}
/** SW → ContentScript: Forwarded RPC call */
export interface CSRPCMessage extends BaseMessage {
type: 'cs:rpc'
method: RPCMethod
args: unknown[]
}
// ============================================================================
// Query Messages (ContentScript → SW → SidePanel)
// ============================================================================
/** Query types that content script can ask */
export type QueryType = 'shouldShowMask'
/** ContentScript → SW: Query to sidepanel */
export interface CSQueryMessage extends BaseMessage {
type: 'cs:query'
queryType: QueryType
tabId: number
}
/** SW → ContentScript: Query response */
export interface QueryResponseMessage extends BaseMessage {
type: 'query:response'
result: unknown
}
// ============================================================================
// Tab Event Messages (SW → SidePanel)
// ============================================================================
/** Tab event types */
export type TabEventType = 'removed' | 'updated'
/** SW → SidePanel: Tab event notification */
export interface TabEventMessage extends BaseMessage {
type: 'tab:event'
eventType: TabEventType
tabId: number
data?: {
// For 'updated' events
status?: string
url?: string
} }
action?: {
name: string
input: unknown
output: string
}
// For 'observation' type
content?: string
// For 'retry' type
attempt?: number
maxAttempts?: number
// For 'error' and 'retry' types
message?: string
// Raw LLM response for debugging (step and error types)
rawResponse?: unknown
}
/** Agent state snapshot */
export interface AgentState {
status: AgentStatus
task: string
history: HistoricalEvent[]
} }
// ============================================================================ // ============================================================================
// RPC Protocol: Background <-> ContentScript // Union Types
// Used by RemotePageController to call PageController methods
// ============================================================================ // ============================================================================
export interface PageControllerRPCProtocol { /** All message types */
// State queries export type ExtensionMessage =
'rpc:getCurrentUrl': () => string | RPCCallMessage
'rpc:getLastUpdateTime': () => number | RPCResponseMessage
'rpc:getBrowserState': () => BrowserState | CSRPCMessage
| CSQueryMessage
| QueryResponseMessage
| TabEventMessage
// DOM operations // ============================================================================
'rpc:updateTree': () => string // Utility Functions
'rpc:cleanUpHighlights': () => void // ============================================================================
// Element actions /** Generate unique message ID */
'rpc:clickElement': (index: number) => ActionResult export function generateMessageId(): string {
'rpc:inputText': (data: { index: number; text: string }) => ActionResult return `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
'rpc:selectOption': (data: { index: number; optionText: string }) => ActionResult
'rpc:scroll': (options: ScrollOptions) => ActionResult
'rpc:scrollHorizontally': (options: ScrollHorizontallyOptions) => ActionResult
'rpc:executeJavascript': (script: string) => ActionResult
// Mask operations
'rpc:showMask': () => void
'rpc:hideMask': () => void
// Lifecycle
'rpc:dispose': () => void
} }
// ============================================================================ /** Type guard for our messages */
// Command Protocol: SidePanel -> Background export function isExtensionMessage(msg: unknown): msg is ExtensionMessage {
// Used by SidePanel UI to control the agent return (
// ============================================================================ typeof msg === 'object' &&
msg !== null &&
export interface AgentCommandProtocol { 'type' in msg &&
// Task control 'id' in msg &&
'agent:execute': (task: string) => void typeof (msg as ExtensionMessage).type === 'string' &&
'agent:stop': () => void typeof (msg as ExtensionMessage).id === 'string'
)
// State queries
'agent:getState': () => AgentState
// Configuration
'agent:configure': (config: { apiKey: string; baseURL: string; model: string }) => void
} }
// ============================================================================
// Content Script Query Protocol: ContentScript -> Background
// Used by ContentScript to query Background state
// ============================================================================
export interface ContentScriptQueryProtocol {
/** Check if there's an active task for this tab, returns true if mask should be shown */
'content:shouldShowMask': () => boolean
/** Report content script initialization error to background */
'content:error': (error: { message: string; url: string }) => void
}
// ============================================================================
// Event Protocol: Background -> SidePanel
// Used by Background to push updates to SidePanel
// ============================================================================
export interface AgentEventProtocol {
'event:status': (status: AgentStatus) => void
'event:history': (history: HistoricalEvent[]) => void
'event:activity': (activity: AgentActivity) => void
'event:stateSnapshot': (state: AgentState) => void
}
// ============================================================================
// Messaging Instances
// ============================================================================
/**
* RPC messaging for PageController remote calls
* Background sends, ContentScript receives
*/
export const pageControllerRPC = defineExtensionMessaging<PageControllerRPCProtocol>()
/**
* Command messaging for agent control
* SidePanel sends, Background receives
*/
export const agentCommands = defineExtensionMessaging<AgentCommandProtocol>()
/**
* Event messaging for agent updates
* Background sends, SidePanel receives
*/
export const agentEvents = defineExtensionMessaging<AgentEventProtocol>()
/**
* Content script query messaging
* ContentScript sends, Background receives
*/
export const contentScriptQuery = defineExtensionMessaging<ContentScriptQueryProtocol>()

View File

@@ -1,38 +1,75 @@
/** /**
* RPC utilities for PageController remote calls * RPC Client for PageController remote calls
* *
* This module provides helper functions for making RPC calls * This module provides RPC functionality from SidePanel to ContentScript
* from Background to ContentScript with proper error handling. * via the Background (SW) relay.
*
* Flow: SidePanel → SW (relay) → ContentScript → SW → SidePanel
*/ */
import { pageControllerRPC } from './protocol' import {
import type { type ActionResult,
ActionResult, type BrowserState,
BrowserState, type RPCCallMessage,
ScrollHorizontallyOptions, type RPCMethod,
ScrollOptions, type RPCResponseMessage,
type ScrollHorizontallyOptions,
type ScrollOptions,
generateMessageId,
isExtensionMessage,
} from './protocol' } from './protocol'
/** RPC call configuration */ /** RPC configuration */
const RPC_CONFIG = { const RPC_CONFIG = {
/** Maximum retry attempts for transient failures */ /** Maximum retry attempts for transient failures */
maxRetries: 3, maxRetries: 3,
/** Base delay between retries in ms (exponential backoff) */ /** Base delay between retries in ms (exponential backoff) */
retryDelayMs: 500, retryDelayMs: 500,
/** Timeout for waiting for content script to be ready */ /** Timeout for individual RPC call in ms */
readyTimeoutMs: 5000, callTimeoutMs: 30000,
} }
/** /** Pending RPC calls waiting for response */
* Error thrown when RPC call fails due to tab/content script issues const pendingCalls = new Map<
*/ string,
export class RPCError extends Error { {
constructor( resolve: (value: unknown) => void
message: string, reject: (error: Error) => void
public readonly code: 'TAB_CLOSED' | 'CONTENT_SCRIPT_NOT_READY' | 'RPC_FAILED' timeout: ReturnType<typeof setTimeout>
) {
super(message)
this.name = 'RPCError'
} }
>()
/** Whether the response listener is registered */
let listenerRegistered = false
/**
* Register the RPC response listener (called once)
*/
function ensureResponseListener(): void {
if (listenerRegistered) return
listenerRegistered = true
chrome.runtime.onMessage.addListener((message: unknown) => {
if (!isExtensionMessage(message)) return
if (message.type !== 'rpc:response') return
const response = message as RPCResponseMessage
const pending = pendingCalls.get(response.id)
if (!pending) {
console.debug('[RPC] Received response for unknown call:', response.id)
return
}
pendingCalls.delete(response.id)
clearTimeout(pending.timeout)
if (response.success) {
pending.resolve(response.result)
} else {
pending.reject(new Error(response.error || 'RPC call failed'))
}
})
console.debug('[RPC] Response listener registered')
} }
/** /**
@@ -55,167 +92,97 @@ async function tabExists(tabId: number): Promise<boolean> {
} }
/** /**
* Wrap an RPC call with error handling and retry logic * Error thrown when RPC call fails
*/ */
async function withRetry<T>(tabId: number, operation: string, fn: () => Promise<T>): Promise<T> { export class RPCError extends Error {
constructor(
message: string,
public readonly code: 'TAB_CLOSED' | 'CONTENT_SCRIPT_NOT_READY' | 'RPC_FAILED' | 'TIMEOUT'
) {
super(message)
this.name = 'RPCError'
}
}
/**
* Make a single RPC call (no retry)
*/
async function callOnce(tabId: number, method: RPCMethod, args: unknown[]): Promise<unknown> {
ensureResponseListener()
const id = generateMessageId()
const message: RPCCallMessage = {
type: 'rpc:call',
id,
tabId,
method,
args,
}
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
pendingCalls.delete(id)
reject(new RPCError(`RPC ${method} timed out`, 'TIMEOUT'))
}, RPC_CONFIG.callTimeoutMs)
pendingCalls.set(id, { resolve, reject, timeout })
chrome.runtime.sendMessage(message).catch((error: Error) => {
pendingCalls.delete(id)
clearTimeout(timeout)
reject(error)
})
})
}
/**
* Make an RPC call with retry logic
*/
async function call(tabId: number, method: RPCMethod, args: unknown[]): Promise<unknown> {
let lastError: Error | null = null let lastError: Error | null = null
for (let attempt = 0; attempt < RPC_CONFIG.maxRetries; attempt++) { for (let attempt = 0; attempt < RPC_CONFIG.maxRetries; attempt++) {
try { try {
return await fn() return await callOnce(tabId, method, args)
} catch (error) { } catch (error) {
lastError = error as Error lastError = error as Error
const message = lastError.message || String(error) const message = lastError.message || String(error)
// Check if tab still exists // Check if tab still exists
if (!(await tabExists(tabId))) { if (!(await tabExists(tabId))) {
throw new RPCError(`Tab ${tabId} was closed during ${operation}`, 'TAB_CLOSED') throw new RPCError(`Tab ${tabId} was closed`, 'TAB_CLOSED')
} }
// Check for content script not ready errors // Check for retryable errors
if ( if (
message.includes('Could not establish connection') || message.includes('Could not establish connection') ||
message.includes('Receiving end does not exist') message.includes('Receiving end does not exist') ||
message.includes('content script not ready')
) { ) {
console.log( const delay = RPC_CONFIG.retryDelayMs * Math.pow(2, attempt)
`[RPC] Content script not ready for ${operation}, attempt ${attempt + 1}/${RPC_CONFIG.maxRetries}` console.debug(
`[RPC] Retry ${attempt + 1}/${RPC_CONFIG.maxRetries} for ${method}, waiting ${delay}ms`
) )
// Wait before retry with exponential backoff await sleep(delay)
await sleep(RPC_CONFIG.retryDelayMs * Math.pow(2, attempt))
continue continue
} }
// For other errors, throw immediately // Non-retryable error
throw new RPCError(`RPC ${operation} failed: ${message}`, 'RPC_FAILED') throw lastError
} }
} }
// All retries exhausted
throw new RPCError( throw new RPCError(
`Content script not ready after ${RPC_CONFIG.maxRetries} attempts for ${operation}`, `Content script not ready after ${RPC_CONFIG.maxRetries} attempts for ${method}`,
'CONTENT_SCRIPT_NOT_READY' 'CONTENT_SCRIPT_NOT_READY'
) )
} }
/** /**
* Create an RPC client bound to a specific tab. * RPC client interface matching PageController methods
* The tabId is captured at creation time to ensure messages are sent to the correct tab
* even if the user switches tabs or the page loses focus.
*/ */
export function createRPCClient(tabIdPromise: Promise<number>): RPCClient {
return {
// State queries
async getCurrentUrl(): Promise<string> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getCurrentUrl', () =>
pageControllerRPC.sendMessage('rpc:getCurrentUrl', undefined, tabId)
)
},
async getLastUpdateTime(): Promise<number> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getLastUpdateTime', () =>
pageControllerRPC.sendMessage('rpc:getLastUpdateTime', undefined, tabId)
)
},
async getBrowserState(): Promise<BrowserState> {
const tabId = await tabIdPromise
return withRetry(tabId, 'getBrowserState', () =>
pageControllerRPC.sendMessage('rpc:getBrowserState', undefined, tabId)
)
},
// DOM operations
async updateTree(): Promise<string> {
const tabId = await tabIdPromise
return withRetry(tabId, 'updateTree', () =>
pageControllerRPC.sendMessage('rpc:updateTree', undefined, tabId)
)
},
async cleanUpHighlights(): Promise<void> {
const tabId = await tabIdPromise
return withRetry(tabId, 'cleanUpHighlights', () =>
pageControllerRPC.sendMessage('rpc:cleanUpHighlights', undefined, tabId)
)
},
// Element actions
async clickElement(index: number): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'clickElement', () =>
pageControllerRPC.sendMessage('rpc:clickElement', index, tabId)
)
},
async inputText(index: number, text: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'inputText', () =>
pageControllerRPC.sendMessage('rpc:inputText', { index, text }, tabId)
)
},
async selectOption(index: number, optionText: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'selectOption', () =>
pageControllerRPC.sendMessage('rpc:selectOption', { index, optionText }, tabId)
)
},
async scroll(options: ScrollOptions): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'scroll', () =>
pageControllerRPC.sendMessage('rpc:scroll', options, tabId)
)
},
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'scrollHorizontally', () =>
pageControllerRPC.sendMessage('rpc:scrollHorizontally', options, tabId)
)
},
async executeJavascript(script: string): Promise<ActionResult> {
const tabId = await tabIdPromise
return withRetry(tabId, 'executeJavascript', () =>
pageControllerRPC.sendMessage('rpc:executeJavascript', script, tabId)
)
},
// Mask operations
async showMask(): Promise<void> {
const tabId = await tabIdPromise
return withRetry(tabId, 'showMask', () =>
pageControllerRPC.sendMessage('rpc:showMask', undefined, tabId)
)
},
async hideMask(): Promise<void> {
const tabId = await tabIdPromise
// Don't retry hideMask - if content script is gone, mask is already hidden
try {
return await pageControllerRPC.sendMessage('rpc:hideMask', undefined, tabId)
} catch {
// Ignore errors - mask is effectively hidden if content script is gone
}
},
// Lifecycle
async dispose(): Promise<void> {
const tabId = await tabIdPromise
// Don't retry dispose - best effort cleanup
try {
return await pageControllerRPC.sendMessage('rpc:dispose', undefined, tabId)
} catch {
// Ignore errors - resources are already cleaned up if content script is gone
}
},
}
}
export interface RPCClient { export interface RPCClient {
tabId: number
getCurrentUrl(): Promise<string> getCurrentUrl(): Promise<string>
getLastUpdateTime(): Promise<number> getLastUpdateTime(): Promise<number>
getBrowserState(): Promise<BrowserState> getBrowserState(): Promise<BrowserState>
@@ -231,3 +198,80 @@ export interface RPCClient {
hideMask(): Promise<void> hideMask(): Promise<void>
dispose(): Promise<void> dispose(): Promise<void>
} }
/**
* Create an RPC client bound to a specific tab
*/
export function createRPCClient(tabId: number): RPCClient {
console.debug(`[RPC] Creating client for tab ${tabId}`)
return {
tabId,
async getCurrentUrl(): Promise<string> {
return call(tabId, 'getCurrentUrl', []) as Promise<string>
},
async getLastUpdateTime(): Promise<number> {
return call(tabId, 'getLastUpdateTime', []) as Promise<number>
},
async getBrowserState(): Promise<BrowserState> {
return call(tabId, 'getBrowserState', []) as Promise<BrowserState>
},
async updateTree(): Promise<string> {
return call(tabId, 'updateTree', []) as Promise<string>
},
async cleanUpHighlights(): Promise<void> {
await call(tabId, 'cleanUpHighlights', [])
},
async clickElement(index: number): Promise<ActionResult> {
return call(tabId, 'clickElement', [index]) as Promise<ActionResult>
},
async inputText(index: number, text: string): Promise<ActionResult> {
return call(tabId, 'inputText', [index, text]) as Promise<ActionResult>
},
async selectOption(index: number, optionText: string): Promise<ActionResult> {
return call(tabId, 'selectOption', [index, optionText]) as Promise<ActionResult>
},
async scroll(options: ScrollOptions): Promise<ActionResult> {
return call(tabId, 'scroll', [options]) as Promise<ActionResult>
},
async scrollHorizontally(options: ScrollHorizontallyOptions): Promise<ActionResult> {
return call(tabId, 'scrollHorizontally', [options]) as Promise<ActionResult>
},
async executeJavascript(script: string): Promise<ActionResult> {
return call(tabId, 'executeJavascript', [script]) as Promise<ActionResult>
},
async showMask(): Promise<void> {
await call(tabId, 'showMask', [])
},
async hideMask(): Promise<void> {
// Best effort - don't throw if content script is gone
try {
await callOnce(tabId, 'hideMask', [])
} catch (e) {
console.debug('[RPC] hideMask failed (ignored):', e)
}
},
async dispose(): Promise<void> {
// Best effort - don't throw if content script is gone
try {
await callOnce(tabId, 'dispose', [])
} catch (e) {
console.debug('[RPC] dispose failed (ignored):', e)
}
},
}
}

View File

@@ -1,208 +1,186 @@
# PageAgentExt Architecture # PageAgentExt Architecture
This document describes the architecture of the Chrome extension version of PageAgent, including environment definitions, communication protocols, and extension considerations. This document describes the MV3-compliant architecture of the Chrome extension version of PageAgent.
## Design Principles
The architecture follows Chrome MV3 Service Worker constraints:
1. **Service Worker is stateless** - No long-running loops, no in-memory state
2. **Agent runs in frontend context** - SidePanel hosts all agent logic
3. **SW is a message relay** - Only forwards messages between contexts
4. **Event-driven** - All operations are triggered by user actions or message events
## Environment Definitions ## Environment Definitions
The extension operates across three isolated JavaScript contexts: The extension operates across three isolated JavaScript contexts:
### 1. Background (Service Worker) ### 1. Side Panel (Frontend - Agent Host)
**File:** `src/entrypoints/background.ts`
**Responsibilities:**
- Hosts the headless `PageAgentCore` instance
- Manages agent lifecycle (create, execute, stop, dispose)
- Stores LLM configuration in `chrome.storage.local`
- Receives commands from SidePanel via messaging
- Broadcasts events to SidePanel for UI updates
- Uses `RemotePageController` to proxy DOM operations to ContentScript
**Key Components:**
- `PageAgentCore` - The AI agent (from `@page-agent/core`)
- `RemotePageController` - Proxy that forwards calls to ContentScript
- Command handlers for `agent:execute`, `agent:stop`, `agent:configure`
### 2. Content Script
**File:** `src/entrypoints/content.ts`
**Responsibilities:**
- Runs in the context of web pages
- Hosts the real `PageController` instance (lazy-initialized)
- Performs actual DOM operations (click, input, scroll, etc.)
- Responds to RPC messages from Background
- Manages visual mask overlay during automation
**Key Components:**
- `PageController` - DOM controller (from `@page-agent/page-controller`)
- RPC handlers for all PageController methods
**Lifecycle:** PageController is created lazily on first RPC call and disposed between tasks. This ensures clean state for each task and enables future multi-page support.
### 3. Side Panel (React UI)
**Files:** `src/entrypoints/sidepanel/` **Files:** `src/entrypoints/sidepanel/`
**Responsibilities:** **Responsibilities:**
- Provides user interface for controlling the agent - Hosts `PageAgentCore` instance and main execution loop
- Displays task input and execution history - Manages `TabsManager` for multi-tab control
- Shows real-time agent activity (thinking, executing, etc.) - Uses `RemotePageController` to proxy DOM operations via SW
- Manages LLM configuration settings - Stores agent state (task, history, status)
- Sends commands to Background and receives event updates - Provides React UI for user interaction
- Handles `shouldShowMask` queries from content scripts
**Key Components:** **Key Components:**
- `App.tsx` - Main React component with chat-style UI - `AgentController` - Encapsulates agent lifecycle, isolated from UI
- `ConfigPanel` - Settings form for LLM configuration - `useAgent` hook - React integration for AgentController
- Event subscription for real-time updates - `App.tsx` - Main UI component
- `ConfigPanel` - LLM settings
## Communication Architecture **Lifecycle:** When sidepanel closes, agent disposes naturally. No state persists in SW.
### 2. Background (Service Worker - Stateless Relay)
**File:** `src/entrypoints/background.ts`
**Responsibilities:**
- Relays RPC messages from SidePanel to ContentScript
- Forwards tab events (onRemoved, onUpdated) to SidePanel
- Opens sidepanel on action click
- **NO** agent logic, **NO** state
**Message Flows:**
```
SidePanel → SW → ContentScript (RPC calls)
ContentScript → SW → SidePanel (mask state queries)
SW → SidePanel (tab events)
```
### 3. Content Script
**File:** `src/entrypoints/content.ts`
**Responsibilities:**
- Runs in web page context
- Hosts real `PageController` instance (lazy-initialized)
- Handles RPC messages for DOM operations
- Queries SidePanel for mask state on page load
- Manages visual mask overlay
**Lifecycle:** PageController is created on first RPC call and disposed between tasks.
## Architecture Diagram
``` ```
┌─────────────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────────────┐
Side Panel │ Side Panel (Frontend)
│ ┌──────────────┐ ┌──────────────┐ ┌───────────────────────┐ │ │ ┌────────────────────────────────────────────────────────────┐ │
│ │ Task Input │ Event Stream │ History Display │ │ │ │ AgentController │ │
─────────────┘ └───────────── └─────────────────────── │ ┌──────────────┐ ┌────────────── ──────────────────┐ │
└─────────┼─────────────────┼─────────────────────────────────────┘ │ │ │ PageAgentCore│ │ TabsManager │ │RemotePageController│ │ │
│ Commands │ Events └──────────────┘ └──────────────┘ └────────┬─────────┘ │ │
└───────────────────────────────────────────────┼────────────┘
│ │ │
│ ┌──────────────┐ ┌──────────────┐ │ │
│ │ React UI │ │ Query Handler│◄─────────────┼───────────┐ │
│ │ (App.tsx) │ │(shouldShowMask) │ │ │
│ └──────────────┘ └──────────────┘ │ │ │
└──────────────────────────────────────────────────┼───────────┼───┘
│ │
RPC Call │ Query │
▼ │
┌─────────────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────────────┐
Background │ Background (Service Worker)
┌──────────────────────────────────────────────────────────┐
PageAgentCore ┌────────────────┐
│ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ Message Relay │
LLM Tools │ │ RemotePageCtrl │ │ │ (stateless) │
│ └─────────────┘ └─────────────┘ └───────┬──────── └───────┬────────┘
└─────────────────────────────────────────────┼────────────┘
└────────────────────────────────────────────────┼──────────────── │ Tab Events ─────────────────┼─────────────────► SidePanel │
RPC (onRemoved, onUpdated)
└──────────────────────────────┼───────────────────────────────────┘
│ RPC Forward
┌─────────────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────────────┐
│ Content Script │ │ Content Script
│ ┌──────────────────────────────────────────────────────────┐ │ ┌────────────────────────────────────────────────────────────┐ │
│ │ PageController │ │ │ PageController │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────────┐ │ │
│ │ │ DOM Tree │ │ Actions │ │ Mask │ │ │ │ │ DOM Tree │ │ Actions │ │ Mask │ │ │
│ │ └─────────────┘ └─────────────┘ └──────────────────┘ │ │ │ └─────────────┘ └─────────────┘ └──────────────────┘ │ │
│ └──────────────────────────────────────────────────────────┘ │ └────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘ └─────────────────────────────────────────────────────────────────┘
┌───────────────┐ ┌───────────────┐
│ Web Page │ │ Web Page │
│ DOM │ │ DOM │
└───────────────┘ └───────────────┘
``` ```
## Message Protocol ## Message Protocol
All cross-context communication uses `@webext-core/messaging` for type safety. All messages use a simple type-based protocol defined in `src/messaging/protocol.ts`.
### Protocol Definition ### Message Types
**File:** `src/messaging/protocol.ts` | Type | Direction | Purpose |
|------|-----------|---------|
| `rpc:call` | SidePanel → SW | Request to call PageController method |
| `rpc:response` | SW → SidePanel | Response from PageController |
| `cs:rpc` | SW → ContentScript | Forwarded RPC call |
| `cs:query` | ContentScript → SW | Query to SidePanel (e.g., shouldShowMask) |
| `query:response` | SW → ContentScript | Response to query |
| `tab:event` | SW → SidePanel | Tab removed/updated notification |
### 1. RPC Protocol (Background → ContentScript) ### RPC Methods
Used by `RemotePageController` to call `PageController` methods. All PageController methods are available via RPC:
```typescript - State: `getCurrentUrl`, `getLastUpdateTime`, `getBrowserState`
interface PageControllerRPCProtocol { - DOM: `updateTree`, `cleanUpHighlights`
// State queries - Actions: `clickElement`, `inputText`, `selectOption`, `scroll`, `scrollHorizontally`, `executeJavascript`
'rpc:getCurrentUrl': () => string - Mask: `showMask`, `hideMask`
'rpc:getLastUpdateTime': () => number - Lifecycle: `dispose`
'rpc:getBrowserState': () => BrowserState
// DOM operations
'rpc:updateTree': () => string
'rpc:cleanUpHighlights': () => void
// Element actions
'rpc:clickElement': (index: number) => ActionResult
'rpc:inputText': (data: { index: number; text: string }) => ActionResult
'rpc:selectOption': (data: { index: number; optionText: string }) => ActionResult
'rpc:scroll': (options: ScrollOptions) => ActionResult
'rpc:scrollHorizontally': (options: ScrollHorizontallyOptions) => ActionResult
'rpc:executeJavascript': (script: string) => ActionResult
// Mask operations
'rpc:showMask': () => void
'rpc:hideMask': () => void
// Lifecycle
'rpc:dispose': () => void
}
```
### 2. Command Protocol (SidePanel → Background)
Used by SidePanel UI to control the agent.
```typescript
interface AgentCommandProtocol {
'agent:execute': (task: string) => void
'agent:stop': () => void
'agent:getState': () => AgentState
'agent:configure': (config: LLMConfig) => void
}
```
### 3. Event Protocol (Background → SidePanel)
Used by Background to push updates to SidePanel.
```typescript
interface AgentEventProtocol {
'event:status': (status: AgentStatus) => void
'event:history': (history: HistoricalEvent[]) => void
'event:activity': (activity: AgentActivity) => void
'event:stateSnapshot': (state: AgentState) => void
}
```
## Communication Flow ## Communication Flow
### Task Execution Flow ### Task Execution
``` ```
1. User enters task in SidePanel 1. User enters task in SidePanel
└─> SidePanel sends 'agent:execute' command └─> AgentController.execute(task)
2. Background receives command 2. AgentController creates agent instances
├─> Creates PageAgentCore with RemotePageController ├─> new PageAgentCore()
─> Starts task execution ─> new TabsManager()
└─> new RemotePageController()
3. Agent executes step loop: 3. Agent executes step loop:
├─> LLM generates next action ├─> LLM generates next action
├─> Agent calls RemotePageController method ├─> RemotePageController.method() called
│ └─> RPC message sent to ContentScript │ └─> RPC message → SW → ContentScript
├─> ContentScript executes on real PageController ├─> ContentScript executes on real PageController
│ └─> RPC response returned │ └─> Response → SW → SidePanel
├─> Agent updates history ├─> Agent updates history
└─> Background broadcasts events to SidePanel └─> React UI re-renders via events
4. SidePanel receives events 4. Task completes or user stops
└─> Updates UI (status, history, activity) └─> Agent disposes, status changes
5. Task completes or user stops
└─> Agent disposes, status changes to idle/completed/error
``` ```
### Configuration Flow ### Page Reload During Task
``` ```
1. User opens Settings in SidePanel 1. Page reloads/navigates
2. User enters API credentials 2. Content script initializes
3. SidePanel sends 'agent:configure' command 3. Content script queries: shouldShowMask?
4. Background saves config to chrome.storage.local └─> cs:query → SW → SidePanel
5. Next agent creation uses new config 4. SidePanel checks if tab is current + agent running
└─> query:response → SW → ContentScript
5. Content script shows/hides mask accordingly
``` ```
## File Structure ## File Structure
@@ -210,99 +188,85 @@ interface AgentEventProtocol {
``` ```
packages/extension/src/ packages/extension/src/
├── agent/ ├── agent/
── RemotePageController.ts # Proxy for PageController ── RemotePageController.ts # Proxy for PageController RPC
│ ├── TabsManager.ts # Multi-tab management
│ └── tabTools.ts # Agent tools for tab control
├── entrypoints/ ├── entrypoints/
│ ├── background.ts # Service worker │ ├── background.ts # Stateless SW relay
│ ├── content.ts # Content script │ ├── content.ts # Content script with PageController
│ └── sidepanel/ │ └── sidepanel/
│ ├── AgentController.ts # Agent lifecycle management
│ ├── useAgent.ts # React hook for agent
│ ├── App.tsx # Main UI component
│ ├── components/
│ │ ├── ConfigPanel.tsx
│ │ ├── cards/
│ │ └── index.tsx
│ ├── index.html │ ├── index.html
── main.tsx ── main.tsx
│ └── App.tsx # Main UI component
├── messaging/ ├── messaging/
│ ├── protocol.ts # Message type definitions │ ├── protocol.ts # Message type definitions
│ ├── rpc.ts # RPC client for PageController │ ├── rpc.ts # RPC client for SidePanel
── events.ts # Event broadcasting utilities ── index.ts
│ └── index.ts # Module exports
├── components/ui/ # shadcn components ├── components/ui/ # shadcn components
├── lib/utils.ts # Utility functions ├── lib/utils.ts
└── assets/index.css # Tailwind styles └── utils/constants.ts
``` ```
## Design Decisions ## Design Decisions
### Tab ID Binding ### Why Agent in SidePanel?
**Problem:** When a task completes while the page is not focused (user switched tabs), RPC messages like `hideMask` or `dispose` would be sent to the wrong tab because `chrome.tabs.query({ active: true })` returns the currently active tab, not the original target tab. MV3 Service Workers have strict lifecycle constraints:
- Terminate after ~30s of inactivity
- Cannot maintain long-running loops
- State is lost on termination
**Solution:** `RemotePageController` captures the target tab ID at construction time and binds it to its RPC client. All subsequent RPC calls use this fixed tab ID regardless of which tab is currently active. By hosting the agent in SidePanel (a visible frontend page), we get:
- Persistent execution while panel is open
- Natural disposal when panel closes
- No SW wake-up complexity
``` ### Agent Isolation from UI
Task starts → RemotePageController created → tabId captured (e.g., 123)
User switches to another tab (456 is now active)
Task completes → hideMask RPC sent to tab 123 (correct!)
```
### Lazy PageController Lifecycle `AgentController` is a separate class from the React UI for:
- **Testability** - Can test agent logic without React
- **Portability** - Future: move agent to popup, options page, or external page
- **Clean separation** - UI concerns don't pollute agent logic
**Problem:** PageController was created once when content script loaded and persisted until page unload. If the mask was disposed mid-task, subsequent tasks couldn't show it again. ### Simplified Messaging
**Solution:** PageController is now lazy-initialized on first RPC call and fully disposed between tasks. Each task gets a fresh PageController instance with its own mask. Previous architecture had complex retry/wake-up logic for SW. New architecture:
- SW is stateless, always ready
- No ping/wake-up needed
- Simple request-response pattern
- Retry logic only for content script initialization
``` ## Multi-Tab Control
Task 1: showMask → creates PageController + Mask → execute → hideMask → dispose → null
Task 2: showMask → creates new PageController + Mask → ...
```
This also prepares for future multi-page workflows where PageController may need to be recreated when navigating between pages. ### Tab Types
## Extension Considerations - **Initial Tab** - Where user started the task
- **Managed Tabs** - Tabs opened by agent via `open_new_tab`
### Current Limitations (v1) ### Tab Grouping
1. **Single page control only** - Agent controls the active tab where SidePanel was opened Agent-opened tabs are grouped in a Chrome tab group named `Task(<taskId>)`.
2. **No cross-tab navigation** - Cannot follow links that open in new tabs
3. **Session-based** - Agent state is not persisted across extension restarts
### Future Extension Points ### Tab Switching
#### Multi-tab Control Only initial tab and managed tabs can be switched to. This prevents the agent from accessing unrelated tabs.
To support controlling multiple tabs: ## Configuration
1. Add `tabId` parameter to RPC messages LLM config (apiKey, baseURL, model) is stored in `chrome.storage.local`. This persists across sessions and is managed via the ConfigPanel.
2. Track tab-to-controller mapping in Background
3. Allow SidePanel to switch between controlled tabs
#### Persistent Sessions ## Security
To persist agent sessions: 1. **API Key Storage** - Keys in `chrome.storage.local` (extension-only access)
2. **Content Script Isolation** - Runs in isolated world
1. Store session state in `chrome.storage.local` 3. **Tab Restriction** - Agent can only control tabs it opened or started from
2. Restore agent on extension startup 4. **No Arbitrary Tab Access** - Cannot switch to unmanaged tabs
3. Handle service worker restarts gracefully
#### Cross-tab Navigation
To follow links in new tabs:
1. Listen to `chrome.tabs.onCreated` events
2. Inject content script into new tabs
3. Transfer control to new tab when navigation occurs
#### Screenshot/Vision Support
To add visual context for the agent:
1. Use `chrome.tabs.captureVisibleTab` for screenshots
2. Send images to vision-capable LLM models
3. Add screenshot tool to agent toolkit
## Security Considerations
1. **API Key Storage** - Keys stored in `chrome.storage.local` (extension-only access)
2. **Content Script Isolation** - Runs in isolated world, not accessible to page scripts
3. **Message Validation** - Only trusted extension contexts can send/receive messages
4. **Permission Scope** - Request minimal permissions needed for functionality
## Development ## Development

View File

@@ -15,6 +15,9 @@ export default defineConfig({
}, },
vite: () => ({ vite: () => ({
plugins: [tailwindcss()], plugins: [tailwindcss()],
optimizeDeps: {
force: true,
},
build: { build: {
minify: false, minify: false,
chunkSizeWarningLimit: 2000, chunkSizeWarningLimit: 2000,
@@ -32,7 +35,7 @@ export default defineConfig({
description: description:
'AI-powered browser automation assistant. Control web pages with natural language.', 'AI-powered browser automation assistant. Control web pages with natural language.',
homepage_url: 'https://alibaba.github.io/page-agent/', homepage_url: 'https://alibaba.github.io/page-agent/',
permissions: ['tabs', 'sidePanel', 'storage'], permissions: ['tabs', 'tabGroups', 'sidePanel', 'storage'],
host_permissions: ['<all_urls>'], host_permissions: ['<all_urls>'],
icons: { icons: {
64: 'assets/page-agent-64.png', 64: 'assets/page-agent-64.png',