From 7e9027167d0c2481d5e7d67c32c40fbf51705b15 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Fri, 3 Apr 2026 19:01:54 +0800 Subject: [PATCH] feat(controller): add `keepSemanticTags` config to keep the semantic structure of the page --- .../src/agent/RemotePageController.content.ts | 5 +- .../page-controller/src/PageController.ts | 6 ++- packages/page-controller/src/dom/index.ts | 51 +++++++++++++++++-- .../docs/advanced/page-controller/page.tsx | 8 +++ 4 files changed, 65 insertions(+), 5 deletions(-) diff --git a/packages/extension/src/agent/RemotePageController.content.ts b/packages/extension/src/agent/RemotePageController.content.ts index ac9cd8f..55eff28 100644 --- a/packages/extension/src/agent/RemotePageController.content.ts +++ b/packages/extension/src/agent/RemotePageController.content.ts @@ -19,7 +19,10 @@ export function initPageController() { function getPC(): PageController { if (!pageController) { - pageController = new PageController({ enableMask: false, viewportExpansion: 400 }) + pageController = new PageController({ + enableMask: false, + viewportExpansion: 400, + }) } return pageController } diff --git a/packages/page-controller/src/PageController.ts b/packages/page-controller/src/PageController.ts index 53b827d..d058001 100644 --- a/packages/page-controller/src/PageController.ts +++ b/packages/page-controller/src/PageController.ts @@ -193,7 +193,11 @@ export class PageController extends EventTarget { interactiveBlacklist: blacklist, }) - this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.includeAttributes) + this.simplifiedHTML = dom.flatTreeToString( + this.flatTree, + this.config.includeAttributes, + this.config.keepSemanticTags + ) this.selectorMap.clear() this.selectorMap = dom.getSelectorMap(this.flatTree) diff --git a/packages/page-controller/src/dom/index.ts b/packages/page-controller/src/dom/index.ts index 2968c0c..a7ae7f8 100644 --- a/packages/page-controller/src/dom/index.ts +++ b/packages/page-controller/src/dom/index.ts @@ -28,8 +28,27 @@ export interface DomConfig { includeAttributes?: string[] highlightOpacity?: number highlightLabelOpacity?: number + + /** + * Preserve semantic landmark tags in dehydrated output even if not interactive + * @note maybe confusing for LLM combining with page scrolling, use with caution + **/ + keepSemanticTags?: boolean } +// TODO: corresponding roles +const SEMANTIC_TAGS = new Set([ + 'nav', + 'menu', + // 'main', + 'header', + 'footer', + 'aside', + // 'article', + // 'form', + 'dialog', +]) + /** * 用于检测可交互元素是否是新出现的。 */ @@ -171,7 +190,11 @@ interface TreeNode { * * @todo 数据脱敏过滤器 */ -export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: string[]): string { +export function flatTreeToString( + flatTree: FlatDomTree, + includeAttributes: string[] = [], + keepSemanticTags = false +): string { const DEFAULT_INCLUDE_ATTRIBUTES = [ 'title', 'type', @@ -203,7 +226,7 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri 'contenteditable', ] - const includeAttrs = [...(includeAttributes || []), ...DEFAULT_INCLUDE_ATTRIBUTES] + const includeAttrs = [...includeAttributes, ...DEFAULT_INCLUDE_ATTRIBUTES] // Helper function to cap text length const capTextLength = (text: string, maxLength: number): string => { @@ -294,6 +317,8 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri const depthStr = '\t'.repeat(depth) if (node.type === 'element') { + const isSemantic = keepSemanticTags && node.tagName && SEMANTIC_TAGS.has(node.tagName) + // Add element with highlight_index if (node.highlightIndex !== undefined) { nextDepth += 1 @@ -391,10 +416,30 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri result.push(line) } - // Process children regardless + // special treatment for semantic tags + // even if they are not interactive, we can keep them for clear context + + const emitSemantic = isSemantic && node.highlightIndex === undefined + // to check if this tag is empty + const mark = emitSemantic ? result.length : -1 + + if (emitSemantic) { + result.push(`${depthStr}<${node.tagName}>`) + nextDepth += 1 + } + for (const child of node.children) { processNode(child, nextDepth, result) } + + if (emitSemantic) { + // empty tag should be removed + if (result.length === mark + 1) { + result.pop() + } else { + result.push(`${depthStr}`) + } + } } else if (node.type === 'text') { // Add text only if it doesn't have a highlighted parent if (hasParentWithHighlightIndex(node)) { diff --git a/packages/website/src/pages/docs/advanced/page-controller/page.tsx b/packages/website/src/pages/docs/advanced/page-controller/page.tsx index ee4371a..b1d781d 100644 --- a/packages/website/src/pages/docs/advanced/page-controller/page.tsx +++ b/packages/website/src/pages/docs/advanced/page-controller/page.tsx @@ -108,6 +108,14 @@ const agent = new PageAgentCore({ ? '在 DOM 提取中包含的额外 HTML 属性。支持通配符 *(如 data-* 匹配所有 data- 开头的属性)。默认已包含常见属性如 role, aria-label 等。' : 'Additional HTML attributes to include in DOM extraction. Supports wildcard * (e.g. data-* matches all data- prefixed attributes). Common attributes like role, aria-label are included by default.', }, + { + name: 'keepSemanticTags', + type: 'boolean', + defaultValue: 'false', + description: isZh + ? '在简化输出中保留语义标签(如 nav, main, header, footer, aside 等),即使它们不可交互。帮助 LLM 理解页面结构。' + : 'Preserve semantic landmark tags (e.g. nav, main, header, footer, aside) in dehydrated output even if not interactive. Helps LLM understand page structure.', + }, ]} />