From 01db520881c91f671c51fe1e60a6ac08acdae370 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:02:35 +0800 Subject: [PATCH 1/5] feat: support wildcard in `includeAttributes` --- packages/page-controller/src/dom/index.ts | 46 ++++++++++++++----- .../docs/advanced/page-controller/page.tsx | 4 +- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/packages/page-controller/src/dom/index.ts b/packages/page-controller/src/dom/index.ts index 0036022..b24c5c1 100644 --- a/packages/page-controller/src/dom/index.ts +++ b/packages/page-controller/src/dom/index.ts @@ -74,6 +74,36 @@ export function getFlatTree(config: DomConfig): FlatDomTree { return elements } +function globToRegex(pattern: string): RegExp { + const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') + return new RegExp(`^${escaped.replace(/\*/g, '.*')}$`) +} + +function matchAttributes( + attrs: Record, + patterns: string[] +): Record { + const result: Record = {} + + for (const pattern of patterns) { + if (pattern.includes('*')) { + const regex = globToRegex(pattern) + for (const key in attrs) { + if (regex.test(key) && attrs[key].trim()) { + result[key] = attrs[key].trim() + } + } + } else { + const value = attrs[pattern] + if (value && value.trim()) { + result[pattern] = value.trim() + } + } + } + + return result +} + /** * elementsToString 内部使用的类型 */ @@ -248,23 +278,15 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri let attributesHtmlStr = '' if (includeAttrs.length > 0 && node.attributes) { - const attributesToInclude: Record = {} - - // Filter attributes - for (const key of includeAttrs) { - const value = node.attributes[key] - if (value && value.trim() !== '') { - attributesToInclude[key] = value.trim() - } - } + const attributesToInclude = matchAttributes(node.attributes, includeAttrs) // Remove duplicate values (for attributes longer than 5 chars) - const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude) - if (orderedKeys.length > 1) { + const keys = Object.keys(attributesToInclude) + if (keys.length > 1) { const keysToRemove = new Set() const seenValues: Record = {} - for (const key of orderedKeys) { + for (const key of keys) { const value = attributesToInclude[key] if (value.length > 5) { if (value in seenValues) { diff --git a/packages/website/src/pages/docs/advanced/page-controller/page.tsx b/packages/website/src/pages/docs/advanced/page-controller/page.tsx index a955776..ee4371a 100644 --- a/packages/website/src/pages/docs/advanced/page-controller/page.tsx +++ b/packages/website/src/pages/docs/advanced/page-controller/page.tsx @@ -105,8 +105,8 @@ const agent = new PageAgentCore({ name: 'includeAttributes', type: 'string[]', description: isZh - ? '在 DOM 提取中包含的额外 HTML 属性(如 data-testid)。默认已包含常见属性如 role, aria-label 等。' - : 'Additional HTML attributes to include in DOM extraction (e.g. data-testid). Common attributes like role, aria-label are included by default.', + ? '在 DOM 提取中包含的额外 HTML 属性。支持通配符 *(如 data-* 匹配所有 data- 开头的属性)。默认已包含常见属性如 role, aria-label 等。' + : 'Additional HTML attributes to include in DOM extraction. Supports wildcard * (e.g. data-* matches all data- prefixed attributes). Common attributes like role, aria-label are included by default.', }, ]} /> From 0b6a698f6b6504695f9bde1206b4c75c9ecd7364 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:26:53 +0000 Subject: [PATCH 2/5] Initial plan From 5988d4ba24de88719e8135282802a7bf96511add Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:29:10 +0000 Subject: [PATCH 3/5] Initial plan From 5852054e3a452ca4c92f20aa5ceaa53e19d75f01 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:29:56 +0000 Subject: [PATCH 4/5] perf: cache compiled regexes in globToRegex to avoid repeated compilation Co-authored-by: gaomeng1900 <10131203+gaomeng1900@users.noreply.github.com> --- packages/page-controller/src/dom/index.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/page-controller/src/dom/index.ts b/packages/page-controller/src/dom/index.ts index b24c5c1..11ba851 100644 --- a/packages/page-controller/src/dom/index.ts +++ b/packages/page-controller/src/dom/index.ts @@ -74,9 +74,16 @@ export function getFlatTree(config: DomConfig): FlatDomTree { return elements } +const globRegexCache = new Map() + function globToRegex(pattern: string): RegExp { - const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') - return new RegExp(`^${escaped.replace(/\*/g, '.*')}$`) + let regex = globRegexCache.get(pattern) + if (!regex) { + const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') + regex = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`) + globRegexCache.set(pattern, regex) + } + return regex } function matchAttributes( From d46a57f8eff1cbf42ab5e0ee02aee02e41caf3b9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:32:20 +0000 Subject: [PATCH 5/5] refactor: use Object.keys() instead of for...in in matchAttributes Co-authored-by: gaomeng1900 <10131203+gaomeng1900@users.noreply.github.com> --- packages/page-controller/src/dom/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/page-controller/src/dom/index.ts b/packages/page-controller/src/dom/index.ts index b24c5c1..07cf724 100644 --- a/packages/page-controller/src/dom/index.ts +++ b/packages/page-controller/src/dom/index.ts @@ -88,7 +88,7 @@ function matchAttributes( for (const pattern of patterns) { if (pattern.includes('*')) { const regex = globToRegex(pattern) - for (const key in attrs) { + for (const key of Object.keys(attrs)) { if (regex.test(key) && attrs[key].trim()) { result[key] = attrs[key].trim() }