From 01db520881c91f671c51fe1e60a6ac08acdae370 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:02:35 +0800 Subject: [PATCH] feat: support wildcard in `includeAttributes` --- packages/page-controller/src/dom/index.ts | 46 ++++++++++++++----- .../docs/advanced/page-controller/page.tsx | 4 +- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/packages/page-controller/src/dom/index.ts b/packages/page-controller/src/dom/index.ts index 0036022..b24c5c1 100644 --- a/packages/page-controller/src/dom/index.ts +++ b/packages/page-controller/src/dom/index.ts @@ -74,6 +74,36 @@ export function getFlatTree(config: DomConfig): FlatDomTree { return elements } +function globToRegex(pattern: string): RegExp { + const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&') + return new RegExp(`^${escaped.replace(/\*/g, '.*')}$`) +} + +function matchAttributes( + attrs: Record, + patterns: string[] +): Record { + const result: Record = {} + + for (const pattern of patterns) { + if (pattern.includes('*')) { + const regex = globToRegex(pattern) + for (const key in attrs) { + if (regex.test(key) && attrs[key].trim()) { + result[key] = attrs[key].trim() + } + } + } else { + const value = attrs[pattern] + if (value && value.trim()) { + result[pattern] = value.trim() + } + } + } + + return result +} + /** * elementsToString 内部使用的类型 */ @@ -248,23 +278,15 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri let attributesHtmlStr = '' if (includeAttrs.length > 0 && node.attributes) { - const attributesToInclude: Record = {} - - // Filter attributes - for (const key of includeAttrs) { - const value = node.attributes[key] - if (value && value.trim() !== '') { - attributesToInclude[key] = value.trim() - } - } + const attributesToInclude = matchAttributes(node.attributes, includeAttrs) // Remove duplicate values (for attributes longer than 5 chars) - const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude) - if (orderedKeys.length > 1) { + const keys = Object.keys(attributesToInclude) + if (keys.length > 1) { const keysToRemove = new Set() const seenValues: Record = {} - for (const key of orderedKeys) { + for (const key of keys) { const value = attributesToInclude[key] if (value.length > 5) { if (value in seenValues) { diff --git a/packages/website/src/pages/docs/advanced/page-controller/page.tsx b/packages/website/src/pages/docs/advanced/page-controller/page.tsx index a955776..ee4371a 100644 --- a/packages/website/src/pages/docs/advanced/page-controller/page.tsx +++ b/packages/website/src/pages/docs/advanced/page-controller/page.tsx @@ -105,8 +105,8 @@ const agent = new PageAgentCore({ name: 'includeAttributes', type: 'string[]', description: isZh - ? '在 DOM 提取中包含的额外 HTML 属性(如 data-testid)。默认已包含常见属性如 role, aria-label 等。' - : 'Additional HTML attributes to include in DOM extraction (e.g. data-testid). Common attributes like role, aria-label are included by default.', + ? '在 DOM 提取中包含的额外 HTML 属性。支持通配符 *(如 data-* 匹配所有 data- 开头的属性)。默认已包含常见属性如 role, aria-label 等。' + : 'Additional HTML attributes to include in DOM extraction. Supports wildcard * (e.g. data-* matches all data- prefixed attributes). Common attributes like role, aria-label are included by default.', }, ]} />