Merge pull request #173 from alibaba/feat/tolerant-html-cleaning
feat: support wildcard in `includeAttributes`
This commit is contained in:
@@ -74,6 +74,43 @@ export function getFlatTree(config: DomConfig): FlatDomTree {
|
||||
return elements
|
||||
}
|
||||
|
||||
const globRegexCache = new Map<string, RegExp>()
|
||||
|
||||
function globToRegex(pattern: string): RegExp {
|
||||
let regex = globRegexCache.get(pattern)
|
||||
if (!regex) {
|
||||
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
||||
regex = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`)
|
||||
globRegexCache.set(pattern, regex)
|
||||
}
|
||||
return regex
|
||||
}
|
||||
|
||||
function matchAttributes(
|
||||
attrs: Record<string, string>,
|
||||
patterns: string[]
|
||||
): Record<string, string> {
|
||||
const result: Record<string, string> = {}
|
||||
|
||||
for (const pattern of patterns) {
|
||||
if (pattern.includes('*')) {
|
||||
const regex = globToRegex(pattern)
|
||||
for (const key of Object.keys(attrs)) {
|
||||
if (regex.test(key) && attrs[key].trim()) {
|
||||
result[key] = attrs[key].trim()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const value = attrs[pattern]
|
||||
if (value && value.trim()) {
|
||||
result[pattern] = value.trim()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
/**
|
||||
* elementsToString 内部使用的类型
|
||||
*/
|
||||
@@ -248,23 +285,15 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri
|
||||
let attributesHtmlStr = ''
|
||||
|
||||
if (includeAttrs.length > 0 && node.attributes) {
|
||||
const attributesToInclude: Record<string, string> = {}
|
||||
|
||||
// Filter attributes
|
||||
for (const key of includeAttrs) {
|
||||
const value = node.attributes[key]
|
||||
if (value && value.trim() !== '') {
|
||||
attributesToInclude[key] = value.trim()
|
||||
}
|
||||
}
|
||||
const attributesToInclude = matchAttributes(node.attributes, includeAttrs)
|
||||
|
||||
// Remove duplicate values (for attributes longer than 5 chars)
|
||||
const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude)
|
||||
if (orderedKeys.length > 1) {
|
||||
const keys = Object.keys(attributesToInclude)
|
||||
if (keys.length > 1) {
|
||||
const keysToRemove = new Set<string>()
|
||||
const seenValues: Record<string, string> = {}
|
||||
|
||||
for (const key of orderedKeys) {
|
||||
for (const key of keys) {
|
||||
const value = attributesToInclude[key]
|
||||
if (value.length > 5) {
|
||||
if (value in seenValues) {
|
||||
|
||||
@@ -105,8 +105,8 @@ const agent = new PageAgentCore({
|
||||
name: 'includeAttributes',
|
||||
type: 'string[]',
|
||||
description: isZh
|
||||
? '在 DOM 提取中包含的额外 HTML 属性(如 data-testid)。默认已包含常见属性如 role, aria-label 等。'
|
||||
: 'Additional HTML attributes to include in DOM extraction (e.g. data-testid). Common attributes like role, aria-label are included by default.',
|
||||
? '在 DOM 提取中包含的额外 HTML 属性。支持通配符 *(如 data-* 匹配所有 data- 开头的属性)。默认已包含常见属性如 role, aria-label 等。'
|
||||
: 'Additional HTML attributes to include in DOM extraction. Supports wildcard * (e.g. data-* matches all data- prefixed attributes). Common attributes like role, aria-label are included by default.',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
|
||||
Reference in New Issue
Block a user