Merge pull request #173 from alibaba/feat/tolerant-html-cleaning
feat: support wildcard in `includeAttributes`
This commit is contained in:
@@ -74,6 +74,43 @@ export function getFlatTree(config: DomConfig): FlatDomTree {
|
|||||||
return elements
|
return elements
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const globRegexCache = new Map<string, RegExp>()
|
||||||
|
|
||||||
|
function globToRegex(pattern: string): RegExp {
|
||||||
|
let regex = globRegexCache.get(pattern)
|
||||||
|
if (!regex) {
|
||||||
|
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
||||||
|
regex = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`)
|
||||||
|
globRegexCache.set(pattern, regex)
|
||||||
|
}
|
||||||
|
return regex
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchAttributes(
|
||||||
|
attrs: Record<string, string>,
|
||||||
|
patterns: string[]
|
||||||
|
): Record<string, string> {
|
||||||
|
const result: Record<string, string> = {}
|
||||||
|
|
||||||
|
for (const pattern of patterns) {
|
||||||
|
if (pattern.includes('*')) {
|
||||||
|
const regex = globToRegex(pattern)
|
||||||
|
for (const key of Object.keys(attrs)) {
|
||||||
|
if (regex.test(key) && attrs[key].trim()) {
|
||||||
|
result[key] = attrs[key].trim()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const value = attrs[pattern]
|
||||||
|
if (value && value.trim()) {
|
||||||
|
result[pattern] = value.trim()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* elementsToString 内部使用的类型
|
* elementsToString 内部使用的类型
|
||||||
*/
|
*/
|
||||||
@@ -248,23 +285,15 @@ export function flatTreeToString(flatTree: FlatDomTree, includeAttributes?: stri
|
|||||||
let attributesHtmlStr = ''
|
let attributesHtmlStr = ''
|
||||||
|
|
||||||
if (includeAttrs.length > 0 && node.attributes) {
|
if (includeAttrs.length > 0 && node.attributes) {
|
||||||
const attributesToInclude: Record<string, string> = {}
|
const attributesToInclude = matchAttributes(node.attributes, includeAttrs)
|
||||||
|
|
||||||
// Filter attributes
|
|
||||||
for (const key of includeAttrs) {
|
|
||||||
const value = node.attributes[key]
|
|
||||||
if (value && value.trim() !== '') {
|
|
||||||
attributesToInclude[key] = value.trim()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove duplicate values (for attributes longer than 5 chars)
|
// Remove duplicate values (for attributes longer than 5 chars)
|
||||||
const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude)
|
const keys = Object.keys(attributesToInclude)
|
||||||
if (orderedKeys.length > 1) {
|
if (keys.length > 1) {
|
||||||
const keysToRemove = new Set<string>()
|
const keysToRemove = new Set<string>()
|
||||||
const seenValues: Record<string, string> = {}
|
const seenValues: Record<string, string> = {}
|
||||||
|
|
||||||
for (const key of orderedKeys) {
|
for (const key of keys) {
|
||||||
const value = attributesToInclude[key]
|
const value = attributesToInclude[key]
|
||||||
if (value.length > 5) {
|
if (value.length > 5) {
|
||||||
if (value in seenValues) {
|
if (value in seenValues) {
|
||||||
|
|||||||
@@ -105,8 +105,8 @@ const agent = new PageAgentCore({
|
|||||||
name: 'includeAttributes',
|
name: 'includeAttributes',
|
||||||
type: 'string[]',
|
type: 'string[]',
|
||||||
description: isZh
|
description: isZh
|
||||||
? '在 DOM 提取中包含的额外 HTML 属性(如 data-testid)。默认已包含常见属性如 role, aria-label 等。'
|
? '在 DOM 提取中包含的额外 HTML 属性。支持通配符 *(如 data-* 匹配所有 data- 开头的属性)。默认已包含常见属性如 role, aria-label 等。'
|
||||||
: 'Additional HTML attributes to include in DOM extraction (e.g. data-testid). Common attributes like role, aria-label are included by default.',
|
: 'Additional HTML attributes to include in DOM extraction. Supports wildcard * (e.g. data-* matches all data- prefixed attributes). Common attributes like role, aria-label are included by default.',
|
||||||
},
|
},
|
||||||
]}
|
]}
|
||||||
/>
|
/>
|
||||||
|
|||||||
Reference in New Issue
Block a user