26 lines
748 B
Python
26 lines
748 B
Python
import jieba
|
|
import re
|
|
|
|
def tokenize(text: str) -> str:
|
|
return " ".join(jieba.cut(text))
|
|
|
|
|
|
def build_match_query(text: str, limit: int = 12) -> str:
|
|
"""Build a safe FTS5 MATCH query from user/model text."""
|
|
terms: list[str] = []
|
|
seen: set[str] = set()
|
|
for token in tokenize(text or "").split():
|
|
token = token.strip()
|
|
if not token or not re.search(r"\w", token, flags=re.UNICODE):
|
|
continue
|
|
upper = token.upper()
|
|
if upper in {"AND", "OR", "NOT", "NEAR"}:
|
|
continue
|
|
if token in seen:
|
|
continue
|
|
seen.add(token)
|
|
terms.append('"' + token.replace('"', '""') + '"')
|
|
if len(terms) >= limit:
|
|
break
|
|
return " OR ".join(terms)
|