Files
qiweimanager-master/helper/auto_reply_knowledge.go
yuanzhipeng 849090a627 feat(auto-reply): 优化自动回复逻辑和知识库功能
- 将默认回复详细程度从"detailed"调整为"medium",前后端保持一致
- 新增话题切换检测逻辑,当用户主动要求换话题时提供引导回复
- 优化上下文处理机制,仅在指代型追问时注入历史对话,避免模型复读旧内容
- 改进知识库检索逻辑,区分自包含问题和指代型问题的上下文需求
- 完善知识库完整性指令,确保回复详细程度与知识展开程度一致
- 重构知识库重建逻辑,支持递归扫描子目录中的文件,修复索引为空的问题
- 增强素材匹配算法,引入强信号检测机制,避免仅凭模糊匹配误发素材
- 新增素材开场白AI生成功能,支持图片、视频、文档等类型智能描述
- 改进知识库重建通知,显示具体的文件数、分片数及失败统计信息
2026-06-26 14:25:35 +08:00

1270 lines
32 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"archive/zip"
"bytes"
"crypto/sha1"
"encoding/csv"
"encoding/hex"
"encoding/json"
"encoding/xml"
"fmt"
"html"
"io"
"io/fs"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
"unicode"
"github.com/ledongthuc/pdf"
"github.com/xuri/excelize/v2"
)
const maxKnowledgeChunkContentRunes = 6000
const maxPDFOCRPages = 20
var (
pdfFindRenderer = findPDFRenderer
pdfOCRPageImage func(string, int) (string, error)
renderPDFPageFunc = renderPDFPage
)
type KnowledgeChunk struct {
ID string `json:"id"`
Source string `json:"source"`
Title string `json:"title"`
Content string `json:"content"`
Line int `json:"line"`
Page int `json:"page"`
SectionID string `json:"sectionId,omitempty"`
SectionIndex int `json:"sectionIndex,omitempty"`
PartIndex int `json:"partIndex,omitempty"`
UpdatedAt int64 `json:"updatedAt"`
Hash string `json:"hash"`
Score float64 `json:"score,omitempty"`
}
type KnowledgeIndex struct {
Chunks []KnowledgeChunk `json:"chunks"`
FileCount int `json:"fileCount"`
FailedFiles []string `json:"failedFiles"`
LastIndexedAt int64 `json:"lastIndexedAt"`
}
type knowledgeParseWarning struct {
Warnings []string
}
func (e knowledgeParseWarning) Error() string {
return strings.Join(e.Warnings, "")
}
func NewKnowledgeIndex() *KnowledgeIndex {
return &KnowledgeIndex{
Chunks: make([]KnowledgeChunk, 0),
FailedFiles: make([]string, 0),
}
}
func (e *AutoReplyEngine) loadKnowledgeIndex() error {
cfg := e.getConfig()
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
data, err := os.ReadFile(indexPath)
if err != nil {
if os.IsNotExist(err) {
e.updateKnowledgeStatus(NewKnowledgeIndex())
return nil
}
return err
}
var idx KnowledgeIndex
if err := json.Unmarshal(data, &idx); err != nil {
return err
}
e.mu.Lock()
e.index = &idx
e.status.KnowledgeFileCount = idx.FileCount
e.status.KnowledgeChunkCount = len(idx.Chunks)
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
e.mu.Unlock()
return nil
}
func (e *AutoReplyEngine) rebuildKnowledgeIndex() (*KnowledgeIndex, error) {
cfg := e.getConfig()
root := resolveAutoReplyPath(cfg.Knowledge.Directory)
idx := NewKnowledgeIndex()
allowed := make(map[string]bool)
for _, ext := range cfg.Knowledge.SupportedExtensions {
allowed[strings.ToLower(ext)] = true
}
if len(allowed) == 0 {
allowed[".md"] = true
allowed[".txt"] = true
allowed[".csv"] = true
}
if err := os.MkdirAll(root, 0755); err != nil {
return nil, err
}
// 递归遍历子目录filepath.WalkDir知识库常按分类分文件夹组织
// (如 01_产品与设备/、03_售后支持/01_故障排查/),与素材扫描保持一致。
// 仅扫根目录会漏掉所有子目录文件,导致索引为空、向量召回失败。
walkErr := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return nil // 单个条目出错跳过,不中断整体重建
}
if d.IsDir() {
return nil
}
ext := strings.ToLower(filepath.Ext(d.Name()))
if !isRootKnowledgeFile(d.Name(), ext, allowed, cfg.Knowledge.IndexPath, cfg.Retrieval.EmbeddingIndexPath) {
return nil
}
chunks, err := parseKnowledgeFile(path, root)
if err != nil {
var warning knowledgeParseWarning
if ok := errorAs(err, &warning); ok {
for _, item := range warning.Warnings {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %s", path, item))
}
} else {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %v", path, err))
return nil
}
}
if len(chunks) == 0 {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: 未读取到可索引内容", path))
return nil
}
idx.FileCount++
idx.Chunks = append(idx.Chunks, chunks...)
return nil
})
if walkErr != nil {
return nil, walkErr
}
idx.LastIndexedAt = time.Now().Unix()
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
if err := os.MkdirAll(filepath.Dir(indexPath), 0755); err != nil {
return nil, err
}
data, err := json.MarshalIndent(idx, "", " ")
if err != nil {
return nil, err
}
if err := os.WriteFile(indexPath, data, 0644); err != nil {
return nil, err
}
e.updateKnowledgeStatus(idx)
if err := e.rebuildEmbeddingIndex(idx); err != nil {
e.setLastErrorWithScope(autoReplyErrorScopeKnowledge, err.Error())
}
return idx, nil
}
func isRootKnowledgeFile(name string, ext string, allowed map[string]bool, knowledgeIndexPath string, embeddingIndexPath string) bool {
name = strings.TrimSpace(name)
ext = strings.ToLower(ext)
if name == "" || name == ".keep" || !allowed[ext] {
return false
}
if strings.EqualFold(name, filepath.Base(knowledgeIndexPath)) ||
strings.EqualFold(name, filepath.Base(embeddingIndexPath)) {
return false
}
return true
}
func errorAs(err error, target interface{}) bool {
switch t := target.(type) {
case *knowledgeParseWarning:
if value, ok := err.(knowledgeParseWarning); ok {
*t = value
return true
}
if value, ok := err.(*knowledgeParseWarning); ok {
*t = *value
return true
}
}
return false
}
func (e *AutoReplyEngine) updateKnowledgeStatus(idx *KnowledgeIndex) {
e.mu.Lock()
e.index = idx
e.status.KnowledgeFileCount = idx.FileCount
e.status.KnowledgeChunkCount = len(idx.Chunks)
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
e.mu.Unlock()
}
func scoreKnowledgeChunk(queryTokens map[string]int, chunk KnowledgeChunk) float64 {
textTokens := tokenizeKnowledgeText(chunk.Title + " " + chunk.Content)
if len(textTokens) == 0 {
return 0
}
var matched, weighted float64
for token, qCount := range queryTokens {
if count, ok := textTokens[token]; ok {
matched++
weighted += math.Min(float64(count), float64(qCount)+1)
}
}
if matched == 0 {
return 0
}
coverage := matched / float64(len(queryTokens))
density := weighted / math.Sqrt(float64(len(textTokens))+1)
return coverage*0.75 + density*0.25
}
func parseKnowledgeFile(path string, root string) ([]KnowledgeChunk, error) {
ext := strings.ToLower(filepath.Ext(path))
var blocks []textBlock
var err error
switch ext {
case ".md", ".txt":
blocks, err = parsePlainKnowledgeFile(path)
case ".csv":
blocks, err = parseCSVKnowledgeFile(path)
case ".docx":
blocks, err = parseDocxKnowledgeFile(path)
case ".xlsx":
blocks, err = parseXlsxKnowledgeFile(path)
case ".pdf":
blocks, err = parsePDFKnowledgeFile(path)
default:
err = fmt.Errorf("unsupported extension: %s", ext)
}
if err != nil {
return nil, err
}
rel, err := filepath.Rel(root, path)
if err != nil {
rel = filepath.Base(path)
}
info, _ := os.Stat(path)
updatedAt := time.Now().Unix()
if info != nil {
updatedAt = info.ModTime().Unix()
}
chunks := make([]KnowledgeChunk, 0, len(blocks))
for i, block := range blocks {
content := strings.TrimSpace(block.Content)
if content == "" || isLowValueKnowledgeBlock(block.Title, content) {
continue
}
parts := splitLongKnowledgeContent(content, maxKnowledgeChunkContentRunes)
for partIndex, part := range parts {
title := block.Title
if len(parts) > 1 {
title = fmt.Sprintf("%s %d/%d", strings.TrimSpace(block.Title), partIndex+1, len(parts))
}
hash := hashKnowledgeChunk(rel, part, i*1000+partIndex)
sectionID := hashKnowledgeChunk(rel, strings.TrimSpace(block.Title), i)
chunks = append(chunks, KnowledgeChunk{
ID: hash,
Source: filepath.ToSlash(rel),
Title: title,
Content: part,
Line: block.Line,
Page: block.Page,
SectionID: sectionID,
SectionIndex: i,
PartIndex: partIndex,
UpdatedAt: updatedAt,
Hash: hash,
})
}
}
return chunks, nil
}
type textBlock struct {
Title string
Content string
Line int
Page int
}
func splitLongKnowledgeContent(content string, maxRunes int) []string {
content = strings.TrimSpace(content)
if content == "" {
return nil
}
if maxRunes <= 0 || len([]rune(content)) <= maxRunes {
return []string{content}
}
var chunks []string
var current strings.Builder
currentRunes := 0
flush := func() {
text := strings.TrimSpace(current.String())
if text != "" {
chunks = append(chunks, text)
}
current.Reset()
currentRunes = 0
}
appendPiece := func(piece string) {
piece = strings.TrimSpace(piece)
if piece == "" {
return
}
pieceRunes := []rune(piece)
for len(pieceRunes) > maxRunes {
if currentRunes > 0 {
flush()
}
chunks = append(chunks, strings.TrimSpace(string(pieceRunes[:maxRunes])))
pieceRunes = pieceRunes[maxRunes:]
}
if len(pieceRunes) == 0 {
return
}
separatorRunes := 0
if currentRunes > 0 {
separatorRunes = 1
}
if currentRunes+separatorRunes+len(pieceRunes) > maxRunes {
flush()
}
if currentRunes > 0 {
current.WriteString("\n")
currentRunes++
}
current.WriteString(string(pieceRunes))
currentRunes += len(pieceRunes)
}
for _, line := range strings.Split(content, "\n") {
appendPiece(line)
}
flush()
if len(chunks) == 0 {
return []string{content}
}
return chunks
}
func parsePlainKnowledgeFile(path string) ([]textBlock, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
text := stripBOM(string(data))
lines := strings.Split(text, "\n")
blocks := make([]textBlock, 0)
var currentTitle string
var current []string
startLine := 1
inFrontMatter := false
inDataviewBlock := false
flush := func() {
content := strings.TrimSpace(strings.Join(current, "\n"))
if content != "" && !isLowValueKnowledgeBlock(currentTitle, content) {
blocks = append(blocks, textBlock{Title: currentTitle, Content: content, Line: startLine})
}
current = nil
}
for i, line := range lines {
trimmed := strings.TrimSpace(line)
if i == 0 && trimmed == "---" {
inFrontMatter = true
continue
}
if inFrontMatter {
if trimmed == "---" {
inFrontMatter = false
startLine = i + 2
}
continue
}
if inDataviewBlock {
if strings.HasPrefix(trimmed, "```") {
inDataviewBlock = false
startLine = i + 2
}
continue
}
if strings.EqualFold(trimmed, "```dataview") {
flush()
inDataviewBlock = true
continue
}
if strings.HasPrefix(trimmed, "#") {
flush()
currentTitle = strings.TrimSpace(strings.TrimLeft(trimmed, "#"))
startLine = i + 1
continue
}
if trimmed == "" {
flush()
startLine = i + 2
continue
}
if len(current) == 0 {
startLine = i + 1
}
current = append(current, trimmed)
}
flush()
return blocks, nil
}
func isLowValueKnowledgeBlock(title string, content string) bool {
content = strings.TrimSpace(content)
if content == "" {
return true
}
if content == "---" || strings.Trim(content, "- \t\r\n") == "" {
return true
}
lower := strings.ToLower(content)
if strings.HasPrefix(lower, "```dataview") || strings.Contains(lower, "list from [[") {
return true
}
if strings.HasPrefix(content, "---\n") && strings.Contains(content, "\n---") {
return true
}
title = strings.TrimSpace(title)
return title == "反向链接" && (strings.Contains(lower, "dataview") || strings.Contains(lower, "list from [["))
}
func parseCSVKnowledgeFile(path string) ([]textBlock, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
reader := csv.NewReader(bytes.NewReader([]byte(stripBOM(string(data)))))
reader.FieldsPerRecord = -1
records, err := reader.ReadAll()
if err != nil {
return nil, err
}
if len(records) == 0 {
return nil, nil
}
headers := records[0]
blocks := make([]textBlock, 0, len(records)-1)
for i, row := range records[1:] {
parts := make([]string, 0, len(row))
for j, val := range row {
val = strings.TrimSpace(val)
if val == "" {
continue
}
if j < len(headers) && strings.TrimSpace(headers[j]) != "" {
parts = append(parts, strings.TrimSpace(headers[j])+": "+val)
} else {
parts = append(parts, val)
}
}
if len(parts) > 0 {
blocks = append(blocks, textBlock{Title: fmt.Sprintf("row %d", i+2), Content: strings.Join(parts, "\n"), Line: i + 2})
}
}
return blocks, nil
}
func parseDocxKnowledgeFile(path string) ([]textBlock, error) {
zr, err := zip.OpenReader(path)
if err != nil {
return nil, err
}
defer zr.Close()
var document []byte
for _, file := range zr.File {
if file.Name == "word/document.xml" {
document, err = readZipFile(file)
if err != nil {
return nil, err
}
break
}
}
if len(document) == 0 {
return nil, fmt.Errorf("word/document.xml not found")
}
return extractDocxBlocks(document), nil
}
func parseXlsxKnowledgeFile(path string) ([]textBlock, error) {
file, err := excelize.OpenFile(path)
if err != nil {
return nil, err
}
defer func() { _ = file.Close() }()
blocks := make([]textBlock, 0)
for _, sheetName := range file.GetSheetList() {
rows, err := file.GetRows(sheetName)
if err != nil {
return nil, err
}
if len(rows) == 0 {
continue
}
applyMergedCellValues(file, sheetName, rows)
blocks = append(blocks, structuredSheetBlocks(sheetName, rows)...)
}
return blocks, nil
}
func applyMergedCellValues(file *excelize.File, sheetName string, rows [][]string) {
mergeCells, err := file.GetMergeCells(sheetName)
if err != nil {
return
}
for _, mergeCell := range mergeCells {
value := strings.TrimSpace(mergeCell.GetCellValue())
if value == "" {
continue
}
startCol, startRow, err := excelize.CellNameToCoordinates(mergeCell.GetStartAxis())
if err != nil {
continue
}
endCol, endRow, err := excelize.CellNameToCoordinates(mergeCell.GetEndAxis())
if err != nil {
continue
}
if startRow == endRow {
continue
}
for row := startRow; row <= endRow; row++ {
for col := startCol; col <= endCol; col++ {
setSheetCellValue(rows, row-1, col-1, value)
}
}
}
}
func setSheetCellValue(rows [][]string, rowIndex int, colIndex int, value string) {
if rowIndex < 0 || rowIndex >= len(rows) || colIndex < 0 {
return
}
if len(rows[rowIndex]) <= colIndex {
expanded := make([]string, colIndex+1)
copy(expanded, rows[rowIndex])
rows[rowIndex] = expanded
}
if strings.TrimSpace(rows[rowIndex][colIndex]) == "" {
rows[rowIndex][colIndex] = value
}
}
func structuredSheetBlocks(sheetName string, rows [][]string) []textBlock {
blocks := make([]textBlock, 0)
headers := make([]string, 0)
carry := make([]string, 0)
sectionTitle := ""
for i, rawRow := range rows {
row := normalizeSheetRow(rawRow)
nonEmpty := nonEmptySheetValues(row)
if len(nonEmpty) == 0 {
continue
}
if len(nonEmpty) == 1 && !looksLikeSheetHeaderRow(row) {
sectionTitle = nonEmpty[0]
blocks = append(blocks, textBlock{
Title: sheetName,
Content: fmt.Sprintf("工作表: %s\n分类: %s", sheetName, sectionTitle),
Line: i + 1,
})
continue
}
if looksLikeSheetHeaderRow(row) {
headers = row
carry = make([]string, len(headers))
blocks = append(blocks, textBlock{
Title: sheetName,
Content: fmt.Sprintf("工作表: %s\n表头: %s", sheetName, strings.Join(nonEmpty, "")),
Line: i + 1,
})
continue
}
filled := applySheetCarryForward(row, headers, carry)
content := structuredSheetRowContent(sheetName, sectionTitle, headers, filled)
if strings.TrimSpace(content) == "" {
continue
}
blocks = append(blocks, textBlock{Title: sheetName, Content: content, Line: i + 1})
}
return blocks
}
func extractDocxBlocks(data []byte) []textBlock {
decoder := xml.NewDecoder(bytes.NewReader(data))
blocks := make([]textBlock, 0)
var paragraph strings.Builder
var cell strings.Builder
var rowCells []string
inText := false
inParagraph := false
tableDepth := 0
inCell := false
rowIndex := 0
paragraphIndex := 0
flushParagraph := func() {
text := normalizeWhitespace(paragraph.String())
paragraph.Reset()
if text == "" {
return
}
paragraphIndex++
blocks = append(blocks, textBlock{Title: "docx paragraph", Content: text, Line: paragraphIndex})
}
flushCell := func() {
text := normalizeWhitespace(cell.String())
cell.Reset()
if text != "" {
rowCells = append(rowCells, text)
}
}
flushRow := func() {
cleaned := make([]string, 0, len(rowCells))
for _, value := range rowCells {
value = strings.TrimSpace(value)
if value != "" {
cleaned = append(cleaned, value)
}
}
rowCells = nil
if len(cleaned) == 0 {
return
}
rowIndex++
blocks = append(blocks, textBlock{Title: "docx table", Content: strings.Join(cleaned, " | "), Line: rowIndex})
}
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
switch t.Name.Local {
case "tbl":
tableDepth++
case "tr":
if tableDepth > 0 {
rowCells = nil
}
case "tc":
if tableDepth > 0 {
inCell = true
cell.Reset()
}
case "p":
if tableDepth == 0 {
inParagraph = true
paragraph.Reset()
}
case "t":
inText = true
case "tab":
if inCell {
cell.WriteString(" ")
} else if inParagraph {
paragraph.WriteString(" ")
}
case "br":
if inCell {
cell.WriteString("\n")
} else if inParagraph {
paragraph.WriteString("\n")
}
}
case xml.EndElement:
switch t.Name.Local {
case "t":
inText = false
case "p":
if tableDepth == 0 && inParagraph {
flushParagraph()
}
inParagraph = false
if inCell {
cell.WriteString(" ")
}
case "tc":
if inCell {
flushCell()
}
inCell = false
case "tr":
if tableDepth > 0 {
flushRow()
}
case "tbl":
if tableDepth > 0 {
tableDepth--
}
}
case xml.CharData:
if !inText {
continue
}
text := html.UnescapeString(string(t))
if inCell {
cell.WriteString(text)
} else if inParagraph {
paragraph.WriteString(text)
}
}
}
return blocks
}
func normalizeSheetRow(row []string) []string {
result := make([]string, len(row))
for i, value := range row {
result[i] = strings.TrimSpace(value)
}
return result
}
func nonEmptySheetValues(row []string) []string {
values := make([]string, 0, len(row))
for _, value := range row {
if strings.TrimSpace(value) != "" {
values = append(values, strings.TrimSpace(value))
}
}
return values
}
func looksLikeSheetHeaderRow(row []string) bool {
matches := 0
for _, value := range row {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if isSheetHeaderLabel(value) {
matches++
}
}
return matches >= 2
}
func isSheetHeaderLabel(value string) bool {
value = strings.TrimSpace(value)
if value == "" {
return false
}
headerLabels := map[string]bool{
"星期": true, "时段": true, "部门": true, "会议主题": true, "会议时间": true, "会议日期": true,
"日期": true, "时间": true, "主题": true, "名称": true, "类别": true, "类型": true,
"项目": true, "标准": true, "要求": true, "负责人": true, "内容": true, "操作指引": true,
"检查项目": true, "核对内容": true, "详细": true, "备注": true, "测试流程": true,
}
return headerLabels[value]
}
func applySheetCarryForward(row []string, headers []string, carry []string) []string {
filled := append([]string(nil), row...)
if len(headers) == 0 {
return filled
}
if len(filled) < len(headers) {
expanded := make([]string, len(headers))
copy(expanded, filled)
filled = expanded
}
for i := range headers {
header := strings.TrimSpace(headers[i])
value := strings.TrimSpace(filled[i])
if value == "" && i < len(carry) && isCarryForwardSheetHeader(header) {
filled[i] = carry[i]
continue
}
if value != "" && i < len(carry) && isCarryForwardSheetHeader(header) {
carry[i] = value
}
}
return filled
}
func isCarryForwardSheetHeader(header string) bool {
header = strings.TrimSpace(header)
if header == "" {
return false
}
for _, term := range []string{"星期", "日期", "部门", "类别", "类型", "项目"} {
if strings.Contains(header, term) {
return true
}
}
return false
}
func structuredSheetRowContent(sheetName string, sectionTitle string, headers []string, row []string) string {
parts := make([]string, 0, len(row)+2)
parts = append(parts, "工作表: "+sheetName)
if strings.TrimSpace(sectionTitle) != "" {
parts = append(parts, "分类: "+strings.TrimSpace(sectionTitle))
}
for i, value := range row {
value = strings.TrimSpace(value)
if value == "" {
continue
}
label := ""
if i < len(headers) {
label = strings.TrimSpace(headers[i])
}
if label == "" {
label = fmt.Sprintf("列%d", i+1)
}
parts = append(parts, label+": "+value)
}
return strings.Join(parts, "\n")
}
func parsePDFKnowledgeFile(path string) ([]textBlock, error) {
file, reader, err := pdf.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
pageCount := reader.NumPage()
blocks := make([]textBlock, 0)
needsOCR := make([]int, 0)
for pageNum := 1; pageNum <= pageCount; pageNum++ {
page := reader.Page(pageNum)
text, pageErr := page.GetPlainText(nil)
if pageErr == nil && hasEnoughPDFText(text) {
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
continue
}
needsOCR = append(needsOCR, pageNum)
}
var warnings []string
if len(needsOCR) > 0 {
ocrBlocks, ocrWarnings := ocrPDFPages(path, needsOCR)
blocks = append(blocks, ocrBlocks...)
warnings = append(warnings, ocrWarnings...)
}
if len(blocks) == 0 {
if len(warnings) > 0 {
return nil, knowledgeParseWarning{Warnings: warnings}
}
return nil, fmt.Errorf("PDF未读取到可索引内容")
}
if len(warnings) > 0 {
return blocks, knowledgeParseWarning{Warnings: warnings}
}
return blocks, nil
}
func hasEnoughPDFText(text string) bool {
text = normalizeWhitespace(text)
if len([]rune(text)) >= 20 {
return true
}
tokens := tokenizeKnowledgeText(text)
return len(tokens) >= 5
}
func ocrPDFPages(path string, pageNumbers []int) ([]textBlock, []string) {
if len(pageNumbers) == 0 {
return nil, nil
}
var warnings []string
limitedPages := make([]int, 0, len(pageNumbers))
for _, pageNum := range pageNumbers {
if pageNum > maxPDFOCRPages {
continue
}
limitedPages = append(limitedPages, pageNum)
}
if len(limitedPages) < len(pageNumbers) {
warnings = append(warnings, fmt.Sprintf("PDF超过%d页后续页面未做视觉识别", maxPDFOCRPages))
}
pageNumbers = limitedPages
if len(pageNumbers) == 0 {
return nil, warnings
}
renderer, err := pdfFindRenderer()
if err != nil {
return nil, []string{"PDF扫描页需要pdftoppm渲染但未找到可用工具: " + err.Error()}
}
tmpDir, err := os.MkdirTemp("", "qiwei_pdf_ocr_*")
if err != nil {
return nil, []string{"PDF OCR临时目录创建失败: " + err.Error()}
}
defer os.RemoveAll(tmpDir)
blocks := make([]textBlock, 0, len(pageNumbers))
for _, pageNum := range pageNumbers {
imagePath, err := renderPDFPageFunc(renderer, path, pageNum, tmpDir)
if err != nil {
warnings = append(warnings, fmt.Sprintf("PDF第%d页渲染失败: %v", pageNum, err))
continue
}
ocr := pdfOCRPageImage
if ocr == nil {
ocr = ocrPDFPageImage
}
text, err := ocr(imagePath, pageNum)
if err != nil {
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别失败: %v", pageNum, err))
continue
}
text = normalizeWhitespace(text)
if text == "" {
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别为空", pageNum))
continue
}
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
}
return blocks, warnings
}
func findPDFRenderer() (string, error) {
candidates := []string{
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm.exe")),
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm")),
}
for _, candidate := range candidates {
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
return candidate, nil
}
}
if path, err := exec.LookPath("pdftoppm.exe"); err == nil {
return path, nil
}
if path, err := exec.LookPath("pdftoppm"); err == nil {
return path, nil
}
return "", fmt.Errorf("pdftoppm.exe not found")
}
func renderPDFPage(renderer string, pdfPath string, pageNum int, tmpDir string) (string, error) {
prefix := filepath.Join(tmpDir, fmt.Sprintf("page_%d", pageNum))
ctxArgs := []string{"-f", fmt.Sprintf("%d", pageNum), "-l", fmt.Sprintf("%d", pageNum), "-png", "-r", "160", pdfPath, prefix}
cmd := exec.Command(renderer, ctxArgs...)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("%v: %s", err, truncateText(string(output), 200))
}
matches, err := filepath.Glob(prefix + "-*.png")
if err != nil {
return "", err
}
if len(matches) == 0 {
return "", fmt.Errorf("pdftoppm未生成图片")
}
return matches[0], nil
}
func ocrPDFPageImage(imagePath string, pageNum int) (string, error) {
cfg := getAutoReplyEngine().getConfig()
dataURL, err := imageDataURLFromFile(imagePath)
if err != nil {
return "", err
}
systemPrompt := "你是一个严谨的PDF页面OCR识别器只提取图片中真实可见的文字保留标题、表格行列关系和关键数值不要补充不存在的内容。"
userPrompt := fmt.Sprintf("请完整识别这张PDF第%d页中的全部可见文字。若有表格请用每行一段的方式输出。", pageNum)
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, dataURL)
if err != nil {
return "", err
}
return strings.TrimSpace(result.Answer), nil
}
func splitTextToBlocks(text string, title string, page int) []textBlock {
text = normalizeWhitespace(text)
if text == "" {
return nil
}
paragraphs := regexp.MustCompile(`\n{2,}`).Split(text, -1)
blocks := make([]textBlock, 0, len(paragraphs))
for i, p := range paragraphs {
p = strings.TrimSpace(p)
if p != "" {
blocks = append(blocks, textBlock{Title: title, Content: p, Line: i + 1, Page: page})
}
}
if len(blocks) == 0 {
blocks = append(blocks, textBlock{Title: title, Content: text, Page: page})
}
return blocks
}
func readZipFile(file *zip.File) ([]byte, error) {
rc, err := file.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
func extractXMLText(data []byte) string {
decoder := xml.NewDecoder(bytes.NewReader(data))
var parts []string
for {
token, err := decoder.Token()
if err != nil {
break
}
if charData, ok := token.(xml.CharData); ok {
text := strings.TrimSpace(string(charData))
if text != "" {
parts = append(parts, html.UnescapeString(text))
}
}
}
return strings.Join(parts, "\n")
}
func extractSharedStrings(data []byte) []string {
decoder := xml.NewDecoder(bytes.NewReader(data))
values := make([]string, 0)
var current []string
inSI := false
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
if t.Name.Local == "si" {
inSI = true
current = nil
}
case xml.EndElement:
if t.Name.Local == "si" && inSI {
values = append(values, strings.Join(current, ""))
inSI = false
}
case xml.CharData:
if inSI {
current = append(current, string(t))
}
}
}
return values
}
func extractSheetRows(sheetName string, data []byte, sharedStrings []string) []textBlock {
decoder := xml.NewDecoder(bytes.NewReader(data))
blocks := make([]textBlock, 0)
var row []string
var cellType string
var cellValue string
inRow := false
inV := false
rowNum := 0
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
switch t.Name.Local {
case "row":
inRow = true
row = nil
rowNum++
case "c":
cellType = ""
cellValue = ""
for _, attr := range t.Attr {
if attr.Name.Local == "t" {
cellType = attr.Value
}
}
case "v", "t":
if inRow {
inV = true
}
}
case xml.EndElement:
switch t.Name.Local {
case "v", "t":
inV = false
case "c":
value := strings.TrimSpace(cellValue)
if cellType == "s" {
if idx, err := strconvAtoiSafe(value); err == nil && idx >= 0 && idx < len(sharedStrings) {
value = sharedStrings[idx]
}
}
if value != "" {
row = append(row, value)
}
case "row":
inRow = false
if len(row) > 0 {
blocks = append(blocks, textBlock{Title: filepath.Base(sheetName), Content: strings.Join(row, " | "), Line: rowNum})
}
}
case xml.CharData:
if inV {
cellValue += string(t)
}
}
}
return blocks
}
func extractPDFLikeText(data []byte) string {
raw := string(data)
re := regexp.MustCompile(`\(([^()]*)\)`)
matches := re.FindAllStringSubmatch(raw, -1)
parts := make([]string, 0, len(matches))
for _, match := range matches {
if len(match) > 1 {
text := strings.ReplaceAll(match[1], `\(`, "(")
text = strings.ReplaceAll(text, `\)`, ")")
text = strings.ReplaceAll(text, `\n`, "\n")
text = strings.TrimSpace(text)
if text != "" && printableRatio(text) > 0.6 {
parts = append(parts, text)
}
}
}
if len(parts) == 0 {
parts = append(parts, strings.Map(func(r rune) rune {
if r == '\n' || r == '\r' || r == '\t' || unicode.IsPrint(r) {
return r
}
return ' '
}, raw))
}
return strings.Join(parts, "\n")
}
func tokenizeKnowledgeText(text string) map[string]int {
text = strings.ToLower(text)
tokens := make(map[string]int)
var current []rune
flush := func() {
if len(current) > 0 {
token := string(current)
if len([]rune(token)) > 1 {
tokens[token]++
}
current = nil
}
}
var chineseRunes []rune
for _, r := range text {
if unicode.Is(unicode.Han, r) {
flush()
chineseRunes = append(chineseRunes, r)
continue
}
if len(chineseRunes) > 0 {
addChineseTokens(tokens, chineseRunes)
chineseRunes = nil
}
if unicode.IsLetter(r) || unicode.IsDigit(r) {
current = append(current, r)
} else {
flush()
}
}
flush()
if len(chineseRunes) > 0 {
addChineseTokens(tokens, chineseRunes)
}
return tokens
}
func addChineseTokens(tokens map[string]int, chars []rune) {
for _, r := range chars {
tokens[string(r)]++
}
for i := 0; i+1 < len(chars); i++ {
tokens[string(chars[i:i+2])] += 2
}
}
func hashKnowledgeChunk(source string, content string, idx int) string {
sum := sha1.Sum([]byte(fmt.Sprintf("%s:%d:%s", source, idx, content)))
return hex.EncodeToString(sum[:])
}
func stripBOM(text string) string {
return strings.TrimPrefix(text, "\ufeff")
}
func normalizeWhitespace(text string) string {
text = strings.ReplaceAll(text, "\r\n", "\n")
text = strings.ReplaceAll(text, "\r", "\n")
lines := strings.Split(text, "\n")
cleaned := make([]string, 0, len(lines))
for _, line := range lines {
line = strings.TrimSpace(regexp.MustCompile(`[ \t]+`).ReplaceAllString(line, " "))
if line != "" {
cleaned = append(cleaned, line)
} else if len(cleaned) > 0 && cleaned[len(cleaned)-1] != "" {
cleaned = append(cleaned, "")
}
}
return strings.TrimSpace(strings.Join(cleaned, "\n"))
}
func printableRatio(text string) float64 {
if text == "" {
return 0
}
printable := 0
total := 0
for _, r := range text {
total++
if unicode.IsPrint(r) || unicode.IsSpace(r) {
printable++
}
}
return float64(printable) / float64(total)
}
func strconvAtoiSafe(value string) (int, error) {
value = strings.TrimSpace(value)
n := 0
if value == "" {
return 0, fmt.Errorf("empty")
}
for _, r := range value {
if r < '0' || r > '9' {
return 0, fmt.Errorf("invalid int")
}
n = n*10 + int(r-'0')
}
return n, nil
}
func resolveAutoReplyPath(pathValue string) string {
if filepath.IsAbs(pathValue) {
return pathValue
}
exePath, err := os.Executable()
if err != nil {
wd, wdErr := os.Getwd()
if wdErr == nil {
return filepath.Join(wd, pathValue)
}
return pathValue
}
return filepath.Join(filepath.Dir(exePath), pathValue)
}