- 将默认回复详细程度从"detailed"调整为"medium",前后端保持一致 - 新增话题切换检测逻辑,当用户主动要求换话题时提供引导回复 - 优化上下文处理机制,仅在指代型追问时注入历史对话,避免模型复读旧内容 - 改进知识库检索逻辑,区分自包含问题和指代型问题的上下文需求 - 完善知识库完整性指令,确保回复详细程度与知识展开程度一致 - 重构知识库重建逻辑,支持递归扫描子目录中的文件,修复索引为空的问题 - 增强素材匹配算法,引入强信号检测机制,避免仅凭模糊匹配误发素材 - 新增素材开场白AI生成功能,支持图片、视频、文档等类型智能描述 - 改进知识库重建通知,显示具体的文件数、分片数及失败统计信息
1270 lines
32 KiB
Go
1270 lines
32 KiB
Go
package main
|
||
|
||
import (
|
||
"archive/zip"
|
||
"bytes"
|
||
"crypto/sha1"
|
||
"encoding/csv"
|
||
"encoding/hex"
|
||
"encoding/json"
|
||
"encoding/xml"
|
||
"fmt"
|
||
"html"
|
||
"io"
|
||
"io/fs"
|
||
"math"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"regexp"
|
||
"strings"
|
||
"time"
|
||
"unicode"
|
||
|
||
"github.com/ledongthuc/pdf"
|
||
"github.com/xuri/excelize/v2"
|
||
)
|
||
|
||
const maxKnowledgeChunkContentRunes = 6000
|
||
const maxPDFOCRPages = 20
|
||
|
||
var (
|
||
pdfFindRenderer = findPDFRenderer
|
||
pdfOCRPageImage func(string, int) (string, error)
|
||
renderPDFPageFunc = renderPDFPage
|
||
)
|
||
|
||
type KnowledgeChunk struct {
|
||
ID string `json:"id"`
|
||
Source string `json:"source"`
|
||
Title string `json:"title"`
|
||
Content string `json:"content"`
|
||
Line int `json:"line"`
|
||
Page int `json:"page"`
|
||
SectionID string `json:"sectionId,omitempty"`
|
||
SectionIndex int `json:"sectionIndex,omitempty"`
|
||
PartIndex int `json:"partIndex,omitempty"`
|
||
UpdatedAt int64 `json:"updatedAt"`
|
||
Hash string `json:"hash"`
|
||
Score float64 `json:"score,omitempty"`
|
||
}
|
||
|
||
type KnowledgeIndex struct {
|
||
Chunks []KnowledgeChunk `json:"chunks"`
|
||
FileCount int `json:"fileCount"`
|
||
FailedFiles []string `json:"failedFiles"`
|
||
LastIndexedAt int64 `json:"lastIndexedAt"`
|
||
}
|
||
|
||
type knowledgeParseWarning struct {
|
||
Warnings []string
|
||
}
|
||
|
||
func (e knowledgeParseWarning) Error() string {
|
||
return strings.Join(e.Warnings, ";")
|
||
}
|
||
|
||
func NewKnowledgeIndex() *KnowledgeIndex {
|
||
return &KnowledgeIndex{
|
||
Chunks: make([]KnowledgeChunk, 0),
|
||
FailedFiles: make([]string, 0),
|
||
}
|
||
}
|
||
|
||
func (e *AutoReplyEngine) loadKnowledgeIndex() error {
|
||
cfg := e.getConfig()
|
||
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
|
||
data, err := os.ReadFile(indexPath)
|
||
if err != nil {
|
||
if os.IsNotExist(err) {
|
||
e.updateKnowledgeStatus(NewKnowledgeIndex())
|
||
return nil
|
||
}
|
||
return err
|
||
}
|
||
var idx KnowledgeIndex
|
||
if err := json.Unmarshal(data, &idx); err != nil {
|
||
return err
|
||
}
|
||
e.mu.Lock()
|
||
e.index = &idx
|
||
e.status.KnowledgeFileCount = idx.FileCount
|
||
e.status.KnowledgeChunkCount = len(idx.Chunks)
|
||
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
|
||
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
|
||
e.mu.Unlock()
|
||
return nil
|
||
}
|
||
|
||
func (e *AutoReplyEngine) rebuildKnowledgeIndex() (*KnowledgeIndex, error) {
|
||
cfg := e.getConfig()
|
||
root := resolveAutoReplyPath(cfg.Knowledge.Directory)
|
||
idx := NewKnowledgeIndex()
|
||
allowed := make(map[string]bool)
|
||
for _, ext := range cfg.Knowledge.SupportedExtensions {
|
||
allowed[strings.ToLower(ext)] = true
|
||
}
|
||
if len(allowed) == 0 {
|
||
allowed[".md"] = true
|
||
allowed[".txt"] = true
|
||
allowed[".csv"] = true
|
||
}
|
||
if err := os.MkdirAll(root, 0755); err != nil {
|
||
return nil, err
|
||
}
|
||
// 递归遍历子目录(filepath.WalkDir):知识库常按分类分文件夹组织
|
||
// (如 01_产品与设备/、03_售后支持/01_故障排查/),与素材扫描保持一致。
|
||
// 仅扫根目录会漏掉所有子目录文件,导致索引为空、向量召回失败。
|
||
walkErr := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
|
||
if err != nil {
|
||
return nil // 单个条目出错跳过,不中断整体重建
|
||
}
|
||
if d.IsDir() {
|
||
return nil
|
||
}
|
||
ext := strings.ToLower(filepath.Ext(d.Name()))
|
||
if !isRootKnowledgeFile(d.Name(), ext, allowed, cfg.Knowledge.IndexPath, cfg.Retrieval.EmbeddingIndexPath) {
|
||
return nil
|
||
}
|
||
chunks, err := parseKnowledgeFile(path, root)
|
||
if err != nil {
|
||
var warning knowledgeParseWarning
|
||
if ok := errorAs(err, &warning); ok {
|
||
for _, item := range warning.Warnings {
|
||
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %s", path, item))
|
||
}
|
||
} else {
|
||
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %v", path, err))
|
||
return nil
|
||
}
|
||
}
|
||
if len(chunks) == 0 {
|
||
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: 未读取到可索引内容", path))
|
||
return nil
|
||
}
|
||
idx.FileCount++
|
||
idx.Chunks = append(idx.Chunks, chunks...)
|
||
return nil
|
||
})
|
||
if walkErr != nil {
|
||
return nil, walkErr
|
||
}
|
||
idx.LastIndexedAt = time.Now().Unix()
|
||
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
|
||
if err := os.MkdirAll(filepath.Dir(indexPath), 0755); err != nil {
|
||
return nil, err
|
||
}
|
||
data, err := json.MarshalIndent(idx, "", " ")
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if err := os.WriteFile(indexPath, data, 0644); err != nil {
|
||
return nil, err
|
||
}
|
||
e.updateKnowledgeStatus(idx)
|
||
if err := e.rebuildEmbeddingIndex(idx); err != nil {
|
||
e.setLastErrorWithScope(autoReplyErrorScopeKnowledge, err.Error())
|
||
}
|
||
return idx, nil
|
||
}
|
||
|
||
func isRootKnowledgeFile(name string, ext string, allowed map[string]bool, knowledgeIndexPath string, embeddingIndexPath string) bool {
|
||
name = strings.TrimSpace(name)
|
||
ext = strings.ToLower(ext)
|
||
if name == "" || name == ".keep" || !allowed[ext] {
|
||
return false
|
||
}
|
||
if strings.EqualFold(name, filepath.Base(knowledgeIndexPath)) ||
|
||
strings.EqualFold(name, filepath.Base(embeddingIndexPath)) {
|
||
return false
|
||
}
|
||
return true
|
||
}
|
||
|
||
func errorAs(err error, target interface{}) bool {
|
||
switch t := target.(type) {
|
||
case *knowledgeParseWarning:
|
||
if value, ok := err.(knowledgeParseWarning); ok {
|
||
*t = value
|
||
return true
|
||
}
|
||
if value, ok := err.(*knowledgeParseWarning); ok {
|
||
*t = *value
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
func (e *AutoReplyEngine) updateKnowledgeStatus(idx *KnowledgeIndex) {
|
||
e.mu.Lock()
|
||
e.index = idx
|
||
e.status.KnowledgeFileCount = idx.FileCount
|
||
e.status.KnowledgeChunkCount = len(idx.Chunks)
|
||
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
|
||
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
|
||
e.mu.Unlock()
|
||
}
|
||
|
||
func scoreKnowledgeChunk(queryTokens map[string]int, chunk KnowledgeChunk) float64 {
|
||
textTokens := tokenizeKnowledgeText(chunk.Title + " " + chunk.Content)
|
||
if len(textTokens) == 0 {
|
||
return 0
|
||
}
|
||
var matched, weighted float64
|
||
for token, qCount := range queryTokens {
|
||
if count, ok := textTokens[token]; ok {
|
||
matched++
|
||
weighted += math.Min(float64(count), float64(qCount)+1)
|
||
}
|
||
}
|
||
if matched == 0 {
|
||
return 0
|
||
}
|
||
coverage := matched / float64(len(queryTokens))
|
||
density := weighted / math.Sqrt(float64(len(textTokens))+1)
|
||
return coverage*0.75 + density*0.25
|
||
}
|
||
|
||
func parseKnowledgeFile(path string, root string) ([]KnowledgeChunk, error) {
|
||
ext := strings.ToLower(filepath.Ext(path))
|
||
var blocks []textBlock
|
||
var err error
|
||
switch ext {
|
||
case ".md", ".txt":
|
||
blocks, err = parsePlainKnowledgeFile(path)
|
||
case ".csv":
|
||
blocks, err = parseCSVKnowledgeFile(path)
|
||
case ".docx":
|
||
blocks, err = parseDocxKnowledgeFile(path)
|
||
case ".xlsx":
|
||
blocks, err = parseXlsxKnowledgeFile(path)
|
||
case ".pdf":
|
||
blocks, err = parsePDFKnowledgeFile(path)
|
||
default:
|
||
err = fmt.Errorf("unsupported extension: %s", ext)
|
||
}
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
rel, err := filepath.Rel(root, path)
|
||
if err != nil {
|
||
rel = filepath.Base(path)
|
||
}
|
||
info, _ := os.Stat(path)
|
||
updatedAt := time.Now().Unix()
|
||
if info != nil {
|
||
updatedAt = info.ModTime().Unix()
|
||
}
|
||
chunks := make([]KnowledgeChunk, 0, len(blocks))
|
||
for i, block := range blocks {
|
||
content := strings.TrimSpace(block.Content)
|
||
if content == "" || isLowValueKnowledgeBlock(block.Title, content) {
|
||
continue
|
||
}
|
||
parts := splitLongKnowledgeContent(content, maxKnowledgeChunkContentRunes)
|
||
for partIndex, part := range parts {
|
||
title := block.Title
|
||
if len(parts) > 1 {
|
||
title = fmt.Sprintf("%s %d/%d", strings.TrimSpace(block.Title), partIndex+1, len(parts))
|
||
}
|
||
hash := hashKnowledgeChunk(rel, part, i*1000+partIndex)
|
||
sectionID := hashKnowledgeChunk(rel, strings.TrimSpace(block.Title), i)
|
||
chunks = append(chunks, KnowledgeChunk{
|
||
ID: hash,
|
||
Source: filepath.ToSlash(rel),
|
||
Title: title,
|
||
Content: part,
|
||
Line: block.Line,
|
||
Page: block.Page,
|
||
SectionID: sectionID,
|
||
SectionIndex: i,
|
||
PartIndex: partIndex,
|
||
UpdatedAt: updatedAt,
|
||
Hash: hash,
|
||
})
|
||
}
|
||
}
|
||
return chunks, nil
|
||
}
|
||
|
||
type textBlock struct {
|
||
Title string
|
||
Content string
|
||
Line int
|
||
Page int
|
||
}
|
||
|
||
func splitLongKnowledgeContent(content string, maxRunes int) []string {
|
||
content = strings.TrimSpace(content)
|
||
if content == "" {
|
||
return nil
|
||
}
|
||
if maxRunes <= 0 || len([]rune(content)) <= maxRunes {
|
||
return []string{content}
|
||
}
|
||
var chunks []string
|
||
var current strings.Builder
|
||
currentRunes := 0
|
||
flush := func() {
|
||
text := strings.TrimSpace(current.String())
|
||
if text != "" {
|
||
chunks = append(chunks, text)
|
||
}
|
||
current.Reset()
|
||
currentRunes = 0
|
||
}
|
||
appendPiece := func(piece string) {
|
||
piece = strings.TrimSpace(piece)
|
||
if piece == "" {
|
||
return
|
||
}
|
||
pieceRunes := []rune(piece)
|
||
for len(pieceRunes) > maxRunes {
|
||
if currentRunes > 0 {
|
||
flush()
|
||
}
|
||
chunks = append(chunks, strings.TrimSpace(string(pieceRunes[:maxRunes])))
|
||
pieceRunes = pieceRunes[maxRunes:]
|
||
}
|
||
if len(pieceRunes) == 0 {
|
||
return
|
||
}
|
||
separatorRunes := 0
|
||
if currentRunes > 0 {
|
||
separatorRunes = 1
|
||
}
|
||
if currentRunes+separatorRunes+len(pieceRunes) > maxRunes {
|
||
flush()
|
||
}
|
||
if currentRunes > 0 {
|
||
current.WriteString("\n")
|
||
currentRunes++
|
||
}
|
||
current.WriteString(string(pieceRunes))
|
||
currentRunes += len(pieceRunes)
|
||
}
|
||
for _, line := range strings.Split(content, "\n") {
|
||
appendPiece(line)
|
||
}
|
||
flush()
|
||
if len(chunks) == 0 {
|
||
return []string{content}
|
||
}
|
||
return chunks
|
||
}
|
||
|
||
func parsePlainKnowledgeFile(path string) ([]textBlock, error) {
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
text := stripBOM(string(data))
|
||
lines := strings.Split(text, "\n")
|
||
blocks := make([]textBlock, 0)
|
||
var currentTitle string
|
||
var current []string
|
||
startLine := 1
|
||
inFrontMatter := false
|
||
inDataviewBlock := false
|
||
flush := func() {
|
||
content := strings.TrimSpace(strings.Join(current, "\n"))
|
||
if content != "" && !isLowValueKnowledgeBlock(currentTitle, content) {
|
||
blocks = append(blocks, textBlock{Title: currentTitle, Content: content, Line: startLine})
|
||
}
|
||
current = nil
|
||
}
|
||
for i, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
if i == 0 && trimmed == "---" {
|
||
inFrontMatter = true
|
||
continue
|
||
}
|
||
if inFrontMatter {
|
||
if trimmed == "---" {
|
||
inFrontMatter = false
|
||
startLine = i + 2
|
||
}
|
||
continue
|
||
}
|
||
if inDataviewBlock {
|
||
if strings.HasPrefix(trimmed, "```") {
|
||
inDataviewBlock = false
|
||
startLine = i + 2
|
||
}
|
||
continue
|
||
}
|
||
if strings.EqualFold(trimmed, "```dataview") {
|
||
flush()
|
||
inDataviewBlock = true
|
||
continue
|
||
}
|
||
if strings.HasPrefix(trimmed, "#") {
|
||
flush()
|
||
currentTitle = strings.TrimSpace(strings.TrimLeft(trimmed, "#"))
|
||
startLine = i + 1
|
||
continue
|
||
}
|
||
if trimmed == "" {
|
||
flush()
|
||
startLine = i + 2
|
||
continue
|
||
}
|
||
if len(current) == 0 {
|
||
startLine = i + 1
|
||
}
|
||
current = append(current, trimmed)
|
||
}
|
||
flush()
|
||
return blocks, nil
|
||
}
|
||
|
||
func isLowValueKnowledgeBlock(title string, content string) bool {
|
||
content = strings.TrimSpace(content)
|
||
if content == "" {
|
||
return true
|
||
}
|
||
if content == "---" || strings.Trim(content, "- \t\r\n") == "" {
|
||
return true
|
||
}
|
||
lower := strings.ToLower(content)
|
||
if strings.HasPrefix(lower, "```dataview") || strings.Contains(lower, "list from [[") {
|
||
return true
|
||
}
|
||
if strings.HasPrefix(content, "---\n") && strings.Contains(content, "\n---") {
|
||
return true
|
||
}
|
||
title = strings.TrimSpace(title)
|
||
return title == "反向链接" && (strings.Contains(lower, "dataview") || strings.Contains(lower, "list from [["))
|
||
}
|
||
|
||
func parseCSVKnowledgeFile(path string) ([]textBlock, error) {
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
reader := csv.NewReader(bytes.NewReader([]byte(stripBOM(string(data)))))
|
||
reader.FieldsPerRecord = -1
|
||
records, err := reader.ReadAll()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(records) == 0 {
|
||
return nil, nil
|
||
}
|
||
headers := records[0]
|
||
blocks := make([]textBlock, 0, len(records)-1)
|
||
for i, row := range records[1:] {
|
||
parts := make([]string, 0, len(row))
|
||
for j, val := range row {
|
||
val = strings.TrimSpace(val)
|
||
if val == "" {
|
||
continue
|
||
}
|
||
if j < len(headers) && strings.TrimSpace(headers[j]) != "" {
|
||
parts = append(parts, strings.TrimSpace(headers[j])+": "+val)
|
||
} else {
|
||
parts = append(parts, val)
|
||
}
|
||
}
|
||
if len(parts) > 0 {
|
||
blocks = append(blocks, textBlock{Title: fmt.Sprintf("row %d", i+2), Content: strings.Join(parts, "\n"), Line: i + 2})
|
||
}
|
||
}
|
||
return blocks, nil
|
||
}
|
||
|
||
func parseDocxKnowledgeFile(path string) ([]textBlock, error) {
|
||
zr, err := zip.OpenReader(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer zr.Close()
|
||
var document []byte
|
||
for _, file := range zr.File {
|
||
if file.Name == "word/document.xml" {
|
||
document, err = readZipFile(file)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
break
|
||
}
|
||
}
|
||
if len(document) == 0 {
|
||
return nil, fmt.Errorf("word/document.xml not found")
|
||
}
|
||
return extractDocxBlocks(document), nil
|
||
}
|
||
|
||
func parseXlsxKnowledgeFile(path string) ([]textBlock, error) {
|
||
file, err := excelize.OpenFile(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer func() { _ = file.Close() }()
|
||
|
||
blocks := make([]textBlock, 0)
|
||
for _, sheetName := range file.GetSheetList() {
|
||
rows, err := file.GetRows(sheetName)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(rows) == 0 {
|
||
continue
|
||
}
|
||
applyMergedCellValues(file, sheetName, rows)
|
||
blocks = append(blocks, structuredSheetBlocks(sheetName, rows)...)
|
||
}
|
||
return blocks, nil
|
||
}
|
||
|
||
func applyMergedCellValues(file *excelize.File, sheetName string, rows [][]string) {
|
||
mergeCells, err := file.GetMergeCells(sheetName)
|
||
if err != nil {
|
||
return
|
||
}
|
||
for _, mergeCell := range mergeCells {
|
||
value := strings.TrimSpace(mergeCell.GetCellValue())
|
||
if value == "" {
|
||
continue
|
||
}
|
||
startCol, startRow, err := excelize.CellNameToCoordinates(mergeCell.GetStartAxis())
|
||
if err != nil {
|
||
continue
|
||
}
|
||
endCol, endRow, err := excelize.CellNameToCoordinates(mergeCell.GetEndAxis())
|
||
if err != nil {
|
||
continue
|
||
}
|
||
if startRow == endRow {
|
||
continue
|
||
}
|
||
for row := startRow; row <= endRow; row++ {
|
||
for col := startCol; col <= endCol; col++ {
|
||
setSheetCellValue(rows, row-1, col-1, value)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
func setSheetCellValue(rows [][]string, rowIndex int, colIndex int, value string) {
|
||
if rowIndex < 0 || rowIndex >= len(rows) || colIndex < 0 {
|
||
return
|
||
}
|
||
if len(rows[rowIndex]) <= colIndex {
|
||
expanded := make([]string, colIndex+1)
|
||
copy(expanded, rows[rowIndex])
|
||
rows[rowIndex] = expanded
|
||
}
|
||
if strings.TrimSpace(rows[rowIndex][colIndex]) == "" {
|
||
rows[rowIndex][colIndex] = value
|
||
}
|
||
}
|
||
|
||
func structuredSheetBlocks(sheetName string, rows [][]string) []textBlock {
|
||
blocks := make([]textBlock, 0)
|
||
headers := make([]string, 0)
|
||
carry := make([]string, 0)
|
||
sectionTitle := ""
|
||
for i, rawRow := range rows {
|
||
row := normalizeSheetRow(rawRow)
|
||
nonEmpty := nonEmptySheetValues(row)
|
||
if len(nonEmpty) == 0 {
|
||
continue
|
||
}
|
||
if len(nonEmpty) == 1 && !looksLikeSheetHeaderRow(row) {
|
||
sectionTitle = nonEmpty[0]
|
||
blocks = append(blocks, textBlock{
|
||
Title: sheetName,
|
||
Content: fmt.Sprintf("工作表: %s\n分类: %s", sheetName, sectionTitle),
|
||
Line: i + 1,
|
||
})
|
||
continue
|
||
}
|
||
if looksLikeSheetHeaderRow(row) {
|
||
headers = row
|
||
carry = make([]string, len(headers))
|
||
blocks = append(blocks, textBlock{
|
||
Title: sheetName,
|
||
Content: fmt.Sprintf("工作表: %s\n表头: %s", sheetName, strings.Join(nonEmpty, ";")),
|
||
Line: i + 1,
|
||
})
|
||
continue
|
||
}
|
||
filled := applySheetCarryForward(row, headers, carry)
|
||
content := structuredSheetRowContent(sheetName, sectionTitle, headers, filled)
|
||
if strings.TrimSpace(content) == "" {
|
||
continue
|
||
}
|
||
blocks = append(blocks, textBlock{Title: sheetName, Content: content, Line: i + 1})
|
||
}
|
||
return blocks
|
||
}
|
||
|
||
func extractDocxBlocks(data []byte) []textBlock {
|
||
decoder := xml.NewDecoder(bytes.NewReader(data))
|
||
blocks := make([]textBlock, 0)
|
||
var paragraph strings.Builder
|
||
var cell strings.Builder
|
||
var rowCells []string
|
||
inText := false
|
||
inParagraph := false
|
||
tableDepth := 0
|
||
inCell := false
|
||
rowIndex := 0
|
||
paragraphIndex := 0
|
||
|
||
flushParagraph := func() {
|
||
text := normalizeWhitespace(paragraph.String())
|
||
paragraph.Reset()
|
||
if text == "" {
|
||
return
|
||
}
|
||
paragraphIndex++
|
||
blocks = append(blocks, textBlock{Title: "docx paragraph", Content: text, Line: paragraphIndex})
|
||
}
|
||
flushCell := func() {
|
||
text := normalizeWhitespace(cell.String())
|
||
cell.Reset()
|
||
if text != "" {
|
||
rowCells = append(rowCells, text)
|
||
}
|
||
}
|
||
flushRow := func() {
|
||
cleaned := make([]string, 0, len(rowCells))
|
||
for _, value := range rowCells {
|
||
value = strings.TrimSpace(value)
|
||
if value != "" {
|
||
cleaned = append(cleaned, value)
|
||
}
|
||
}
|
||
rowCells = nil
|
||
if len(cleaned) == 0 {
|
||
return
|
||
}
|
||
rowIndex++
|
||
blocks = append(blocks, textBlock{Title: "docx table", Content: strings.Join(cleaned, " | "), Line: rowIndex})
|
||
}
|
||
|
||
for {
|
||
token, err := decoder.Token()
|
||
if err != nil {
|
||
break
|
||
}
|
||
switch t := token.(type) {
|
||
case xml.StartElement:
|
||
switch t.Name.Local {
|
||
case "tbl":
|
||
tableDepth++
|
||
case "tr":
|
||
if tableDepth > 0 {
|
||
rowCells = nil
|
||
}
|
||
case "tc":
|
||
if tableDepth > 0 {
|
||
inCell = true
|
||
cell.Reset()
|
||
}
|
||
case "p":
|
||
if tableDepth == 0 {
|
||
inParagraph = true
|
||
paragraph.Reset()
|
||
}
|
||
case "t":
|
||
inText = true
|
||
case "tab":
|
||
if inCell {
|
||
cell.WriteString(" ")
|
||
} else if inParagraph {
|
||
paragraph.WriteString(" ")
|
||
}
|
||
case "br":
|
||
if inCell {
|
||
cell.WriteString("\n")
|
||
} else if inParagraph {
|
||
paragraph.WriteString("\n")
|
||
}
|
||
}
|
||
case xml.EndElement:
|
||
switch t.Name.Local {
|
||
case "t":
|
||
inText = false
|
||
case "p":
|
||
if tableDepth == 0 && inParagraph {
|
||
flushParagraph()
|
||
}
|
||
inParagraph = false
|
||
if inCell {
|
||
cell.WriteString(" ")
|
||
}
|
||
case "tc":
|
||
if inCell {
|
||
flushCell()
|
||
}
|
||
inCell = false
|
||
case "tr":
|
||
if tableDepth > 0 {
|
||
flushRow()
|
||
}
|
||
case "tbl":
|
||
if tableDepth > 0 {
|
||
tableDepth--
|
||
}
|
||
}
|
||
case xml.CharData:
|
||
if !inText {
|
||
continue
|
||
}
|
||
text := html.UnescapeString(string(t))
|
||
if inCell {
|
||
cell.WriteString(text)
|
||
} else if inParagraph {
|
||
paragraph.WriteString(text)
|
||
}
|
||
}
|
||
}
|
||
return blocks
|
||
}
|
||
|
||
func normalizeSheetRow(row []string) []string {
|
||
result := make([]string, len(row))
|
||
for i, value := range row {
|
||
result[i] = strings.TrimSpace(value)
|
||
}
|
||
return result
|
||
}
|
||
|
||
func nonEmptySheetValues(row []string) []string {
|
||
values := make([]string, 0, len(row))
|
||
for _, value := range row {
|
||
if strings.TrimSpace(value) != "" {
|
||
values = append(values, strings.TrimSpace(value))
|
||
}
|
||
}
|
||
return values
|
||
}
|
||
|
||
func looksLikeSheetHeaderRow(row []string) bool {
|
||
matches := 0
|
||
for _, value := range row {
|
||
value = strings.TrimSpace(value)
|
||
if value == "" {
|
||
continue
|
||
}
|
||
if isSheetHeaderLabel(value) {
|
||
matches++
|
||
}
|
||
}
|
||
return matches >= 2
|
||
}
|
||
|
||
func isSheetHeaderLabel(value string) bool {
|
||
value = strings.TrimSpace(value)
|
||
if value == "" {
|
||
return false
|
||
}
|
||
headerLabels := map[string]bool{
|
||
"星期": true, "时段": true, "部门": true, "会议主题": true, "会议时间": true, "会议日期": true,
|
||
"日期": true, "时间": true, "主题": true, "名称": true, "类别": true, "类型": true,
|
||
"项目": true, "标准": true, "要求": true, "负责人": true, "内容": true, "操作指引": true,
|
||
"检查项目": true, "核对内容": true, "详细": true, "备注": true, "测试流程": true,
|
||
}
|
||
return headerLabels[value]
|
||
}
|
||
|
||
func applySheetCarryForward(row []string, headers []string, carry []string) []string {
|
||
filled := append([]string(nil), row...)
|
||
if len(headers) == 0 {
|
||
return filled
|
||
}
|
||
if len(filled) < len(headers) {
|
||
expanded := make([]string, len(headers))
|
||
copy(expanded, filled)
|
||
filled = expanded
|
||
}
|
||
for i := range headers {
|
||
header := strings.TrimSpace(headers[i])
|
||
value := strings.TrimSpace(filled[i])
|
||
if value == "" && i < len(carry) && isCarryForwardSheetHeader(header) {
|
||
filled[i] = carry[i]
|
||
continue
|
||
}
|
||
if value != "" && i < len(carry) && isCarryForwardSheetHeader(header) {
|
||
carry[i] = value
|
||
}
|
||
}
|
||
return filled
|
||
}
|
||
|
||
func isCarryForwardSheetHeader(header string) bool {
|
||
header = strings.TrimSpace(header)
|
||
if header == "" {
|
||
return false
|
||
}
|
||
for _, term := range []string{"星期", "日期", "部门", "类别", "类型", "项目"} {
|
||
if strings.Contains(header, term) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
func structuredSheetRowContent(sheetName string, sectionTitle string, headers []string, row []string) string {
|
||
parts := make([]string, 0, len(row)+2)
|
||
parts = append(parts, "工作表: "+sheetName)
|
||
if strings.TrimSpace(sectionTitle) != "" {
|
||
parts = append(parts, "分类: "+strings.TrimSpace(sectionTitle))
|
||
}
|
||
for i, value := range row {
|
||
value = strings.TrimSpace(value)
|
||
if value == "" {
|
||
continue
|
||
}
|
||
label := ""
|
||
if i < len(headers) {
|
||
label = strings.TrimSpace(headers[i])
|
||
}
|
||
if label == "" {
|
||
label = fmt.Sprintf("列%d", i+1)
|
||
}
|
||
parts = append(parts, label+": "+value)
|
||
}
|
||
return strings.Join(parts, "\n")
|
||
}
|
||
|
||
func parsePDFKnowledgeFile(path string) ([]textBlock, error) {
|
||
file, reader, err := pdf.Open(path)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer file.Close()
|
||
|
||
pageCount := reader.NumPage()
|
||
blocks := make([]textBlock, 0)
|
||
needsOCR := make([]int, 0)
|
||
for pageNum := 1; pageNum <= pageCount; pageNum++ {
|
||
page := reader.Page(pageNum)
|
||
text, pageErr := page.GetPlainText(nil)
|
||
if pageErr == nil && hasEnoughPDFText(text) {
|
||
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
|
||
continue
|
||
}
|
||
needsOCR = append(needsOCR, pageNum)
|
||
}
|
||
|
||
var warnings []string
|
||
if len(needsOCR) > 0 {
|
||
ocrBlocks, ocrWarnings := ocrPDFPages(path, needsOCR)
|
||
blocks = append(blocks, ocrBlocks...)
|
||
warnings = append(warnings, ocrWarnings...)
|
||
}
|
||
if len(blocks) == 0 {
|
||
if len(warnings) > 0 {
|
||
return nil, knowledgeParseWarning{Warnings: warnings}
|
||
}
|
||
return nil, fmt.Errorf("PDF未读取到可索引内容")
|
||
}
|
||
if len(warnings) > 0 {
|
||
return blocks, knowledgeParseWarning{Warnings: warnings}
|
||
}
|
||
return blocks, nil
|
||
}
|
||
|
||
func hasEnoughPDFText(text string) bool {
|
||
text = normalizeWhitespace(text)
|
||
if len([]rune(text)) >= 20 {
|
||
return true
|
||
}
|
||
tokens := tokenizeKnowledgeText(text)
|
||
return len(tokens) >= 5
|
||
}
|
||
|
||
func ocrPDFPages(path string, pageNumbers []int) ([]textBlock, []string) {
|
||
if len(pageNumbers) == 0 {
|
||
return nil, nil
|
||
}
|
||
var warnings []string
|
||
limitedPages := make([]int, 0, len(pageNumbers))
|
||
for _, pageNum := range pageNumbers {
|
||
if pageNum > maxPDFOCRPages {
|
||
continue
|
||
}
|
||
limitedPages = append(limitedPages, pageNum)
|
||
}
|
||
if len(limitedPages) < len(pageNumbers) {
|
||
warnings = append(warnings, fmt.Sprintf("PDF超过%d页,后续页面未做视觉识别", maxPDFOCRPages))
|
||
}
|
||
pageNumbers = limitedPages
|
||
if len(pageNumbers) == 0 {
|
||
return nil, warnings
|
||
}
|
||
renderer, err := pdfFindRenderer()
|
||
if err != nil {
|
||
return nil, []string{"PDF扫描页需要pdftoppm渲染,但未找到可用工具: " + err.Error()}
|
||
}
|
||
tmpDir, err := os.MkdirTemp("", "qiwei_pdf_ocr_*")
|
||
if err != nil {
|
||
return nil, []string{"PDF OCR临时目录创建失败: " + err.Error()}
|
||
}
|
||
defer os.RemoveAll(tmpDir)
|
||
|
||
blocks := make([]textBlock, 0, len(pageNumbers))
|
||
for _, pageNum := range pageNumbers {
|
||
imagePath, err := renderPDFPageFunc(renderer, path, pageNum, tmpDir)
|
||
if err != nil {
|
||
warnings = append(warnings, fmt.Sprintf("PDF第%d页渲染失败: %v", pageNum, err))
|
||
continue
|
||
}
|
||
ocr := pdfOCRPageImage
|
||
if ocr == nil {
|
||
ocr = ocrPDFPageImage
|
||
}
|
||
text, err := ocr(imagePath, pageNum)
|
||
if err != nil {
|
||
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别失败: %v", pageNum, err))
|
||
continue
|
||
}
|
||
text = normalizeWhitespace(text)
|
||
if text == "" {
|
||
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别为空", pageNum))
|
||
continue
|
||
}
|
||
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
|
||
}
|
||
return blocks, warnings
|
||
}
|
||
|
||
func findPDFRenderer() (string, error) {
|
||
candidates := []string{
|
||
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm.exe")),
|
||
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm")),
|
||
}
|
||
for _, candidate := range candidates {
|
||
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
|
||
return candidate, nil
|
||
}
|
||
}
|
||
if path, err := exec.LookPath("pdftoppm.exe"); err == nil {
|
||
return path, nil
|
||
}
|
||
if path, err := exec.LookPath("pdftoppm"); err == nil {
|
||
return path, nil
|
||
}
|
||
return "", fmt.Errorf("pdftoppm.exe not found")
|
||
}
|
||
|
||
func renderPDFPage(renderer string, pdfPath string, pageNum int, tmpDir string) (string, error) {
|
||
prefix := filepath.Join(tmpDir, fmt.Sprintf("page_%d", pageNum))
|
||
ctxArgs := []string{"-f", fmt.Sprintf("%d", pageNum), "-l", fmt.Sprintf("%d", pageNum), "-png", "-r", "160", pdfPath, prefix}
|
||
cmd := exec.Command(renderer, ctxArgs...)
|
||
output, err := cmd.CombinedOutput()
|
||
if err != nil {
|
||
return "", fmt.Errorf("%v: %s", err, truncateText(string(output), 200))
|
||
}
|
||
matches, err := filepath.Glob(prefix + "-*.png")
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if len(matches) == 0 {
|
||
return "", fmt.Errorf("pdftoppm未生成图片")
|
||
}
|
||
return matches[0], nil
|
||
}
|
||
|
||
func ocrPDFPageImage(imagePath string, pageNum int) (string, error) {
|
||
cfg := getAutoReplyEngine().getConfig()
|
||
dataURL, err := imageDataURLFromFile(imagePath)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
systemPrompt := "你是一个严谨的PDF页面OCR识别器,只提取图片中真实可见的文字,保留标题、表格行列关系和关键数值,不要补充不存在的内容。"
|
||
userPrompt := fmt.Sprintf("请完整识别这张PDF第%d页中的全部可见文字。若有表格,请用每行一段的方式输出。", pageNum)
|
||
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, dataURL)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
return strings.TrimSpace(result.Answer), nil
|
||
}
|
||
|
||
func splitTextToBlocks(text string, title string, page int) []textBlock {
|
||
text = normalizeWhitespace(text)
|
||
if text == "" {
|
||
return nil
|
||
}
|
||
paragraphs := regexp.MustCompile(`\n{2,}`).Split(text, -1)
|
||
blocks := make([]textBlock, 0, len(paragraphs))
|
||
for i, p := range paragraphs {
|
||
p = strings.TrimSpace(p)
|
||
if p != "" {
|
||
blocks = append(blocks, textBlock{Title: title, Content: p, Line: i + 1, Page: page})
|
||
}
|
||
}
|
||
if len(blocks) == 0 {
|
||
blocks = append(blocks, textBlock{Title: title, Content: text, Page: page})
|
||
}
|
||
return blocks
|
||
}
|
||
|
||
func readZipFile(file *zip.File) ([]byte, error) {
|
||
rc, err := file.Open()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer rc.Close()
|
||
return io.ReadAll(rc)
|
||
}
|
||
|
||
func extractXMLText(data []byte) string {
|
||
decoder := xml.NewDecoder(bytes.NewReader(data))
|
||
var parts []string
|
||
for {
|
||
token, err := decoder.Token()
|
||
if err != nil {
|
||
break
|
||
}
|
||
if charData, ok := token.(xml.CharData); ok {
|
||
text := strings.TrimSpace(string(charData))
|
||
if text != "" {
|
||
parts = append(parts, html.UnescapeString(text))
|
||
}
|
||
}
|
||
}
|
||
return strings.Join(parts, "\n")
|
||
}
|
||
|
||
func extractSharedStrings(data []byte) []string {
|
||
decoder := xml.NewDecoder(bytes.NewReader(data))
|
||
values := make([]string, 0)
|
||
var current []string
|
||
inSI := false
|
||
for {
|
||
token, err := decoder.Token()
|
||
if err != nil {
|
||
break
|
||
}
|
||
switch t := token.(type) {
|
||
case xml.StartElement:
|
||
if t.Name.Local == "si" {
|
||
inSI = true
|
||
current = nil
|
||
}
|
||
case xml.EndElement:
|
||
if t.Name.Local == "si" && inSI {
|
||
values = append(values, strings.Join(current, ""))
|
||
inSI = false
|
||
}
|
||
case xml.CharData:
|
||
if inSI {
|
||
current = append(current, string(t))
|
||
}
|
||
}
|
||
}
|
||
return values
|
||
}
|
||
|
||
func extractSheetRows(sheetName string, data []byte, sharedStrings []string) []textBlock {
|
||
decoder := xml.NewDecoder(bytes.NewReader(data))
|
||
blocks := make([]textBlock, 0)
|
||
var row []string
|
||
var cellType string
|
||
var cellValue string
|
||
inRow := false
|
||
inV := false
|
||
rowNum := 0
|
||
for {
|
||
token, err := decoder.Token()
|
||
if err != nil {
|
||
break
|
||
}
|
||
switch t := token.(type) {
|
||
case xml.StartElement:
|
||
switch t.Name.Local {
|
||
case "row":
|
||
inRow = true
|
||
row = nil
|
||
rowNum++
|
||
case "c":
|
||
cellType = ""
|
||
cellValue = ""
|
||
for _, attr := range t.Attr {
|
||
if attr.Name.Local == "t" {
|
||
cellType = attr.Value
|
||
}
|
||
}
|
||
case "v", "t":
|
||
if inRow {
|
||
inV = true
|
||
}
|
||
}
|
||
case xml.EndElement:
|
||
switch t.Name.Local {
|
||
case "v", "t":
|
||
inV = false
|
||
case "c":
|
||
value := strings.TrimSpace(cellValue)
|
||
if cellType == "s" {
|
||
if idx, err := strconvAtoiSafe(value); err == nil && idx >= 0 && idx < len(sharedStrings) {
|
||
value = sharedStrings[idx]
|
||
}
|
||
}
|
||
if value != "" {
|
||
row = append(row, value)
|
||
}
|
||
case "row":
|
||
inRow = false
|
||
if len(row) > 0 {
|
||
blocks = append(blocks, textBlock{Title: filepath.Base(sheetName), Content: strings.Join(row, " | "), Line: rowNum})
|
||
}
|
||
}
|
||
case xml.CharData:
|
||
if inV {
|
||
cellValue += string(t)
|
||
}
|
||
}
|
||
}
|
||
return blocks
|
||
}
|
||
|
||
func extractPDFLikeText(data []byte) string {
|
||
raw := string(data)
|
||
re := regexp.MustCompile(`\(([^()]*)\)`)
|
||
matches := re.FindAllStringSubmatch(raw, -1)
|
||
parts := make([]string, 0, len(matches))
|
||
for _, match := range matches {
|
||
if len(match) > 1 {
|
||
text := strings.ReplaceAll(match[1], `\(`, "(")
|
||
text = strings.ReplaceAll(text, `\)`, ")")
|
||
text = strings.ReplaceAll(text, `\n`, "\n")
|
||
text = strings.TrimSpace(text)
|
||
if text != "" && printableRatio(text) > 0.6 {
|
||
parts = append(parts, text)
|
||
}
|
||
}
|
||
}
|
||
if len(parts) == 0 {
|
||
parts = append(parts, strings.Map(func(r rune) rune {
|
||
if r == '\n' || r == '\r' || r == '\t' || unicode.IsPrint(r) {
|
||
return r
|
||
}
|
||
return ' '
|
||
}, raw))
|
||
}
|
||
return strings.Join(parts, "\n")
|
||
}
|
||
|
||
func tokenizeKnowledgeText(text string) map[string]int {
|
||
text = strings.ToLower(text)
|
||
tokens := make(map[string]int)
|
||
var current []rune
|
||
flush := func() {
|
||
if len(current) > 0 {
|
||
token := string(current)
|
||
if len([]rune(token)) > 1 {
|
||
tokens[token]++
|
||
}
|
||
current = nil
|
||
}
|
||
}
|
||
var chineseRunes []rune
|
||
for _, r := range text {
|
||
if unicode.Is(unicode.Han, r) {
|
||
flush()
|
||
chineseRunes = append(chineseRunes, r)
|
||
continue
|
||
}
|
||
if len(chineseRunes) > 0 {
|
||
addChineseTokens(tokens, chineseRunes)
|
||
chineseRunes = nil
|
||
}
|
||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||
current = append(current, r)
|
||
} else {
|
||
flush()
|
||
}
|
||
}
|
||
flush()
|
||
if len(chineseRunes) > 0 {
|
||
addChineseTokens(tokens, chineseRunes)
|
||
}
|
||
return tokens
|
||
}
|
||
|
||
func addChineseTokens(tokens map[string]int, chars []rune) {
|
||
for _, r := range chars {
|
||
tokens[string(r)]++
|
||
}
|
||
for i := 0; i+1 < len(chars); i++ {
|
||
tokens[string(chars[i:i+2])] += 2
|
||
}
|
||
}
|
||
|
||
func hashKnowledgeChunk(source string, content string, idx int) string {
|
||
sum := sha1.Sum([]byte(fmt.Sprintf("%s:%d:%s", source, idx, content)))
|
||
return hex.EncodeToString(sum[:])
|
||
}
|
||
|
||
func stripBOM(text string) string {
|
||
return strings.TrimPrefix(text, "\ufeff")
|
||
}
|
||
|
||
func normalizeWhitespace(text string) string {
|
||
text = strings.ReplaceAll(text, "\r\n", "\n")
|
||
text = strings.ReplaceAll(text, "\r", "\n")
|
||
lines := strings.Split(text, "\n")
|
||
cleaned := make([]string, 0, len(lines))
|
||
for _, line := range lines {
|
||
line = strings.TrimSpace(regexp.MustCompile(`[ \t]+`).ReplaceAllString(line, " "))
|
||
if line != "" {
|
||
cleaned = append(cleaned, line)
|
||
} else if len(cleaned) > 0 && cleaned[len(cleaned)-1] != "" {
|
||
cleaned = append(cleaned, "")
|
||
}
|
||
}
|
||
return strings.TrimSpace(strings.Join(cleaned, "\n"))
|
||
}
|
||
|
||
func printableRatio(text string) float64 {
|
||
if text == "" {
|
||
return 0
|
||
}
|
||
printable := 0
|
||
total := 0
|
||
for _, r := range text {
|
||
total++
|
||
if unicode.IsPrint(r) || unicode.IsSpace(r) {
|
||
printable++
|
||
}
|
||
}
|
||
return float64(printable) / float64(total)
|
||
}
|
||
|
||
func strconvAtoiSafe(value string) (int, error) {
|
||
value = strings.TrimSpace(value)
|
||
n := 0
|
||
if value == "" {
|
||
return 0, fmt.Errorf("empty")
|
||
}
|
||
for _, r := range value {
|
||
if r < '0' || r > '9' {
|
||
return 0, fmt.Errorf("invalid int")
|
||
}
|
||
n = n*10 + int(r-'0')
|
||
}
|
||
return n, nil
|
||
}
|
||
|
||
func resolveAutoReplyPath(pathValue string) string {
|
||
if filepath.IsAbs(pathValue) {
|
||
return pathValue
|
||
}
|
||
exePath, err := os.Executable()
|
||
if err != nil {
|
||
wd, wdErr := os.Getwd()
|
||
if wdErr == nil {
|
||
return filepath.Join(wd, pathValue)
|
||
}
|
||
return pathValue
|
||
}
|
||
return filepath.Join(filepath.Dir(exePath), pathValue)
|
||
}
|