Files
qiweimanager-master/helper/auto_reply_knowledge.go

1264 lines
31 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"archive/zip"
"bytes"
"crypto/sha1"
"encoding/csv"
"encoding/hex"
"encoding/json"
"encoding/xml"
"fmt"
"html"
"io"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"
"unicode"
"github.com/ledongthuc/pdf"
"github.com/xuri/excelize/v2"
)
const maxKnowledgeChunkContentRunes = 6000
const maxPDFOCRPages = 20
var (
pdfFindRenderer = findPDFRenderer
pdfOCRPageImage func(string, int) (string, error)
renderPDFPageFunc = renderPDFPage
)
type KnowledgeChunk struct {
ID string `json:"id"`
Source string `json:"source"`
Title string `json:"title"`
Content string `json:"content"`
Line int `json:"line"`
Page int `json:"page"`
SectionID string `json:"sectionId,omitempty"`
SectionIndex int `json:"sectionIndex,omitempty"`
PartIndex int `json:"partIndex,omitempty"`
UpdatedAt int64 `json:"updatedAt"`
Hash string `json:"hash"`
Score float64 `json:"score,omitempty"`
}
type KnowledgeIndex struct {
Chunks []KnowledgeChunk `json:"chunks"`
FileCount int `json:"fileCount"`
FailedFiles []string `json:"failedFiles"`
LastIndexedAt int64 `json:"lastIndexedAt"`
}
type knowledgeParseWarning struct {
Warnings []string
}
func (e knowledgeParseWarning) Error() string {
return strings.Join(e.Warnings, "")
}
func NewKnowledgeIndex() *KnowledgeIndex {
return &KnowledgeIndex{
Chunks: make([]KnowledgeChunk, 0),
FailedFiles: make([]string, 0),
}
}
func (e *AutoReplyEngine) loadKnowledgeIndex() error {
cfg := e.getConfig()
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
data, err := os.ReadFile(indexPath)
if err != nil {
if os.IsNotExist(err) {
e.updateKnowledgeStatus(NewKnowledgeIndex())
return nil
}
return err
}
var idx KnowledgeIndex
if err := json.Unmarshal(data, &idx); err != nil {
return err
}
e.mu.Lock()
e.index = &idx
e.status.KnowledgeFileCount = idx.FileCount
e.status.KnowledgeChunkCount = len(idx.Chunks)
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
e.mu.Unlock()
return nil
}
func (e *AutoReplyEngine) rebuildKnowledgeIndex() (*KnowledgeIndex, error) {
cfg := e.getConfig()
root := resolveAutoReplyPath(cfg.Knowledge.Directory)
idx := NewKnowledgeIndex()
allowed := make(map[string]bool)
for _, ext := range cfg.Knowledge.SupportedExtensions {
allowed[strings.ToLower(ext)] = true
}
if len(allowed) == 0 {
allowed[".md"] = true
allowed[".txt"] = true
allowed[".csv"] = true
}
if err := os.MkdirAll(root, 0755); err != nil {
return nil, err
}
entries, err := os.ReadDir(root)
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
ext := strings.ToLower(filepath.Ext(entry.Name()))
if !isRootKnowledgeFile(entry.Name(), ext, allowed, cfg.Knowledge.IndexPath, cfg.Retrieval.EmbeddingIndexPath) {
continue
}
path := filepath.Join(root, entry.Name())
chunks, err := parseKnowledgeFile(path, root)
if err != nil {
var warning knowledgeParseWarning
if ok := errorAs(err, &warning); ok {
for _, item := range warning.Warnings {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %s", path, item))
}
} else {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %v", path, err))
continue
}
}
if len(chunks) == 0 {
idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: 未读取到可索引内容", path))
continue
}
idx.FileCount++
idx.Chunks = append(idx.Chunks, chunks...)
}
idx.LastIndexedAt = time.Now().Unix()
indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath)
if err := os.MkdirAll(filepath.Dir(indexPath), 0755); err != nil {
return nil, err
}
data, err := json.MarshalIndent(idx, "", " ")
if err != nil {
return nil, err
}
if err := os.WriteFile(indexPath, data, 0644); err != nil {
return nil, err
}
e.updateKnowledgeStatus(idx)
if err := e.rebuildEmbeddingIndex(idx); err != nil {
e.setLastErrorWithScope(autoReplyErrorScopeKnowledge, err.Error())
}
return idx, nil
}
func isRootKnowledgeFile(name string, ext string, allowed map[string]bool, knowledgeIndexPath string, embeddingIndexPath string) bool {
name = strings.TrimSpace(name)
ext = strings.ToLower(ext)
if name == "" || name == ".keep" || !allowed[ext] {
return false
}
if strings.EqualFold(name, filepath.Base(knowledgeIndexPath)) ||
strings.EqualFold(name, filepath.Base(embeddingIndexPath)) {
return false
}
return true
}
func errorAs(err error, target interface{}) bool {
switch t := target.(type) {
case *knowledgeParseWarning:
if value, ok := err.(knowledgeParseWarning); ok {
*t = value
return true
}
if value, ok := err.(*knowledgeParseWarning); ok {
*t = *value
return true
}
}
return false
}
func (e *AutoReplyEngine) updateKnowledgeStatus(idx *KnowledgeIndex) {
e.mu.Lock()
e.index = idx
e.status.KnowledgeFileCount = idx.FileCount
e.status.KnowledgeChunkCount = len(idx.Chunks)
e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt
e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...)
e.mu.Unlock()
}
func scoreKnowledgeChunk(queryTokens map[string]int, chunk KnowledgeChunk) float64 {
textTokens := tokenizeKnowledgeText(chunk.Title + " " + chunk.Content)
if len(textTokens) == 0 {
return 0
}
var matched, weighted float64
for token, qCount := range queryTokens {
if count, ok := textTokens[token]; ok {
matched++
weighted += math.Min(float64(count), float64(qCount)+1)
}
}
if matched == 0 {
return 0
}
coverage := matched / float64(len(queryTokens))
density := weighted / math.Sqrt(float64(len(textTokens))+1)
return coverage*0.75 + density*0.25
}
func parseKnowledgeFile(path string, root string) ([]KnowledgeChunk, error) {
ext := strings.ToLower(filepath.Ext(path))
var blocks []textBlock
var err error
switch ext {
case ".md", ".txt":
blocks, err = parsePlainKnowledgeFile(path)
case ".csv":
blocks, err = parseCSVKnowledgeFile(path)
case ".docx":
blocks, err = parseDocxKnowledgeFile(path)
case ".xlsx":
blocks, err = parseXlsxKnowledgeFile(path)
case ".pdf":
blocks, err = parsePDFKnowledgeFile(path)
default:
err = fmt.Errorf("unsupported extension: %s", ext)
}
if err != nil {
return nil, err
}
rel, err := filepath.Rel(root, path)
if err != nil {
rel = filepath.Base(path)
}
info, _ := os.Stat(path)
updatedAt := time.Now().Unix()
if info != nil {
updatedAt = info.ModTime().Unix()
}
chunks := make([]KnowledgeChunk, 0, len(blocks))
for i, block := range blocks {
content := strings.TrimSpace(block.Content)
if content == "" || isLowValueKnowledgeBlock(block.Title, content) {
continue
}
parts := splitLongKnowledgeContent(content, maxKnowledgeChunkContentRunes)
for partIndex, part := range parts {
title := block.Title
if len(parts) > 1 {
title = fmt.Sprintf("%s %d/%d", strings.TrimSpace(block.Title), partIndex+1, len(parts))
}
hash := hashKnowledgeChunk(rel, part, i*1000+partIndex)
sectionID := hashKnowledgeChunk(rel, strings.TrimSpace(block.Title), i)
chunks = append(chunks, KnowledgeChunk{
ID: hash,
Source: filepath.ToSlash(rel),
Title: title,
Content: part,
Line: block.Line,
Page: block.Page,
SectionID: sectionID,
SectionIndex: i,
PartIndex: partIndex,
UpdatedAt: updatedAt,
Hash: hash,
})
}
}
return chunks, nil
}
type textBlock struct {
Title string
Content string
Line int
Page int
}
func splitLongKnowledgeContent(content string, maxRunes int) []string {
content = strings.TrimSpace(content)
if content == "" {
return nil
}
if maxRunes <= 0 || len([]rune(content)) <= maxRunes {
return []string{content}
}
var chunks []string
var current strings.Builder
currentRunes := 0
flush := func() {
text := strings.TrimSpace(current.String())
if text != "" {
chunks = append(chunks, text)
}
current.Reset()
currentRunes = 0
}
appendPiece := func(piece string) {
piece = strings.TrimSpace(piece)
if piece == "" {
return
}
pieceRunes := []rune(piece)
for len(pieceRunes) > maxRunes {
if currentRunes > 0 {
flush()
}
chunks = append(chunks, strings.TrimSpace(string(pieceRunes[:maxRunes])))
pieceRunes = pieceRunes[maxRunes:]
}
if len(pieceRunes) == 0 {
return
}
separatorRunes := 0
if currentRunes > 0 {
separatorRunes = 1
}
if currentRunes+separatorRunes+len(pieceRunes) > maxRunes {
flush()
}
if currentRunes > 0 {
current.WriteString("\n")
currentRunes++
}
current.WriteString(string(pieceRunes))
currentRunes += len(pieceRunes)
}
for _, line := range strings.Split(content, "\n") {
appendPiece(line)
}
flush()
if len(chunks) == 0 {
return []string{content}
}
return chunks
}
func parsePlainKnowledgeFile(path string) ([]textBlock, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
text := stripBOM(string(data))
lines := strings.Split(text, "\n")
blocks := make([]textBlock, 0)
var currentTitle string
var current []string
startLine := 1
inFrontMatter := false
inDataviewBlock := false
flush := func() {
content := strings.TrimSpace(strings.Join(current, "\n"))
if content != "" && !isLowValueKnowledgeBlock(currentTitle, content) {
blocks = append(blocks, textBlock{Title: currentTitle, Content: content, Line: startLine})
}
current = nil
}
for i, line := range lines {
trimmed := strings.TrimSpace(line)
if i == 0 && trimmed == "---" {
inFrontMatter = true
continue
}
if inFrontMatter {
if trimmed == "---" {
inFrontMatter = false
startLine = i + 2
}
continue
}
if inDataviewBlock {
if strings.HasPrefix(trimmed, "```") {
inDataviewBlock = false
startLine = i + 2
}
continue
}
if strings.EqualFold(trimmed, "```dataview") {
flush()
inDataviewBlock = true
continue
}
if strings.HasPrefix(trimmed, "#") {
flush()
currentTitle = strings.TrimSpace(strings.TrimLeft(trimmed, "#"))
startLine = i + 1
continue
}
if trimmed == "" {
flush()
startLine = i + 2
continue
}
if len(current) == 0 {
startLine = i + 1
}
current = append(current, trimmed)
}
flush()
return blocks, nil
}
func isLowValueKnowledgeBlock(title string, content string) bool {
content = strings.TrimSpace(content)
if content == "" {
return true
}
if content == "---" || strings.Trim(content, "- \t\r\n") == "" {
return true
}
lower := strings.ToLower(content)
if strings.HasPrefix(lower, "```dataview") || strings.Contains(lower, "list from [[") {
return true
}
if strings.HasPrefix(content, "---\n") && strings.Contains(content, "\n---") {
return true
}
title = strings.TrimSpace(title)
return title == "反向链接" && (strings.Contains(lower, "dataview") || strings.Contains(lower, "list from [["))
}
func parseCSVKnowledgeFile(path string) ([]textBlock, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
reader := csv.NewReader(bytes.NewReader([]byte(stripBOM(string(data)))))
reader.FieldsPerRecord = -1
records, err := reader.ReadAll()
if err != nil {
return nil, err
}
if len(records) == 0 {
return nil, nil
}
headers := records[0]
blocks := make([]textBlock, 0, len(records)-1)
for i, row := range records[1:] {
parts := make([]string, 0, len(row))
for j, val := range row {
val = strings.TrimSpace(val)
if val == "" {
continue
}
if j < len(headers) && strings.TrimSpace(headers[j]) != "" {
parts = append(parts, strings.TrimSpace(headers[j])+": "+val)
} else {
parts = append(parts, val)
}
}
if len(parts) > 0 {
blocks = append(blocks, textBlock{Title: fmt.Sprintf("row %d", i+2), Content: strings.Join(parts, "\n"), Line: i + 2})
}
}
return blocks, nil
}
func parseDocxKnowledgeFile(path string) ([]textBlock, error) {
zr, err := zip.OpenReader(path)
if err != nil {
return nil, err
}
defer zr.Close()
var document []byte
for _, file := range zr.File {
if file.Name == "word/document.xml" {
document, err = readZipFile(file)
if err != nil {
return nil, err
}
break
}
}
if len(document) == 0 {
return nil, fmt.Errorf("word/document.xml not found")
}
return extractDocxBlocks(document), nil
}
func parseXlsxKnowledgeFile(path string) ([]textBlock, error) {
file, err := excelize.OpenFile(path)
if err != nil {
return nil, err
}
defer func() { _ = file.Close() }()
blocks := make([]textBlock, 0)
for _, sheetName := range file.GetSheetList() {
rows, err := file.GetRows(sheetName)
if err != nil {
return nil, err
}
if len(rows) == 0 {
continue
}
applyMergedCellValues(file, sheetName, rows)
blocks = append(blocks, structuredSheetBlocks(sheetName, rows)...)
}
return blocks, nil
}
func applyMergedCellValues(file *excelize.File, sheetName string, rows [][]string) {
mergeCells, err := file.GetMergeCells(sheetName)
if err != nil {
return
}
for _, mergeCell := range mergeCells {
value := strings.TrimSpace(mergeCell.GetCellValue())
if value == "" {
continue
}
startCol, startRow, err := excelize.CellNameToCoordinates(mergeCell.GetStartAxis())
if err != nil {
continue
}
endCol, endRow, err := excelize.CellNameToCoordinates(mergeCell.GetEndAxis())
if err != nil {
continue
}
if startRow == endRow {
continue
}
for row := startRow; row <= endRow; row++ {
for col := startCol; col <= endCol; col++ {
setSheetCellValue(rows, row-1, col-1, value)
}
}
}
}
func setSheetCellValue(rows [][]string, rowIndex int, colIndex int, value string) {
if rowIndex < 0 || rowIndex >= len(rows) || colIndex < 0 {
return
}
if len(rows[rowIndex]) <= colIndex {
expanded := make([]string, colIndex+1)
copy(expanded, rows[rowIndex])
rows[rowIndex] = expanded
}
if strings.TrimSpace(rows[rowIndex][colIndex]) == "" {
rows[rowIndex][colIndex] = value
}
}
func structuredSheetBlocks(sheetName string, rows [][]string) []textBlock {
blocks := make([]textBlock, 0)
headers := make([]string, 0)
carry := make([]string, 0)
sectionTitle := ""
for i, rawRow := range rows {
row := normalizeSheetRow(rawRow)
nonEmpty := nonEmptySheetValues(row)
if len(nonEmpty) == 0 {
continue
}
if len(nonEmpty) == 1 && !looksLikeSheetHeaderRow(row) {
sectionTitle = nonEmpty[0]
blocks = append(blocks, textBlock{
Title: sheetName,
Content: fmt.Sprintf("工作表: %s\n分类: %s", sheetName, sectionTitle),
Line: i + 1,
})
continue
}
if looksLikeSheetHeaderRow(row) {
headers = row
carry = make([]string, len(headers))
blocks = append(blocks, textBlock{
Title: sheetName,
Content: fmt.Sprintf("工作表: %s\n表头: %s", sheetName, strings.Join(nonEmpty, "")),
Line: i + 1,
})
continue
}
filled := applySheetCarryForward(row, headers, carry)
content := structuredSheetRowContent(sheetName, sectionTitle, headers, filled)
if strings.TrimSpace(content) == "" {
continue
}
blocks = append(blocks, textBlock{Title: sheetName, Content: content, Line: i + 1})
}
return blocks
}
func extractDocxBlocks(data []byte) []textBlock {
decoder := xml.NewDecoder(bytes.NewReader(data))
blocks := make([]textBlock, 0)
var paragraph strings.Builder
var cell strings.Builder
var rowCells []string
inText := false
inParagraph := false
tableDepth := 0
inCell := false
rowIndex := 0
paragraphIndex := 0
flushParagraph := func() {
text := normalizeWhitespace(paragraph.String())
paragraph.Reset()
if text == "" {
return
}
paragraphIndex++
blocks = append(blocks, textBlock{Title: "docx paragraph", Content: text, Line: paragraphIndex})
}
flushCell := func() {
text := normalizeWhitespace(cell.String())
cell.Reset()
if text != "" {
rowCells = append(rowCells, text)
}
}
flushRow := func() {
cleaned := make([]string, 0, len(rowCells))
for _, value := range rowCells {
value = strings.TrimSpace(value)
if value != "" {
cleaned = append(cleaned, value)
}
}
rowCells = nil
if len(cleaned) == 0 {
return
}
rowIndex++
blocks = append(blocks, textBlock{Title: "docx table", Content: strings.Join(cleaned, " | "), Line: rowIndex})
}
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
switch t.Name.Local {
case "tbl":
tableDepth++
case "tr":
if tableDepth > 0 {
rowCells = nil
}
case "tc":
if tableDepth > 0 {
inCell = true
cell.Reset()
}
case "p":
if tableDepth == 0 {
inParagraph = true
paragraph.Reset()
}
case "t":
inText = true
case "tab":
if inCell {
cell.WriteString(" ")
} else if inParagraph {
paragraph.WriteString(" ")
}
case "br":
if inCell {
cell.WriteString("\n")
} else if inParagraph {
paragraph.WriteString("\n")
}
}
case xml.EndElement:
switch t.Name.Local {
case "t":
inText = false
case "p":
if tableDepth == 0 && inParagraph {
flushParagraph()
}
inParagraph = false
if inCell {
cell.WriteString(" ")
}
case "tc":
if inCell {
flushCell()
}
inCell = false
case "tr":
if tableDepth > 0 {
flushRow()
}
case "tbl":
if tableDepth > 0 {
tableDepth--
}
}
case xml.CharData:
if !inText {
continue
}
text := html.UnescapeString(string(t))
if inCell {
cell.WriteString(text)
} else if inParagraph {
paragraph.WriteString(text)
}
}
}
return blocks
}
func normalizeSheetRow(row []string) []string {
result := make([]string, len(row))
for i, value := range row {
result[i] = strings.TrimSpace(value)
}
return result
}
func nonEmptySheetValues(row []string) []string {
values := make([]string, 0, len(row))
for _, value := range row {
if strings.TrimSpace(value) != "" {
values = append(values, strings.TrimSpace(value))
}
}
return values
}
func looksLikeSheetHeaderRow(row []string) bool {
matches := 0
for _, value := range row {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if isSheetHeaderLabel(value) {
matches++
}
}
return matches >= 2
}
func isSheetHeaderLabel(value string) bool {
value = strings.TrimSpace(value)
if value == "" {
return false
}
headerLabels := map[string]bool{
"星期": true, "时段": true, "部门": true, "会议主题": true, "会议时间": true, "会议日期": true,
"日期": true, "时间": true, "主题": true, "名称": true, "类别": true, "类型": true,
"项目": true, "标准": true, "要求": true, "负责人": true, "内容": true, "操作指引": true,
"检查项目": true, "核对内容": true, "详细": true, "备注": true, "测试流程": true,
}
return headerLabels[value]
}
func applySheetCarryForward(row []string, headers []string, carry []string) []string {
filled := append([]string(nil), row...)
if len(headers) == 0 {
return filled
}
if len(filled) < len(headers) {
expanded := make([]string, len(headers))
copy(expanded, filled)
filled = expanded
}
for i := range headers {
header := strings.TrimSpace(headers[i])
value := strings.TrimSpace(filled[i])
if value == "" && i < len(carry) && isCarryForwardSheetHeader(header) {
filled[i] = carry[i]
continue
}
if value != "" && i < len(carry) && isCarryForwardSheetHeader(header) {
carry[i] = value
}
}
return filled
}
func isCarryForwardSheetHeader(header string) bool {
header = strings.TrimSpace(header)
if header == "" {
return false
}
for _, term := range []string{"星期", "日期", "部门", "类别", "类型", "项目"} {
if strings.Contains(header, term) {
return true
}
}
return false
}
func structuredSheetRowContent(sheetName string, sectionTitle string, headers []string, row []string) string {
parts := make([]string, 0, len(row)+2)
parts = append(parts, "工作表: "+sheetName)
if strings.TrimSpace(sectionTitle) != "" {
parts = append(parts, "分类: "+strings.TrimSpace(sectionTitle))
}
for i, value := range row {
value = strings.TrimSpace(value)
if value == "" {
continue
}
label := ""
if i < len(headers) {
label = strings.TrimSpace(headers[i])
}
if label == "" {
label = fmt.Sprintf("列%d", i+1)
}
parts = append(parts, label+": "+value)
}
return strings.Join(parts, "\n")
}
func parsePDFKnowledgeFile(path string) ([]textBlock, error) {
file, reader, err := pdf.Open(path)
if err != nil {
return nil, err
}
defer file.Close()
pageCount := reader.NumPage()
blocks := make([]textBlock, 0)
needsOCR := make([]int, 0)
for pageNum := 1; pageNum <= pageCount; pageNum++ {
page := reader.Page(pageNum)
text, pageErr := page.GetPlainText(nil)
if pageErr == nil && hasEnoughPDFText(text) {
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
continue
}
needsOCR = append(needsOCR, pageNum)
}
var warnings []string
if len(needsOCR) > 0 {
ocrBlocks, ocrWarnings := ocrPDFPages(path, needsOCR)
blocks = append(blocks, ocrBlocks...)
warnings = append(warnings, ocrWarnings...)
}
if len(blocks) == 0 {
if len(warnings) > 0 {
return nil, knowledgeParseWarning{Warnings: warnings}
}
return nil, fmt.Errorf("PDF未读取到可索引内容")
}
if len(warnings) > 0 {
return blocks, knowledgeParseWarning{Warnings: warnings}
}
return blocks, nil
}
func hasEnoughPDFText(text string) bool {
text = normalizeWhitespace(text)
if len([]rune(text)) >= 20 {
return true
}
tokens := tokenizeKnowledgeText(text)
return len(tokens) >= 5
}
func ocrPDFPages(path string, pageNumbers []int) ([]textBlock, []string) {
if len(pageNumbers) == 0 {
return nil, nil
}
var warnings []string
limitedPages := make([]int, 0, len(pageNumbers))
for _, pageNum := range pageNumbers {
if pageNum > maxPDFOCRPages {
continue
}
limitedPages = append(limitedPages, pageNum)
}
if len(limitedPages) < len(pageNumbers) {
warnings = append(warnings, fmt.Sprintf("PDF超过%d页后续页面未做视觉识别", maxPDFOCRPages))
}
pageNumbers = limitedPages
if len(pageNumbers) == 0 {
return nil, warnings
}
renderer, err := pdfFindRenderer()
if err != nil {
return nil, []string{"PDF扫描页需要pdftoppm渲染但未找到可用工具: " + err.Error()}
}
tmpDir, err := os.MkdirTemp("", "qiwei_pdf_ocr_*")
if err != nil {
return nil, []string{"PDF OCR临时目录创建失败: " + err.Error()}
}
defer os.RemoveAll(tmpDir)
blocks := make([]textBlock, 0, len(pageNumbers))
for _, pageNum := range pageNumbers {
imagePath, err := renderPDFPageFunc(renderer, path, pageNum, tmpDir)
if err != nil {
warnings = append(warnings, fmt.Sprintf("PDF第%d页渲染失败: %v", pageNum, err))
continue
}
ocr := pdfOCRPageImage
if ocr == nil {
ocr = ocrPDFPageImage
}
text, err := ocr(imagePath, pageNum)
if err != nil {
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别失败: %v", pageNum, err))
continue
}
text = normalizeWhitespace(text)
if text == "" {
warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别为空", pageNum))
continue
}
blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum})
}
return blocks, warnings
}
func findPDFRenderer() (string, error) {
candidates := []string{
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm.exe")),
resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm")),
}
for _, candidate := range candidates {
if info, err := os.Stat(candidate); err == nil && !info.IsDir() {
return candidate, nil
}
}
if path, err := exec.LookPath("pdftoppm.exe"); err == nil {
return path, nil
}
if path, err := exec.LookPath("pdftoppm"); err == nil {
return path, nil
}
return "", fmt.Errorf("pdftoppm.exe not found")
}
func renderPDFPage(renderer string, pdfPath string, pageNum int, tmpDir string) (string, error) {
prefix := filepath.Join(tmpDir, fmt.Sprintf("page_%d", pageNum))
ctxArgs := []string{"-f", fmt.Sprintf("%d", pageNum), "-l", fmt.Sprintf("%d", pageNum), "-png", "-r", "160", pdfPath, prefix}
cmd := exec.Command(renderer, ctxArgs...)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("%v: %s", err, truncateText(string(output), 200))
}
matches, err := filepath.Glob(prefix + "-*.png")
if err != nil {
return "", err
}
if len(matches) == 0 {
return "", fmt.Errorf("pdftoppm未生成图片")
}
return matches[0], nil
}
func ocrPDFPageImage(imagePath string, pageNum int) (string, error) {
cfg := getAutoReplyEngine().getConfig()
dataURL, err := imageDataURLFromFile(imagePath)
if err != nil {
return "", err
}
systemPrompt := "你是一个严谨的PDF页面OCR识别器只提取图片中真实可见的文字保留标题、表格行列关系和关键数值不要补充不存在的内容。"
userPrompt := fmt.Sprintf("请完整识别这张PDF第%d页中的全部可见文字。若有表格请用每行一段的方式输出。", pageNum)
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, dataURL)
if err != nil {
return "", err
}
return strings.TrimSpace(result.Answer), nil
}
func splitTextToBlocks(text string, title string, page int) []textBlock {
text = normalizeWhitespace(text)
if text == "" {
return nil
}
paragraphs := regexp.MustCompile(`\n{2,}`).Split(text, -1)
blocks := make([]textBlock, 0, len(paragraphs))
for i, p := range paragraphs {
p = strings.TrimSpace(p)
if p != "" {
blocks = append(blocks, textBlock{Title: title, Content: p, Line: i + 1, Page: page})
}
}
if len(blocks) == 0 {
blocks = append(blocks, textBlock{Title: title, Content: text, Page: page})
}
return blocks
}
func readZipFile(file *zip.File) ([]byte, error) {
rc, err := file.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
func extractXMLText(data []byte) string {
decoder := xml.NewDecoder(bytes.NewReader(data))
var parts []string
for {
token, err := decoder.Token()
if err != nil {
break
}
if charData, ok := token.(xml.CharData); ok {
text := strings.TrimSpace(string(charData))
if text != "" {
parts = append(parts, html.UnescapeString(text))
}
}
}
return strings.Join(parts, "\n")
}
func extractSharedStrings(data []byte) []string {
decoder := xml.NewDecoder(bytes.NewReader(data))
values := make([]string, 0)
var current []string
inSI := false
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
if t.Name.Local == "si" {
inSI = true
current = nil
}
case xml.EndElement:
if t.Name.Local == "si" && inSI {
values = append(values, strings.Join(current, ""))
inSI = false
}
case xml.CharData:
if inSI {
current = append(current, string(t))
}
}
}
return values
}
func extractSheetRows(sheetName string, data []byte, sharedStrings []string) []textBlock {
decoder := xml.NewDecoder(bytes.NewReader(data))
blocks := make([]textBlock, 0)
var row []string
var cellType string
var cellValue string
inRow := false
inV := false
rowNum := 0
for {
token, err := decoder.Token()
if err != nil {
break
}
switch t := token.(type) {
case xml.StartElement:
switch t.Name.Local {
case "row":
inRow = true
row = nil
rowNum++
case "c":
cellType = ""
cellValue = ""
for _, attr := range t.Attr {
if attr.Name.Local == "t" {
cellType = attr.Value
}
}
case "v", "t":
if inRow {
inV = true
}
}
case xml.EndElement:
switch t.Name.Local {
case "v", "t":
inV = false
case "c":
value := strings.TrimSpace(cellValue)
if cellType == "s" {
if idx, err := strconvAtoiSafe(value); err == nil && idx >= 0 && idx < len(sharedStrings) {
value = sharedStrings[idx]
}
}
if value != "" {
row = append(row, value)
}
case "row":
inRow = false
if len(row) > 0 {
blocks = append(blocks, textBlock{Title: filepath.Base(sheetName), Content: strings.Join(row, " | "), Line: rowNum})
}
}
case xml.CharData:
if inV {
cellValue += string(t)
}
}
}
return blocks
}
func extractPDFLikeText(data []byte) string {
raw := string(data)
re := regexp.MustCompile(`\(([^()]*)\)`)
matches := re.FindAllStringSubmatch(raw, -1)
parts := make([]string, 0, len(matches))
for _, match := range matches {
if len(match) > 1 {
text := strings.ReplaceAll(match[1], `\(`, "(")
text = strings.ReplaceAll(text, `\)`, ")")
text = strings.ReplaceAll(text, `\n`, "\n")
text = strings.TrimSpace(text)
if text != "" && printableRatio(text) > 0.6 {
parts = append(parts, text)
}
}
}
if len(parts) == 0 {
parts = append(parts, strings.Map(func(r rune) rune {
if r == '\n' || r == '\r' || r == '\t' || unicode.IsPrint(r) {
return r
}
return ' '
}, raw))
}
return strings.Join(parts, "\n")
}
func tokenizeKnowledgeText(text string) map[string]int {
text = strings.ToLower(text)
tokens := make(map[string]int)
var current []rune
flush := func() {
if len(current) > 0 {
token := string(current)
if len([]rune(token)) > 1 {
tokens[token]++
}
current = nil
}
}
var chineseRunes []rune
for _, r := range text {
if unicode.Is(unicode.Han, r) {
flush()
chineseRunes = append(chineseRunes, r)
continue
}
if len(chineseRunes) > 0 {
addChineseTokens(tokens, chineseRunes)
chineseRunes = nil
}
if unicode.IsLetter(r) || unicode.IsDigit(r) {
current = append(current, r)
} else {
flush()
}
}
flush()
if len(chineseRunes) > 0 {
addChineseTokens(tokens, chineseRunes)
}
return tokens
}
func addChineseTokens(tokens map[string]int, chars []rune) {
for _, r := range chars {
tokens[string(r)]++
}
for i := 0; i+1 < len(chars); i++ {
tokens[string(chars[i:i+2])] += 2
}
}
func hashKnowledgeChunk(source string, content string, idx int) string {
sum := sha1.Sum([]byte(fmt.Sprintf("%s:%d:%s", source, idx, content)))
return hex.EncodeToString(sum[:])
}
func stripBOM(text string) string {
return strings.TrimPrefix(text, "\ufeff")
}
func normalizeWhitespace(text string) string {
text = strings.ReplaceAll(text, "\r\n", "\n")
text = strings.ReplaceAll(text, "\r", "\n")
lines := strings.Split(text, "\n")
cleaned := make([]string, 0, len(lines))
for _, line := range lines {
line = strings.TrimSpace(regexp.MustCompile(`[ \t]+`).ReplaceAllString(line, " "))
if line != "" {
cleaned = append(cleaned, line)
} else if len(cleaned) > 0 && cleaned[len(cleaned)-1] != "" {
cleaned = append(cleaned, "")
}
}
return strings.TrimSpace(strings.Join(cleaned, "\n"))
}
func printableRatio(text string) float64 {
if text == "" {
return 0
}
printable := 0
total := 0
for _, r := range text {
total++
if unicode.IsPrint(r) || unicode.IsSpace(r) {
printable++
}
}
return float64(printable) / float64(total)
}
func strconvAtoiSafe(value string) (int, error) {
value = strings.TrimSpace(value)
n := 0
if value == "" {
return 0, fmt.Errorf("empty")
}
for _, r := range value {
if r < '0' || r > '9' {
return 0, fmt.Errorf("invalid int")
}
n = n*10 + int(r-'0')
}
return n, nil
}
func resolveAutoReplyPath(pathValue string) string {
if filepath.IsAbs(pathValue) {
return pathValue
}
exePath, err := os.Executable()
if err != nil {
wd, wdErr := os.Getwd()
if wdErr == nil {
return filepath.Join(wd, pathValue)
}
return pathValue
}
return filepath.Join(filepath.Dir(exePath), pathValue)
}