package main import ( "archive/zip" "bytes" "crypto/sha1" "encoding/csv" "encoding/hex" "encoding/json" "encoding/xml" "fmt" "html" "io" "math" "os" "os/exec" "path/filepath" "regexp" "strings" "time" "unicode" "github.com/ledongthuc/pdf" "github.com/xuri/excelize/v2" ) const maxKnowledgeChunkContentRunes = 6000 const maxPDFOCRPages = 20 var ( pdfFindRenderer = findPDFRenderer pdfOCRPageImage func(string, int) (string, error) renderPDFPageFunc = renderPDFPage ) type KnowledgeChunk struct { ID string `json:"id"` Source string `json:"source"` Title string `json:"title"` Content string `json:"content"` Line int `json:"line"` Page int `json:"page"` SectionID string `json:"sectionId,omitempty"` SectionIndex int `json:"sectionIndex,omitempty"` PartIndex int `json:"partIndex,omitempty"` UpdatedAt int64 `json:"updatedAt"` Hash string `json:"hash"` Score float64 `json:"score,omitempty"` } type KnowledgeIndex struct { Chunks []KnowledgeChunk `json:"chunks"` FileCount int `json:"fileCount"` FailedFiles []string `json:"failedFiles"` LastIndexedAt int64 `json:"lastIndexedAt"` } type knowledgeParseWarning struct { Warnings []string } func (e knowledgeParseWarning) Error() string { return strings.Join(e.Warnings, ";") } func NewKnowledgeIndex() *KnowledgeIndex { return &KnowledgeIndex{ Chunks: make([]KnowledgeChunk, 0), FailedFiles: make([]string, 0), } } func (e *AutoReplyEngine) loadKnowledgeIndex() error { cfg := e.getConfig() indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath) data, err := os.ReadFile(indexPath) if err != nil { if os.IsNotExist(err) { e.updateKnowledgeStatus(NewKnowledgeIndex()) return nil } return err } var idx KnowledgeIndex if err := json.Unmarshal(data, &idx); err != nil { return err } e.mu.Lock() e.index = &idx e.status.KnowledgeFileCount = idx.FileCount e.status.KnowledgeChunkCount = len(idx.Chunks) e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...) e.mu.Unlock() return nil } func (e *AutoReplyEngine) rebuildKnowledgeIndex() (*KnowledgeIndex, error) { cfg := e.getConfig() root := resolveAutoReplyPath(cfg.Knowledge.Directory) idx := NewKnowledgeIndex() allowed := make(map[string]bool) for _, ext := range cfg.Knowledge.SupportedExtensions { allowed[strings.ToLower(ext)] = true } if len(allowed) == 0 { allowed[".md"] = true allowed[".txt"] = true allowed[".csv"] = true } if err := os.MkdirAll(root, 0755); err != nil { return nil, err } entries, err := os.ReadDir(root) if err != nil { return nil, err } for _, entry := range entries { if entry.IsDir() { continue } ext := strings.ToLower(filepath.Ext(entry.Name())) if !isRootKnowledgeFile(entry.Name(), ext, allowed, cfg.Knowledge.IndexPath, cfg.Retrieval.EmbeddingIndexPath) { continue } path := filepath.Join(root, entry.Name()) chunks, err := parseKnowledgeFile(path, root) if err != nil { var warning knowledgeParseWarning if ok := errorAs(err, &warning); ok { for _, item := range warning.Warnings { idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %s", path, item)) } } else { idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: %v", path, err)) continue } } if len(chunks) == 0 { idx.FailedFiles = append(idx.FailedFiles, fmt.Sprintf("%s: 未读取到可索引内容", path)) continue } idx.FileCount++ idx.Chunks = append(idx.Chunks, chunks...) } idx.LastIndexedAt = time.Now().Unix() indexPath := resolveAutoReplyPath(cfg.Knowledge.IndexPath) if err := os.MkdirAll(filepath.Dir(indexPath), 0755); err != nil { return nil, err } data, err := json.MarshalIndent(idx, "", " ") if err != nil { return nil, err } if err := os.WriteFile(indexPath, data, 0644); err != nil { return nil, err } e.updateKnowledgeStatus(idx) if err := e.rebuildEmbeddingIndex(idx); err != nil { e.setLastErrorWithScope(autoReplyErrorScopeKnowledge, err.Error()) } return idx, nil } func isRootKnowledgeFile(name string, ext string, allowed map[string]bool, knowledgeIndexPath string, embeddingIndexPath string) bool { name = strings.TrimSpace(name) ext = strings.ToLower(ext) if name == "" || name == ".keep" || !allowed[ext] { return false } if strings.EqualFold(name, filepath.Base(knowledgeIndexPath)) || strings.EqualFold(name, filepath.Base(embeddingIndexPath)) { return false } return true } func errorAs(err error, target interface{}) bool { switch t := target.(type) { case *knowledgeParseWarning: if value, ok := err.(knowledgeParseWarning); ok { *t = value return true } if value, ok := err.(*knowledgeParseWarning); ok { *t = *value return true } } return false } func (e *AutoReplyEngine) updateKnowledgeStatus(idx *KnowledgeIndex) { e.mu.Lock() e.index = idx e.status.KnowledgeFileCount = idx.FileCount e.status.KnowledgeChunkCount = len(idx.Chunks) e.status.KnowledgeLastIndexedAt = idx.LastIndexedAt e.status.KnowledgeFailedFiles = append([]string(nil), idx.FailedFiles...) e.mu.Unlock() } func scoreKnowledgeChunk(queryTokens map[string]int, chunk KnowledgeChunk) float64 { textTokens := tokenizeKnowledgeText(chunk.Title + " " + chunk.Content) if len(textTokens) == 0 { return 0 } var matched, weighted float64 for token, qCount := range queryTokens { if count, ok := textTokens[token]; ok { matched++ weighted += math.Min(float64(count), float64(qCount)+1) } } if matched == 0 { return 0 } coverage := matched / float64(len(queryTokens)) density := weighted / math.Sqrt(float64(len(textTokens))+1) return coverage*0.75 + density*0.25 } func parseKnowledgeFile(path string, root string) ([]KnowledgeChunk, error) { ext := strings.ToLower(filepath.Ext(path)) var blocks []textBlock var err error switch ext { case ".md", ".txt": blocks, err = parsePlainKnowledgeFile(path) case ".csv": blocks, err = parseCSVKnowledgeFile(path) case ".docx": blocks, err = parseDocxKnowledgeFile(path) case ".xlsx": blocks, err = parseXlsxKnowledgeFile(path) case ".pdf": blocks, err = parsePDFKnowledgeFile(path) default: err = fmt.Errorf("unsupported extension: %s", ext) } if err != nil { return nil, err } rel, err := filepath.Rel(root, path) if err != nil { rel = filepath.Base(path) } info, _ := os.Stat(path) updatedAt := time.Now().Unix() if info != nil { updatedAt = info.ModTime().Unix() } chunks := make([]KnowledgeChunk, 0, len(blocks)) for i, block := range blocks { content := strings.TrimSpace(block.Content) if content == "" || isLowValueKnowledgeBlock(block.Title, content) { continue } parts := splitLongKnowledgeContent(content, maxKnowledgeChunkContentRunes) for partIndex, part := range parts { title := block.Title if len(parts) > 1 { title = fmt.Sprintf("%s %d/%d", strings.TrimSpace(block.Title), partIndex+1, len(parts)) } hash := hashKnowledgeChunk(rel, part, i*1000+partIndex) sectionID := hashKnowledgeChunk(rel, strings.TrimSpace(block.Title), i) chunks = append(chunks, KnowledgeChunk{ ID: hash, Source: filepath.ToSlash(rel), Title: title, Content: part, Line: block.Line, Page: block.Page, SectionID: sectionID, SectionIndex: i, PartIndex: partIndex, UpdatedAt: updatedAt, Hash: hash, }) } } return chunks, nil } type textBlock struct { Title string Content string Line int Page int } func splitLongKnowledgeContent(content string, maxRunes int) []string { content = strings.TrimSpace(content) if content == "" { return nil } if maxRunes <= 0 || len([]rune(content)) <= maxRunes { return []string{content} } var chunks []string var current strings.Builder currentRunes := 0 flush := func() { text := strings.TrimSpace(current.String()) if text != "" { chunks = append(chunks, text) } current.Reset() currentRunes = 0 } appendPiece := func(piece string) { piece = strings.TrimSpace(piece) if piece == "" { return } pieceRunes := []rune(piece) for len(pieceRunes) > maxRunes { if currentRunes > 0 { flush() } chunks = append(chunks, strings.TrimSpace(string(pieceRunes[:maxRunes]))) pieceRunes = pieceRunes[maxRunes:] } if len(pieceRunes) == 0 { return } separatorRunes := 0 if currentRunes > 0 { separatorRunes = 1 } if currentRunes+separatorRunes+len(pieceRunes) > maxRunes { flush() } if currentRunes > 0 { current.WriteString("\n") currentRunes++ } current.WriteString(string(pieceRunes)) currentRunes += len(pieceRunes) } for _, line := range strings.Split(content, "\n") { appendPiece(line) } flush() if len(chunks) == 0 { return []string{content} } return chunks } func parsePlainKnowledgeFile(path string) ([]textBlock, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } text := stripBOM(string(data)) lines := strings.Split(text, "\n") blocks := make([]textBlock, 0) var currentTitle string var current []string startLine := 1 inFrontMatter := false inDataviewBlock := false flush := func() { content := strings.TrimSpace(strings.Join(current, "\n")) if content != "" && !isLowValueKnowledgeBlock(currentTitle, content) { blocks = append(blocks, textBlock{Title: currentTitle, Content: content, Line: startLine}) } current = nil } for i, line := range lines { trimmed := strings.TrimSpace(line) if i == 0 && trimmed == "---" { inFrontMatter = true continue } if inFrontMatter { if trimmed == "---" { inFrontMatter = false startLine = i + 2 } continue } if inDataviewBlock { if strings.HasPrefix(trimmed, "```") { inDataviewBlock = false startLine = i + 2 } continue } if strings.EqualFold(trimmed, "```dataview") { flush() inDataviewBlock = true continue } if strings.HasPrefix(trimmed, "#") { flush() currentTitle = strings.TrimSpace(strings.TrimLeft(trimmed, "#")) startLine = i + 1 continue } if trimmed == "" { flush() startLine = i + 2 continue } if len(current) == 0 { startLine = i + 1 } current = append(current, trimmed) } flush() return blocks, nil } func isLowValueKnowledgeBlock(title string, content string) bool { content = strings.TrimSpace(content) if content == "" { return true } if content == "---" || strings.Trim(content, "- \t\r\n") == "" { return true } lower := strings.ToLower(content) if strings.HasPrefix(lower, "```dataview") || strings.Contains(lower, "list from [[") { return true } if strings.HasPrefix(content, "---\n") && strings.Contains(content, "\n---") { return true } title = strings.TrimSpace(title) return title == "反向链接" && (strings.Contains(lower, "dataview") || strings.Contains(lower, "list from [[")) } func parseCSVKnowledgeFile(path string) ([]textBlock, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } reader := csv.NewReader(bytes.NewReader([]byte(stripBOM(string(data))))) reader.FieldsPerRecord = -1 records, err := reader.ReadAll() if err != nil { return nil, err } if len(records) == 0 { return nil, nil } headers := records[0] blocks := make([]textBlock, 0, len(records)-1) for i, row := range records[1:] { parts := make([]string, 0, len(row)) for j, val := range row { val = strings.TrimSpace(val) if val == "" { continue } if j < len(headers) && strings.TrimSpace(headers[j]) != "" { parts = append(parts, strings.TrimSpace(headers[j])+": "+val) } else { parts = append(parts, val) } } if len(parts) > 0 { blocks = append(blocks, textBlock{Title: fmt.Sprintf("row %d", i+2), Content: strings.Join(parts, "\n"), Line: i + 2}) } } return blocks, nil } func parseDocxKnowledgeFile(path string) ([]textBlock, error) { zr, err := zip.OpenReader(path) if err != nil { return nil, err } defer zr.Close() var document []byte for _, file := range zr.File { if file.Name == "word/document.xml" { document, err = readZipFile(file) if err != nil { return nil, err } break } } if len(document) == 0 { return nil, fmt.Errorf("word/document.xml not found") } return extractDocxBlocks(document), nil } func parseXlsxKnowledgeFile(path string) ([]textBlock, error) { file, err := excelize.OpenFile(path) if err != nil { return nil, err } defer func() { _ = file.Close() }() blocks := make([]textBlock, 0) for _, sheetName := range file.GetSheetList() { rows, err := file.GetRows(sheetName) if err != nil { return nil, err } if len(rows) == 0 { continue } applyMergedCellValues(file, sheetName, rows) blocks = append(blocks, structuredSheetBlocks(sheetName, rows)...) } return blocks, nil } func applyMergedCellValues(file *excelize.File, sheetName string, rows [][]string) { mergeCells, err := file.GetMergeCells(sheetName) if err != nil { return } for _, mergeCell := range mergeCells { value := strings.TrimSpace(mergeCell.GetCellValue()) if value == "" { continue } startCol, startRow, err := excelize.CellNameToCoordinates(mergeCell.GetStartAxis()) if err != nil { continue } endCol, endRow, err := excelize.CellNameToCoordinates(mergeCell.GetEndAxis()) if err != nil { continue } if startRow == endRow { continue } for row := startRow; row <= endRow; row++ { for col := startCol; col <= endCol; col++ { setSheetCellValue(rows, row-1, col-1, value) } } } } func setSheetCellValue(rows [][]string, rowIndex int, colIndex int, value string) { if rowIndex < 0 || rowIndex >= len(rows) || colIndex < 0 { return } if len(rows[rowIndex]) <= colIndex { expanded := make([]string, colIndex+1) copy(expanded, rows[rowIndex]) rows[rowIndex] = expanded } if strings.TrimSpace(rows[rowIndex][colIndex]) == "" { rows[rowIndex][colIndex] = value } } func structuredSheetBlocks(sheetName string, rows [][]string) []textBlock { blocks := make([]textBlock, 0) headers := make([]string, 0) carry := make([]string, 0) sectionTitle := "" for i, rawRow := range rows { row := normalizeSheetRow(rawRow) nonEmpty := nonEmptySheetValues(row) if len(nonEmpty) == 0 { continue } if len(nonEmpty) == 1 && !looksLikeSheetHeaderRow(row) { sectionTitle = nonEmpty[0] blocks = append(blocks, textBlock{ Title: sheetName, Content: fmt.Sprintf("工作表: %s\n分类: %s", sheetName, sectionTitle), Line: i + 1, }) continue } if looksLikeSheetHeaderRow(row) { headers = row carry = make([]string, len(headers)) blocks = append(blocks, textBlock{ Title: sheetName, Content: fmt.Sprintf("工作表: %s\n表头: %s", sheetName, strings.Join(nonEmpty, ";")), Line: i + 1, }) continue } filled := applySheetCarryForward(row, headers, carry) content := structuredSheetRowContent(sheetName, sectionTitle, headers, filled) if strings.TrimSpace(content) == "" { continue } blocks = append(blocks, textBlock{Title: sheetName, Content: content, Line: i + 1}) } return blocks } func extractDocxBlocks(data []byte) []textBlock { decoder := xml.NewDecoder(bytes.NewReader(data)) blocks := make([]textBlock, 0) var paragraph strings.Builder var cell strings.Builder var rowCells []string inText := false inParagraph := false tableDepth := 0 inCell := false rowIndex := 0 paragraphIndex := 0 flushParagraph := func() { text := normalizeWhitespace(paragraph.String()) paragraph.Reset() if text == "" { return } paragraphIndex++ blocks = append(blocks, textBlock{Title: "docx paragraph", Content: text, Line: paragraphIndex}) } flushCell := func() { text := normalizeWhitespace(cell.String()) cell.Reset() if text != "" { rowCells = append(rowCells, text) } } flushRow := func() { cleaned := make([]string, 0, len(rowCells)) for _, value := range rowCells { value = strings.TrimSpace(value) if value != "" { cleaned = append(cleaned, value) } } rowCells = nil if len(cleaned) == 0 { return } rowIndex++ blocks = append(blocks, textBlock{Title: "docx table", Content: strings.Join(cleaned, " | "), Line: rowIndex}) } for { token, err := decoder.Token() if err != nil { break } switch t := token.(type) { case xml.StartElement: switch t.Name.Local { case "tbl": tableDepth++ case "tr": if tableDepth > 0 { rowCells = nil } case "tc": if tableDepth > 0 { inCell = true cell.Reset() } case "p": if tableDepth == 0 { inParagraph = true paragraph.Reset() } case "t": inText = true case "tab": if inCell { cell.WriteString(" ") } else if inParagraph { paragraph.WriteString(" ") } case "br": if inCell { cell.WriteString("\n") } else if inParagraph { paragraph.WriteString("\n") } } case xml.EndElement: switch t.Name.Local { case "t": inText = false case "p": if tableDepth == 0 && inParagraph { flushParagraph() } inParagraph = false if inCell { cell.WriteString(" ") } case "tc": if inCell { flushCell() } inCell = false case "tr": if tableDepth > 0 { flushRow() } case "tbl": if tableDepth > 0 { tableDepth-- } } case xml.CharData: if !inText { continue } text := html.UnescapeString(string(t)) if inCell { cell.WriteString(text) } else if inParagraph { paragraph.WriteString(text) } } } return blocks } func normalizeSheetRow(row []string) []string { result := make([]string, len(row)) for i, value := range row { result[i] = strings.TrimSpace(value) } return result } func nonEmptySheetValues(row []string) []string { values := make([]string, 0, len(row)) for _, value := range row { if strings.TrimSpace(value) != "" { values = append(values, strings.TrimSpace(value)) } } return values } func looksLikeSheetHeaderRow(row []string) bool { matches := 0 for _, value := range row { value = strings.TrimSpace(value) if value == "" { continue } if isSheetHeaderLabel(value) { matches++ } } return matches >= 2 } func isSheetHeaderLabel(value string) bool { value = strings.TrimSpace(value) if value == "" { return false } headerLabels := map[string]bool{ "星期": true, "时段": true, "部门": true, "会议主题": true, "会议时间": true, "会议日期": true, "日期": true, "时间": true, "主题": true, "名称": true, "类别": true, "类型": true, "项目": true, "标准": true, "要求": true, "负责人": true, "内容": true, "操作指引": true, "检查项目": true, "核对内容": true, "详细": true, "备注": true, "测试流程": true, } return headerLabels[value] } func applySheetCarryForward(row []string, headers []string, carry []string) []string { filled := append([]string(nil), row...) if len(headers) == 0 { return filled } if len(filled) < len(headers) { expanded := make([]string, len(headers)) copy(expanded, filled) filled = expanded } for i := range headers { header := strings.TrimSpace(headers[i]) value := strings.TrimSpace(filled[i]) if value == "" && i < len(carry) && isCarryForwardSheetHeader(header) { filled[i] = carry[i] continue } if value != "" && i < len(carry) && isCarryForwardSheetHeader(header) { carry[i] = value } } return filled } func isCarryForwardSheetHeader(header string) bool { header = strings.TrimSpace(header) if header == "" { return false } for _, term := range []string{"星期", "日期", "部门", "类别", "类型", "项目"} { if strings.Contains(header, term) { return true } } return false } func structuredSheetRowContent(sheetName string, sectionTitle string, headers []string, row []string) string { parts := make([]string, 0, len(row)+2) parts = append(parts, "工作表: "+sheetName) if strings.TrimSpace(sectionTitle) != "" { parts = append(parts, "分类: "+strings.TrimSpace(sectionTitle)) } for i, value := range row { value = strings.TrimSpace(value) if value == "" { continue } label := "" if i < len(headers) { label = strings.TrimSpace(headers[i]) } if label == "" { label = fmt.Sprintf("列%d", i+1) } parts = append(parts, label+": "+value) } return strings.Join(parts, "\n") } func parsePDFKnowledgeFile(path string) ([]textBlock, error) { file, reader, err := pdf.Open(path) if err != nil { return nil, err } defer file.Close() pageCount := reader.NumPage() blocks := make([]textBlock, 0) needsOCR := make([]int, 0) for pageNum := 1; pageNum <= pageCount; pageNum++ { page := reader.Page(pageNum) text, pageErr := page.GetPlainText(nil) if pageErr == nil && hasEnoughPDFText(text) { blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum}) continue } needsOCR = append(needsOCR, pageNum) } var warnings []string if len(needsOCR) > 0 { ocrBlocks, ocrWarnings := ocrPDFPages(path, needsOCR) blocks = append(blocks, ocrBlocks...) warnings = append(warnings, ocrWarnings...) } if len(blocks) == 0 { if len(warnings) > 0 { return nil, knowledgeParseWarning{Warnings: warnings} } return nil, fmt.Errorf("PDF未读取到可索引内容") } if len(warnings) > 0 { return blocks, knowledgeParseWarning{Warnings: warnings} } return blocks, nil } func hasEnoughPDFText(text string) bool { text = normalizeWhitespace(text) if len([]rune(text)) >= 20 { return true } tokens := tokenizeKnowledgeText(text) return len(tokens) >= 5 } func ocrPDFPages(path string, pageNumbers []int) ([]textBlock, []string) { if len(pageNumbers) == 0 { return nil, nil } var warnings []string limitedPages := make([]int, 0, len(pageNumbers)) for _, pageNum := range pageNumbers { if pageNum > maxPDFOCRPages { continue } limitedPages = append(limitedPages, pageNum) } if len(limitedPages) < len(pageNumbers) { warnings = append(warnings, fmt.Sprintf("PDF超过%d页,后续页面未做视觉识别", maxPDFOCRPages)) } pageNumbers = limitedPages if len(pageNumbers) == 0 { return nil, warnings } renderer, err := pdfFindRenderer() if err != nil { return nil, []string{"PDF扫描页需要pdftoppm渲染,但未找到可用工具: " + err.Error()} } tmpDir, err := os.MkdirTemp("", "qiwei_pdf_ocr_*") if err != nil { return nil, []string{"PDF OCR临时目录创建失败: " + err.Error()} } defer os.RemoveAll(tmpDir) blocks := make([]textBlock, 0, len(pageNumbers)) for _, pageNum := range pageNumbers { imagePath, err := renderPDFPageFunc(renderer, path, pageNum, tmpDir) if err != nil { warnings = append(warnings, fmt.Sprintf("PDF第%d页渲染失败: %v", pageNum, err)) continue } ocr := pdfOCRPageImage if ocr == nil { ocr = ocrPDFPageImage } text, err := ocr(imagePath, pageNum) if err != nil { warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别失败: %v", pageNum, err)) continue } text = normalizeWhitespace(text) if text == "" { warnings = append(warnings, fmt.Sprintf("PDF第%d页视觉识别为空", pageNum)) continue } blocks = append(blocks, textBlock{Title: fmt.Sprintf("pdf 第%d页", pageNum), Content: text, Page: pageNum}) } return blocks, warnings } func findPDFRenderer() (string, error) { candidates := []string{ resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm.exe")), resolveAutoReplyPath(filepath.Join("tools", "pdf", "pdftoppm")), } for _, candidate := range candidates { if info, err := os.Stat(candidate); err == nil && !info.IsDir() { return candidate, nil } } if path, err := exec.LookPath("pdftoppm.exe"); err == nil { return path, nil } if path, err := exec.LookPath("pdftoppm"); err == nil { return path, nil } return "", fmt.Errorf("pdftoppm.exe not found") } func renderPDFPage(renderer string, pdfPath string, pageNum int, tmpDir string) (string, error) { prefix := filepath.Join(tmpDir, fmt.Sprintf("page_%d", pageNum)) ctxArgs := []string{"-f", fmt.Sprintf("%d", pageNum), "-l", fmt.Sprintf("%d", pageNum), "-png", "-r", "160", pdfPath, prefix} cmd := exec.Command(renderer, ctxArgs...) output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("%v: %s", err, truncateText(string(output), 200)) } matches, err := filepath.Glob(prefix + "-*.png") if err != nil { return "", err } if len(matches) == 0 { return "", fmt.Errorf("pdftoppm未生成图片") } return matches[0], nil } func ocrPDFPageImage(imagePath string, pageNum int) (string, error) { cfg := getAutoReplyEngine().getConfig() dataURL, err := imageDataURLFromFile(imagePath) if err != nil { return "", err } systemPrompt := "你是一个严谨的PDF页面OCR识别器,只提取图片中真实可见的文字,保留标题、表格行列关系和关键数值,不要补充不存在的内容。" userPrompt := fmt.Sprintf("请完整识别这张PDF第%d页中的全部可见文字。若有表格,请用每行一段的方式输出。", pageNum) result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, dataURL) if err != nil { return "", err } return strings.TrimSpace(result.Answer), nil } func splitTextToBlocks(text string, title string, page int) []textBlock { text = normalizeWhitespace(text) if text == "" { return nil } paragraphs := regexp.MustCompile(`\n{2,}`).Split(text, -1) blocks := make([]textBlock, 0, len(paragraphs)) for i, p := range paragraphs { p = strings.TrimSpace(p) if p != "" { blocks = append(blocks, textBlock{Title: title, Content: p, Line: i + 1, Page: page}) } } if len(blocks) == 0 { blocks = append(blocks, textBlock{Title: title, Content: text, Page: page}) } return blocks } func readZipFile(file *zip.File) ([]byte, error) { rc, err := file.Open() if err != nil { return nil, err } defer rc.Close() return io.ReadAll(rc) } func extractXMLText(data []byte) string { decoder := xml.NewDecoder(bytes.NewReader(data)) var parts []string for { token, err := decoder.Token() if err != nil { break } if charData, ok := token.(xml.CharData); ok { text := strings.TrimSpace(string(charData)) if text != "" { parts = append(parts, html.UnescapeString(text)) } } } return strings.Join(parts, "\n") } func extractSharedStrings(data []byte) []string { decoder := xml.NewDecoder(bytes.NewReader(data)) values := make([]string, 0) var current []string inSI := false for { token, err := decoder.Token() if err != nil { break } switch t := token.(type) { case xml.StartElement: if t.Name.Local == "si" { inSI = true current = nil } case xml.EndElement: if t.Name.Local == "si" && inSI { values = append(values, strings.Join(current, "")) inSI = false } case xml.CharData: if inSI { current = append(current, string(t)) } } } return values } func extractSheetRows(sheetName string, data []byte, sharedStrings []string) []textBlock { decoder := xml.NewDecoder(bytes.NewReader(data)) blocks := make([]textBlock, 0) var row []string var cellType string var cellValue string inRow := false inV := false rowNum := 0 for { token, err := decoder.Token() if err != nil { break } switch t := token.(type) { case xml.StartElement: switch t.Name.Local { case "row": inRow = true row = nil rowNum++ case "c": cellType = "" cellValue = "" for _, attr := range t.Attr { if attr.Name.Local == "t" { cellType = attr.Value } } case "v", "t": if inRow { inV = true } } case xml.EndElement: switch t.Name.Local { case "v", "t": inV = false case "c": value := strings.TrimSpace(cellValue) if cellType == "s" { if idx, err := strconvAtoiSafe(value); err == nil && idx >= 0 && idx < len(sharedStrings) { value = sharedStrings[idx] } } if value != "" { row = append(row, value) } case "row": inRow = false if len(row) > 0 { blocks = append(blocks, textBlock{Title: filepath.Base(sheetName), Content: strings.Join(row, " | "), Line: rowNum}) } } case xml.CharData: if inV { cellValue += string(t) } } } return blocks } func extractPDFLikeText(data []byte) string { raw := string(data) re := regexp.MustCompile(`\(([^()]*)\)`) matches := re.FindAllStringSubmatch(raw, -1) parts := make([]string, 0, len(matches)) for _, match := range matches { if len(match) > 1 { text := strings.ReplaceAll(match[1], `\(`, "(") text = strings.ReplaceAll(text, `\)`, ")") text = strings.ReplaceAll(text, `\n`, "\n") text = strings.TrimSpace(text) if text != "" && printableRatio(text) > 0.6 { parts = append(parts, text) } } } if len(parts) == 0 { parts = append(parts, strings.Map(func(r rune) rune { if r == '\n' || r == '\r' || r == '\t' || unicode.IsPrint(r) { return r } return ' ' }, raw)) } return strings.Join(parts, "\n") } func tokenizeKnowledgeText(text string) map[string]int { text = strings.ToLower(text) tokens := make(map[string]int) var current []rune flush := func() { if len(current) > 0 { token := string(current) if len([]rune(token)) > 1 { tokens[token]++ } current = nil } } var chineseRunes []rune for _, r := range text { if unicode.Is(unicode.Han, r) { flush() chineseRunes = append(chineseRunes, r) continue } if len(chineseRunes) > 0 { addChineseTokens(tokens, chineseRunes) chineseRunes = nil } if unicode.IsLetter(r) || unicode.IsDigit(r) { current = append(current, r) } else { flush() } } flush() if len(chineseRunes) > 0 { addChineseTokens(tokens, chineseRunes) } return tokens } func addChineseTokens(tokens map[string]int, chars []rune) { for _, r := range chars { tokens[string(r)]++ } for i := 0; i+1 < len(chars); i++ { tokens[string(chars[i:i+2])] += 2 } } func hashKnowledgeChunk(source string, content string, idx int) string { sum := sha1.Sum([]byte(fmt.Sprintf("%s:%d:%s", source, idx, content))) return hex.EncodeToString(sum[:]) } func stripBOM(text string) string { return strings.TrimPrefix(text, "\ufeff") } func normalizeWhitespace(text string) string { text = strings.ReplaceAll(text, "\r\n", "\n") text = strings.ReplaceAll(text, "\r", "\n") lines := strings.Split(text, "\n") cleaned := make([]string, 0, len(lines)) for _, line := range lines { line = strings.TrimSpace(regexp.MustCompile(`[ \t]+`).ReplaceAllString(line, " ")) if line != "" { cleaned = append(cleaned, line) } else if len(cleaned) > 0 && cleaned[len(cleaned)-1] != "" { cleaned = append(cleaned, "") } } return strings.TrimSpace(strings.Join(cleaned, "\n")) } func printableRatio(text string) float64 { if text == "" { return 0 } printable := 0 total := 0 for _, r := range text { total++ if unicode.IsPrint(r) || unicode.IsSpace(r) { printable++ } } return float64(printable) / float64(total) } func strconvAtoiSafe(value string) (int, error) { value = strings.TrimSpace(value) n := 0 if value == "" { return 0, fmt.Errorf("empty") } for _, r := range value { if r < '0' || r > '9' { return 0, fmt.Errorf("invalid int") } n = n*10 + int(r-'0') } return n, nil } func resolveAutoReplyPath(pathValue string) string { if filepath.IsAbs(pathValue) { return pathValue } exePath, err := os.Executable() if err != nil { wd, wdErr := os.Getwd() if wdErr == nil { return filepath.Join(wd, pathValue) } return pathValue } return filepath.Join(filepath.Dir(exePath), pathValue) }