- 将默认回复详细程度从"detailed"调整为"medium",前后端保持一致 - 新增话题切换检测逻辑,当用户主动要求换话题时提供引导回复 - 优化上下文处理机制,仅在指代型追问时注入历史对话,避免模型复读旧内容 - 改进知识库检索逻辑,区分自包含问题和指代型问题的上下文需求 - 完善知识库完整性指令,确保回复详细程度与知识展开程度一致 - 重构知识库重建逻辑,支持递归扫描子目录中的文件,修复索引为空的问题 - 增强素材匹配算法,引入强信号检测机制,避免仅凭模糊匹配误发素材 - 新增素材开场白AI生成功能,支持图片、视频、文档等类型智能描述 - 改进知识库重建通知,显示具体的文件数、分片数及失败统计信息
275 lines
9.1 KiB
Go
275 lines
9.1 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"qiweimanager/config"
|
|
)
|
|
|
|
func TestParseKnowledgeFileSplitsLongBlocksForEmbedding(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "long.txt")
|
|
longLine := strings.Repeat("knowledge content ", 900)
|
|
if err := os.WriteFile(path, []byte(longLine), 0644); err != nil {
|
|
t.Fatalf("write knowledge file failed: %v", err)
|
|
}
|
|
|
|
chunks, err := parseKnowledgeFile(path, dir)
|
|
if err != nil {
|
|
t.Fatalf("parse failed: %v", err)
|
|
}
|
|
if len(chunks) < 2 {
|
|
t.Fatalf("expected long block to be split, got %d chunks", len(chunks))
|
|
}
|
|
for _, chunk := range chunks {
|
|
if got := len([]rune(chunk.Content)); got > maxKnowledgeChunkContentRunes {
|
|
t.Fatalf("chunk exceeded limit: %d", got)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRebuildKnowledgeIndexCountsOnlyRootKnowledgeFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
for _, name := range []string{"a.pdf", "b.pdf", "c.xlsx", "d.xlsx", "e.docx", "f.docx"} {
|
|
if err := os.WriteFile(filepath.Join(dir, name), []byte("placeholder"), 0644); err != nil {
|
|
t.Fatalf("write %s failed: %v", name, err)
|
|
}
|
|
}
|
|
for _, name := range []string{".keep", "index.json", "embedding_index.json"} {
|
|
if err := os.WriteFile(filepath.Join(dir, name), []byte("{}"), 0644); err != nil {
|
|
t.Fatalf("write %s failed: %v", name, err)
|
|
}
|
|
}
|
|
sub := filepath.Join(dir, "after_sales_cases")
|
|
if err := os.MkdirAll(sub, 0755); err != nil {
|
|
t.Fatalf("mkdir subdir failed: %v", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(sub, "hidden.md"), []byte("hidden content"), 0644); err != nil {
|
|
t.Fatalf("write hidden failed: %v", err)
|
|
}
|
|
|
|
allowed := map[string]bool{".pdf": true, ".xlsx": true, ".docx": true, ".md": true}
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
t.Fatalf("read dir failed: %v", err)
|
|
}
|
|
count := 0
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
continue
|
|
}
|
|
if isRootKnowledgeFile(entry.Name(), filepath.Ext(entry.Name()), allowed, "index.json", "embedding_index.json") {
|
|
count++
|
|
}
|
|
}
|
|
if count != 6 {
|
|
t.Fatalf("expected 6 root upload files, got %d", count)
|
|
}
|
|
}
|
|
|
|
// TestRebuildKnowledgeIndexScansSubdirectories 锁住递归扫描行为:
|
|
// 知识库按分类分文件夹组织时(文件在子目录里),重建必须把子目录里的文件
|
|
// 一并索引。这是“重置索引后向量仍为空”那个问题的根因回归测试。
|
|
func TestRebuildKnowledgeIndexScansSubdirectories(t *testing.T) {
|
|
dir := t.TempDir()
|
|
// 根目录故意不放任何知识文件,全部放进多层子目录。
|
|
files := map[string]string{
|
|
filepath.Join("01_产品", "数控机床", "VMC850规格.md"): "VMC850 立式加工中心,主轴转速 8000rpm。",
|
|
filepath.Join("03_售后", "故障排查", "常见故障.md"): "报警 E01 表示伺服过载,请检查负载。",
|
|
filepath.Join("readme.txt"): "", // 空文件,应进 FailedFiles 不计入 FileCount
|
|
}
|
|
for rel, content := range files {
|
|
full := filepath.Join(dir, rel)
|
|
if err := os.MkdirAll(filepath.Dir(full), 0755); err != nil {
|
|
t.Fatalf("mkdir for %s: %v", rel, err)
|
|
}
|
|
if err := os.WriteFile(full, []byte(content), 0644); err != nil {
|
|
t.Fatalf("write %s: %v", rel, err)
|
|
}
|
|
}
|
|
|
|
cfg := config.NewDefaultAutoReplyConfig()
|
|
cfg.Knowledge.Directory = dir
|
|
cfg.Knowledge.IndexPath = filepath.Join(dir, "index.json")
|
|
cfg.Retrieval.EmbeddingIndexPath = filepath.Join(dir, "embedding_index.json")
|
|
engine := testAutoReplyEngine(cfg)
|
|
|
|
idx, err := engine.rebuildKnowledgeIndex()
|
|
if err != nil {
|
|
t.Fatalf("rebuildKnowledgeIndex failed: %v", err)
|
|
}
|
|
if idx.FileCount != 2 {
|
|
t.Fatalf("expected 2 indexed files from subdirectories, got %d (chunks=%d failed=%v)", idx.FileCount, len(idx.Chunks), idx.FailedFiles)
|
|
}
|
|
if len(idx.Chunks) == 0 {
|
|
t.Fatal("expected chunks from subdirectory files, got none")
|
|
}
|
|
// 确认子目录文件的相对路径作为 Source 被正确记录(用 / 分隔)。
|
|
sources := make(map[string]bool)
|
|
for _, chunk := range idx.Chunks {
|
|
sources[chunk.Source] = true
|
|
}
|
|
if !sources["01_产品/数控机床/VMC850规格.md"] {
|
|
t.Fatalf("expected nested source path recorded, got sources=%v", sources)
|
|
}
|
|
}
|
|
|
|
func TestParsePDFKnowledgeFileExtractsTextLayer(t *testing.T) {
|
|
path := filepath.Join(t.TempDir(), "text.pdf")
|
|
writeMinimalTextPDF(t, path, "AgentBox PDF content 123")
|
|
|
|
blocks, err := parsePDFKnowledgeFile(path)
|
|
if err != nil {
|
|
t.Fatalf("parse pdf failed: %v", err)
|
|
}
|
|
if got := knowledgeBlockContent(blocks); !strings.Contains(got, "AgentBox PDF content 123") {
|
|
t.Fatalf("expected text-layer content, got %q", got)
|
|
}
|
|
}
|
|
|
|
func TestParsePDFKnowledgeFileUsesOCRForEmptyTextPage(t *testing.T) {
|
|
path := filepath.Join(t.TempDir(), "scan.pdf")
|
|
writeMinimalBlankPDF(t, path, 1)
|
|
restore := stubPDFOCR(t, "OCR page content", nil)
|
|
defer restore()
|
|
|
|
blocks, err := parsePDFKnowledgeFile(path)
|
|
if err != nil {
|
|
t.Fatalf("parse pdf failed: %v", err)
|
|
}
|
|
if got := knowledgeBlockContent(blocks); !strings.Contains(got, "OCR page content") {
|
|
t.Fatalf("expected OCR content, got %q", got)
|
|
}
|
|
}
|
|
|
|
func TestParsePDFKnowledgeFileLimitsOCRToFirstTwentyPages(t *testing.T) {
|
|
path := filepath.Join(t.TempDir(), "long-scan.pdf")
|
|
writeMinimalBlankPDF(t, path, 21)
|
|
calls := 0
|
|
restore := stubPDFOCRFunc(t, func(imagePath string, pageNum int) (string, error) {
|
|
calls++
|
|
if pageNum > maxPDFOCRPages {
|
|
t.Fatalf("unexpected OCR call for page %d", pageNum)
|
|
}
|
|
return fmt.Sprintf("page %d text", pageNum), nil
|
|
})
|
|
defer restore()
|
|
|
|
blocks, err := parsePDFKnowledgeFile(path)
|
|
if err == nil || !strings.Contains(err.Error(), "PDF超过20页") {
|
|
t.Fatalf("expected over-limit warning, got blocks=%d err=%v", len(blocks), err)
|
|
}
|
|
if calls != maxPDFOCRPages {
|
|
t.Fatalf("expected %d OCR calls, got %d", maxPDFOCRPages, calls)
|
|
}
|
|
if len(blocks) != maxPDFOCRPages {
|
|
t.Fatalf("expected %d OCR blocks, got %d", maxPDFOCRPages, len(blocks))
|
|
}
|
|
}
|
|
|
|
func stubPDFOCR(t *testing.T, text string, err error) func() {
|
|
t.Helper()
|
|
return stubPDFOCRFunc(t, func(imagePath string, pageNum int) (string, error) {
|
|
return text, err
|
|
})
|
|
}
|
|
|
|
func stubPDFOCRFunc(t *testing.T, ocr func(string, int) (string, error)) func() {
|
|
t.Helper()
|
|
oldFind := pdfFindRenderer
|
|
oldOCR := pdfOCRPageImage
|
|
tmp := t.TempDir()
|
|
renderer := filepath.Join(tmp, "pdftoppm.exe")
|
|
if err := os.WriteFile(renderer, []byte("stub"), 0644); err != nil {
|
|
t.Fatalf("write renderer stub failed: %v", err)
|
|
}
|
|
pdfFindRenderer = func() (string, error) { return renderer, nil }
|
|
pdfOCRPageImage = ocr
|
|
oldRender := renderPDFPageFunc
|
|
renderPDFPageFunc = func(renderer string, pdfPath string, pageNum int, tmpDir string) (string, error) {
|
|
imagePath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", pageNum))
|
|
if err := os.WriteFile(imagePath, []byte{0x89, 0x50, 0x4e, 0x47}, 0644); err != nil {
|
|
return "", err
|
|
}
|
|
return imagePath, nil
|
|
}
|
|
return func() {
|
|
pdfFindRenderer = oldFind
|
|
pdfOCRPageImage = oldOCR
|
|
renderPDFPageFunc = oldRender
|
|
}
|
|
}
|
|
|
|
func writeMinimalTextPDF(t *testing.T, path string, text string) {
|
|
t.Helper()
|
|
writeRawPDF(t, path, []string{fmt.Sprintf("BT /F1 12 Tf 72 720 Td (%s) Tj ET", escapePDFString(text))})
|
|
}
|
|
|
|
func writeMinimalBlankPDF(t *testing.T, path string, pages int) {
|
|
t.Helper()
|
|
streams := make([]string, pages)
|
|
for i := range streams {
|
|
streams[i] = ""
|
|
}
|
|
writeRawPDF(t, path, streams)
|
|
}
|
|
|
|
func writeRawPDF(t *testing.T, path string, pageStreams []string) {
|
|
t.Helper()
|
|
var b strings.Builder
|
|
offsets := []int{0}
|
|
writeObj := func(id int, body string) {
|
|
offsets = append(offsets, b.Len())
|
|
b.WriteString(fmt.Sprintf("%d 0 obj\n%s\nendobj\n", id, body))
|
|
}
|
|
b.WriteString("%PDF-1.4\n")
|
|
pageCount := len(pageStreams)
|
|
kids := make([]string, 0, pageCount)
|
|
for i := 0; i < pageCount; i++ {
|
|
pageID := 3 + i*2
|
|
kids = append(kids, fmt.Sprintf("%d 0 R", pageID))
|
|
}
|
|
writeObj(1, "<< /Type /Catalog /Pages 2 0 R >>")
|
|
writeObj(2, fmt.Sprintf("<< /Type /Pages /Kids [%s] /Count %d >>", strings.Join(kids, " "), pageCount))
|
|
for i, stream := range pageStreams {
|
|
pageID := 3 + i*2
|
|
contentID := pageID + 1
|
|
writeObj(pageID, fmt.Sprintf("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> /Contents %d 0 R >>", contentID))
|
|
writeObj(contentID, fmt.Sprintf("<< /Length %d >>\nstream\n%s\nendstream", len(stream), stream))
|
|
}
|
|
xref := b.Len()
|
|
b.WriteString(fmt.Sprintf("xref\n0 %d\n0000000000 65535 f \n", len(offsets)))
|
|
for i := 1; i < len(offsets); i++ {
|
|
b.WriteString(fmt.Sprintf("%010d 00000 n \n", offsets[i]))
|
|
}
|
|
b.WriteString(fmt.Sprintf("trailer\n<< /Root 1 0 R /Size %d >>\nstartxref\n%d\n%%%%EOF\n", len(offsets), xref))
|
|
if err := os.WriteFile(path, []byte(b.String()), 0644); err != nil {
|
|
t.Fatalf("write pdf failed: %v", err)
|
|
}
|
|
}
|
|
|
|
func escapePDFString(text string) string {
|
|
text = strings.ReplaceAll(text, `\`, `\\`)
|
|
text = strings.ReplaceAll(text, `(`, `\(`)
|
|
text = strings.ReplaceAll(text, `)`, `\)`)
|
|
return text
|
|
}
|
|
|
|
func TestKnowledgeConfigStillUsesPDFExtension(t *testing.T) {
|
|
cfg := config.NewDefaultAutoReplyConfig()
|
|
found := false
|
|
for _, ext := range cfg.Knowledge.SupportedExtensions {
|
|
if ext == ".pdf" {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatal("expected default knowledge config to support .pdf")
|
|
}
|
|
}
|