Initial qiwei secondary development handoff

This commit is contained in:
2026-06-23 21:11:20 +08:00
commit 858cb68f4f
207 changed files with 52782 additions and 0 deletions

558
helper/auto_reply_media.go Normal file
View File

@@ -0,0 +1,558 @@
package main
import (
"encoding/base64"
"fmt"
"io"
"mime"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
)
var (
autoReplyVisionRecognizer = defaultAutoReplyVisionRecognizer
autoReplyAudioTranscriber = defaultAutoReplyAudioTranscriber
audioFindSilkDecoder = findSilkDecoder
audioConvertSilkToWav = convertSilkToWav
audioFindFFmpeg = findFFmpeg
audioConvertSilkToMp3 = convertSilkToMp3
)
func (e *AutoReplyEngine) prepareMediaMessage(msg *autoReplyMessage) error {
if msg == nil {
return nil
}
if msg.MediaKind == "" {
msg.MediaKind = mediaKindForRawType(msg.RawType)
}
if msg.RawType == 11047 && looksLikeStickerOrImage(*msg) {
msg.MediaKind = "emoji"
}
switch msg.MediaKind {
case "voice":
if text := strings.TrimSpace(msg.VoiceText); text != "" {
msg.Content = text
msg.MessageType = "voice"
return nil
}
text, err := autoReplyAudioTranscriber(e, *msg)
if err != nil {
return err
}
msg.Content = strings.TrimSpace(text)
msg.MessageType = "voice"
return nil
case "image", "emoji":
text, err := autoReplyVisionRecognizer(e, *msg)
if err != nil {
return err
}
msg.Content = strings.TrimSpace(text)
msg.MessageType = msg.MediaKind
return nil
case "video":
desc := mediaTextDescription(*msg)
if desc != "" {
msg.Content = desc
}
if msg.MediaURL != "" || msg.MediaLocalPath != "" {
if text, err := autoReplyVisionRecognizer(e, *msg); err == nil && strings.TrimSpace(text) != "" {
msg.Content = strings.TrimSpace(msg.Content + "\n视频封面识别" + text)
}
}
msg.MessageType = "video"
return nil
default:
if desc := mediaTextDescription(*msg); desc != "" {
msg.Content = desc
return nil
}
return fmt.Errorf("unsupported media message type: %s", msg.MediaKind)
}
}
func looksLikeStickerOrImageText(content string) bool {
content = strings.TrimSpace(content)
return strings.Contains(content, "表情") || strings.Contains(content, "图片") ||
strings.Contains(content, "琛ㄦ儏") || strings.Contains(content, "鍥剧墖")
}
func looksLikeStickerOrImage(msg autoReplyMessage) bool {
if looksLikeStickerOrImageText(msg.Content) {
return true
}
if strings.TrimSpace(msg.Content) != "" {
return false
}
return strings.TrimSpace(msg.MediaURL) != "" ||
strings.TrimSpace(msg.MediaFileID) != "" ||
strings.TrimSpace(msg.MediaLocalPath) != ""
}
func defaultAutoReplyVisionRecognizer(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
cfg := e.getConfig()
imageRef := strings.TrimSpace(msg.MediaURL)
if path, err := ensureAutoReplyMediaLocalPath(msg); err == nil && path != "" {
if dataURL, err := imageDataURLFromFile(path); err == nil && dataURL != "" {
imageRef = dataURL
}
}
if imageRef == "" {
return "", fmt.Errorf("missing image url or local file")
}
systemPrompt := buildVisionRecognitionSystemPrompt(cfg)
userPrompt := buildNonTextAutoReplyUserPrompt(msg)
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, imageRef)
if err != nil {
return "", fmt.Errorf("vision recognition failed (model=%s): %w", visionRequestConfig(cfg.AI).Model, err)
}
return strings.TrimSpace(result.Answer), nil
}
func defaultAutoReplyAudioTranscriber(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
cfg := e.getConfig()
path, err := ensureAutoReplyMediaLocalPath(msg)
if err != nil {
return "", err
}
var failures []string
if warning := audioConfigWarning(cfg.AI); warning != "" {
failures = append(failures, warning)
}
mode := inferAudioMode(cfg.AI)
ext := strings.ToLower(filepath.Ext(path))
if ext == ".silk" {
if converted, ok, err := optionalSilkToStandardAudio(path); err != nil {
failures = append(failures, "silk 转码失败: "+err.Error())
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): 缺少可用的企微 silk 语音转码能力或转码失败%s",
mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), formatAudioFailures(failures))
} else if ok {
path = converted
ext = strings.ToLower(filepath.Ext(path))
}
}
switch mode {
case audioModeParaformer:
text, err := callDashScopeParaformerTranscription(cfg.AI, audioSourceURLForParaformer(msg, path))
if err == nil {
return text, nil
}
failures = append(failures, err.Error())
if text, fallbackErr := callOpenAICompatibleAudioTranscription(cfg.AI, path); fallbackErr == nil {
return text, nil
} else {
failures = append(failures, fallbackErr.Error())
}
case audioModeTranscription, audioModeCustomHTTP:
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
default:
if text, err := callOpenAICompatibleAudioChatTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
}
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): %s", mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), strings.Join(failures, " | "))
}
func optionalSilkToStandardAudio(path string) (string, bool, error) {
if strings.EqualFold(filepath.Ext(path), ".silk") {
if converted, err := audioConvertSilkToWav(path); err == nil {
return converted, true, nil
} else {
if _, ffmpegErr := audioFindFFmpeg(); ffmpegErr != nil {
return "", false, fmt.Errorf("内置 silk 解码失败: %v也未找到可用 ffmpeg: %v", err, ffmpegErr)
}
converted, mp3Err := audioConvertSilkToMp3(path)
if mp3Err != nil {
return "", true, fmt.Errorf("内置 silk 解码失败: %vffmpeg 兜底也失败: %v", err, mp3Err)
}
return converted, true, nil
}
}
return path, false, nil
}
func convertSilkToWav(silkPath string) (string, error) {
decoder, err := audioFindSilkDecoder()
if err != nil {
return "", err
}
wavPath := strings.TrimSuffix(silkPath, filepath.Ext(silkPath)) + ".wav"
cmd := exec.Command(decoder, "-in", silkPath, "-out", wavPath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("silkdecode执行失败: %v, 输出: %s", err, truncateText(string(output), 240))
}
info, err := os.Stat(wavPath)
if err != nil {
return "", fmt.Errorf("silkdecode未生成wav: %w", err)
}
if info.Size() <= 44 {
return "", fmt.Errorf("silkdecode生成的wav为空或损坏: %s", wavPath)
}
return wavPath, nil
}
func findSilkDecoder() (string, error) {
names := []string{"silkdecode.exe", "silk_decoder.exe", "silk-v3-decoder.exe"}
candidates := make([]string, 0, 12)
if currentDir, err := os.Getwd(); err == nil {
for _, name := range names {
candidates = append(candidates,
filepath.Join(currentDir, "tools", "audio", name),
filepath.Join(currentDir, name),
)
}
}
if exePath, err := os.Executable(); err == nil {
exeDir := filepath.Dir(exePath)
for _, name := range names {
candidates = append(candidates,
filepath.Join(exeDir, "tools", "audio", name),
filepath.Join(exeDir, name),
)
}
}
for _, candidate := range candidates {
if _, err := os.Stat(candidate); err == nil {
return candidate, nil
}
}
if path, err := exec.LookPath("silkdecode"); err == nil {
return path, nil
}
return "", fmt.Errorf("缺少随包语音转码组件 silkdecode.exe")
}
func audioSourceURLForParaformer(msg autoReplyMessage, path string) string {
for _, candidate := range []string{msg.MediaURL, path} {
candidate = strings.TrimSpace(candidate)
if strings.HasPrefix(strings.ToLower(candidate), "http://") || strings.HasPrefix(strings.ToLower(candidate), "https://") || strings.HasPrefix(strings.ToLower(candidate), "oss://") {
return candidate
}
}
return ""
}
func formatAudioFailures(failures []string) string {
cleaned := make([]string, 0, len(failures))
for _, failure := range failures {
if failure = strings.TrimSpace(failure); failure != "" {
cleaned = append(cleaned, failure)
}
}
if len(cleaned) == 0 {
return ""
}
return ";附加信息: " + strings.Join(cleaned, " | ")
}
func mediaKindForRawType(rawType int) string {
switch rawType {
case 11042:
return "image"
case 11043:
return "video"
case 11044:
return "voice"
case 11045:
return "file"
case 11046:
return "location"
case 11047:
return "link"
default:
return "non_text"
}
}
func mediaTextDescription(msg autoReplyMessage) string {
parts := make([]string, 0, 4)
if content := strings.TrimSpace(msg.Content); content != "" && !strings.HasPrefix(content, "[") {
parts = append(parts, content)
}
if msg.MediaFileName != "" {
parts = append(parts, "文件:"+msg.MediaFileName)
}
if msg.MediaKind != "" && len(parts) == 0 {
parts = append(parts, nonTextMessageDescription(msg))
}
return strings.Join(parts, "\n")
}
func mediaRecognitionFallbackAnswer(msg autoReplyMessage) string {
switch msg.MediaKind {
case "voice":
return "我这边暂时无法识别这条语音内容,麻烦您补充一句文字说明,我继续帮您处理。"
case "image", "emoji", "video":
return "我这边暂时无法识别这条图片/视频内容,麻烦您补充一句文字说明,我继续帮您处理。"
default:
return "我这边暂时无法识别这条内容,麻烦您补充一句文字说明,我继续帮您处理。"
}
}
func ensureAutoReplyMediaLocalPath(msg autoReplyMessage) (string, error) {
if path := strings.TrimSpace(msg.MediaLocalPath); path != "" {
if _, err := os.Stat(path); err == nil {
return path, nil
}
}
ext := mediaExtForMessage(msg)
base := msg.MediaFileID
if base == "" {
base = filepath.Base(strings.TrimSpace(msg.MediaURL))
}
if base == "" || base == "." || base == string(filepath.Separator) {
base = fmt.Sprintf("%s_%d", msg.MediaKind, msg.RawType)
}
savePath := generateSavePath("auto_reply_media", base, ext)
if savePath == "" {
return "", fmt.Errorf("failed to create media save path")
}
if msg.MediaURL != "" {
if msg.MediaAESKey != "" || msg.MediaAuthKey != "" || msg.MediaSize > 0 {
if DownloadMediaFileForClient(uint32(msg.ClientID), msg.MediaURL, msg.MediaAuthKey, msg.MediaAESKey, int(msg.MediaSize), savePath) {
if _, err := os.Stat(savePath); err == nil {
return savePath, nil
}
return "", fmt.Errorf("media download reported success but file missing: %s", savePath)
}
}
if err := downloadPlainMedia(msg.MediaURL, savePath); err == nil {
return savePath, nil
}
}
if msg.MediaFileID != "" {
if DownloadFileByFileIdForClient(uint32(msg.ClientID), msg.MediaAESKey, msg.MediaFileID, savePath, int(msg.MediaSize), msg.MediaFileType) {
if _, err := os.Stat(savePath); err == nil {
return savePath, nil
}
return "", fmt.Errorf("file_id download reported success but file missing: %s", savePath)
}
}
return "", fmt.Errorf("media download failed")
}
func downloadPlainMedia(url string, savePath string) error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("download status %d", resp.StatusCode)
}
if err := os.MkdirAll(filepath.Dir(savePath), 0755); err != nil {
return err
}
file, err := os.Create(savePath)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
func imageDataURLFromFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
if len(data) == 0 {
return "", fmt.Errorf("empty image file")
}
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
if mimeType == "" {
mimeType = http.DetectContentType(data)
}
if !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
}
func audioDataURLFromFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
if len(data) == 0 {
return "", fmt.Errorf("empty audio file")
}
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
if mimeType == "" {
switch strings.ToLower(filepath.Ext(path)) {
case ".silk":
mimeType = "audio/silk"
case ".amr":
mimeType = "audio/amr"
case ".mp3":
mimeType = "audio/mpeg"
case ".wav":
mimeType = "audio/wav"
case ".m4a":
mimeType = "audio/mp4"
default:
mimeType = http.DetectContentType(data)
}
}
if mimeType == "" || mimeType == "application/octet-stream" {
mimeType = "application/octet-stream"
}
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
}
func mediaExtForMessage(msg autoReplyMessage) string {
if ext := filepath.Ext(msg.MediaFileName); ext != "" {
return ext
}
if ext := filepath.Ext(strings.TrimSpace(msg.MediaURL)); ext != "" && len(ext) <= 8 {
return ext
}
switch msg.MediaKind {
case "voice":
return ".silk"
case "video":
return ".mp4"
case "file":
return ".bin"
default:
return ".jpg"
}
}
func fillMediaFieldsFromValue(msg *autoReplyMessage, value interface{}) {
if msg == nil {
return
}
cdn := firstMediaCdnMap(value)
if len(cdn) == 0 {
return
}
msg.MediaAESKey = firstNonEmptyString(cdn["aes_key"], cdn["aesKey"])
msg.MediaAuthKey = firstNonEmptyString(cdn["auth_key"], cdn["authKey"])
msg.MediaFileID = firstNonEmptyString(cdn["file_id"], cdn["fileId"])
msg.MediaFileName = firstNonEmptyString(cdn["file_name"], cdn["fileName"], cdn["name"])
if path := firstLocalMediaPathFromValue(cdn); path != "" {
msg.MediaLocalPath = path
}
msg.MediaFileType = intFromAny(firstNonNil(cdn["file_type"], cdn["fileType"]))
msg.MediaSize = int64(intFromAny(firstNonNil(cdn["size"], cdn["file_size"], cdn["fileSize"])))
if msg.MediaURL == "" {
msg.MediaURL = firstMediaURLFromValue(cdn)
}
}
func firstVoiceTextFromValue(value interface{}) string {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{
"voice_text", "voiceText", "voice_to_text", "voiceToText",
"translate_text", "translateText", "translated_text", "translatedText",
"trans_text", "transText", "transcript", "transcription",
"recognition_text", "recognitionText", "asr_text", "asrText",
"speech_text", "speechText", "text_content", "textContent",
} {
if text := cleanVoiceTranscript(stringFromAny(v[key])); text != "" {
return text
}
}
for _, item := range v {
if text := firstVoiceTextFromValue(item); text != "" {
return text
}
}
case []interface{}:
for _, item := range v {
if text := firstVoiceTextFromValue(item); text != "" {
return text
}
}
}
return ""
}
func cleanVoiceTranscript(text string) string {
text = strings.TrimSpace(text)
if text == "" {
return ""
}
if strings.HasPrefix(text, "{{") && strings.HasSuffix(text, "}}") {
return ""
}
for _, prefix := range []string{"转文字完成", "转文字:", "转文字:", "语音转文字:", "语音转文字:", "转写:", "转写:"} {
text = strings.TrimSpace(strings.TrimPrefix(text, prefix))
}
return text
}
func firstMediaCdnMap(value interface{}) map[string]interface{} {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{"cdn", "cdnData", "c2cCdnData"} {
if child, ok := v[key].(map[string]interface{}); ok {
return child
}
}
for _, item := range v {
if child := firstMediaCdnMap(item); len(child) > 0 {
return child
}
}
case []interface{}:
for _, item := range v {
if child := firstMediaCdnMap(item); len(child) > 0 {
return child
}
}
}
return nil
}
func firstNonEmptyString(values ...interface{}) string {
for _, value := range values {
text := stringFromAny(value)
if strings.TrimSpace(text) != "" {
return strings.TrimSpace(text)
}
}
return ""
}
func firstLocalMediaPathFromValue(value interface{}) string {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{"local_path", "localPath", "path", "file_name", "fileName"} {
text := strings.TrimSpace(stringFromAny(v[key]))
if text != "" && filepath.IsAbs(text) {
return text
}
}
for _, item := range v {
if path := firstLocalMediaPathFromValue(item); path != "" {
return path
}
}
case []interface{}:
for _, item := range v {
if path := firstLocalMediaPathFromValue(item); path != "" {
return path
}
}
}
return ""
}