package main import ( "encoding/base64" "fmt" "io" "mime" "net/http" "os" "os/exec" "path/filepath" "strings" ) var ( autoReplyVisionRecognizer = defaultAutoReplyVisionRecognizer autoReplyAudioTranscriber = defaultAutoReplyAudioTranscriber audioFindSilkDecoder = findSilkDecoder audioConvertSilkToWav = convertSilkToWav audioFindFFmpeg = findFFmpeg audioConvertSilkToMp3 = convertSilkToMp3 ) func (e *AutoReplyEngine) prepareMediaMessage(msg *autoReplyMessage) error { if msg == nil { return nil } if msg.MediaKind == "" { msg.MediaKind = mediaKindForRawType(msg.RawType) } if msg.RawType == 11047 && looksLikeStickerOrImage(*msg) { msg.MediaKind = "emoji" } switch msg.MediaKind { case "voice": if text := strings.TrimSpace(msg.VoiceText); text != "" { msg.Content = text msg.MessageType = "voice" return nil } text, err := autoReplyAudioTranscriber(e, *msg) if err != nil { return err } msg.Content = strings.TrimSpace(text) msg.MessageType = "voice" return nil case "image", "emoji": text, err := autoReplyVisionRecognizer(e, *msg) if err != nil { return err } msg.Content = strings.TrimSpace(text) msg.MessageType = msg.MediaKind return nil case "video": desc := mediaTextDescription(*msg) if desc != "" { msg.Content = desc } if msg.MediaURL != "" || msg.MediaLocalPath != "" { if text, err := autoReplyVisionRecognizer(e, *msg); err == nil && strings.TrimSpace(text) != "" { msg.Content = strings.TrimSpace(msg.Content + "\n视频封面识别:" + text) } } msg.MessageType = "video" return nil default: if desc := mediaTextDescription(*msg); desc != "" { msg.Content = desc return nil } return fmt.Errorf("unsupported media message type: %s", msg.MediaKind) } } func looksLikeStickerOrImageText(content string) bool { content = strings.TrimSpace(content) return strings.Contains(content, "表情") || strings.Contains(content, "图片") || strings.Contains(content, "琛ㄦ儏") || strings.Contains(content, "鍥剧墖") } func looksLikeStickerOrImage(msg autoReplyMessage) bool { if looksLikeStickerOrImageText(msg.Content) { return true } if strings.TrimSpace(msg.Content) != "" { return false } return strings.TrimSpace(msg.MediaURL) != "" || strings.TrimSpace(msg.MediaFileID) != "" || strings.TrimSpace(msg.MediaLocalPath) != "" } func defaultAutoReplyVisionRecognizer(e *AutoReplyEngine, msg autoReplyMessage) (string, error) { cfg := e.getConfig() imageRef := strings.TrimSpace(msg.MediaURL) if path, err := ensureAutoReplyMediaLocalPath(msg); err == nil && path != "" { if dataURL, err := imageDataURLFromFile(path); err == nil && dataURL != "" { imageRef = dataURL } } if imageRef == "" { return "", fmt.Errorf("missing image url or local file") } systemPrompt := buildVisionRecognitionSystemPrompt(cfg) userPrompt := buildNonTextAutoReplyUserPrompt(msg) result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, imageRef) if err != nil { return "", fmt.Errorf("vision recognition failed (model=%s): %w", visionRequestConfig(cfg.AI).Model, err) } return strings.TrimSpace(result.Answer), nil } func defaultAutoReplyAudioTranscriber(e *AutoReplyEngine, msg autoReplyMessage) (string, error) { cfg := e.getConfig() path, err := ensureAutoReplyMediaLocalPath(msg) if err != nil { return "", err } var failures []string if warning := audioConfigWarning(cfg.AI); warning != "" { failures = append(failures, warning) } mode := inferAudioMode(cfg.AI) ext := strings.ToLower(filepath.Ext(path)) if ext == ".silk" { if converted, ok, err := optionalSilkToStandardAudio(path); err != nil { failures = append(failures, "silk 转码失败: "+err.Error()) return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): 缺少可用的企微 silk 语音转码能力或转码失败%s", mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), formatAudioFailures(failures)) } else if ok { path = converted ext = strings.ToLower(filepath.Ext(path)) } } switch mode { case audioModeParaformer: text, err := callDashScopeParaformerTranscription(cfg.AI, audioSourceURLForParaformer(msg, path)) if err == nil { return text, nil } failures = append(failures, err.Error()) if text, fallbackErr := callOpenAICompatibleAudioTranscription(cfg.AI, path); fallbackErr == nil { return text, nil } else { failures = append(failures, fallbackErr.Error()) } case audioModeTranscription, audioModeCustomHTTP: if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil { return text, nil } else { failures = append(failures, err.Error()) } default: if text, err := callOpenAICompatibleAudioChatTranscription(cfg.AI, path); err == nil { return text, nil } else { failures = append(failures, err.Error()) } if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil { return text, nil } else { failures = append(failures, err.Error()) } } return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): %s", mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), strings.Join(failures, " | ")) } func optionalSilkToStandardAudio(path string) (string, bool, error) { if strings.EqualFold(filepath.Ext(path), ".silk") { if converted, err := audioConvertSilkToWav(path); err == nil { return converted, true, nil } else { if _, ffmpegErr := audioFindFFmpeg(); ffmpegErr != nil { return "", false, fmt.Errorf("内置 silk 解码失败: %v;也未找到可用 ffmpeg: %v", err, ffmpegErr) } converted, mp3Err := audioConvertSilkToMp3(path) if mp3Err != nil { return "", true, fmt.Errorf("内置 silk 解码失败: %v;ffmpeg 兜底也失败: %v", err, mp3Err) } return converted, true, nil } } return path, false, nil } func convertSilkToWav(silkPath string) (string, error) { decoder, err := audioFindSilkDecoder() if err != nil { return "", err } wavPath := strings.TrimSuffix(silkPath, filepath.Ext(silkPath)) + ".wav" cmd := exec.Command(decoder, "-in", silkPath, "-out", wavPath) output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("silkdecode执行失败: %v, 输出: %s", err, truncateText(string(output), 240)) } info, err := os.Stat(wavPath) if err != nil { return "", fmt.Errorf("silkdecode未生成wav: %w", err) } if info.Size() <= 44 { return "", fmt.Errorf("silkdecode生成的wav为空或损坏: %s", wavPath) } return wavPath, nil } func findSilkDecoder() (string, error) { names := []string{"silkdecode.exe", "silk_decoder.exe", "silk-v3-decoder.exe"} candidates := make([]string, 0, 12) if currentDir, err := os.Getwd(); err == nil { for _, name := range names { candidates = append(candidates, filepath.Join(currentDir, "tools", "audio", name), filepath.Join(currentDir, name), ) } } if exePath, err := os.Executable(); err == nil { exeDir := filepath.Dir(exePath) for _, name := range names { candidates = append(candidates, filepath.Join(exeDir, "tools", "audio", name), filepath.Join(exeDir, name), ) } } for _, candidate := range candidates { if _, err := os.Stat(candidate); err == nil { return candidate, nil } } if path, err := exec.LookPath("silkdecode"); err == nil { return path, nil } return "", fmt.Errorf("缺少随包语音转码组件 silkdecode.exe") } func audioSourceURLForParaformer(msg autoReplyMessage, path string) string { for _, candidate := range []string{msg.MediaURL, path} { candidate = strings.TrimSpace(candidate) if strings.HasPrefix(strings.ToLower(candidate), "http://") || strings.HasPrefix(strings.ToLower(candidate), "https://") || strings.HasPrefix(strings.ToLower(candidate), "oss://") { return candidate } } return "" } func formatAudioFailures(failures []string) string { cleaned := make([]string, 0, len(failures)) for _, failure := range failures { if failure = strings.TrimSpace(failure); failure != "" { cleaned = append(cleaned, failure) } } if len(cleaned) == 0 { return "" } return ";附加信息: " + strings.Join(cleaned, " | ") } func mediaKindForRawType(rawType int) string { switch rawType { case 11042: return "image" case 11043: return "video" case 11044: return "voice" case 11045: return "file" case 11046: return "location" case 11047: return "link" default: return "non_text" } } func mediaTextDescription(msg autoReplyMessage) string { parts := make([]string, 0, 4) if content := strings.TrimSpace(msg.Content); content != "" && !strings.HasPrefix(content, "[") { parts = append(parts, content) } if msg.MediaFileName != "" { parts = append(parts, "文件:"+msg.MediaFileName) } if msg.MediaKind != "" && len(parts) == 0 { parts = append(parts, nonTextMessageDescription(msg)) } return strings.Join(parts, "\n") } func mediaRecognitionFallbackAnswer(msg autoReplyMessage) string { switch msg.MediaKind { case "voice": return "我这边暂时无法识别这条语音内容,麻烦您补充一句文字说明,我继续帮您处理。" case "image", "emoji", "video": return "我这边暂时无法识别这条图片/视频内容,麻烦您补充一句文字说明,我继续帮您处理。" default: return "我这边暂时无法识别这条内容,麻烦您补充一句文字说明,我继续帮您处理。" } } func ensureAutoReplyMediaLocalPath(msg autoReplyMessage) (string, error) { if path := strings.TrimSpace(msg.MediaLocalPath); path != "" { if _, err := os.Stat(path); err == nil { return path, nil } } ext := mediaExtForMessage(msg) base := msg.MediaFileID if base == "" { base = filepath.Base(strings.TrimSpace(msg.MediaURL)) } if base == "" || base == "." || base == string(filepath.Separator) { base = fmt.Sprintf("%s_%d", msg.MediaKind, msg.RawType) } savePath := generateSavePath("auto_reply_media", base, ext) if savePath == "" { return "", fmt.Errorf("failed to create media save path") } if msg.MediaURL != "" { if msg.MediaAESKey != "" || msg.MediaAuthKey != "" || msg.MediaSize > 0 { if DownloadMediaFileForClient(uint32(msg.ClientID), msg.MediaURL, msg.MediaAuthKey, msg.MediaAESKey, int(msg.MediaSize), savePath) { if _, err := os.Stat(savePath); err == nil { return savePath, nil } return "", fmt.Errorf("media download reported success but file missing: %s", savePath) } } if err := downloadPlainMedia(msg.MediaURL, savePath); err == nil { return savePath, nil } } if msg.MediaFileID != "" { if DownloadFileByFileIdForClient(uint32(msg.ClientID), msg.MediaAESKey, msg.MediaFileID, savePath, int(msg.MediaSize), msg.MediaFileType) { if _, err := os.Stat(savePath); err == nil { return savePath, nil } return "", fmt.Errorf("file_id download reported success but file missing: %s", savePath) } } return "", fmt.Errorf("media download failed") } func downloadPlainMedia(url string, savePath string) error { resp, err := http.Get(url) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { return fmt.Errorf("download status %d", resp.StatusCode) } if err := os.MkdirAll(filepath.Dir(savePath), 0755); err != nil { return err } file, err := os.Create(savePath) if err != nil { return err } defer file.Close() _, err = io.Copy(file, resp.Body) return err } func imageDataURLFromFile(path string) (string, error) { data, err := os.ReadFile(path) if err != nil { return "", err } if len(data) == 0 { return "", fmt.Errorf("empty image file") } mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path))) if mimeType == "" { mimeType = http.DetectContentType(data) } if !strings.HasPrefix(mimeType, "image/") { mimeType = "image/jpeg" } return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil } func audioDataURLFromFile(path string) (string, error) { data, err := os.ReadFile(path) if err != nil { return "", err } if len(data) == 0 { return "", fmt.Errorf("empty audio file") } mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path))) if mimeType == "" { switch strings.ToLower(filepath.Ext(path)) { case ".silk": mimeType = "audio/silk" case ".amr": mimeType = "audio/amr" case ".mp3": mimeType = "audio/mpeg" case ".wav": mimeType = "audio/wav" case ".m4a": mimeType = "audio/mp4" default: mimeType = http.DetectContentType(data) } } if mimeType == "" || mimeType == "application/octet-stream" { mimeType = "application/octet-stream" } return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil } func mediaExtForMessage(msg autoReplyMessage) string { if ext := filepath.Ext(msg.MediaFileName); ext != "" { return ext } if ext := filepath.Ext(strings.TrimSpace(msg.MediaURL)); ext != "" && len(ext) <= 8 { return ext } switch msg.MediaKind { case "voice": return ".silk" case "video": return ".mp4" case "file": return ".bin" default: return ".jpg" } } func fillMediaFieldsFromValue(msg *autoReplyMessage, value interface{}) { if msg == nil { return } cdn := firstMediaCdnMap(value) if len(cdn) == 0 { return } msg.MediaAESKey = firstNonEmptyString(cdn["aes_key"], cdn["aesKey"]) msg.MediaAuthKey = firstNonEmptyString(cdn["auth_key"], cdn["authKey"]) msg.MediaFileID = firstNonEmptyString(cdn["file_id"], cdn["fileId"]) msg.MediaFileName = firstNonEmptyString(cdn["file_name"], cdn["fileName"], cdn["name"]) if path := firstLocalMediaPathFromValue(cdn); path != "" { msg.MediaLocalPath = path } msg.MediaFileType = intFromAny(firstNonNil(cdn["file_type"], cdn["fileType"])) msg.MediaSize = int64(intFromAny(firstNonNil(cdn["size"], cdn["file_size"], cdn["fileSize"]))) if msg.MediaURL == "" { msg.MediaURL = firstMediaURLFromValue(cdn) } } func firstVoiceTextFromValue(value interface{}) string { switch v := value.(type) { case map[string]interface{}: for _, key := range []string{ "voice_text", "voiceText", "voice_to_text", "voiceToText", "translate_text", "translateText", "translated_text", "translatedText", "trans_text", "transText", "transcript", "transcription", "recognition_text", "recognitionText", "asr_text", "asrText", "speech_text", "speechText", "text_content", "textContent", } { if text := cleanVoiceTranscript(stringFromAny(v[key])); text != "" { return text } } for _, item := range v { if text := firstVoiceTextFromValue(item); text != "" { return text } } case []interface{}: for _, item := range v { if text := firstVoiceTextFromValue(item); text != "" { return text } } } return "" } func cleanVoiceTranscript(text string) string { text = strings.TrimSpace(text) if text == "" { return "" } if strings.HasPrefix(text, "{{") && strings.HasSuffix(text, "}}") { return "" } for _, prefix := range []string{"转文字完成", "转文字:", "转文字:", "语音转文字:", "语音转文字:", "转写:", "转写:"} { text = strings.TrimSpace(strings.TrimPrefix(text, prefix)) } return text } func firstMediaCdnMap(value interface{}) map[string]interface{} { switch v := value.(type) { case map[string]interface{}: for _, key := range []string{"cdn", "cdnData", "c2cCdnData"} { if child, ok := v[key].(map[string]interface{}); ok { return child } } for _, item := range v { if child := firstMediaCdnMap(item); len(child) > 0 { return child } } case []interface{}: for _, item := range v { if child := firstMediaCdnMap(item); len(child) > 0 { return child } } } return nil } func firstNonEmptyString(values ...interface{}) string { for _, value := range values { text := stringFromAny(value) if strings.TrimSpace(text) != "" { return strings.TrimSpace(text) } } return "" } func firstLocalMediaPathFromValue(value interface{}) string { switch v := value.(type) { case map[string]interface{}: for _, key := range []string{"local_path", "localPath", "path", "file_name", "fileName"} { text := strings.TrimSpace(stringFromAny(v[key])) if text != "" && filepath.IsAbs(text) { return text } } for _, item := range v { if path := firstLocalMediaPathFromValue(item); path != "" { return path } } case []interface{}: for _, item := range v { if path := firstLocalMediaPathFromValue(item); path != "" { return path } } } return "" }