Files
qiweimanager-master/helper/auto_reply_media.go

559 lines
16 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"encoding/base64"
"fmt"
"io"
"mime"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
)
var (
autoReplyVisionRecognizer = defaultAutoReplyVisionRecognizer
autoReplyAudioTranscriber = defaultAutoReplyAudioTranscriber
audioFindSilkDecoder = findSilkDecoder
audioConvertSilkToWav = convertSilkToWav
audioFindFFmpeg = findFFmpeg
audioConvertSilkToMp3 = convertSilkToMp3
)
func (e *AutoReplyEngine) prepareMediaMessage(msg *autoReplyMessage) error {
if msg == nil {
return nil
}
if msg.MediaKind == "" {
msg.MediaKind = mediaKindForRawType(msg.RawType)
}
if msg.RawType == 11047 && looksLikeStickerOrImage(*msg) {
msg.MediaKind = "emoji"
}
switch msg.MediaKind {
case "voice":
if text := strings.TrimSpace(msg.VoiceText); text != "" {
msg.Content = text
msg.MessageType = "voice"
return nil
}
text, err := autoReplyAudioTranscriber(e, *msg)
if err != nil {
return err
}
msg.Content = strings.TrimSpace(text)
msg.MessageType = "voice"
return nil
case "image", "emoji":
text, err := autoReplyVisionRecognizer(e, *msg)
if err != nil {
return err
}
msg.Content = strings.TrimSpace(text)
msg.MessageType = msg.MediaKind
return nil
case "video":
desc := mediaTextDescription(*msg)
if desc != "" {
msg.Content = desc
}
if msg.MediaURL != "" || msg.MediaLocalPath != "" {
if text, err := autoReplyVisionRecognizer(e, *msg); err == nil && strings.TrimSpace(text) != "" {
msg.Content = strings.TrimSpace(msg.Content + "\n视频封面识别" + text)
}
}
msg.MessageType = "video"
return nil
default:
if desc := mediaTextDescription(*msg); desc != "" {
msg.Content = desc
return nil
}
return fmt.Errorf("unsupported media message type: %s", msg.MediaKind)
}
}
func looksLikeStickerOrImageText(content string) bool {
content = strings.TrimSpace(content)
return strings.Contains(content, "表情") || strings.Contains(content, "图片") ||
strings.Contains(content, "琛ㄦ儏") || strings.Contains(content, "鍥剧墖")
}
func looksLikeStickerOrImage(msg autoReplyMessage) bool {
if looksLikeStickerOrImageText(msg.Content) {
return true
}
if strings.TrimSpace(msg.Content) != "" {
return false
}
return strings.TrimSpace(msg.MediaURL) != "" ||
strings.TrimSpace(msg.MediaFileID) != "" ||
strings.TrimSpace(msg.MediaLocalPath) != ""
}
func defaultAutoReplyVisionRecognizer(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
cfg := e.getConfig()
imageRef := strings.TrimSpace(msg.MediaURL)
if path, err := ensureAutoReplyMediaLocalPath(msg); err == nil && path != "" {
if dataURL, err := imageDataURLFromFile(path); err == nil && dataURL != "" {
imageRef = dataURL
}
}
if imageRef == "" {
return "", fmt.Errorf("missing image url or local file")
}
systemPrompt := buildVisionRecognitionSystemPrompt(cfg)
userPrompt := buildNonTextAutoReplyUserPrompt(msg)
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, imageRef)
if err != nil {
return "", fmt.Errorf("vision recognition failed (model=%s): %w", visionRequestConfig(cfg.AI).Model, err)
}
return strings.TrimSpace(result.Answer), nil
}
func defaultAutoReplyAudioTranscriber(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
cfg := e.getConfig()
path, err := ensureAutoReplyMediaLocalPath(msg)
if err != nil {
return "", err
}
var failures []string
if warning := audioConfigWarning(cfg.AI); warning != "" {
failures = append(failures, warning)
}
mode := inferAudioMode(cfg.AI)
ext := strings.ToLower(filepath.Ext(path))
if ext == ".silk" {
if converted, ok, err := optionalSilkToStandardAudio(path); err != nil {
failures = append(failures, "silk 转码失败: "+err.Error())
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): 缺少可用的企微 silk 语音转码能力或转码失败%s",
mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), formatAudioFailures(failures))
} else if ok {
path = converted
ext = strings.ToLower(filepath.Ext(path))
}
}
switch mode {
case audioModeParaformer:
text, err := callDashScopeParaformerTranscription(cfg.AI, audioSourceURLForParaformer(msg, path))
if err == nil {
return text, nil
}
failures = append(failures, err.Error())
if text, fallbackErr := callOpenAICompatibleAudioTranscription(cfg.AI, path); fallbackErr == nil {
return text, nil
} else {
failures = append(failures, fallbackErr.Error())
}
case audioModeTranscription, audioModeCustomHTTP:
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
default:
if text, err := callOpenAICompatibleAudioChatTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
return text, nil
} else {
failures = append(failures, err.Error())
}
}
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): %s", mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), strings.Join(failures, " | "))
}
func optionalSilkToStandardAudio(path string) (string, bool, error) {
if strings.EqualFold(filepath.Ext(path), ".silk") {
if converted, err := audioConvertSilkToWav(path); err == nil {
return converted, true, nil
} else {
if _, ffmpegErr := audioFindFFmpeg(); ffmpegErr != nil {
return "", false, fmt.Errorf("内置 silk 解码失败: %v也未找到可用 ffmpeg: %v", err, ffmpegErr)
}
converted, mp3Err := audioConvertSilkToMp3(path)
if mp3Err != nil {
return "", true, fmt.Errorf("内置 silk 解码失败: %vffmpeg 兜底也失败: %v", err, mp3Err)
}
return converted, true, nil
}
}
return path, false, nil
}
func convertSilkToWav(silkPath string) (string, error) {
decoder, err := audioFindSilkDecoder()
if err != nil {
return "", err
}
wavPath := strings.TrimSuffix(silkPath, filepath.Ext(silkPath)) + ".wav"
cmd := exec.Command(decoder, "-in", silkPath, "-out", wavPath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("silkdecode执行失败: %v, 输出: %s", err, truncateText(string(output), 240))
}
info, err := os.Stat(wavPath)
if err != nil {
return "", fmt.Errorf("silkdecode未生成wav: %w", err)
}
if info.Size() <= 44 {
return "", fmt.Errorf("silkdecode生成的wav为空或损坏: %s", wavPath)
}
return wavPath, nil
}
func findSilkDecoder() (string, error) {
names := []string{"silkdecode.exe", "silk_decoder.exe", "silk-v3-decoder.exe"}
candidates := make([]string, 0, 12)
if currentDir, err := os.Getwd(); err == nil {
for _, name := range names {
candidates = append(candidates,
filepath.Join(currentDir, "tools", "audio", name),
filepath.Join(currentDir, name),
)
}
}
if exePath, err := os.Executable(); err == nil {
exeDir := filepath.Dir(exePath)
for _, name := range names {
candidates = append(candidates,
filepath.Join(exeDir, "tools", "audio", name),
filepath.Join(exeDir, name),
)
}
}
for _, candidate := range candidates {
if _, err := os.Stat(candidate); err == nil {
return candidate, nil
}
}
if path, err := exec.LookPath("silkdecode"); err == nil {
return path, nil
}
return "", fmt.Errorf("缺少随包语音转码组件 silkdecode.exe")
}
func audioSourceURLForParaformer(msg autoReplyMessage, path string) string {
for _, candidate := range []string{msg.MediaURL, path} {
candidate = strings.TrimSpace(candidate)
if strings.HasPrefix(strings.ToLower(candidate), "http://") || strings.HasPrefix(strings.ToLower(candidate), "https://") || strings.HasPrefix(strings.ToLower(candidate), "oss://") {
return candidate
}
}
return ""
}
func formatAudioFailures(failures []string) string {
cleaned := make([]string, 0, len(failures))
for _, failure := range failures {
if failure = strings.TrimSpace(failure); failure != "" {
cleaned = append(cleaned, failure)
}
}
if len(cleaned) == 0 {
return ""
}
return ";附加信息: " + strings.Join(cleaned, " | ")
}
func mediaKindForRawType(rawType int) string {
switch rawType {
case 11042:
return "image"
case 11043:
return "video"
case 11044:
return "voice"
case 11045:
return "file"
case 11046:
return "location"
case 11047:
return "link"
default:
return "non_text"
}
}
func mediaTextDescription(msg autoReplyMessage) string {
parts := make([]string, 0, 4)
if content := strings.TrimSpace(msg.Content); content != "" && !strings.HasPrefix(content, "[") {
parts = append(parts, content)
}
if msg.MediaFileName != "" {
parts = append(parts, "文件:"+msg.MediaFileName)
}
if msg.MediaKind != "" && len(parts) == 0 {
parts = append(parts, nonTextMessageDescription(msg))
}
return strings.Join(parts, "\n")
}
func mediaRecognitionFallbackAnswer(msg autoReplyMessage) string {
switch msg.MediaKind {
case "voice":
return "我这边暂时无法识别这条语音内容,麻烦您补充一句文字说明,我继续帮您处理。"
case "image", "emoji", "video":
return "我这边暂时无法识别这条图片/视频内容,麻烦您补充一句文字说明,我继续帮您处理。"
default:
return "我这边暂时无法识别这条内容,麻烦您补充一句文字说明,我继续帮您处理。"
}
}
func ensureAutoReplyMediaLocalPath(msg autoReplyMessage) (string, error) {
if path := strings.TrimSpace(msg.MediaLocalPath); path != "" {
if _, err := os.Stat(path); err == nil {
return path, nil
}
}
ext := mediaExtForMessage(msg)
base := msg.MediaFileID
if base == "" {
base = filepath.Base(strings.TrimSpace(msg.MediaURL))
}
if base == "" || base == "." || base == string(filepath.Separator) {
base = fmt.Sprintf("%s_%d", msg.MediaKind, msg.RawType)
}
savePath := generateSavePath("auto_reply_media", base, ext)
if savePath == "" {
return "", fmt.Errorf("failed to create media save path")
}
if msg.MediaURL != "" {
if msg.MediaAESKey != "" || msg.MediaAuthKey != "" || msg.MediaSize > 0 {
if DownloadMediaFileForClient(uint32(msg.ClientID), msg.MediaURL, msg.MediaAuthKey, msg.MediaAESKey, int(msg.MediaSize), savePath) {
if _, err := os.Stat(savePath); err == nil {
return savePath, nil
}
return "", fmt.Errorf("media download reported success but file missing: %s", savePath)
}
}
if err := downloadPlainMedia(msg.MediaURL, savePath); err == nil {
return savePath, nil
}
}
if msg.MediaFileID != "" {
if DownloadFileByFileIdForClient(uint32(msg.ClientID), msg.MediaAESKey, msg.MediaFileID, savePath, int(msg.MediaSize), msg.MediaFileType) {
if _, err := os.Stat(savePath); err == nil {
return savePath, nil
}
return "", fmt.Errorf("file_id download reported success but file missing: %s", savePath)
}
}
return "", fmt.Errorf("media download failed")
}
func downloadPlainMedia(url string, savePath string) error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("download status %d", resp.StatusCode)
}
if err := os.MkdirAll(filepath.Dir(savePath), 0755); err != nil {
return err
}
file, err := os.Create(savePath)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
func imageDataURLFromFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
if len(data) == 0 {
return "", fmt.Errorf("empty image file")
}
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
if mimeType == "" {
mimeType = http.DetectContentType(data)
}
if !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
}
func audioDataURLFromFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
if len(data) == 0 {
return "", fmt.Errorf("empty audio file")
}
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
if mimeType == "" {
switch strings.ToLower(filepath.Ext(path)) {
case ".silk":
mimeType = "audio/silk"
case ".amr":
mimeType = "audio/amr"
case ".mp3":
mimeType = "audio/mpeg"
case ".wav":
mimeType = "audio/wav"
case ".m4a":
mimeType = "audio/mp4"
default:
mimeType = http.DetectContentType(data)
}
}
if mimeType == "" || mimeType == "application/octet-stream" {
mimeType = "application/octet-stream"
}
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
}
func mediaExtForMessage(msg autoReplyMessage) string {
if ext := filepath.Ext(msg.MediaFileName); ext != "" {
return ext
}
if ext := filepath.Ext(strings.TrimSpace(msg.MediaURL)); ext != "" && len(ext) <= 8 {
return ext
}
switch msg.MediaKind {
case "voice":
return ".silk"
case "video":
return ".mp4"
case "file":
return ".bin"
default:
return ".jpg"
}
}
func fillMediaFieldsFromValue(msg *autoReplyMessage, value interface{}) {
if msg == nil {
return
}
cdn := firstMediaCdnMap(value)
if len(cdn) == 0 {
return
}
msg.MediaAESKey = firstNonEmptyString(cdn["aes_key"], cdn["aesKey"])
msg.MediaAuthKey = firstNonEmptyString(cdn["auth_key"], cdn["authKey"])
msg.MediaFileID = firstNonEmptyString(cdn["file_id"], cdn["fileId"])
msg.MediaFileName = firstNonEmptyString(cdn["file_name"], cdn["fileName"], cdn["name"])
if path := firstLocalMediaPathFromValue(cdn); path != "" {
msg.MediaLocalPath = path
}
msg.MediaFileType = intFromAny(firstNonNil(cdn["file_type"], cdn["fileType"]))
msg.MediaSize = int64(intFromAny(firstNonNil(cdn["size"], cdn["file_size"], cdn["fileSize"])))
if msg.MediaURL == "" {
msg.MediaURL = firstMediaURLFromValue(cdn)
}
}
func firstVoiceTextFromValue(value interface{}) string {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{
"voice_text", "voiceText", "voice_to_text", "voiceToText",
"translate_text", "translateText", "translated_text", "translatedText",
"trans_text", "transText", "transcript", "transcription",
"recognition_text", "recognitionText", "asr_text", "asrText",
"speech_text", "speechText", "text_content", "textContent",
} {
if text := cleanVoiceTranscript(stringFromAny(v[key])); text != "" {
return text
}
}
for _, item := range v {
if text := firstVoiceTextFromValue(item); text != "" {
return text
}
}
case []interface{}:
for _, item := range v {
if text := firstVoiceTextFromValue(item); text != "" {
return text
}
}
}
return ""
}
func cleanVoiceTranscript(text string) string {
text = strings.TrimSpace(text)
if text == "" {
return ""
}
if strings.HasPrefix(text, "{{") && strings.HasSuffix(text, "}}") {
return ""
}
for _, prefix := range []string{"转文字完成", "转文字:", "转文字:", "语音转文字:", "语音转文字:", "转写:", "转写:"} {
text = strings.TrimSpace(strings.TrimPrefix(text, prefix))
}
return text
}
func firstMediaCdnMap(value interface{}) map[string]interface{} {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{"cdn", "cdnData", "c2cCdnData"} {
if child, ok := v[key].(map[string]interface{}); ok {
return child
}
}
for _, item := range v {
if child := firstMediaCdnMap(item); len(child) > 0 {
return child
}
}
case []interface{}:
for _, item := range v {
if child := firstMediaCdnMap(item); len(child) > 0 {
return child
}
}
}
return nil
}
func firstNonEmptyString(values ...interface{}) string {
for _, value := range values {
text := stringFromAny(value)
if strings.TrimSpace(text) != "" {
return strings.TrimSpace(text)
}
}
return ""
}
func firstLocalMediaPathFromValue(value interface{}) string {
switch v := value.(type) {
case map[string]interface{}:
for _, key := range []string{"local_path", "localPath", "path", "file_name", "fileName"} {
text := strings.TrimSpace(stringFromAny(v[key]))
if text != "" && filepath.IsAbs(text) {
return text
}
}
for _, item := range v {
if path := firstLocalMediaPathFromValue(item); path != "" {
return path
}
}
case []interface{}:
for _, item := range v {
if path := firstLocalMediaPathFromValue(item); path != "" {
return path
}
}
}
return ""
}