559 lines
16 KiB
Go
559 lines
16 KiB
Go
package main
|
||
|
||
import (
|
||
"encoding/base64"
|
||
"fmt"
|
||
"io"
|
||
"mime"
|
||
"net/http"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"strings"
|
||
)
|
||
|
||
var (
|
||
autoReplyVisionRecognizer = defaultAutoReplyVisionRecognizer
|
||
autoReplyAudioTranscriber = defaultAutoReplyAudioTranscriber
|
||
audioFindSilkDecoder = findSilkDecoder
|
||
audioConvertSilkToWav = convertSilkToWav
|
||
audioFindFFmpeg = findFFmpeg
|
||
audioConvertSilkToMp3 = convertSilkToMp3
|
||
)
|
||
|
||
func (e *AutoReplyEngine) prepareMediaMessage(msg *autoReplyMessage) error {
|
||
if msg == nil {
|
||
return nil
|
||
}
|
||
if msg.MediaKind == "" {
|
||
msg.MediaKind = mediaKindForRawType(msg.RawType)
|
||
}
|
||
if msg.RawType == 11047 && looksLikeStickerOrImage(*msg) {
|
||
msg.MediaKind = "emoji"
|
||
}
|
||
switch msg.MediaKind {
|
||
case "voice":
|
||
if text := strings.TrimSpace(msg.VoiceText); text != "" {
|
||
msg.Content = text
|
||
msg.MessageType = "voice"
|
||
return nil
|
||
}
|
||
text, err := autoReplyAudioTranscriber(e, *msg)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
msg.Content = strings.TrimSpace(text)
|
||
msg.MessageType = "voice"
|
||
return nil
|
||
case "image", "emoji":
|
||
text, err := autoReplyVisionRecognizer(e, *msg)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
msg.Content = strings.TrimSpace(text)
|
||
msg.MessageType = msg.MediaKind
|
||
return nil
|
||
case "video":
|
||
desc := mediaTextDescription(*msg)
|
||
if desc != "" {
|
||
msg.Content = desc
|
||
}
|
||
if msg.MediaURL != "" || msg.MediaLocalPath != "" {
|
||
if text, err := autoReplyVisionRecognizer(e, *msg); err == nil && strings.TrimSpace(text) != "" {
|
||
msg.Content = strings.TrimSpace(msg.Content + "\n视频封面识别:" + text)
|
||
}
|
||
}
|
||
msg.MessageType = "video"
|
||
return nil
|
||
default:
|
||
if desc := mediaTextDescription(*msg); desc != "" {
|
||
msg.Content = desc
|
||
return nil
|
||
}
|
||
return fmt.Errorf("unsupported media message type: %s", msg.MediaKind)
|
||
}
|
||
}
|
||
|
||
func looksLikeStickerOrImageText(content string) bool {
|
||
content = strings.TrimSpace(content)
|
||
return strings.Contains(content, "表情") || strings.Contains(content, "图片") ||
|
||
strings.Contains(content, "琛ㄦ儏") || strings.Contains(content, "鍥剧墖")
|
||
}
|
||
|
||
func looksLikeStickerOrImage(msg autoReplyMessage) bool {
|
||
if looksLikeStickerOrImageText(msg.Content) {
|
||
return true
|
||
}
|
||
if strings.TrimSpace(msg.Content) != "" {
|
||
return false
|
||
}
|
||
return strings.TrimSpace(msg.MediaURL) != "" ||
|
||
strings.TrimSpace(msg.MediaFileID) != "" ||
|
||
strings.TrimSpace(msg.MediaLocalPath) != ""
|
||
}
|
||
|
||
func defaultAutoReplyVisionRecognizer(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
|
||
cfg := e.getConfig()
|
||
imageRef := strings.TrimSpace(msg.MediaURL)
|
||
if path, err := ensureAutoReplyMediaLocalPath(msg); err == nil && path != "" {
|
||
if dataURL, err := imageDataURLFromFile(path); err == nil && dataURL != "" {
|
||
imageRef = dataURL
|
||
}
|
||
}
|
||
if imageRef == "" {
|
||
return "", fmt.Errorf("missing image url or local file")
|
||
}
|
||
systemPrompt := buildVisionRecognitionSystemPrompt(cfg)
|
||
userPrompt := buildNonTextAutoReplyUserPrompt(msg)
|
||
result, err := callOpenAICompatibleVisionChat(cfg.AI, systemPrompt, userPrompt, imageRef)
|
||
if err != nil {
|
||
return "", fmt.Errorf("vision recognition failed (model=%s): %w", visionRequestConfig(cfg.AI).Model, err)
|
||
}
|
||
return strings.TrimSpace(result.Answer), nil
|
||
}
|
||
|
||
func defaultAutoReplyAudioTranscriber(e *AutoReplyEngine, msg autoReplyMessage) (string, error) {
|
||
cfg := e.getConfig()
|
||
path, err := ensureAutoReplyMediaLocalPath(msg)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
var failures []string
|
||
if warning := audioConfigWarning(cfg.AI); warning != "" {
|
||
failures = append(failures, warning)
|
||
}
|
||
mode := inferAudioMode(cfg.AI)
|
||
ext := strings.ToLower(filepath.Ext(path))
|
||
if ext == ".silk" {
|
||
if converted, ok, err := optionalSilkToStandardAudio(path); err != nil {
|
||
failures = append(failures, "silk 转码失败: "+err.Error())
|
||
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): 缺少可用的企微 silk 语音转码能力或转码失败%s",
|
||
mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), formatAudioFailures(failures))
|
||
} else if ok {
|
||
path = converted
|
||
ext = strings.ToLower(filepath.Ext(path))
|
||
}
|
||
}
|
||
switch mode {
|
||
case audioModeParaformer:
|
||
text, err := callDashScopeParaformerTranscription(cfg.AI, audioSourceURLForParaformer(msg, path))
|
||
if err == nil {
|
||
return text, nil
|
||
}
|
||
failures = append(failures, err.Error())
|
||
if text, fallbackErr := callOpenAICompatibleAudioTranscription(cfg.AI, path); fallbackErr == nil {
|
||
return text, nil
|
||
} else {
|
||
failures = append(failures, fallbackErr.Error())
|
||
}
|
||
case audioModeTranscription, audioModeCustomHTTP:
|
||
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
|
||
return text, nil
|
||
} else {
|
||
failures = append(failures, err.Error())
|
||
}
|
||
default:
|
||
if text, err := callOpenAICompatibleAudioChatTranscription(cfg.AI, path); err == nil {
|
||
return text, nil
|
||
} else {
|
||
failures = append(failures, err.Error())
|
||
}
|
||
if text, err := callOpenAICompatibleAudioTranscription(cfg.AI, path); err == nil {
|
||
return text, nil
|
||
} else {
|
||
failures = append(failures, err.Error())
|
||
}
|
||
}
|
||
return "", fmt.Errorf("voice recognition failed (mode=%s model=%s): %s", mode, fallbackString(cfg.AI.AudioModel, defaultAudioModel), strings.Join(failures, " | "))
|
||
}
|
||
|
||
func optionalSilkToStandardAudio(path string) (string, bool, error) {
|
||
if strings.EqualFold(filepath.Ext(path), ".silk") {
|
||
if converted, err := audioConvertSilkToWav(path); err == nil {
|
||
return converted, true, nil
|
||
} else {
|
||
if _, ffmpegErr := audioFindFFmpeg(); ffmpegErr != nil {
|
||
return "", false, fmt.Errorf("内置 silk 解码失败: %v;也未找到可用 ffmpeg: %v", err, ffmpegErr)
|
||
}
|
||
converted, mp3Err := audioConvertSilkToMp3(path)
|
||
if mp3Err != nil {
|
||
return "", true, fmt.Errorf("内置 silk 解码失败: %v;ffmpeg 兜底也失败: %v", err, mp3Err)
|
||
}
|
||
return converted, true, nil
|
||
}
|
||
}
|
||
return path, false, nil
|
||
}
|
||
|
||
func convertSilkToWav(silkPath string) (string, error) {
|
||
decoder, err := audioFindSilkDecoder()
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
wavPath := strings.TrimSuffix(silkPath, filepath.Ext(silkPath)) + ".wav"
|
||
cmd := exec.Command(decoder, "-in", silkPath, "-out", wavPath)
|
||
output, err := cmd.CombinedOutput()
|
||
if err != nil {
|
||
return "", fmt.Errorf("silkdecode执行失败: %v, 输出: %s", err, truncateText(string(output), 240))
|
||
}
|
||
info, err := os.Stat(wavPath)
|
||
if err != nil {
|
||
return "", fmt.Errorf("silkdecode未生成wav: %w", err)
|
||
}
|
||
if info.Size() <= 44 {
|
||
return "", fmt.Errorf("silkdecode生成的wav为空或损坏: %s", wavPath)
|
||
}
|
||
return wavPath, nil
|
||
}
|
||
|
||
func findSilkDecoder() (string, error) {
|
||
names := []string{"silkdecode.exe", "silk_decoder.exe", "silk-v3-decoder.exe"}
|
||
candidates := make([]string, 0, 12)
|
||
if currentDir, err := os.Getwd(); err == nil {
|
||
for _, name := range names {
|
||
candidates = append(candidates,
|
||
filepath.Join(currentDir, "tools", "audio", name),
|
||
filepath.Join(currentDir, name),
|
||
)
|
||
}
|
||
}
|
||
if exePath, err := os.Executable(); err == nil {
|
||
exeDir := filepath.Dir(exePath)
|
||
for _, name := range names {
|
||
candidates = append(candidates,
|
||
filepath.Join(exeDir, "tools", "audio", name),
|
||
filepath.Join(exeDir, name),
|
||
)
|
||
}
|
||
}
|
||
for _, candidate := range candidates {
|
||
if _, err := os.Stat(candidate); err == nil {
|
||
return candidate, nil
|
||
}
|
||
}
|
||
if path, err := exec.LookPath("silkdecode"); err == nil {
|
||
return path, nil
|
||
}
|
||
return "", fmt.Errorf("缺少随包语音转码组件 silkdecode.exe")
|
||
}
|
||
|
||
func audioSourceURLForParaformer(msg autoReplyMessage, path string) string {
|
||
for _, candidate := range []string{msg.MediaURL, path} {
|
||
candidate = strings.TrimSpace(candidate)
|
||
if strings.HasPrefix(strings.ToLower(candidate), "http://") || strings.HasPrefix(strings.ToLower(candidate), "https://") || strings.HasPrefix(strings.ToLower(candidate), "oss://") {
|
||
return candidate
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func formatAudioFailures(failures []string) string {
|
||
cleaned := make([]string, 0, len(failures))
|
||
for _, failure := range failures {
|
||
if failure = strings.TrimSpace(failure); failure != "" {
|
||
cleaned = append(cleaned, failure)
|
||
}
|
||
}
|
||
if len(cleaned) == 0 {
|
||
return ""
|
||
}
|
||
return ";附加信息: " + strings.Join(cleaned, " | ")
|
||
}
|
||
|
||
func mediaKindForRawType(rawType int) string {
|
||
switch rawType {
|
||
case 11042:
|
||
return "image"
|
||
case 11043:
|
||
return "video"
|
||
case 11044:
|
||
return "voice"
|
||
case 11045:
|
||
return "file"
|
||
case 11046:
|
||
return "location"
|
||
case 11047:
|
||
return "link"
|
||
default:
|
||
return "non_text"
|
||
}
|
||
}
|
||
|
||
func mediaTextDescription(msg autoReplyMessage) string {
|
||
parts := make([]string, 0, 4)
|
||
if content := strings.TrimSpace(msg.Content); content != "" && !strings.HasPrefix(content, "[") {
|
||
parts = append(parts, content)
|
||
}
|
||
if msg.MediaFileName != "" {
|
||
parts = append(parts, "文件:"+msg.MediaFileName)
|
||
}
|
||
if msg.MediaKind != "" && len(parts) == 0 {
|
||
parts = append(parts, nonTextMessageDescription(msg))
|
||
}
|
||
return strings.Join(parts, "\n")
|
||
}
|
||
|
||
func mediaRecognitionFallbackAnswer(msg autoReplyMessage) string {
|
||
switch msg.MediaKind {
|
||
case "voice":
|
||
return "我这边暂时无法识别这条语音内容,麻烦您补充一句文字说明,我继续帮您处理。"
|
||
case "image", "emoji", "video":
|
||
return "我这边暂时无法识别这条图片/视频内容,麻烦您补充一句文字说明,我继续帮您处理。"
|
||
default:
|
||
return "我这边暂时无法识别这条内容,麻烦您补充一句文字说明,我继续帮您处理。"
|
||
}
|
||
}
|
||
|
||
func ensureAutoReplyMediaLocalPath(msg autoReplyMessage) (string, error) {
|
||
if path := strings.TrimSpace(msg.MediaLocalPath); path != "" {
|
||
if _, err := os.Stat(path); err == nil {
|
||
return path, nil
|
||
}
|
||
}
|
||
ext := mediaExtForMessage(msg)
|
||
base := msg.MediaFileID
|
||
if base == "" {
|
||
base = filepath.Base(strings.TrimSpace(msg.MediaURL))
|
||
}
|
||
if base == "" || base == "." || base == string(filepath.Separator) {
|
||
base = fmt.Sprintf("%s_%d", msg.MediaKind, msg.RawType)
|
||
}
|
||
savePath := generateSavePath("auto_reply_media", base, ext)
|
||
if savePath == "" {
|
||
return "", fmt.Errorf("failed to create media save path")
|
||
}
|
||
if msg.MediaURL != "" {
|
||
if msg.MediaAESKey != "" || msg.MediaAuthKey != "" || msg.MediaSize > 0 {
|
||
if DownloadMediaFileForClient(uint32(msg.ClientID), msg.MediaURL, msg.MediaAuthKey, msg.MediaAESKey, int(msg.MediaSize), savePath) {
|
||
if _, err := os.Stat(savePath); err == nil {
|
||
return savePath, nil
|
||
}
|
||
return "", fmt.Errorf("media download reported success but file missing: %s", savePath)
|
||
}
|
||
}
|
||
if err := downloadPlainMedia(msg.MediaURL, savePath); err == nil {
|
||
return savePath, nil
|
||
}
|
||
}
|
||
if msg.MediaFileID != "" {
|
||
if DownloadFileByFileIdForClient(uint32(msg.ClientID), msg.MediaAESKey, msg.MediaFileID, savePath, int(msg.MediaSize), msg.MediaFileType) {
|
||
if _, err := os.Stat(savePath); err == nil {
|
||
return savePath, nil
|
||
}
|
||
return "", fmt.Errorf("file_id download reported success but file missing: %s", savePath)
|
||
}
|
||
}
|
||
return "", fmt.Errorf("media download failed")
|
||
}
|
||
|
||
func downloadPlainMedia(url string, savePath string) error {
|
||
resp, err := http.Get(url)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||
return fmt.Errorf("download status %d", resp.StatusCode)
|
||
}
|
||
if err := os.MkdirAll(filepath.Dir(savePath), 0755); err != nil {
|
||
return err
|
||
}
|
||
file, err := os.Create(savePath)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer file.Close()
|
||
_, err = io.Copy(file, resp.Body)
|
||
return err
|
||
}
|
||
|
||
func imageDataURLFromFile(path string) (string, error) {
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if len(data) == 0 {
|
||
return "", fmt.Errorf("empty image file")
|
||
}
|
||
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
|
||
if mimeType == "" {
|
||
mimeType = http.DetectContentType(data)
|
||
}
|
||
if !strings.HasPrefix(mimeType, "image/") {
|
||
mimeType = "image/jpeg"
|
||
}
|
||
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
|
||
}
|
||
|
||
func audioDataURLFromFile(path string) (string, error) {
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if len(data) == 0 {
|
||
return "", fmt.Errorf("empty audio file")
|
||
}
|
||
mimeType := mime.TypeByExtension(strings.ToLower(filepath.Ext(path)))
|
||
if mimeType == "" {
|
||
switch strings.ToLower(filepath.Ext(path)) {
|
||
case ".silk":
|
||
mimeType = "audio/silk"
|
||
case ".amr":
|
||
mimeType = "audio/amr"
|
||
case ".mp3":
|
||
mimeType = "audio/mpeg"
|
||
case ".wav":
|
||
mimeType = "audio/wav"
|
||
case ".m4a":
|
||
mimeType = "audio/mp4"
|
||
default:
|
||
mimeType = http.DetectContentType(data)
|
||
}
|
||
}
|
||
if mimeType == "" || mimeType == "application/octet-stream" {
|
||
mimeType = "application/octet-stream"
|
||
}
|
||
return "data:" + mimeType + ";base64," + base64.StdEncoding.EncodeToString(data), nil
|
||
}
|
||
|
||
func mediaExtForMessage(msg autoReplyMessage) string {
|
||
if ext := filepath.Ext(msg.MediaFileName); ext != "" {
|
||
return ext
|
||
}
|
||
if ext := filepath.Ext(strings.TrimSpace(msg.MediaURL)); ext != "" && len(ext) <= 8 {
|
||
return ext
|
||
}
|
||
switch msg.MediaKind {
|
||
case "voice":
|
||
return ".silk"
|
||
case "video":
|
||
return ".mp4"
|
||
case "file":
|
||
return ".bin"
|
||
default:
|
||
return ".jpg"
|
||
}
|
||
}
|
||
|
||
func fillMediaFieldsFromValue(msg *autoReplyMessage, value interface{}) {
|
||
if msg == nil {
|
||
return
|
||
}
|
||
cdn := firstMediaCdnMap(value)
|
||
if len(cdn) == 0 {
|
||
return
|
||
}
|
||
msg.MediaAESKey = firstNonEmptyString(cdn["aes_key"], cdn["aesKey"])
|
||
msg.MediaAuthKey = firstNonEmptyString(cdn["auth_key"], cdn["authKey"])
|
||
msg.MediaFileID = firstNonEmptyString(cdn["file_id"], cdn["fileId"])
|
||
msg.MediaFileName = firstNonEmptyString(cdn["file_name"], cdn["fileName"], cdn["name"])
|
||
if path := firstLocalMediaPathFromValue(cdn); path != "" {
|
||
msg.MediaLocalPath = path
|
||
}
|
||
msg.MediaFileType = intFromAny(firstNonNil(cdn["file_type"], cdn["fileType"]))
|
||
msg.MediaSize = int64(intFromAny(firstNonNil(cdn["size"], cdn["file_size"], cdn["fileSize"])))
|
||
if msg.MediaURL == "" {
|
||
msg.MediaURL = firstMediaURLFromValue(cdn)
|
||
}
|
||
}
|
||
|
||
func firstVoiceTextFromValue(value interface{}) string {
|
||
switch v := value.(type) {
|
||
case map[string]interface{}:
|
||
for _, key := range []string{
|
||
"voice_text", "voiceText", "voice_to_text", "voiceToText",
|
||
"translate_text", "translateText", "translated_text", "translatedText",
|
||
"trans_text", "transText", "transcript", "transcription",
|
||
"recognition_text", "recognitionText", "asr_text", "asrText",
|
||
"speech_text", "speechText", "text_content", "textContent",
|
||
} {
|
||
if text := cleanVoiceTranscript(stringFromAny(v[key])); text != "" {
|
||
return text
|
||
}
|
||
}
|
||
for _, item := range v {
|
||
if text := firstVoiceTextFromValue(item); text != "" {
|
||
return text
|
||
}
|
||
}
|
||
case []interface{}:
|
||
for _, item := range v {
|
||
if text := firstVoiceTextFromValue(item); text != "" {
|
||
return text
|
||
}
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func cleanVoiceTranscript(text string) string {
|
||
text = strings.TrimSpace(text)
|
||
if text == "" {
|
||
return ""
|
||
}
|
||
if strings.HasPrefix(text, "{{") && strings.HasSuffix(text, "}}") {
|
||
return ""
|
||
}
|
||
for _, prefix := range []string{"转文字完成", "转文字:", "转文字:", "语音转文字:", "语音转文字:", "转写:", "转写:"} {
|
||
text = strings.TrimSpace(strings.TrimPrefix(text, prefix))
|
||
}
|
||
return text
|
||
}
|
||
|
||
func firstMediaCdnMap(value interface{}) map[string]interface{} {
|
||
switch v := value.(type) {
|
||
case map[string]interface{}:
|
||
for _, key := range []string{"cdn", "cdnData", "c2cCdnData"} {
|
||
if child, ok := v[key].(map[string]interface{}); ok {
|
||
return child
|
||
}
|
||
}
|
||
for _, item := range v {
|
||
if child := firstMediaCdnMap(item); len(child) > 0 {
|
||
return child
|
||
}
|
||
}
|
||
case []interface{}:
|
||
for _, item := range v {
|
||
if child := firstMediaCdnMap(item); len(child) > 0 {
|
||
return child
|
||
}
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func firstNonEmptyString(values ...interface{}) string {
|
||
for _, value := range values {
|
||
text := stringFromAny(value)
|
||
if strings.TrimSpace(text) != "" {
|
||
return strings.TrimSpace(text)
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func firstLocalMediaPathFromValue(value interface{}) string {
|
||
switch v := value.(type) {
|
||
case map[string]interface{}:
|
||
for _, key := range []string{"local_path", "localPath", "path", "file_name", "fileName"} {
|
||
text := strings.TrimSpace(stringFromAny(v[key]))
|
||
if text != "" && filepath.IsAbs(text) {
|
||
return text
|
||
}
|
||
}
|
||
for _, item := range v {
|
||
if path := firstLocalMediaPathFromValue(item); path != "" {
|
||
return path
|
||
}
|
||
}
|
||
case []interface{}:
|
||
for _, item := range v {
|
||
if path := firstLocalMediaPathFromValue(item); path != "" {
|
||
return path
|
||
}
|
||
}
|
||
}
|
||
return ""
|
||
}
|