Files
get_wechat/chatlog_fastAPI/services/media_parser.py
yuanzhipeng 646efa132e ```
feat(api): 添加万川平台模型配置获取和同步功能

- 新增 getWanchuanModelConfig 函数,按模型编码获取平台模型配置
- 新增 syncWanchuanModelToSettings 函数,从万川平台拉取模型配置并写入后端 AI 设置
- 支持按用途分多个模型编码(generic/vision/voice)分别同步配置
- 配置失败时跳过对应字段,不影响其他模型同步

feat(settings): 重构AI模型配置界面支持多模块分组

- 将AI配置按话题分析、报告生成、视觉、语音四个模块分组展示
- 每个模块独立配置接口地址、密钥和模型名称
- 添加从万川平台获取配置的按钮和同步功能
- 优化配置状态指示和错误提示信息

refactor(config): 扩展AI配置支持独立的语音视觉报告网关

- 新增 voice_base_url/voice_api_key 配置项
- 新增 vision_base_url/vision_api_key 配置项
- 新增 summary_base_url/summary_api_key 配置项
- 留空时回退到 ai_base_url/ai_api_key 兼容单网关场景

refactor(http): 统一使用共享HTTP客户端减少连接开销

- 替换各处 httpx.AsyncClient 为 shared_client
- 在 lifespan 中正确关闭共享客户端资源
- 优化 get_current_wxid 和 health 检查中的HTTP请求

refactor(ai): 按用途缓存AI客户端支持不同网关配置

- 重构 get_openai_client 支持按(base_url, api_key)缓存
- 新增 get_client_for 函数按用途获取对应客户端
- 支持语音、视觉、报告等不同用途使用独立网关和密钥
```
2026-06-24 20:34:10 +08:00

183 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import asyncio
import base64
import logging
import httpx
from fastapi import HTTPException
from services.ai_client import get_client_for
from services.media_resolver import resolve_media
from services.runtime_settings import get_ai_settings
log = logging.getLogger(__name__)
# 语音异步 ASR 默认网关阿里云。voice_base_url 为空时回退到此;
# 提交任务/轮询的子路径由代码自动拼接,配置只需填到 .../api/v1 这一层。
DEFAULT_ASR_BASE_URL = "https://dashscope.aliyuncs.com/api/v1"
async def parse_media(kind: str, key: str) -> dict:
"""
Parse one chatlog media object into text.
kind: voice, image, or video.
key: chatlog media key.
"""
if kind not in {"voice", "image", "video"}:
raise HTTPException(400, "不支持的媒体类型")
if not key:
raise HTTPException(400, "媒体 key 不能为空")
ai = await get_ai_settings()
# voice/vision 各自有独立 url/key为空则回退全局 ai_api_key
if kind == "voice":
if not (ai.get("voice_api_key") or ai.get("ai_api_key")):
raise HTTPException(503, "AI 服务未配置,请在设置页填写语音密钥或 AI API Key")
if not ai.get("voice_model"):
raise HTTPException(503, "语音模型未配置,请在设置页填写语音模型名称,例如 paraformer-v2")
if kind in ("image", "video"):
if not (ai.get("vision_api_key") or ai.get("ai_api_key")):
raise HTTPException(503, "AI 服务未配置,请在设置页填写视觉密钥或 AI API Key")
if not ai.get("vision_model"):
raise HTTPException(503, "视觉模型未配置,请在设置页填写视觉模型名称,例如 qwen-vl-plus")
media = await resolve_media(kind, key)
if kind == "voice":
return {"text": await _parse_voice(media.bytes, media.content_type)}
return {"text": await _parse_visual(kind, media.bytes, media.content_type)}
def _audio_mime(content_type: str) -> str:
"""由 chatlog 返回的 content_type 推断音频 MIME用于 data URI"""
ct = content_type.lower()
if "silk" in ct or "x-silk" in ct:
return "audio/silk"
if "amr" in ct:
return "audio/amr"
if "ogg" in ct or "opus" in ct:
return "audio/ogg"
if "wav" in ct:
return "audio/wav"
return "audio/mpeg"
def _asr_json(resp: httpx.Response, url: str) -> dict:
"""安全解析 ASR 响应为 JSON。
响应非 JSON空响应 / HTML 错误页 / 网关 404原来直接 .json() 会抛
JSONDecodeError把真实原因HTTP 状态码 + 正文)掩盖掉。这里改成抛出
带状态码与正文片段的 HTTPException便于排查如地址填成 compatible-mode/v1
"""
try:
return resp.json()
except Exception:
body = (resp.text or "").strip()[:300]
raise HTTPException(
500,
f"ASR 接口返回非 JSON (HTTP {resp.status_code}) @ {url}{body or '(空响应)'}"
"请检查语音接口地址是否为异步 ASR 网关(如 .../api/v1及密钥是否正确。",
)
async def _parse_voice(media_bytes: bytes, content_type: str) -> str:
"""语音转文字:阿里云异步 ASR 协议(提交任务 → 轮询 → 取结果)。
接口地址动态base = voice_base_url为空直接用默认阿里云原生网关不回退 ai_base_url
提交端点 = {base}/services/audio/asr/transcription轮询 = {base}/tasks/{id}
子路径由代码自动拼接,配置只需填到 .../api/v1 这一层。
密钥 = voice_api_key为空回退 ai_api_key
"""
ai = await get_ai_settings()
# strip 防止配置/同步带入首尾空格(实测出现过 api_key 前导空格导致鉴权失败)
# 注意:异步 ASR 走原生网关 /api/v1与 ai_base_urlOpenAI 兼容的 chat 端点
# .../compatible-mode/...是两套服务不能混用。voice_base_url 为空时应回退到
# DEFAULT_ASR_BASE_URL绝不能回退到 ai_base_url否则会拼成 .../compatible-mode/.../asr 而 404。
base = (ai.get("voice_base_url") or DEFAULT_ASR_BASE_URL).strip().rstrip("/")
api_key = (ai.get("voice_api_key") or ai.get("ai_api_key") or "").strip()
voice_model = (ai.get("voice_model") or "").strip()
b64_audio = base64.b64encode(media_bytes).decode()
data_uri = f"data:{_audio_mime(content_type)};base64,{b64_audio}"
asr_headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
submit_url = f"{base}/services/audio/asr/transcription"
async with httpx.AsyncClient(timeout=60) as http:
submit = await http.post(
submit_url,
headers={**asr_headers, "X-DashScope-Async": "enable"},
json={
"model": voice_model,
"input": {"file_urls": [data_uri]},
"parameters": {"language_hints": ["zh", "en"]},
},
timeout=30,
)
submit_data = _asr_json(submit, submit_url)
if submit.status_code not in (200, 201):
raise HTTPException(500, f"提交识别任务失败 (HTTP {submit.status_code}): {submit_data.get('message', submit_data)}")
task_id = submit_data.get("output", {}).get("task_id")
if not task_id:
raise HTTPException(500, f"未获取到 task_id: {submit_data}")
for _ in range(30):
await asyncio.sleep(1)
poll = await http.get(
f"{base}/tasks/{task_id}",
headers=asr_headers,
timeout=10,
)
poll_data = _asr_json(poll, f"{base}/tasks/{task_id}")
status = poll_data.get("output", {}).get("task_status", "")
if status == "SUCCEEDED":
results = poll_data.get("output", {}).get("results", [])
log.info("[media_parser] ASR SUCCEEDED results: %s", results)
if not results:
return "(识别结果为空)"
trans_url = results[0].get("transcription_url", "")
if trans_url:
trans_resp = await http.get(trans_url, timeout=10)
trans_data = trans_resp.json()
log.info("[media_parser] transcription_url content: %s", str(trans_data)[:500])
transcripts = trans_data.get("transcripts", [])
text = transcripts[0].get("text", "") if transcripts else ""
else:
text = results[0].get("transcription", "")
return text or "(识别结果为空)"
if status in ("FAILED", "CANCELLED"):
raise HTTPException(500, f"识别任务失败: {poll_data.get('output', {}).get('message', status)}")
raise HTTPException(500, "语音识别超时30秒")
async def _parse_visual(kind: str, media_bytes: bytes, content_type: str) -> str:
b64 = base64.b64encode(media_bytes).decode()
ct = content_type.lower()
if "png" in ct:
mime = "image/png"
elif "webp" in ct:
mime = "image/webp"
else:
mime = "image/jpeg"
data_url = f"data:{mime};base64,{b64}"
prompt = "请用中文简洁描述这张图片的内容。" if kind == "image" else "请用中文简洁描述这个视频截图的内容。"
client, ai = await get_client_for("vision")
resp_ai = await client.chat.completions.create(
model=ai["vision_model"],
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": data_url}},
{"type": "text", "text": prompt},
],
}
],
max_tokens=300,
)
return resp_ai.choices[0].message.content or ""