import asyncio import base64 import logging import httpx from fastapi import HTTPException from services.ai_client import get_client_for from services.media_resolver import resolve_media from services.runtime_settings import get_ai_settings log = logging.getLogger(__name__) # 语音异步 ASR 默认网关(阿里云)。voice_base_url 为空时回退到此; # 提交任务/轮询的子路径由代码自动拼接,配置只需填到 .../api/v1 这一层。 DEFAULT_ASR_BASE_URL = "https://dashscope.aliyuncs.com/api/v1" async def parse_media(kind: str, key: str) -> dict: """ Parse one chatlog media object into text. kind: voice, image, or video. key: chatlog media key. """ if kind not in {"voice", "image", "video"}: raise HTTPException(400, "不支持的媒体类型") if not key: raise HTTPException(400, "媒体 key 不能为空") ai = await get_ai_settings() # voice/vision 各自有独立 url/key,为空则回退全局 ai_api_key if kind == "voice": if not (ai.get("voice_api_key") or ai.get("ai_api_key")): raise HTTPException(503, "AI 服务未配置,请在设置页填写语音密钥或 AI API Key") if not ai.get("voice_model"): raise HTTPException(503, "语音模型未配置,请在设置页填写语音模型名称,例如 paraformer-v2") if kind in ("image", "video"): if not (ai.get("vision_api_key") or ai.get("ai_api_key")): raise HTTPException(503, "AI 服务未配置,请在设置页填写视觉密钥或 AI API Key") if not ai.get("vision_model"): raise HTTPException(503, "视觉模型未配置,请在设置页填写视觉模型名称,例如 qwen-vl-plus") media = await resolve_media(kind, key) if kind == "voice": return {"text": await _parse_voice(media.bytes, media.content_type)} return {"text": await _parse_visual(kind, media.bytes, media.content_type)} def _audio_mime(content_type: str) -> str: """由 chatlog 返回的 content_type 推断音频 MIME(用于 data URI)。""" ct = content_type.lower() if "silk" in ct or "x-silk" in ct: return "audio/silk" if "amr" in ct: return "audio/amr" if "ogg" in ct or "opus" in ct: return "audio/ogg" if "wav" in ct: return "audio/wav" return "audio/mpeg" def _asr_json(resp: httpx.Response, url: str) -> dict: """安全解析 ASR 响应为 JSON。 响应非 JSON(空响应 / HTML 错误页 / 网关 404)时,原来直接 .json() 会抛 JSONDecodeError,把真实原因(HTTP 状态码 + 正文)掩盖掉。这里改成抛出 带状态码与正文片段的 HTTPException,便于排查(如地址填成 compatible-mode/v1)。 """ try: return resp.json() except Exception: body = (resp.text or "").strip()[:300] raise HTTPException( 500, f"ASR 接口返回非 JSON (HTTP {resp.status_code}) @ {url}:{body or '(空响应)'}。" "请检查语音接口地址是否为异步 ASR 网关(如 .../api/v1)及密钥是否正确。", ) async def _parse_voice(media_bytes: bytes, content_type: str) -> str: """语音转文字:阿里云异步 ASR 协议(提交任务 → 轮询 → 取结果)。 接口地址动态:base = voice_base_url(为空直接用默认阿里云原生网关,不回退 ai_base_url), 提交端点 = {base}/services/audio/asr/transcription,轮询 = {base}/tasks/{id}, 子路径由代码自动拼接,配置只需填到 .../api/v1 这一层。 密钥 = voice_api_key(为空回退 ai_api_key)。 """ ai = await get_ai_settings() # strip 防止配置/同步带入首尾空格(实测出现过 api_key 前导空格导致鉴权失败) # 注意:异步 ASR 走原生网关 /api/v1,与 ai_base_url(OpenAI 兼容的 chat 端点 # .../compatible-mode/...)是两套服务,不能混用。voice_base_url 为空时应回退到 # DEFAULT_ASR_BASE_URL,绝不能回退到 ai_base_url,否则会拼成 .../compatible-mode/.../asr 而 404。 base = (ai.get("voice_base_url") or DEFAULT_ASR_BASE_URL).strip().rstrip("/") api_key = (ai.get("voice_api_key") or ai.get("ai_api_key") or "").strip() voice_model = (ai.get("voice_model") or "").strip() b64_audio = base64.b64encode(media_bytes).decode() data_uri = f"data:{_audio_mime(content_type)};base64,{b64_audio}" asr_headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } submit_url = f"{base}/services/audio/asr/transcription" async with httpx.AsyncClient(timeout=60) as http: submit = await http.post( submit_url, headers={**asr_headers, "X-DashScope-Async": "enable"}, json={ "model": voice_model, "input": {"file_urls": [data_uri]}, "parameters": {"language_hints": ["zh", "en"]}, }, timeout=30, ) submit_data = _asr_json(submit, submit_url) if submit.status_code not in (200, 201): raise HTTPException(500, f"提交识别任务失败 (HTTP {submit.status_code}): {submit_data.get('message', submit_data)}") task_id = submit_data.get("output", {}).get("task_id") if not task_id: raise HTTPException(500, f"未获取到 task_id: {submit_data}") for _ in range(30): await asyncio.sleep(1) poll = await http.get( f"{base}/tasks/{task_id}", headers=asr_headers, timeout=10, ) poll_data = _asr_json(poll, f"{base}/tasks/{task_id}") status = poll_data.get("output", {}).get("task_status", "") if status == "SUCCEEDED": results = poll_data.get("output", {}).get("results", []) log.info("[media_parser] ASR SUCCEEDED results: %s", results) if not results: return "(识别结果为空)" trans_url = results[0].get("transcription_url", "") if trans_url: trans_resp = await http.get(trans_url, timeout=10) trans_data = trans_resp.json() log.info("[media_parser] transcription_url content: %s", str(trans_data)[:500]) transcripts = trans_data.get("transcripts", []) text = transcripts[0].get("text", "") if transcripts else "" else: text = results[0].get("transcription", "") return text or "(识别结果为空)" if status in ("FAILED", "CANCELLED"): raise HTTPException(500, f"识别任务失败: {poll_data.get('output', {}).get('message', status)}") raise HTTPException(500, "语音识别超时(30秒)") async def _parse_visual(kind: str, media_bytes: bytes, content_type: str) -> str: b64 = base64.b64encode(media_bytes).decode() ct = content_type.lower() if "png" in ct: mime = "image/png" elif "webp" in ct: mime = "image/webp" else: mime = "image/jpeg" data_url = f"data:{mime};base64,{b64}" prompt = "请用中文简洁描述这张图片的内容。" if kind == "image" else "请用中文简洁描述这个视频截图的内容。" client, ai = await get_client_for("vision") resp_ai = await client.chat.completions.create( model=ai["vision_model"], messages=[ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": data_url}}, {"type": "text", "text": prompt}, ], } ], max_tokens=300, ) return resp_ai.choices[0].message.content or ""