Initial upload for secondary development
This commit is contained in:
0
chatlog_fastAPI/services/__init__.py
Normal file
0
chatlog_fastAPI/services/__init__.py
Normal file
31
chatlog_fastAPI/services/ai_client.py
Normal file
31
chatlog_fastAPI/services/ai_client.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import httpx
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from services.runtime_settings import get_ai_settings
|
||||
|
||||
_client_cache: dict[tuple[str, str], AsyncOpenAI] = {}
|
||||
_http_client_cache: dict[tuple[str, str], httpx.AsyncClient] = {}
|
||||
|
||||
|
||||
async def get_openai_client() -> tuple[AsyncOpenAI, dict]:
|
||||
settings = await get_ai_settings()
|
||||
cache_key = (
|
||||
settings.get("ai_base_url") or "",
|
||||
settings.get("ai_api_key") or "",
|
||||
)
|
||||
|
||||
if cache_key not in _client_cache:
|
||||
for http_client in _http_client_cache.values():
|
||||
await http_client.aclose()
|
||||
_client_cache.clear()
|
||||
_http_client_cache.clear()
|
||||
|
||||
http_client = httpx.AsyncClient(timeout=httpx.Timeout(600.0, connect=30.0))
|
||||
_http_client_cache[cache_key] = http_client
|
||||
_client_cache[cache_key] = AsyncOpenAI(
|
||||
api_key=settings.get("ai_api_key") or "missing",
|
||||
base_url=settings.get("ai_base_url"),
|
||||
http_client=http_client,
|
||||
)
|
||||
|
||||
return _client_cache[cache_key], settings
|
||||
203
chatlog_fastAPI/services/chatlog_client.py
Normal file
203
chatlog_fastAPI/services/chatlog_client.py
Normal file
@@ -0,0 +1,203 @@
|
||||
import httpx
|
||||
import asyncio
|
||||
from typing import List
|
||||
from config import settings
|
||||
|
||||
|
||||
class ChatlogHTTPError(RuntimeError):
|
||||
def __init__(self, status_code: int, method: str, path: str, detail: str):
|
||||
self.status_code = status_code
|
||||
self.method = method
|
||||
self.path = path
|
||||
self.detail = detail
|
||||
super().__init__(f"chatlog HTTP {status_code}: {method} {path} body={detail!r}")
|
||||
|
||||
|
||||
class MessageIndexNotReady(RuntimeError):
|
||||
"""Raised when chatlog has sessions but its message time index is not usable yet."""
|
||||
|
||||
|
||||
class ChatlogClient:
|
||||
def __init__(self):
|
||||
self.base = settings.chatlog_base_url
|
||||
self._contact_db_file = None
|
||||
|
||||
async def _get(self, path: str, params: dict, timeout: float = 30.0) -> dict:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
|
||||
r = await client.get(f"{self.base}{path}", params=params)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except httpx.TimeoutException:
|
||||
raise RuntimeError(f"chatlog timeout: GET {path}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
detail = self._response_detail(e.response)
|
||||
raise ChatlogHTTPError(e.response.status_code, "GET", path, detail)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"chatlog request failed: {e}")
|
||||
|
||||
async def _post(self, path: str, body: dict, timeout: float = 30.0) -> dict:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout, trust_env=False) as client:
|
||||
r = await client.post(f"{self.base}{path}", json=body)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except httpx.TimeoutException:
|
||||
raise RuntimeError(f"chatlog timeout: POST {path}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
detail = self._response_detail(e.response)
|
||||
raise ChatlogHTTPError(e.response.status_code, "POST", path, detail)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"chatlog request failed: {e}")
|
||||
|
||||
def _response_detail(self, response: httpx.Response) -> str:
|
||||
try:
|
||||
body = response.json()
|
||||
if isinstance(body, dict):
|
||||
return str(body.get("error") or body.get("detail") or body)
|
||||
return str(body)
|
||||
except Exception:
|
||||
return response.text
|
||||
|
||||
async def get_messages(
|
||||
self,
|
||||
talker: str,
|
||||
time: str = "",
|
||||
sender: str = "",
|
||||
keyword: str = "",
|
||||
min_seq: int = 0,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> dict:
|
||||
params: dict = {
|
||||
"talker": talker,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"format": "json",
|
||||
}
|
||||
if time:
|
||||
params["time"] = time
|
||||
else:
|
||||
params["time"] = "1970-01-01,2099-12-31"
|
||||
if sender:
|
||||
params["sender"] = sender
|
||||
if keyword:
|
||||
params["keyword"] = keyword
|
||||
if min_seq > 0:
|
||||
params["min_seq"] = min_seq
|
||||
|
||||
try:
|
||||
data = await self._get("/api/v1/chatlog", params)
|
||||
except ChatlogHTTPError as e:
|
||||
detail = e.detail.lower()
|
||||
if e.status_code == 404 and "time range not found" in detail:
|
||||
await asyncio.sleep(0.2)
|
||||
try:
|
||||
data = await self._get("/api/v1/chatlog", params)
|
||||
except ChatlogHTTPError as retry_error:
|
||||
if (
|
||||
retry_error.status_code == 404
|
||||
and "time range not found" in retry_error.detail.lower()
|
||||
):
|
||||
raise MessageIndexNotReady(
|
||||
"自动解密仍在处理消息库,请稍后刷新聊天记录;如果长时间为空,请在微信里打开该聊天并翻看历史消息。"
|
||||
) from retry_error
|
||||
raise
|
||||
elif e.status_code == 404 and "not found" in detail:
|
||||
# chatlog sometimes reports a valid date window as missing while it is warming/querying.
|
||||
await asyncio.sleep(0.2)
|
||||
try:
|
||||
data = await self._get("/api/v1/chatlog", params)
|
||||
except ChatlogHTTPError as retry_error:
|
||||
retry_detail = retry_error.detail.lower()
|
||||
if (
|
||||
retry_error.status_code == 404
|
||||
and "time range not found" in retry_detail
|
||||
):
|
||||
raise MessageIndexNotReady(
|
||||
"自动解密仍在处理消息库,请稍后刷新聊天记录;如果长时间为空,请在微信里打开该聊天并翻看历史消息。"
|
||||
) from retry_error
|
||||
if retry_error.status_code == 404 and "not found" in retry_detail:
|
||||
return {"total": 0, "items": []}
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
return {"total": len(data), "items": data}
|
||||
|
||||
async def get_message(self, talker: str, seq: int) -> dict | None:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0, trust_env=False) as client:
|
||||
r = await client.get(
|
||||
f"{self.base}/api/v1/chatlog/message",
|
||||
params={"talker": talker, "seq": seq},
|
||||
)
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except httpx.TimeoutException:
|
||||
raise RuntimeError("chatlog timeout: get_message")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"chatlog request failed: {e}")
|
||||
|
||||
async def get_messages_batch(self, talker: str, seqs: List[int]) -> dict:
|
||||
return await self._post("/api/v1/chatlog/batch", {"talker": talker, "seqs": seqs})
|
||||
|
||||
async def get_chatrooms(self, keyword: str = "", limit: int = 100, offset: int = 0) -> dict:
|
||||
params: dict = {"limit": limit, "offset": offset, "format": "json"}
|
||||
if keyword:
|
||||
params["keyword"] = keyword
|
||||
return await self._get("/api/v1/chatroom", params, timeout=10.0)
|
||||
|
||||
async def get_contacts(self, keyword: str = "", limit: int = 100, offset: int = 0) -> dict:
|
||||
params: dict = {"limit": limit, "offset": offset, "format": "json"}
|
||||
if keyword:
|
||||
params["keyword"] = keyword
|
||||
return await self._get("/api/v1/contact", params, timeout=10.0)
|
||||
|
||||
async def get_chatroom_members(self, talker: str, time: str = "") -> dict:
|
||||
params: dict = {"talker": talker}
|
||||
if time:
|
||||
params["time"] = time
|
||||
return await self._get("/api/v1/chatroom/members", params)
|
||||
|
||||
async def get_sessions(self, keyword: str = "", limit: int = 500) -> list:
|
||||
params: dict = {"limit": limit, "format": "json"}
|
||||
if keyword:
|
||||
params["keyword"] = keyword
|
||||
data = await self._get("/api/v1/session", params, timeout=15.0)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
return data.get("items", data.get("data", []))
|
||||
|
||||
|
||||
async def get_avatar_url(self, wxid: str) -> str:
|
||||
if self._contact_db_file is None:
|
||||
try:
|
||||
db_list = await self._get("/api/v1/db", {})
|
||||
self._contact_db_file = (db_list.get("contact") or [""])[0]
|
||||
except Exception:
|
||||
self._contact_db_file = ""
|
||||
if not self._contact_db_file:
|
||||
return ""
|
||||
safe_wxid = wxid.replace("'", "''")
|
||||
sql = f"SELECT small_head_url, big_head_url FROM contact WHERE username='{safe_wxid}' LIMIT 1"
|
||||
params = {"group": "contact", "file": self._contact_db_file, "sql": sql}
|
||||
try:
|
||||
rows = await self._get("/api/v1/db/query", params, timeout=5.0)
|
||||
if rows:
|
||||
url = rows[0].get("small_head_url") or rows[0].get("big_head_url") or ""
|
||||
if url:
|
||||
return url
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
async def get_db_paths(self) -> dict:
|
||||
data = await self._get("/api/v1/db", {}, timeout=10.0)
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
|
||||
chatlog_client = ChatlogClient()
|
||||
35
chatlog_fastAPI/services/chatlog_context.py
Normal file
35
chatlog_fastAPI/services/chatlog_context.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChatlogContext:
|
||||
account: str = ""
|
||||
work_dir: str = ""
|
||||
data_dir: str = ""
|
||||
platform: str = "windows"
|
||||
version: int = 4
|
||||
chatlog_exe: str = ""
|
||||
chatlog_version: str = ""
|
||||
|
||||
|
||||
_context = ChatlogContext()
|
||||
|
||||
|
||||
def update_chatlog_context(payload: dict) -> dict:
|
||||
global _context
|
||||
_context = ChatlogContext(
|
||||
account=str(payload.get("account") or ""),
|
||||
work_dir=str(payload.get("workDir") or payload.get("work_dir") or ""),
|
||||
data_dir=str(payload.get("dataDir") or payload.get("data_dir") or ""),
|
||||
platform=str(payload.get("platform") or "windows"),
|
||||
version=int(payload.get("version") or 4),
|
||||
chatlog_exe=str(payload.get("chatlogExe") or payload.get("chatlog_exe") or ""),
|
||||
chatlog_version=str(payload.get("chatlogVersion") or payload.get("chatlog_version") or ""),
|
||||
)
|
||||
return get_chatlog_context()
|
||||
|
||||
|
||||
def get_chatlog_context() -> dict:
|
||||
return asdict(_context)
|
||||
25
chatlog_fastAPI/services/fts.py
Normal file
25
chatlog_fastAPI/services/fts.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import jieba
|
||||
import re
|
||||
|
||||
def tokenize(text: str) -> str:
|
||||
return " ".join(jieba.cut(text))
|
||||
|
||||
|
||||
def build_match_query(text: str, limit: int = 12) -> str:
|
||||
"""Build a safe FTS5 MATCH query from user/model text."""
|
||||
terms: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for token in tokenize(text or "").split():
|
||||
token = token.strip()
|
||||
if not token or not re.search(r"\w", token, flags=re.UNICODE):
|
||||
continue
|
||||
upper = token.upper()
|
||||
if upper in {"AND", "OR", "NOT", "NEAR"}:
|
||||
continue
|
||||
if token in seen:
|
||||
continue
|
||||
seen.add(token)
|
||||
terms.append('"' + token.replace('"', '""') + '"')
|
||||
if len(terms) >= limit:
|
||||
break
|
||||
return " OR ".join(terms)
|
||||
142
chatlog_fastAPI/services/media_parser.py
Normal file
142
chatlog_fastAPI/services/media_parser.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import base64
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
from fastapi import HTTPException
|
||||
|
||||
from services.ai_client import get_openai_client
|
||||
from services.media_resolver import resolve_media
|
||||
from services.runtime_settings import get_ai_settings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _get_ai_client():
|
||||
return await get_openai_client()
|
||||
|
||||
|
||||
async def parse_media(kind: str, key: str) -> dict:
|
||||
"""
|
||||
Parse one chatlog media object into text.
|
||||
|
||||
kind: voice, image, or video.
|
||||
key: chatlog media key.
|
||||
"""
|
||||
if kind not in {"voice", "image", "video"}:
|
||||
raise HTTPException(400, "不支持的媒体类型")
|
||||
if not key:
|
||||
raise HTTPException(400, "媒体 key 不能为空")
|
||||
|
||||
ai = await get_ai_settings()
|
||||
if not ai.get("ai_api_key"):
|
||||
raise HTTPException(503, "AI 服务未配置,请在设置页填写 AI API Key")
|
||||
if kind == "voice" and not ai.get("voice_model"):
|
||||
raise HTTPException(503, "语音模型未配置,请在设置页填写语音模型名称,例如 paraformer-v2")
|
||||
if kind in ("image", "video") and not ai.get("vision_model"):
|
||||
raise HTTPException(503, "视觉模型未配置,请在设置页填写视觉模型名称,例如 qwen-vl-plus")
|
||||
|
||||
media = await resolve_media(kind, key)
|
||||
if kind == "voice":
|
||||
return {"text": await _parse_voice(media.bytes, media.content_type)}
|
||||
return {"text": await _parse_visual(kind, media.bytes, media.content_type)}
|
||||
|
||||
|
||||
async def _parse_voice(media_bytes: bytes, content_type: str) -> str:
|
||||
b64_audio = base64.b64encode(media_bytes).decode()
|
||||
audio_ct = content_type.lower()
|
||||
if "silk" in audio_ct or "x-silk" in audio_ct:
|
||||
audio_mime = "audio/silk"
|
||||
elif "amr" in audio_ct:
|
||||
audio_mime = "audio/amr"
|
||||
elif "ogg" in audio_ct or "opus" in audio_ct:
|
||||
audio_mime = "audio/ogg"
|
||||
elif "wav" in audio_ct:
|
||||
audio_mime = "audio/wav"
|
||||
else:
|
||||
audio_mime = "audio/mpeg"
|
||||
|
||||
data_uri = f"data:{audio_mime};base64,{b64_audio}"
|
||||
_, ai = await _get_ai_client()
|
||||
asr_headers = {
|
||||
"Authorization": f"Bearer {ai['ai_api_key']}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60) as http:
|
||||
submit = await http.post(
|
||||
"https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription",
|
||||
headers={**asr_headers, "X-DashScope-Async": "enable"},
|
||||
json={
|
||||
"model": ai["voice_model"],
|
||||
"input": {"file_urls": [data_uri]},
|
||||
"parameters": {"language_hints": ["zh", "en"]},
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
submit_data = submit.json()
|
||||
if submit.status_code not in (200, 201):
|
||||
raise HTTPException(500, f"提交识别任务失败: {submit_data.get('message', submit_data)}")
|
||||
|
||||
task_id = submit_data.get("output", {}).get("task_id")
|
||||
if not task_id:
|
||||
raise HTTPException(500, f"未获取到 task_id: {submit_data}")
|
||||
|
||||
for _ in range(30):
|
||||
import asyncio
|
||||
|
||||
await asyncio.sleep(1)
|
||||
poll = await http.get(
|
||||
f"https://dashscope.aliyuncs.com/api/v1/tasks/{task_id}",
|
||||
headers=asr_headers,
|
||||
timeout=10,
|
||||
)
|
||||
poll_data = poll.json()
|
||||
status = poll_data.get("output", {}).get("task_status", "")
|
||||
if status == "SUCCEEDED":
|
||||
results = poll_data.get("output", {}).get("results", [])
|
||||
log.info("[media_parser] ASR SUCCEEDED results: %s", results)
|
||||
if not results:
|
||||
return "(识别结果为空)"
|
||||
trans_url = results[0].get("transcription_url", "")
|
||||
if trans_url:
|
||||
trans_resp = await http.get(trans_url, timeout=10)
|
||||
trans_data = trans_resp.json()
|
||||
log.info("[media_parser] transcription_url content: %s", str(trans_data)[:500])
|
||||
transcripts = trans_data.get("transcripts", [])
|
||||
text = transcripts[0].get("text", "") if transcripts else ""
|
||||
else:
|
||||
text = results[0].get("transcription", "")
|
||||
return text or "(识别结果为空)"
|
||||
if status in ("FAILED", "CANCELLED"):
|
||||
raise HTTPException(500, f"识别任务失败: {poll_data.get('output', {}).get('message', status)}")
|
||||
|
||||
raise HTTPException(500, "语音识别超时(30秒)")
|
||||
|
||||
|
||||
async def _parse_visual(kind: str, media_bytes: bytes, content_type: str) -> str:
|
||||
b64 = base64.b64encode(media_bytes).decode()
|
||||
ct = content_type.lower()
|
||||
if "png" in ct:
|
||||
mime = "image/png"
|
||||
elif "webp" in ct:
|
||||
mime = "image/webp"
|
||||
else:
|
||||
mime = "image/jpeg"
|
||||
data_url = f"data:{mime};base64,{b64}"
|
||||
prompt = "请用中文简洁描述这张图片的内容。" if kind == "image" else "请用中文简洁描述这个视频截图的内容。"
|
||||
|
||||
client, ai = await _get_ai_client()
|
||||
resp_ai = await client.chat.completions.create(
|
||||
model=ai["vision_model"],
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
}
|
||||
],
|
||||
max_tokens=300,
|
||||
)
|
||||
return resp_ai.choices[0].message.content or ""
|
||||
174
chatlog_fastAPI/services/media_resolver.py
Normal file
174
chatlog_fastAPI/services/media_resolver.py
Normal file
@@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from fastapi import HTTPException
|
||||
|
||||
from config import settings
|
||||
from services.chatlog_context import get_chatlog_context
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResolvedMedia:
|
||||
bytes: bytes
|
||||
content_type: str
|
||||
url: str
|
||||
|
||||
|
||||
def _media_url(kind: str, key: str, thumb: bool = False) -> str:
|
||||
url = f"{settings.chatlog_base_url}/{kind}/{key}"
|
||||
if thumb:
|
||||
url += "?thumb=1"
|
||||
return url
|
||||
|
||||
|
||||
def _read_voice_resource_status(key: str) -> dict:
|
||||
ctx = get_chatlog_context()
|
||||
work_dir = ctx.get("work_dir") or ""
|
||||
if not work_dir:
|
||||
return {"checked": False, "reason": "missing_work_dir"}
|
||||
|
||||
db_path = Path(work_dir) / "db_storage" / "message" / "message_resource.db"
|
||||
if not db_path.exists():
|
||||
return {"checked": False, "reason": "message_resource_db_missing", "path": str(db_path)}
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db_path.as_posix()}?mode=ro", uri=True)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
info = conn.execute(
|
||||
"SELECT * FROM MessageResourceInfo WHERE message_svr_id=?",
|
||||
(int(key),),
|
||||
).fetchone()
|
||||
if not info:
|
||||
return {
|
||||
"checked": True,
|
||||
"found": False,
|
||||
"path": str(db_path),
|
||||
"message": "当前已解密资源库里没有这条语音的媒体资源记录",
|
||||
}
|
||||
details = conn.execute(
|
||||
"SELECT type,size,status,data_index FROM MessageResourceDetail WHERE message_id=?",
|
||||
(info["message_id"],),
|
||||
).fetchall()
|
||||
return {
|
||||
"checked": True,
|
||||
"found": True,
|
||||
"path": str(db_path),
|
||||
"message_id": info["message_id"],
|
||||
"resources": [dict(row) for row in details],
|
||||
}
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception as exc:
|
||||
return {"checked": False, "reason": "resource_db_read_failed", "error": str(exc), "path": str(db_path)}
|
||||
|
||||
|
||||
def _download_failure_message(kind: str, key: str, status_code: int | None, body: str = "") -> str:
|
||||
if kind == "voice":
|
||||
base = "底层语音文件未读取成功"
|
||||
if status_code:
|
||||
base += f"(chatlog /voice 返回 HTTP {status_code})"
|
||||
return (
|
||||
f"{base}。请先确认已安装新版程序并重新识别当前微信账号;"
|
||||
"如果仍失败,说明当前 chatlog 版本还不能解析该 WeChat 4.x 语音资源。"
|
||||
)
|
||||
if status_code:
|
||||
return f"从 chatlog 下载媒体失败: HTTP {status_code}"
|
||||
return f"从 chatlog 下载媒体失败: {body or 'unknown error'}"
|
||||
|
||||
|
||||
async def diagnose_media(kind: str, key: str) -> dict:
|
||||
if kind not in {"voice", "image", "video"}:
|
||||
raise HTTPException(400, "不支持的媒体类型")
|
||||
if not key:
|
||||
raise HTTPException(400, "媒体 key 不能为空")
|
||||
|
||||
url = _media_url(kind, key, thumb=kind in {"image", "video"})
|
||||
result = {
|
||||
"ok": False,
|
||||
"kind": kind,
|
||||
"key": key,
|
||||
"url": url,
|
||||
"chatlog_base_url": settings.chatlog_base_url,
|
||||
"chatlog_context": get_chatlog_context(),
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=20, trust_env=False, follow_redirects=True) as client:
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
result.update(
|
||||
{
|
||||
"status_code": resp.status_code,
|
||||
"content_type": content_type,
|
||||
"content_length": len(resp.content or b""),
|
||||
"ok": resp.status_code < 400 and bool(resp.content),
|
||||
}
|
||||
)
|
||||
if resp.status_code >= 400:
|
||||
result["error"] = _download_failure_message(kind, key, resp.status_code, resp.text[:500])
|
||||
result["response_preview"] = resp.text[:500]
|
||||
elif not resp.content:
|
||||
result["error"] = "chatlog 返回了空媒体文件"
|
||||
except Exception as exc:
|
||||
result.update({"error": f"无法连接 chatlog 媒体接口: {exc}", "exception": str(exc)})
|
||||
|
||||
if kind == "voice":
|
||||
result["resource_db"] = _read_voice_resource_status(key)
|
||||
return result
|
||||
|
||||
|
||||
async def resolve_media(kind: str, key: str) -> ResolvedMedia:
|
||||
if kind not in {"voice", "image", "video"}:
|
||||
raise HTTPException(400, "不支持的媒体类型")
|
||||
if not key:
|
||||
raise HTTPException(400, "媒体 key 不能为空")
|
||||
|
||||
url = _media_url(kind, key, thumb=kind in {"image", "video"})
|
||||
async with httpx.AsyncClient(timeout=60, trust_env=False, follow_redirects=True) as client:
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as exc:
|
||||
diagnostics = await diagnose_media(kind, key)
|
||||
log.warning("[media_resolver] media download failed: %s", diagnostics)
|
||||
raise HTTPException(
|
||||
502,
|
||||
{
|
||||
"message": _download_failure_message(kind, key, exc.response.status_code, exc.response.text[:500]),
|
||||
"diagnostics": diagnostics,
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
diagnostics = await diagnose_media(kind, key)
|
||||
log.warning("[media_resolver] media download exception: %s", diagnostics)
|
||||
raise HTTPException(
|
||||
502,
|
||||
{
|
||||
"message": _download_failure_message(kind, key, None, str(exc)),
|
||||
"diagnostics": diagnostics,
|
||||
},
|
||||
)
|
||||
|
||||
if not resp.content:
|
||||
diagnostics = await diagnose_media(kind, key)
|
||||
raise HTTPException(
|
||||
502,
|
||||
{
|
||||
"message": "chatlog 返回了空媒体文件",
|
||||
"diagnostics": diagnostics,
|
||||
},
|
||||
)
|
||||
|
||||
return ResolvedMedia(
|
||||
bytes=resp.content,
|
||||
content_type=resp.headers.get("content-type", "application/octet-stream"),
|
||||
url=url,
|
||||
)
|
||||
253
chatlog_fastAPI/services/message_formatter.py
Normal file
253
chatlog_fastAPI/services/message_formatter.py
Normal file
@@ -0,0 +1,253 @@
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Any
|
||||
|
||||
|
||||
QUOTE_CONTENT_LIMIT = 600
|
||||
|
||||
|
||||
def extract_contents(item: dict) -> dict:
|
||||
contents = item.get("contents") or item.get("Contents") or {}
|
||||
return contents if isinstance(contents, dict) else {}
|
||||
|
||||
|
||||
def clean_message_text(value: Any) -> str:
|
||||
text = html.unescape(str(value or "")).strip()
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
if len(text) > QUOTE_CONTENT_LIMIT:
|
||||
text = text[:QUOTE_CONTENT_LIMIT] + "..."
|
||||
return text
|
||||
|
||||
|
||||
def _local_name(tag: str) -> str:
|
||||
return tag.rsplit("}", 1)[-1]
|
||||
|
||||
|
||||
def _safe_int(value: Any) -> int | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
try:
|
||||
return int(str(value).strip())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _first(data: dict, *keys: str) -> Any:
|
||||
for key in keys:
|
||||
value = data.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _has_quote_indicator(data: dict) -> bool:
|
||||
keys = {str(key) for key in data.keys()}
|
||||
indicators = {
|
||||
"quote",
|
||||
"refermsg",
|
||||
"referMsg",
|
||||
"refer",
|
||||
"recordInfo",
|
||||
"recordinfo",
|
||||
"fromusr",
|
||||
"fromUser",
|
||||
"chatusr",
|
||||
"chatUser",
|
||||
"displayname",
|
||||
"displayName",
|
||||
"referContent",
|
||||
"svrid",
|
||||
"newmsgid",
|
||||
"newMsgId",
|
||||
}
|
||||
return bool(keys & indicators)
|
||||
|
||||
|
||||
def _decode_json(value: str) -> Any:
|
||||
try:
|
||||
return json.loads(value)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _xml_node_text(node: ET.Element, names: set[str]) -> str:
|
||||
for child in node.iter():
|
||||
if _local_name(child.tag) in names:
|
||||
text = "".join(child.itertext()).strip()
|
||||
if text:
|
||||
return text
|
||||
return ""
|
||||
|
||||
|
||||
def _quote_from_xml(value: str) -> dict | None:
|
||||
text = html.unescape(value or "").strip()
|
||||
if "<" not in text or ">" not in text:
|
||||
return None
|
||||
try:
|
||||
root = ET.fromstring(text)
|
||||
except Exception:
|
||||
try:
|
||||
root = ET.fromstring(f"<root>{text}</root>")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
refer_node = None
|
||||
for node in root.iter():
|
||||
if _local_name(node.tag).lower() == "refermsg":
|
||||
refer_node = node
|
||||
break
|
||||
if refer_node is None:
|
||||
return None
|
||||
|
||||
content = _xml_node_text(refer_node, {"content", "title", "desc"})
|
||||
sender_name = _xml_node_text(refer_node, {"displayname", "nickname", "fromnickname"})
|
||||
sender = _xml_node_text(refer_node, {"fromusr", "chatusr", "sender"})
|
||||
msg_type = _safe_int(_xml_node_text(refer_node, {"type"}))
|
||||
seq = _safe_int(_xml_node_text(refer_node, {"seq", "msgid", "newmsgid", "svrid"}))
|
||||
|
||||
return _normalize_quote(
|
||||
{
|
||||
"sender": sender,
|
||||
"sender_name": sender_name,
|
||||
"content": content,
|
||||
"type": msg_type,
|
||||
"seq": seq,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _find_quote_payload(value: Any, allow_plain_text: bool = False) -> dict | None:
|
||||
if value in (None, ""):
|
||||
return None
|
||||
|
||||
if isinstance(value, str):
|
||||
text = value.strip()
|
||||
if not text:
|
||||
return None
|
||||
decoded = _decode_json(text) if text[:1] in ("{", "[") else None
|
||||
if decoded is not None:
|
||||
return _find_quote_payload(decoded, allow_plain_text=allow_plain_text)
|
||||
xml_quote = _quote_from_xml(text)
|
||||
if xml_quote:
|
||||
return xml_quote
|
||||
if allow_plain_text:
|
||||
return _normalize_quote({"content": text})
|
||||
return None
|
||||
|
||||
if isinstance(value, list):
|
||||
for item in value:
|
||||
quote = _find_quote_payload(item, allow_plain_text=allow_plain_text)
|
||||
if quote:
|
||||
return quote
|
||||
return None
|
||||
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
|
||||
for key in ("quote", "refermsg", "referMsg", "refer", "recordInfo", "recordinfo"):
|
||||
if key in value:
|
||||
quote = _find_quote_payload(value.get(key), allow_plain_text=True)
|
||||
if quote:
|
||||
return quote
|
||||
|
||||
quote = _normalize_quote(value) if allow_plain_text or _has_quote_indicator(value) else None
|
||||
if quote:
|
||||
return quote
|
||||
|
||||
for nested in value.values():
|
||||
quote = _find_quote_payload(nested, allow_plain_text=False)
|
||||
if quote:
|
||||
return quote
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_quote(data: dict) -> dict | None:
|
||||
content = clean_message_text(
|
||||
_first(
|
||||
data,
|
||||
"content",
|
||||
"Content",
|
||||
"text",
|
||||
"title",
|
||||
"desc",
|
||||
"digest",
|
||||
"displayContent",
|
||||
"referContent",
|
||||
)
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
sender = clean_message_text(
|
||||
_first(data, "sender", "Sender", "fromusr", "fromUser", "chatusr", "chatUser", "from")
|
||||
)
|
||||
sender_name = clean_message_text(
|
||||
_first(data, "sender_name", "senderName", "SenderName", "displayname", "displayName", "nickname", "nickName")
|
||||
)
|
||||
msg_type = _safe_int(_first(data, "type", "Type", "msgType", "subType"))
|
||||
seq = _safe_int(_first(data, "seq", "Seq", "sort_seq", "msgid", "msgId", "newmsgid", "newMsgId", "svrid"))
|
||||
|
||||
return {
|
||||
"sender": sender,
|
||||
"sender_name": sender_name,
|
||||
"content": content,
|
||||
"type": msg_type,
|
||||
"seq": seq,
|
||||
}
|
||||
|
||||
|
||||
def extract_quote(item: dict | None) -> dict | None:
|
||||
if not isinstance(item, dict):
|
||||
return None
|
||||
|
||||
contents = extract_contents(item)
|
||||
explicit_sources = (
|
||||
item.get("quote"),
|
||||
item.get("Quote"),
|
||||
item.get("refer"),
|
||||
item.get("recordInfo"),
|
||||
contents.get("quote"),
|
||||
contents.get("refer"),
|
||||
contents.get("refermsg"),
|
||||
contents.get("referMsg"),
|
||||
contents.get("recordInfo"),
|
||||
contents.get("recordinfo"),
|
||||
)
|
||||
for source in explicit_sources:
|
||||
quote = _find_quote_payload(source, allow_plain_text=True)
|
||||
if quote:
|
||||
return quote
|
||||
|
||||
for source in (
|
||||
contents.get("appmsg"),
|
||||
item.get("content"),
|
||||
item.get("Content"),
|
||||
):
|
||||
quote = _find_quote_payload(source, allow_plain_text=False)
|
||||
if quote:
|
||||
return quote
|
||||
return None
|
||||
|
||||
|
||||
def attach_quote(item: dict) -> dict:
|
||||
item["quote"] = extract_quote(item)
|
||||
return item
|
||||
|
||||
|
||||
def quote_to_text(quote: dict | None) -> str:
|
||||
if not quote:
|
||||
return ""
|
||||
sender = quote.get("sender_name") or quote.get("sender") or "未知"
|
||||
seq = quote.get("seq")
|
||||
seq_text = f" seq={seq}" if seq else ""
|
||||
return f"[引用消息{seq_text}] {sender}: {quote.get('content') or ''}".strip()
|
||||
|
||||
|
||||
def append_quote_text(base_text: str, item: dict) -> str:
|
||||
parts = [base_text.strip()] if base_text and base_text.strip() else []
|
||||
quote_text = quote_to_text(extract_quote(item))
|
||||
if quote_text:
|
||||
parts.append(quote_text)
|
||||
return ";".join(parts)
|
||||
139
chatlog_fastAPI/services/report_learning.py
Normal file
139
chatlog_fastAPI/services/report_learning.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import re
|
||||
import aiosqlite
|
||||
|
||||
from services.fts import build_match_query
|
||||
|
||||
MAX_EXAMPLES = 3
|
||||
MAX_EXAMPLE_CHARS = 1800
|
||||
MAX_CONTEXT_CHARS = 5200
|
||||
|
||||
|
||||
def _compact(text: str, limit: int = MAX_EXAMPLE_CHARS) -> str:
|
||||
text = re.sub(r"\n{3,}", "\n\n", (text or "").strip())
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return text[:limit].rstrip() + "\n..."
|
||||
|
||||
|
||||
def _format_examples(rows: list[aiosqlite.Row], purpose: str) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
heading = {
|
||||
"topic": "历史人工修订报告参考(用于学习话题命名和分类口径)",
|
||||
"summary": "历史人工修订报告参考(只学习结构、措辞和关注点,不得照抄历史事实)",
|
||||
}.get(purpose, "历史人工修订报告参考")
|
||||
parts = [heading]
|
||||
total = len(parts[0])
|
||||
for idx, row in enumerate(rows, 1):
|
||||
block = (
|
||||
f"\n\n--- 示例 {idx} ---\n"
|
||||
f"群聊:{row['group_name'] or row['talker'] or row['group_id']}\n"
|
||||
f"话题标题:{row['title']}\n"
|
||||
f"报告内容:\n{_compact(row['content'])}"
|
||||
)
|
||||
if total + len(block) > MAX_CONTEXT_CHARS:
|
||||
break
|
||||
parts.append(block)
|
||||
total += len(block)
|
||||
return "".join(parts).strip()
|
||||
|
||||
|
||||
async def build_report_learning_context(
|
||||
db: aiosqlite.Connection,
|
||||
*,
|
||||
group_id: int | None,
|
||||
query: str = "",
|
||||
exclude_topic_id: int | None = None,
|
||||
purpose: str = "summary",
|
||||
limit: int = MAX_EXAMPLES,
|
||||
) -> str:
|
||||
params: list[object] = []
|
||||
exclude_sql = ""
|
||||
if exclude_topic_id is not None:
|
||||
exclude_sql = " AND t.id<>?"
|
||||
params.append(exclude_topic_id)
|
||||
|
||||
selected: list[aiosqlite.Row] = []
|
||||
seen_doc_ids: set[int] = set()
|
||||
|
||||
if group_id is not None:
|
||||
async with db.execute(
|
||||
f"""
|
||||
SELECT k.id, k.content, k.updated_at, t.id AS topic_id, t.title, t.group_id,
|
||||
g.name AS group_name, g.talker
|
||||
FROM knowledge_docs k
|
||||
JOIN topics t ON t.id = k.topic_id
|
||||
LEFT JOIN groups g ON g.id = t.group_id
|
||||
WHERE k.curated_at IS NOT NULL
|
||||
AND t.group_id=?
|
||||
{exclude_sql}
|
||||
ORDER BY k.curated_at DESC, k.updated_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
[group_id, *params, limit],
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
for row in rows:
|
||||
selected.append(row)
|
||||
seen_doc_ids.add(int(row["id"]))
|
||||
|
||||
if len(selected) < limit:
|
||||
remaining = limit - len(selected)
|
||||
fts_query = build_match_query(query or "")
|
||||
if fts_query:
|
||||
async with db.execute(
|
||||
f"""
|
||||
SELECT k.id, k.content, k.updated_at, t.id AS topic_id, t.title, t.group_id,
|
||||
g.name AS group_name, g.talker
|
||||
FROM knowledge_docs k
|
||||
JOIN topics t ON t.id = k.topic_id
|
||||
LEFT JOIN groups g ON g.id = t.group_id
|
||||
WHERE k.curated_at IS NOT NULL
|
||||
AND k.id IN (SELECT doc_id FROM knowledge_fts WHERE knowledge_fts MATCH ?)
|
||||
{exclude_sql}
|
||||
ORDER BY CASE WHEN t.group_id=? THEN 0 ELSE 1 END,
|
||||
k.curated_at DESC,
|
||||
k.updated_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
[fts_query, *params, group_id or -1, remaining * 3],
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
for row in rows:
|
||||
doc_id = int(row["id"])
|
||||
if doc_id in seen_doc_ids:
|
||||
continue
|
||||
selected.append(row)
|
||||
seen_doc_ids.add(doc_id)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
|
||||
if len(selected) < limit:
|
||||
remaining = limit - len(selected)
|
||||
async with db.execute(
|
||||
f"""
|
||||
SELECT k.id, k.content, k.updated_at, t.id AS topic_id, t.title, t.group_id,
|
||||
g.name AS group_name, g.talker
|
||||
FROM knowledge_docs k
|
||||
JOIN topics t ON t.id = k.topic_id
|
||||
LEFT JOIN groups g ON g.id = t.group_id
|
||||
WHERE k.curated_at IS NOT NULL
|
||||
{exclude_sql}
|
||||
ORDER BY CASE WHEN t.group_id=? THEN 0 ELSE 1 END,
|
||||
k.curated_at DESC,
|
||||
k.updated_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
[*params, group_id or -1, remaining * 3],
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
for row in rows:
|
||||
doc_id = int(row["id"])
|
||||
if doc_id in seen_doc_ids:
|
||||
continue
|
||||
selected.append(row)
|
||||
seen_doc_ids.add(doc_id)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
|
||||
return _format_examples(selected[:limit], purpose)
|
||||
45
chatlog_fastAPI/services/runtime_settings.py
Normal file
45
chatlog_fastAPI/services/runtime_settings.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import logging
|
||||
import aiosqlite
|
||||
from config import settings as default_settings
|
||||
from database import get_active_db_path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_cache: dict | None = None
|
||||
|
||||
|
||||
def invalidate_cache():
|
||||
global _cache
|
||||
_cache = None
|
||||
|
||||
|
||||
async def get_ai_settings() -> dict:
|
||||
global _cache
|
||||
if _cache is not None:
|
||||
return _cache
|
||||
|
||||
# ai_base_url 保留默认值(阿里云兼容 OpenAI 格式地址),其余字段必须由用户在设置页配置
|
||||
result = {
|
||||
"ai_base_url": default_settings.ai_base_url,
|
||||
"ai_api_key": "",
|
||||
"ai_model": "",
|
||||
"summary_model": "",
|
||||
"vision_model": "",
|
||||
"voice_model": "",
|
||||
"topic_analysis_prompt": "",
|
||||
}
|
||||
|
||||
try:
|
||||
path = get_active_db_path()
|
||||
async with aiosqlite.connect(path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute("SELECT key, value FROM app_settings") as cur:
|
||||
rows = await cur.fetchall()
|
||||
for row in rows:
|
||||
if row["key"] in result and row["value"]:
|
||||
result[row["key"]] = row["value"]
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to read runtime settings: {e}")
|
||||
|
||||
_cache = result
|
||||
return result
|
||||
476
chatlog_fastAPI/services/summary_engine.py
Normal file
476
chatlog_fastAPI/services/summary_engine.py
Normal file
@@ -0,0 +1,476 @@
|
||||
"""
|
||||
售后报告生成引擎
|
||||
- 从 topic_messages 拿到所有 msg_seq
|
||||
- 通过 chatlog batch 接口批量拉回消息原文
|
||||
- 用配置的总结模型生成 Markdown 售后事件报告
|
||||
- 写入 knowledge_docs + knowledge_fts(jieba 分词)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import json
|
||||
import aiosqlite
|
||||
from urllib.parse import quote
|
||||
|
||||
from database import get_active_db_path
|
||||
from services.ai_client import get_openai_client
|
||||
from services.fts import tokenize
|
||||
from services.message_formatter import append_quote_text, extract_contents, extract_quote
|
||||
from services.report_learning import build_report_learning_context
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
CHATLOG_BATCH_SIZE = 80
|
||||
SUMMARY_LLM_TIMEOUT_SECONDS = 300
|
||||
|
||||
|
||||
async def _get_client():
|
||||
return await get_openai_client()
|
||||
|
||||
|
||||
def _message_line(item: dict, fallback_seq: int = 0) -> tuple[int, str] | None:
|
||||
if not item:
|
||||
return None
|
||||
seq = item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0
|
||||
time_str = item.get("create_time") or item.get("time") or item.get("CreateTime") or ""
|
||||
sender = (
|
||||
item.get("sender_name")
|
||||
or item.get("senderName")
|
||||
or item.get("SenderName")
|
||||
or item.get("sender")
|
||||
or item.get("Sender")
|
||||
or ""
|
||||
)
|
||||
content = _message_text(item)
|
||||
if not content:
|
||||
return None
|
||||
return int(seq), f"[{time_str}] {sender}: {content}"
|
||||
|
||||
|
||||
def _message_meta(item: dict, fallback_seq: int = 0) -> dict:
|
||||
return {
|
||||
"seq": int(item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0),
|
||||
"time": item.get("create_time") or item.get("time") or item.get("CreateTime") or "",
|
||||
"sender": (
|
||||
item.get("sender_name")
|
||||
or item.get("senderName")
|
||||
or item.get("SenderName")
|
||||
or item.get("sender")
|
||||
or item.get("Sender")
|
||||
or ""
|
||||
),
|
||||
"type": item.get("type") or item.get("Type") or 1,
|
||||
}
|
||||
|
||||
|
||||
def _extract_contents(item: dict) -> dict:
|
||||
return extract_contents(item)
|
||||
|
||||
|
||||
def _message_text(item: dict) -> str:
|
||||
content = item.get("content") or item.get("Content") or ""
|
||||
contents = _extract_contents(item)
|
||||
if isinstance(content, str) and content.lstrip().startswith("<") and extract_quote(item):
|
||||
content = ""
|
||||
|
||||
link_title = contents.get("title") or item.get("link_title") or ""
|
||||
link_desc = contents.get("desc") or item.get("link_desc") or ""
|
||||
link_source = contents.get("sourceName") or contents.get("source_name") or item.get("link_source") or ""
|
||||
link_url = contents.get("url") or item.get("link_url") or ""
|
||||
|
||||
if link_title:
|
||||
parts = [f"[链接卡片] {link_title}"]
|
||||
if link_desc:
|
||||
parts.append(link_desc)
|
||||
if link_source:
|
||||
parts.append(f"来源:{link_source}")
|
||||
if link_url:
|
||||
parts.append(f"URL:{link_url}")
|
||||
if content and content not in parts:
|
||||
parts.append(content)
|
||||
return append_quote_text(";".join(parts), item)
|
||||
|
||||
return append_quote_text(content, item)
|
||||
|
||||
|
||||
def _extract_image_key(item: dict) -> str:
|
||||
contents = _extract_contents(item)
|
||||
key = (
|
||||
contents.get("rawmd5")
|
||||
or contents.get("md5")
|
||||
or contents.get("path")
|
||||
or item.get("media_key")
|
||||
or item.get("mediaKey")
|
||||
or item.get("image_path")
|
||||
or ""
|
||||
)
|
||||
return str(key).replace("\\", "/")
|
||||
|
||||
|
||||
def _is_image_message(item: dict) -> bool:
|
||||
try:
|
||||
return int(item.get("type") or item.get("Type") or 0) == 3
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _media_path(kind: str, key: str) -> str:
|
||||
return f"/{kind}/" + "/".join(quote(part) for part in key.split("/"))
|
||||
|
||||
|
||||
def _image_url(key: str) -> str:
|
||||
return f"{_media_path('image', key)}?thumb=1"
|
||||
|
||||
|
||||
def _collect_image_evidence(messages: list[dict]) -> tuple[list[dict], list[dict]]:
|
||||
images: list[dict] = []
|
||||
failures: list[dict] = []
|
||||
|
||||
for item in messages:
|
||||
if not _is_image_message(item):
|
||||
continue
|
||||
meta = _message_meta(item)
|
||||
key = _extract_image_key(item)
|
||||
if not key:
|
||||
failures.append({**meta, "url": "", "reason": "图片无法展示,缺少图片文件标识"})
|
||||
continue
|
||||
|
||||
url = _image_url(key)
|
||||
images.append({**meta, "key": key, "url": url})
|
||||
|
||||
return images, failures
|
||||
|
||||
|
||||
def _image_evidence_context(images: list[dict], failures: list[dict]) -> str:
|
||||
lines: list[str] = []
|
||||
if images:
|
||||
lines.append("系统将作为原始材料插入报告的现场图片:")
|
||||
for img in images:
|
||||
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']} url={img['url']}")
|
||||
if failures:
|
||||
lines.append("无法展示的图片清单:")
|
||||
for img in failures:
|
||||
link = f",查看图片:{img['url']}" if img.get("url") else ""
|
||||
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}:{img['reason']}{link}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _image_success_markdown(images: list[dict]) -> str:
|
||||
if not images:
|
||||
return ""
|
||||
blocks = ["### 现场图片"]
|
||||
for img in images:
|
||||
alt = f"现场图片 - {img['time']} {img['sender']}".strip()
|
||||
blocks.extend(
|
||||
[
|
||||
f"",
|
||||
f"来源:{img['time']} {img['sender']} seq={img['seq']}",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(blocks).strip()
|
||||
|
||||
|
||||
def _image_failure_markdown(failures: list[dict]) -> str:
|
||||
if not failures:
|
||||
return ""
|
||||
lines = ["## 图片展示提示"]
|
||||
for img in failures:
|
||||
link = f",查看图片:{img['url']}" if img.get("url") else ""
|
||||
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}:{img['reason']}{link}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _insert_after_heading(content: str, heading: str, addition: str) -> str:
|
||||
if not addition:
|
||||
return content
|
||||
lines = content.splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip() == heading:
|
||||
return "\n".join(lines[: i + 1] + ["", addition, ""] + lines[i + 1 :]).strip()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("# "):
|
||||
return "\n".join(lines[: i + 1] + ["", heading, "", addition, ""] + lines[i + 1 :]).strip()
|
||||
return f"{heading}\n\n{addition}\n\n{content}".strip()
|
||||
|
||||
|
||||
def _merge_image_sections(content: str, successes: list[dict], failures: list[dict]) -> str:
|
||||
result = _insert_after_heading(content, "## 关键聊天依据", _image_success_markdown(successes))
|
||||
failure_md = _image_failure_markdown(failures)
|
||||
if failure_md:
|
||||
result = f"{result.rstrip()}\n\n{failure_md}"
|
||||
return result.strip()
|
||||
|
||||
|
||||
def _line_from_snapshot(raw: str | None, fallback_seq: int) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
item = json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
line = _message_line(item, fallback_seq)
|
||||
return line[1] if line else None
|
||||
|
||||
MARKDOWN_TEMPLATE = """\
|
||||
# {title}
|
||||
|
||||
请按聊天记录中的实际内容生成一份【具体售后问题点】报告,不要照抄固定字段,也不要输出占位文案。
|
||||
|
||||
必须围绕以下结构组织,按内容决定是否保留章节,不要输出空章节:
|
||||
## 问题摘要
|
||||
## 关键聊天依据
|
||||
## 当前处理状态
|
||||
## 是否解决
|
||||
## AI 建议/解决方法
|
||||
|
||||
输出规则:
|
||||
- 只写聊天记录中能直接识别或合理归纳的信息。
|
||||
- 没有识别到的客户、门店、联系人、合同、订单、物流、日期、价格、原因等信息直接省略。
|
||||
- 不要写“未从聊天记录中识别”“待补充”“未知”“无”等占位内容。
|
||||
- “是否解决”只能从聊天记录判断,取值限定为:已解决、未解决、处理中、待确认。
|
||||
- 如果聊天内容不足以形成明确售后问题点,仍然按当前话题内容整理,但用更保守的“待确认”结论。
|
||||
- “AI 建议/解决方法”必须放在文档下方,并附注:注:此方法由 AI 生成,仅供参考,请以人工复核和现场实际情况为准。
|
||||
- 只输出 Markdown 报告,不要输出这些规则本身。
|
||||
"""
|
||||
|
||||
|
||||
async def _mark_summarize_failed(topic_id: int, task_id: int | None, error: str):
|
||||
path = get_active_db_path()
|
||||
message = error or "AI 报告生成失败"
|
||||
try:
|
||||
async with aiosqlite.connect(path) as db:
|
||||
await db.execute(
|
||||
"UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||
(topic_id,),
|
||||
)
|
||||
if task_id is not None:
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE ai_tasks
|
||||
SET status='error', progress=?, error=?, updated_at=CURRENT_TIMESTAMP
|
||||
WHERE id=?
|
||||
""",
|
||||
(json.dumps({"processed": 0, "total": 1}), message, task_id),
|
||||
)
|
||||
await db.commit()
|
||||
except Exception as exc:
|
||||
log.warning(f"[summarize] 标记失败状态失败 topic={topic_id} task={task_id}: {exc}")
|
||||
|
||||
|
||||
async def _run_summarize_impl(topic_id: int, topic: dict, task_id: int | None = None):
|
||||
"""
|
||||
为指定话题生成/更新 Markdown 售后事件报告。
|
||||
由 POST /api/topics/{id}/summarize(手动触发)调用。
|
||||
task_id: 若提供,则更新 ai_tasks 表的状态和进度。
|
||||
"""
|
||||
path = get_active_db_path()
|
||||
|
||||
async def _update_task(status: str, processed: int = 0, total: int = 1, error: str = ""):
|
||||
"""辅助函数:更新 ai_tasks 状态和进度"""
|
||||
if task_id is None:
|
||||
return
|
||||
try:
|
||||
async with aiosqlite.connect(path) as _db:
|
||||
_db.row_factory = aiosqlite.Row
|
||||
await _db.execute(
|
||||
"""
|
||||
UPDATE ai_tasks
|
||||
SET status=?, progress=?, error=?, updated_at=CURRENT_TIMESTAMP
|
||||
WHERE id=?
|
||||
""",
|
||||
(status, json.dumps({"processed": processed, "total": total}), error or None, task_id)
|
||||
)
|
||||
await _db.commit()
|
||||
except Exception as e:
|
||||
log.warning(f"[summarize] 更新 task {task_id} 失败: {e}")
|
||||
path = get_active_db_path()
|
||||
async with aiosqlite.connect(path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
|
||||
# 将话题状态置为 processing
|
||||
await db.execute("UPDATE topics SET status = 'processing', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("running", 0, 1)
|
||||
|
||||
# 1. 拿到该话题的所有消息 seq 和群 talker
|
||||
async with db.execute(
|
||||
"""
|
||||
SELECT tm.msg_seq, tm.talker, tm.message_json
|
||||
FROM topic_messages tm
|
||||
WHERE tm.topic_id = ?
|
||||
ORDER BY tm.msg_seq
|
||||
""",
|
||||
(topic_id,),
|
||||
) as cur:
|
||||
msg_rows = await cur.fetchall()
|
||||
|
||||
if not msg_rows:
|
||||
log.warning(f"[summarize] topic={topic_id} 没有消息,跳过")
|
||||
error = "该话题没有关联消息,无法生成 AI 报告"
|
||||
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("error", 0, 1, error)
|
||||
return
|
||||
|
||||
seqs = [r["msg_seq"] for r in msg_rows]
|
||||
# talker 在 topic_messages 里存的是群 ID(chatlog 叫 talker)
|
||||
group_talker = msg_rows[0]["talker"]
|
||||
|
||||
# 2. 批量从 chatlog 拉取消息原文(最多 100 条/批)
|
||||
from services.chatlog_client import chatlog_client
|
||||
messages_text: list[str] = []
|
||||
message_items: dict[int, dict] = {}
|
||||
|
||||
fetched_lines: dict[int, str] = {}
|
||||
for i in range(0, len(seqs), CHATLOG_BATCH_SIZE):
|
||||
chunk_seqs = seqs[i: i + CHATLOG_BATCH_SIZE]
|
||||
try:
|
||||
result = await chatlog_client.get_messages_batch(group_talker, chunk_seqs)
|
||||
for m in result.get("items", []):
|
||||
meta = _message_meta(m)
|
||||
if meta["seq"]:
|
||||
message_items[meta["seq"]] = m
|
||||
line = _message_line(m)
|
||||
if line:
|
||||
fetched_lines[line[0]] = line[1]
|
||||
except Exception as e:
|
||||
log.error(f"[summarize] batch 拉取失败 topic={topic_id}: {e}")
|
||||
|
||||
for r in msg_rows:
|
||||
seq = int(r["msg_seq"])
|
||||
if seq in fetched_lines:
|
||||
messages_text.append(fetched_lines[seq])
|
||||
continue
|
||||
snap_raw = r["message_json"] if "message_json" in r.keys() else None
|
||||
if seq not in message_items and snap_raw:
|
||||
try:
|
||||
snap_item = json.loads(snap_raw)
|
||||
if isinstance(snap_item, dict):
|
||||
message_items[seq] = snap_item
|
||||
except Exception:
|
||||
pass
|
||||
snap_line = _line_from_snapshot(snap_raw, seq)
|
||||
if snap_line:
|
||||
messages_text.append(snap_line)
|
||||
|
||||
image_successes, image_failures = _collect_image_evidence(
|
||||
[message_items[seq] for seq in seqs if seq in message_items]
|
||||
)
|
||||
|
||||
if not messages_text and not image_successes and not image_failures:
|
||||
log.warning(f"[summarize] topic={topic_id} 从 chatlog 获取到 0 条有效消息")
|
||||
error = "未能从 chatlog 获取到有效消息,无法生成 AI 报告"
|
||||
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("error", 0, 1, error)
|
||||
return
|
||||
|
||||
chat_text = "\n".join(messages_text) if messages_text else "无文字消息,仅有图片或媒体证据。"
|
||||
image_context = _image_evidence_context(image_successes, image_failures)
|
||||
learning_context = await build_report_learning_context(
|
||||
db,
|
||||
group_id=topic.get("group_id"),
|
||||
query=f"{topic.get('title', '')}\n{chat_text[:2000]}",
|
||||
exclude_topic_id=topic_id,
|
||||
purpose="summary",
|
||||
)
|
||||
|
||||
# 3. 构建 Prompt
|
||||
template_filled = MARKDOWN_TEMPLATE.format(title=topic["title"])
|
||||
prompt = (
|
||||
f"售后问题点话题:{topic['title']}\n\n"
|
||||
f"以下是该售后问题点关联的完整微信群聊天记录(按时间顺序):\n\n"
|
||||
f"{chat_text}\n\n"
|
||||
f"以下是系统将插入报告的现场图片信息(如有):\n\n{image_context or '无现场图片。'}\n\n"
|
||||
"请根据上述聊天记录输出一份 Markdown 报告。\n"
|
||||
"报告要求:\n"
|
||||
"1. 保持售后问题点口径,优先提炼问题现象、涉及产品/部件、现场材料、处理过程和处理结果。\n"
|
||||
"2. 只能使用聊天记录中能直接识别或合理归纳的信息,不要编造客户、合同、订单、物流、日期、价格、原因或处理结果。\n"
|
||||
"3. 不要输出空字段、空项目、空章节、空表格;某个章节没有有效内容时整段省略。\n"
|
||||
"4. 「是否解决」必须写在文档中,并使用:已解决 / 未解决 / 处理中 / 待确认。\n"
|
||||
"5. 「AI 建议/解决方法」必须写在文档中,且在段末附上固定注释:注:此方法由 AI 生成,仅供参考,请以人工复核和现场实际情况为准。\n"
|
||||
"6. 如果聊天内容不足以形成明确售后问题点,也不要编造结论;只按聊天中已有事实给出保守的待确认判断。\n"
|
||||
"7. 图片会由系统作为「现场图片」原始材料插入「关键聊天依据」;你不要猜测图片内容,也不要自行输出图片 Markdown 或图片说明。\n"
|
||||
"8. 如果聊天文字中有人描述图片内容,可以引用这些文字;但不要根据图片本身编造故障细节。\n"
|
||||
"9. 聊天记录中的「[引用消息]」属于当前回复的上下文证据,可以用于理解被回复的问题和处理过程。\n"
|
||||
"10. 只输出 Markdown 报告,不要输出模板说明或额外解释。\n\n"
|
||||
f"以下是本企业报告库中人工修订过的历史报告示例(如有)。请只学习它们的栏目结构、措辞风格、问题关注点和结论表达方式;不得复制历史事实、客户名、设备状态或处理结果到当前报告:\n\n{learning_context or '暂无可学习的人工修订报告。'}\n\n"
|
||||
f"{template_filled}"
|
||||
)
|
||||
|
||||
# 4. 调用 LLM
|
||||
try:
|
||||
_client, _ai = await _get_client()
|
||||
async with asyncio.timeout(SUMMARY_LLM_TIMEOUT_SECONDS):
|
||||
resp = await _client.chat.completions.create(
|
||||
model=_ai["summary_model"],
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"你是资深售后运营与设备服务工程师,负责根据微信群聊天记录整理具体售后问题点报告。"
|
||||
"你必须忠实依据聊天记录,只输出已识别到的有效信息,缺失信息直接省略,不得编造。"
|
||||
"你要在文档中明确给出是否解决结论,并给出 AI 建议/解决方法和免责声明。只输出 Markdown 报告,不要有任何额外说明。"
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
)
|
||||
content = resp.choices[0].message.content.strip()
|
||||
content = _merge_image_sections(content, image_successes, image_failures)
|
||||
except TimeoutError:
|
||||
error = "AI 报告生成超时,请检查模型/API或稍后重试"
|
||||
log.error(f"[summarize] LLM 调用超时 topic={topic_id}")
|
||||
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("error", 0, 1, error)
|
||||
return
|
||||
except Exception as e:
|
||||
log.error(f"[summarize] LLM 调用失败 topic={topic_id}: {e}", exc_info=True)
|
||||
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("error", 0, 1, str(e) or "LLM 调用失败")
|
||||
return
|
||||
|
||||
# 5. 写入 knowledge_docs
|
||||
async with db.execute(
|
||||
"SELECT id FROM knowledge_docs WHERE topic_id = ?", (topic_id,)
|
||||
) as cur:
|
||||
existing = await cur.fetchone()
|
||||
|
||||
if existing:
|
||||
doc_id = existing["id"]
|
||||
await db.execute(
|
||||
"UPDATE knowledge_docs SET content = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||
(content, doc_id),
|
||||
)
|
||||
else:
|
||||
await db.execute(
|
||||
"INSERT INTO knowledge_docs (topic_id, content) VALUES (?, ?)",
|
||||
(topic_id, content),
|
||||
)
|
||||
async with db.execute("SELECT last_insert_rowid() AS id") as cur:
|
||||
doc_id = (await cur.fetchone())["id"]
|
||||
|
||||
# 6. 更新 FTS(先删后插)
|
||||
await db.execute("DELETE FROM knowledge_fts WHERE doc_id = ?", (doc_id,))
|
||||
await db.execute(
|
||||
"INSERT INTO knowledge_fts (doc_id, title, content) VALUES (?, ?, ?)",
|
||||
(doc_id, tokenize(topic["title"]), tokenize(content)),
|
||||
)
|
||||
|
||||
await db.execute("UPDATE topics SET status = 'completed', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
|
||||
await db.commit()
|
||||
await _update_task("done", 1, 1)
|
||||
log.info(f"[summarize] topic={topic_id} doc={doc_id} 生成完成({len(content)} 字符)")
|
||||
|
||||
|
||||
async def run_summarize(topic_id: int, topic: dict, task_id: int | None = None):
|
||||
try:
|
||||
await _run_summarize_impl(topic_id, topic, task_id)
|
||||
except Exception as e:
|
||||
error = str(e) or e.__class__.__name__
|
||||
log.error(f"[summarize] 未捕获异常 topic={topic_id}: {error}", exc_info=True)
|
||||
await _mark_summarize_failed(topic_id, task_id, error)
|
||||
1094
chatlog_fastAPI/services/topic_engine.py
Normal file
1094
chatlog_fastAPI/services/topic_engine.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user