Files
get_wechat/chatlog_fastAPI/services/summary_engine.py

477 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
售后报告生成引擎
- 从 topic_messages 拿到所有 msg_seq
- 通过 chatlog batch 接口批量拉回消息原文
- 用配置的总结模型生成 Markdown 售后事件报告
- 写入 knowledge_docs + knowledge_ftsjieba 分词)
"""
import asyncio
import logging
import json
import aiosqlite
from urllib.parse import quote
from database import get_active_db_path
from services.ai_client import get_openai_client
from services.fts import tokenize
from services.message_formatter import append_quote_text, extract_contents, extract_quote
from services.report_learning import build_report_learning_context
log = logging.getLogger(__name__)
CHATLOG_BATCH_SIZE = 80
SUMMARY_LLM_TIMEOUT_SECONDS = 300
async def _get_client():
return await get_openai_client()
def _message_line(item: dict, fallback_seq: int = 0) -> tuple[int, str] | None:
if not item:
return None
seq = item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0
time_str = item.get("create_time") or item.get("time") or item.get("CreateTime") or ""
sender = (
item.get("sender_name")
or item.get("senderName")
or item.get("SenderName")
or item.get("sender")
or item.get("Sender")
or ""
)
content = _message_text(item)
if not content:
return None
return int(seq), f"[{time_str}] {sender}: {content}"
def _message_meta(item: dict, fallback_seq: int = 0) -> dict:
return {
"seq": int(item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0),
"time": item.get("create_time") or item.get("time") or item.get("CreateTime") or "",
"sender": (
item.get("sender_name")
or item.get("senderName")
or item.get("SenderName")
or item.get("sender")
or item.get("Sender")
or ""
),
"type": item.get("type") or item.get("Type") or 1,
}
def _extract_contents(item: dict) -> dict:
return extract_contents(item)
def _message_text(item: dict) -> str:
content = item.get("content") or item.get("Content") or ""
contents = _extract_contents(item)
if isinstance(content, str) and content.lstrip().startswith("<") and extract_quote(item):
content = ""
link_title = contents.get("title") or item.get("link_title") or ""
link_desc = contents.get("desc") or item.get("link_desc") or ""
link_source = contents.get("sourceName") or contents.get("source_name") or item.get("link_source") or ""
link_url = contents.get("url") or item.get("link_url") or ""
if link_title:
parts = [f"[链接卡片] {link_title}"]
if link_desc:
parts.append(link_desc)
if link_source:
parts.append(f"来源:{link_source}")
if link_url:
parts.append(f"URL{link_url}")
if content and content not in parts:
parts.append(content)
return append_quote_text("".join(parts), item)
return append_quote_text(content, item)
def _extract_image_key(item: dict) -> str:
contents = _extract_contents(item)
key = (
contents.get("rawmd5")
or contents.get("md5")
or contents.get("path")
or item.get("media_key")
or item.get("mediaKey")
or item.get("image_path")
or ""
)
return str(key).replace("\\", "/")
def _is_image_message(item: dict) -> bool:
try:
return int(item.get("type") or item.get("Type") or 0) == 3
except Exception:
return False
def _media_path(kind: str, key: str) -> str:
return f"/{kind}/" + "/".join(quote(part) for part in key.split("/"))
def _image_url(key: str) -> str:
return f"{_media_path('image', key)}?thumb=1"
def _collect_image_evidence(messages: list[dict]) -> tuple[list[dict], list[dict]]:
images: list[dict] = []
failures: list[dict] = []
for item in messages:
if not _is_image_message(item):
continue
meta = _message_meta(item)
key = _extract_image_key(item)
if not key:
failures.append({**meta, "url": "", "reason": "图片无法展示,缺少图片文件标识"})
continue
url = _image_url(key)
images.append({**meta, "key": key, "url": url})
return images, failures
def _image_evidence_context(images: list[dict], failures: list[dict]) -> str:
lines: list[str] = []
if images:
lines.append("系统将作为原始材料插入报告的现场图片:")
for img in images:
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']} url={img['url']}")
if failures:
lines.append("无法展示的图片清单:")
for img in failures:
link = f",查看图片:{img['url']}" if img.get("url") else ""
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}{img['reason']}{link}")
return "\n".join(lines)
def _image_success_markdown(images: list[dict]) -> str:
if not images:
return ""
blocks = ["### 现场图片"]
for img in images:
alt = f"现场图片 - {img['time']} {img['sender']}".strip()
blocks.extend(
[
f"![{alt}]({img['url']})",
f"来源:{img['time']} {img['sender']} seq={img['seq']}",
"",
]
)
return "\n".join(blocks).strip()
def _image_failure_markdown(failures: list[dict]) -> str:
if not failures:
return ""
lines = ["## 图片展示提示"]
for img in failures:
link = f",查看图片:{img['url']}" if img.get("url") else ""
lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}{img['reason']}{link}")
return "\n".join(lines)
def _insert_after_heading(content: str, heading: str, addition: str) -> str:
if not addition:
return content
lines = content.splitlines()
for i, line in enumerate(lines):
if line.strip() == heading:
return "\n".join(lines[: i + 1] + ["", addition, ""] + lines[i + 1 :]).strip()
for i, line in enumerate(lines):
if line.startswith("# "):
return "\n".join(lines[: i + 1] + ["", heading, "", addition, ""] + lines[i + 1 :]).strip()
return f"{heading}\n\n{addition}\n\n{content}".strip()
def _merge_image_sections(content: str, successes: list[dict], failures: list[dict]) -> str:
result = _insert_after_heading(content, "## 关键聊天依据", _image_success_markdown(successes))
failure_md = _image_failure_markdown(failures)
if failure_md:
result = f"{result.rstrip()}\n\n{failure_md}"
return result.strip()
def _line_from_snapshot(raw: str | None, fallback_seq: int) -> str | None:
if not raw:
return None
try:
item = json.loads(raw)
except Exception:
return None
line = _message_line(item, fallback_seq)
return line[1] if line else None
MARKDOWN_TEMPLATE = """\
# {title}
请按聊天记录中的实际内容生成一份【具体售后问题点】报告,不要照抄固定字段,也不要输出占位文案。
必须围绕以下结构组织,按内容决定是否保留章节,不要输出空章节:
## 问题摘要
## 关键聊天依据
## 当前处理状态
## 是否解决
## AI 建议/解决方法
输出规则:
- 只写聊天记录中能直接识别或合理归纳的信息。
- 没有识别到的客户、门店、联系人、合同、订单、物流、日期、价格、原因等信息直接省略。
- 不要写“未从聊天记录中识别”“待补充”“未知”“无”等占位内容。
- “是否解决”只能从聊天记录判断,取值限定为:已解决、未解决、处理中、待确认。
- 如果聊天内容不足以形成明确售后问题点,仍然按当前话题内容整理,但用更保守的“待确认”结论。
- “AI 建议/解决方法”必须放在文档下方,并附注:注:此方法由 AI 生成,仅供参考,请以人工复核和现场实际情况为准。
- 只输出 Markdown 报告,不要输出这些规则本身。
"""
async def _mark_summarize_failed(topic_id: int, task_id: int | None, error: str):
path = get_active_db_path()
message = error or "AI 报告生成失败"
try:
async with aiosqlite.connect(path) as db:
await db.execute(
"UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(topic_id,),
)
if task_id is not None:
await db.execute(
"""
UPDATE ai_tasks
SET status='error', progress=?, error=?, updated_at=CURRENT_TIMESTAMP
WHERE id=?
""",
(json.dumps({"processed": 0, "total": 1}), message, task_id),
)
await db.commit()
except Exception as exc:
log.warning(f"[summarize] 标记失败状态失败 topic={topic_id} task={task_id}: {exc}")
async def _run_summarize_impl(topic_id: int, topic: dict, task_id: int | None = None):
"""
为指定话题生成/更新 Markdown 售后事件报告。
由 POST /api/topics/{id}/summarize手动触发调用。
task_id: 若提供,则更新 ai_tasks 表的状态和进度。
"""
path = get_active_db_path()
async def _update_task(status: str, processed: int = 0, total: int = 1, error: str = ""):
"""辅助函数:更新 ai_tasks 状态和进度"""
if task_id is None:
return
try:
async with aiosqlite.connect(path) as _db:
_db.row_factory = aiosqlite.Row
await _db.execute(
"""
UPDATE ai_tasks
SET status=?, progress=?, error=?, updated_at=CURRENT_TIMESTAMP
WHERE id=?
""",
(status, json.dumps({"processed": processed, "total": total}), error or None, task_id)
)
await _db.commit()
except Exception as e:
log.warning(f"[summarize] 更新 task {task_id} 失败: {e}")
path = get_active_db_path()
async with aiosqlite.connect(path) as db:
db.row_factory = aiosqlite.Row
# 将话题状态置为 processing
await db.execute("UPDATE topics SET status = 'processing', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("running", 0, 1)
# 1. 拿到该话题的所有消息 seq 和群 talker
async with db.execute(
"""
SELECT tm.msg_seq, tm.talker, tm.message_json
FROM topic_messages tm
WHERE tm.topic_id = ?
ORDER BY tm.msg_seq
""",
(topic_id,),
) as cur:
msg_rows = await cur.fetchall()
if not msg_rows:
log.warning(f"[summarize] topic={topic_id} 没有消息,跳过")
error = "该话题没有关联消息,无法生成 AI 报告"
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("error", 0, 1, error)
return
seqs = [r["msg_seq"] for r in msg_rows]
# talker 在 topic_messages 里存的是群 IDchatlog 叫 talker
group_talker = msg_rows[0]["talker"]
# 2. 批量从 chatlog 拉取消息原文(最多 100 条/批)
from services.chatlog_client import chatlog_client
messages_text: list[str] = []
message_items: dict[int, dict] = {}
fetched_lines: dict[int, str] = {}
for i in range(0, len(seqs), CHATLOG_BATCH_SIZE):
chunk_seqs = seqs[i: i + CHATLOG_BATCH_SIZE]
try:
result = await chatlog_client.get_messages_batch(group_talker, chunk_seqs)
for m in result.get("items", []):
meta = _message_meta(m)
if meta["seq"]:
message_items[meta["seq"]] = m
line = _message_line(m)
if line:
fetched_lines[line[0]] = line[1]
except Exception as e:
log.error(f"[summarize] batch 拉取失败 topic={topic_id}: {e}")
for r in msg_rows:
seq = int(r["msg_seq"])
if seq in fetched_lines:
messages_text.append(fetched_lines[seq])
continue
snap_raw = r["message_json"] if "message_json" in r.keys() else None
if seq not in message_items and snap_raw:
try:
snap_item = json.loads(snap_raw)
if isinstance(snap_item, dict):
message_items[seq] = snap_item
except Exception:
pass
snap_line = _line_from_snapshot(snap_raw, seq)
if snap_line:
messages_text.append(snap_line)
image_successes, image_failures = _collect_image_evidence(
[message_items[seq] for seq in seqs if seq in message_items]
)
if not messages_text and not image_successes and not image_failures:
log.warning(f"[summarize] topic={topic_id} 从 chatlog 获取到 0 条有效消息")
error = "未能从 chatlog 获取到有效消息,无法生成 AI 报告"
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("error", 0, 1, error)
return
chat_text = "\n".join(messages_text) if messages_text else "无文字消息,仅有图片或媒体证据。"
image_context = _image_evidence_context(image_successes, image_failures)
learning_context = await build_report_learning_context(
db,
group_id=topic.get("group_id"),
query=f"{topic.get('title', '')}\n{chat_text[:2000]}",
exclude_topic_id=topic_id,
purpose="summary",
)
# 3. 构建 Prompt
template_filled = MARKDOWN_TEMPLATE.format(title=topic["title"])
prompt = (
f"售后问题点话题:{topic['title']}\n\n"
f"以下是该售后问题点关联的完整微信群聊天记录(按时间顺序):\n\n"
f"{chat_text}\n\n"
f"以下是系统将插入报告的现场图片信息(如有):\n\n{image_context or '无现场图片。'}\n\n"
"请根据上述聊天记录输出一份 Markdown 报告。\n"
"报告要求:\n"
"1. 保持售后问题点口径,优先提炼问题现象、涉及产品/部件、现场材料、处理过程和处理结果。\n"
"2. 只能使用聊天记录中能直接识别或合理归纳的信息,不要编造客户、合同、订单、物流、日期、价格、原因或处理结果。\n"
"3. 不要输出空字段、空项目、空章节、空表格;某个章节没有有效内容时整段省略。\n"
"4. 「是否解决」必须写在文档中,并使用:已解决 / 未解决 / 处理中 / 待确认。\n"
"5. 「AI 建议/解决方法」必须写在文档中,且在段末附上固定注释:注:此方法由 AI 生成,仅供参考,请以人工复核和现场实际情况为准。\n"
"6. 如果聊天内容不足以形成明确售后问题点,也不要编造结论;只按聊天中已有事实给出保守的待确认判断。\n"
"7. 图片会由系统作为「现场图片」原始材料插入「关键聊天依据」;你不要猜测图片内容,也不要自行输出图片 Markdown 或图片说明。\n"
"8. 如果聊天文字中有人描述图片内容,可以引用这些文字;但不要根据图片本身编造故障细节。\n"
"9. 聊天记录中的「[引用消息]」属于当前回复的上下文证据,可以用于理解被回复的问题和处理过程。\n"
"10. 只输出 Markdown 报告,不要输出模板说明或额外解释。\n\n"
f"以下是本企业报告库中人工修订过的历史报告示例(如有)。请只学习它们的栏目结构、措辞风格、问题关注点和结论表达方式;不得复制历史事实、客户名、设备状态或处理结果到当前报告:\n\n{learning_context or '暂无可学习的人工修订报告。'}\n\n"
f"{template_filled}"
)
# 4. 调用 LLM
try:
_client, _ai = await _get_client()
async with asyncio.timeout(SUMMARY_LLM_TIMEOUT_SECONDS):
resp = await _client.chat.completions.create(
model=_ai["summary_model"],
messages=[
{
"role": "system",
"content": (
"你是资深售后运营与设备服务工程师,负责根据微信群聊天记录整理具体售后问题点报告。"
"你必须忠实依据聊天记录,只输出已识别到的有效信息,缺失信息直接省略,不得编造。"
"你要在文档中明确给出是否解决结论,并给出 AI 建议/解决方法和免责声明。只输出 Markdown 报告,不要有任何额外说明。"
),
},
{"role": "user", "content": prompt},
],
temperature=0.2,
)
content = resp.choices[0].message.content.strip()
content = _merge_image_sections(content, image_successes, image_failures)
except TimeoutError:
error = "AI 报告生成超时,请检查模型/API或稍后重试"
log.error(f"[summarize] LLM 调用超时 topic={topic_id}")
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("error", 0, 1, error)
return
except Exception as e:
log.error(f"[summarize] LLM 调用失败 topic={topic_id}: {e}", exc_info=True)
await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("error", 0, 1, str(e) or "LLM 调用失败")
return
# 5. 写入 knowledge_docs
async with db.execute(
"SELECT id FROM knowledge_docs WHERE topic_id = ?", (topic_id,)
) as cur:
existing = await cur.fetchone()
if existing:
doc_id = existing["id"]
await db.execute(
"UPDATE knowledge_docs SET content = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(content, doc_id),
)
else:
await db.execute(
"INSERT INTO knowledge_docs (topic_id, content) VALUES (?, ?)",
(topic_id, content),
)
async with db.execute("SELECT last_insert_rowid() AS id") as cur:
doc_id = (await cur.fetchone())["id"]
# 6. 更新 FTS先删后插
await db.execute("DELETE FROM knowledge_fts WHERE doc_id = ?", (doc_id,))
await db.execute(
"INSERT INTO knowledge_fts (doc_id, title, content) VALUES (?, ?, ?)",
(doc_id, tokenize(topic["title"]), tokenize(content)),
)
await db.execute("UPDATE topics SET status = 'completed', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
await db.commit()
await _update_task("done", 1, 1)
log.info(f"[summarize] topic={topic_id} doc={doc_id} 生成完成({len(content)} 字符)")
async def run_summarize(topic_id: int, topic: dict, task_id: int | None = None):
try:
await _run_summarize_impl(topic_id, topic, task_id)
except Exception as e:
error = str(e) or e.__class__.__name__
log.error(f"[summarize] 未捕获异常 topic={topic_id}: {error}", exc_info=True)
await _mark_summarize_failed(topic_id, task_id, error)