Initial upload for secondary development

2026-06-08 19:00:03 +08:00
commit b913b8c78c
81 changed files with 27139 additions and 0 deletions
--- a/chatlog_fastAPI/services/summary_engine.py
+++ b/chatlog_fastAPI/services/summary_engine.py
@@ -0,0 +1,476 @@
+"""
+售后报告生成引擎
+- 从 topic_messages 拿到所有 msg_seq
+- 通过 chatlog batch 接口批量拉回消息原文
+- 用配置的总结模型生成 Markdown 售后事件报告
+- 写入 knowledge_docs + knowledge_fts（jieba 分词）
+"""
+
+import asyncio
+import logging
+import json
+import aiosqlite
+from urllib.parse import quote
+
+from database import get_active_db_path
+from services.ai_client import get_openai_client
+from services.fts import tokenize
+from services.message_formatter import append_quote_text, extract_contents, extract_quote
+from services.report_learning import build_report_learning_context
+
+log = logging.getLogger(__name__)
+
+CHATLOG_BATCH_SIZE = 80
+SUMMARY_LLM_TIMEOUT_SECONDS = 300
+
+
+async def _get_client():
+    return await get_openai_client()
+
+
+def _message_line(item: dict, fallback_seq: int = 0) -> tuple[int, str] | None:
+    if not item:
+        return None
+    seq = item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0
+    time_str = item.get("create_time") or item.get("time") or item.get("CreateTime") or ""
+    sender = (
+        item.get("sender_name")
+        or item.get("senderName")
+        or item.get("SenderName")
+        or item.get("sender")
+        or item.get("Sender")
+        or ""
+    )
+    content = _message_text(item)
+    if not content:
+        return None
+    return int(seq), f"[{time_str}] {sender}: {content}"
+
+
+def _message_meta(item: dict, fallback_seq: int = 0) -> dict:
+    return {
+        "seq": int(item.get("seq") or item.get("Seq") or item.get("sort_seq") or fallback_seq or 0),
+        "time": item.get("create_time") or item.get("time") or item.get("CreateTime") or "",
+        "sender": (
+            item.get("sender_name")
+            or item.get("senderName")
+            or item.get("SenderName")
+            or item.get("sender")
+            or item.get("Sender")
+            or ""
+        ),
+        "type": item.get("type") or item.get("Type") or 1,
+    }
+
+
+def _extract_contents(item: dict) -> dict:
+    return extract_contents(item)
+
+
+def _message_text(item: dict) -> str:
+    content = item.get("content") or item.get("Content") or ""
+    contents = _extract_contents(item)
+    if isinstance(content, str) and content.lstrip().startswith("<") and extract_quote(item):
+        content = ""
+
+    link_title = contents.get("title") or item.get("link_title") or ""
+    link_desc = contents.get("desc") or item.get("link_desc") or ""
+    link_source = contents.get("sourceName") or contents.get("source_name") or item.get("link_source") or ""
+    link_url = contents.get("url") or item.get("link_url") or ""
+
+    if link_title:
+        parts = [f"[链接卡片] {link_title}"]
+        if link_desc:
+            parts.append(link_desc)
+        if link_source:
+            parts.append(f"来源：{link_source}")
+        if link_url:
+            parts.append(f"URL：{link_url}")
+        if content and content not in parts:
+            parts.append(content)
+        return append_quote_text("；".join(parts), item)
+
+    return append_quote_text(content, item)
+
+
+def _extract_image_key(item: dict) -> str:
+    contents = _extract_contents(item)
+    key = (
+        contents.get("rawmd5")
+        or contents.get("md5")
+        or contents.get("path")
+        or item.get("media_key")
+        or item.get("mediaKey")
+        or item.get("image_path")
+        or ""
+    )
+    return str(key).replace("\\", "/")
+
+
+def _is_image_message(item: dict) -> bool:
+    try:
+        return int(item.get("type") or item.get("Type") or 0) == 3
+    except Exception:
+        return False
+
+
+def _media_path(kind: str, key: str) -> str:
+    return f"/{kind}/" + "/".join(quote(part) for part in key.split("/"))
+
+
+def _image_url(key: str) -> str:
+    return f"{_media_path('image', key)}?thumb=1"
+
+
+def _collect_image_evidence(messages: list[dict]) -> tuple[list[dict], list[dict]]:
+    images: list[dict] = []
+    failures: list[dict] = []
+
+    for item in messages:
+        if not _is_image_message(item):
+            continue
+        meta = _message_meta(item)
+        key = _extract_image_key(item)
+        if not key:
+            failures.append({**meta, "url": "", "reason": "图片无法展示，缺少图片文件标识"})
+            continue
+
+        url = _image_url(key)
+        images.append({**meta, "key": key, "url": url})
+
+    return images, failures
+
+
+def _image_evidence_context(images: list[dict], failures: list[dict]) -> str:
+    lines: list[str] = []
+    if images:
+        lines.append("系统将作为原始材料插入报告的现场图片：")
+        for img in images:
+            lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']} url={img['url']}")
+    if failures:
+        lines.append("无法展示的图片清单：")
+        for img in failures:
+            link = f"，查看图片：{img['url']}" if img.get("url") else ""
+            lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}：{img['reason']}{link}")
+    return "\n".join(lines)
+
+
+def _image_success_markdown(images: list[dict]) -> str:
+    if not images:
+        return ""
+    blocks = ["### 现场图片"]
+    for img in images:
+        alt = f"现场图片 - {img['time']} {img['sender']}".strip()
+        blocks.extend(
+            [
+                f"![{alt}]({img['url']})",
+                f"来源：{img['time']} {img['sender']} seq={img['seq']}",
+                "",
+            ]
+        )
+    return "\n".join(blocks).strip()
+
+
+def _image_failure_markdown(failures: list[dict]) -> str:
+    if not failures:
+        return ""
+    lines = ["## 图片展示提示"]
+    for img in failures:
+        link = f"，查看图片：{img['url']}" if img.get("url") else ""
+        lines.append(f"- [{img['time']}] {img['sender']} seq={img['seq']}：{img['reason']}{link}")
+    return "\n".join(lines)
+
+
+def _insert_after_heading(content: str, heading: str, addition: str) -> str:
+    if not addition:
+        return content
+    lines = content.splitlines()
+    for i, line in enumerate(lines):
+        if line.strip() == heading:
+            return "\n".join(lines[: i + 1] + ["", addition, ""] + lines[i + 1 :]).strip()
+    for i, line in enumerate(lines):
+        if line.startswith("# "):
+            return "\n".join(lines[: i + 1] + ["", heading, "", addition, ""] + lines[i + 1 :]).strip()
+    return f"{heading}\n\n{addition}\n\n{content}".strip()
+
+
+def _merge_image_sections(content: str, successes: list[dict], failures: list[dict]) -> str:
+    result = _insert_after_heading(content, "## 关键聊天依据", _image_success_markdown(successes))
+    failure_md = _image_failure_markdown(failures)
+    if failure_md:
+        result = f"{result.rstrip()}\n\n{failure_md}"
+    return result.strip()
+
+
+def _line_from_snapshot(raw: str | None, fallback_seq: int) -> str | None:
+    if not raw:
+        return None
+    try:
+        item = json.loads(raw)
+    except Exception:
+        return None
+    line = _message_line(item, fallback_seq)
+    return line[1] if line else None
+
+MARKDOWN_TEMPLATE = """\
+# {title}
+
+请按聊天记录中的实际内容生成一份【具体售后问题点】报告，不要照抄固定字段，也不要输出占位文案。
+
+必须围绕以下结构组织，按内容决定是否保留章节，不要输出空章节：
+## 问题摘要
+## 关键聊天依据
+## 当前处理状态
+## 是否解决
+## AI 建议/解决方法
+
+输出规则：
+- 只写聊天记录中能直接识别或合理归纳的信息。
+- 没有识别到的客户、门店、联系人、合同、订单、物流、日期、价格、原因等信息直接省略。
+- 不要写“未从聊天记录中识别”“待补充”“未知”“无”等占位内容。
+- “是否解决”只能从聊天记录判断，取值限定为：已解决、未解决、处理中、待确认。
+- 如果聊天内容不足以形成明确售后问题点，仍然按当前话题内容整理，但用更保守的“待确认”结论。
+- “AI 建议/解决方法”必须放在文档下方，并附注：注：此方法由 AI 生成，仅供参考，请以人工复核和现场实际情况为准。
+- 只输出 Markdown 报告，不要输出这些规则本身。
+"""
+
+
+async def _mark_summarize_failed(topic_id: int, task_id: int | None, error: str):
+    path = get_active_db_path()
+    message = error or "AI 报告生成失败"
+    try:
+        async with aiosqlite.connect(path) as db:
+            await db.execute(
+                "UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
+                (topic_id,),
+            )
+            if task_id is not None:
+                await db.execute(
+                    """
+                    UPDATE ai_tasks
+                    SET status='error', progress=?, error=?, updated_at=CURRENT_TIMESTAMP
+                    WHERE id=?
+                    """,
+                    (json.dumps({"processed": 0, "total": 1}), message, task_id),
+                )
+            await db.commit()
+    except Exception as exc:
+        log.warning(f"[summarize] 标记失败状态失败 topic={topic_id} task={task_id}: {exc}")
+
+
+async def _run_summarize_impl(topic_id: int, topic: dict, task_id: int | None = None):
+    """
+    为指定话题生成/更新 Markdown 售后事件报告。
+    由 POST /api/topics/{id}/summarize（手动触发）调用。
+    task_id: 若提供，则更新 ai_tasks 表的状态和进度。
+    """
+    path = get_active_db_path()
+
+    async def _update_task(status: str, processed: int = 0, total: int = 1, error: str = ""):
+        """辅助函数：更新 ai_tasks 状态和进度"""
+        if task_id is None:
+            return
+        try:
+            async with aiosqlite.connect(path) as _db:
+                _db.row_factory = aiosqlite.Row
+                await _db.execute(
+                    """
+                    UPDATE ai_tasks
+                    SET status=?, progress=?, error=?, updated_at=CURRENT_TIMESTAMP
+                    WHERE id=?
+                    """,
+                    (status, json.dumps({"processed": processed, "total": total}), error or None, task_id)
+                )
+                await _db.commit()
+        except Exception as e:
+            log.warning(f"[summarize] 更新 task {task_id} 失败: {e}")
+    path = get_active_db_path()
+    async with aiosqlite.connect(path) as db:
+        db.row_factory = aiosqlite.Row
+
+        # 将话题状态置为 processing
+        await db.execute("UPDATE topics SET status = 'processing', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+        await db.commit()
+        await _update_task("running", 0, 1)
+
+        # 1. 拿到该话题的所有消息 seq 和群 talker
+        async with db.execute(
+            """
+            SELECT tm.msg_seq, tm.talker, tm.message_json
+            FROM topic_messages tm
+            WHERE tm.topic_id = ?
+            ORDER BY tm.msg_seq
+            """,
+            (topic_id,),
+        ) as cur:
+            msg_rows = await cur.fetchall()
+
+        if not msg_rows:
+            log.warning(f"[summarize] topic={topic_id} 没有消息，跳过")
+            error = "该话题没有关联消息，无法生成 AI 报告"
+            await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+            await db.commit()
+            await _update_task("error", 0, 1, error)
+            return
+
+        seqs = [r["msg_seq"] for r in msg_rows]
+        # talker 在 topic_messages 里存的是群 ID（chatlog 叫 talker）
+        group_talker = msg_rows[0]["talker"]
+
+        # 2. 批量从 chatlog 拉取消息原文（最多 100 条/批）
+        from services.chatlog_client import chatlog_client
+        messages_text: list[str] = []
+        message_items: dict[int, dict] = {}
+
+        fetched_lines: dict[int, str] = {}
+        for i in range(0, len(seqs), CHATLOG_BATCH_SIZE):
+            chunk_seqs = seqs[i: i + CHATLOG_BATCH_SIZE]
+            try:
+                result = await chatlog_client.get_messages_batch(group_talker, chunk_seqs)
+                for m in result.get("items", []):
+                    meta = _message_meta(m)
+                    if meta["seq"]:
+                        message_items[meta["seq"]] = m
+                    line = _message_line(m)
+                    if line:
+                        fetched_lines[line[0]] = line[1]
+            except Exception as e:
+                log.error(f"[summarize] batch 拉取失败 topic={topic_id}: {e}")
+
+        for r in msg_rows:
+            seq = int(r["msg_seq"])
+            if seq in fetched_lines:
+                messages_text.append(fetched_lines[seq])
+                continue
+            snap_raw = r["message_json"] if "message_json" in r.keys() else None
+            if seq not in message_items and snap_raw:
+                try:
+                    snap_item = json.loads(snap_raw)
+                    if isinstance(snap_item, dict):
+                        message_items[seq] = snap_item
+                except Exception:
+                    pass
+            snap_line = _line_from_snapshot(snap_raw, seq)
+            if snap_line:
+                messages_text.append(snap_line)
+
+        image_successes, image_failures = _collect_image_evidence(
+            [message_items[seq] for seq in seqs if seq in message_items]
+        )
+
+        if not messages_text and not image_successes and not image_failures:
+            log.warning(f"[summarize] topic={topic_id} 从 chatlog 获取到 0 条有效消息")
+            error = "未能从 chatlog 获取到有效消息，无法生成 AI 报告"
+            await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+            await db.commit()
+            await _update_task("error", 0, 1, error)
+            return
+
+        chat_text = "\n".join(messages_text) if messages_text else "无文字消息，仅有图片或媒体证据。"
+        image_context = _image_evidence_context(image_successes, image_failures)
+        learning_context = await build_report_learning_context(
+            db,
+            group_id=topic.get("group_id"),
+            query=f"{topic.get('title', '')}\n{chat_text[:2000]}",
+            exclude_topic_id=topic_id,
+            purpose="summary",
+        )
+
+        # 3. 构建 Prompt
+        template_filled = MARKDOWN_TEMPLATE.format(title=topic["title"])
+        prompt = (
+            f"售后问题点话题：{topic['title']}\n\n"
+            f"以下是该售后问题点关联的完整微信群聊天记录（按时间顺序）：\n\n"
+            f"{chat_text}\n\n"
+            f"以下是系统将插入报告的现场图片信息（如有）：\n\n{image_context or '无现场图片。'}\n\n"
+            "请根据上述聊天记录输出一份 Markdown 报告。\n"
+            "报告要求：\n"
+            "1. 保持售后问题点口径，优先提炼问题现象、涉及产品/部件、现场材料、处理过程和处理结果。\n"
+            "2. 只能使用聊天记录中能直接识别或合理归纳的信息，不要编造客户、合同、订单、物流、日期、价格、原因或处理结果。\n"
+            "3. 不要输出空字段、空项目、空章节、空表格；某个章节没有有效内容时整段省略。\n"
+            "4. 「是否解决」必须写在文档中，并使用：已解决 / 未解决 / 处理中 / 待确认。\n"
+            "5. 「AI 建议/解决方法」必须写在文档中，且在段末附上固定注释：注：此方法由 AI 生成，仅供参考，请以人工复核和现场实际情况为准。\n"
+            "6. 如果聊天内容不足以形成明确售后问题点，也不要编造结论；只按聊天中已有事实给出保守的待确认判断。\n"
+            "7. 图片会由系统作为「现场图片」原始材料插入「关键聊天依据」；你不要猜测图片内容，也不要自行输出图片 Markdown 或图片说明。\n"
+            "8. 如果聊天文字中有人描述图片内容，可以引用这些文字；但不要根据图片本身编造故障细节。\n"
+            "9. 聊天记录中的「[引用消息]」属于当前回复的上下文证据，可以用于理解被回复的问题和处理过程。\n"
+            "10. 只输出 Markdown 报告，不要输出模板说明或额外解释。\n\n"
+            f"以下是本企业报告库中人工修订过的历史报告示例（如有）。请只学习它们的栏目结构、措辞风格、问题关注点和结论表达方式；不得复制历史事实、客户名、设备状态或处理结果到当前报告：\n\n{learning_context or '暂无可学习的人工修订报告。'}\n\n"
+            f"{template_filled}"
+        )
+
+        # 4. 调用 LLM
+        try:
+            _client, _ai = await _get_client()
+            async with asyncio.timeout(SUMMARY_LLM_TIMEOUT_SECONDS):
+                resp = await _client.chat.completions.create(
+                    model=_ai["summary_model"],
+                    messages=[
+                        {
+                            "role": "system",
+                            "content": (
+                                "你是资深售后运营与设备服务工程师，负责根据微信群聊天记录整理具体售后问题点报告。"
+                                "你必须忠实依据聊天记录，只输出已识别到的有效信息，缺失信息直接省略，不得编造。"
+                                "你要在文档中明确给出是否解决结论，并给出 AI 建议/解决方法和免责声明。只输出 Markdown 报告，不要有任何额外说明。"
+                            ),
+                        },
+                        {"role": "user", "content": prompt},
+                    ],
+                    temperature=0.2,
+                )
+            content = resp.choices[0].message.content.strip()
+            content = _merge_image_sections(content, image_successes, image_failures)
+        except TimeoutError:
+            error = "AI 报告生成超时，请检查模型/API或稍后重试"
+            log.error(f"[summarize] LLM 调用超时 topic={topic_id}")
+            await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+            await db.commit()
+            await _update_task("error", 0, 1, error)
+            return
+        except Exception as e:
+            log.error(f"[summarize] LLM 调用失败 topic={topic_id}: {e}", exc_info=True)
+            await db.execute("UPDATE topics SET status = 'error', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+            await db.commit()
+            await _update_task("error", 0, 1, str(e) or "LLM 调用失败")
+            return
+
+        # 5. 写入 knowledge_docs
+        async with db.execute(
+            "SELECT id FROM knowledge_docs WHERE topic_id = ?", (topic_id,)
+        ) as cur:
+            existing = await cur.fetchone()
+
+        if existing:
+            doc_id = existing["id"]
+            await db.execute(
+                "UPDATE knowledge_docs SET content = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
+                (content, doc_id),
+            )
+        else:
+            await db.execute(
+                "INSERT INTO knowledge_docs (topic_id, content) VALUES (?, ?)",
+                (topic_id, content),
+            )
+            async with db.execute("SELECT last_insert_rowid() AS id") as cur:
+                doc_id = (await cur.fetchone())["id"]
+
+        # 6. 更新 FTS（先删后插）
+        await db.execute("DELETE FROM knowledge_fts WHERE doc_id = ?", (doc_id,))
+        await db.execute(
+            "INSERT INTO knowledge_fts (doc_id, title, content) VALUES (?, ?, ?)",
+            (doc_id, tokenize(topic["title"]), tokenize(content)),
+        )
+
+        await db.execute("UPDATE topics SET status = 'completed', updated_at = CURRENT_TIMESTAMP WHERE id = ?", (topic_id,))
+        await db.commit()
+        await _update_task("done", 1, 1)
+        log.info(f"[summarize] topic={topic_id} doc={doc_id} 生成完成（{len(content)} 字符）")
+
+
+async def run_summarize(topic_id: int, topic: dict, task_id: int | None = None):
+    try:
+        await _run_summarize_impl(topic_id, topic, task_id)
+    except Exception as e:
+        error = str(e) or e.__class__.__name__
+        log.error(f"[summarize] 未捕获异常 topic={topic_id}: {error}", exc_info=True)
+        await _mark_summarize_failed(topic_id, task_id, error)