get_wechat/chatlog_fastAPI/services/topic_engine.py

"""
话题分类引擎

用户点击 AI 分析后，按所选时间段全量分页拉取消息，解析可用媒体证据，
再分批抽取、跨批合并并校验消息归属。
"""

import asyncio
import json
import logging
import re
from datetime import datetime
import aiosqlite
from fastapi import HTTPException

from database import get_active_db_path
from services.ai_client import get_openai_client
from services.chatlog_client import chatlog_client
from services.message_formatter import append_quote_text, extract_quote
from services.media_parser import parse_media
from services.report_learning import build_report_learning_context
from services.runtime_settings import get_ai_settings

log = logging.getLogger(__name__)

_classify_lock = asyncio.Lock()
_classifying_group: int | None = None

FETCH_PAGE_SIZE = 500
CLASSIFY_BATCH_SIZE = 40
CONTEXT_BEFORE = 4
CONTEXT_AFTER = 4
CLASSIFY_CONTENT_LIMIT = 900
MEDIA_PARSE_CONCURRENCY = 2
UNCATEGORIZED_BUSINESS = "未归类设备问题（需人工复核）"
DAILY_TOPIC = "日常交流/待确认"
DEVICE_ISSUE_EXAMPLES = (
    "报警/故障代码", "主轴异常", "刀库/换刀异常", "回零/限位异常",
    "伺服/驱动异常", "加工精度异常", "气压/液压异常", "润滑/冷却异常",
    "电气线路/IO异常", "系统参数/程序操作问题",
)

BUSINESS_KEYWORDS = (
    "设备", "机床", "数控", "加工中心", "CNC", "cnc", "大铁", "客户", "公司", "厂家", "现场",
    "报警", "警报", "报错", "故障", "故障码", "代码", "异常", "报修", "维修", "售后", "工程师",
    "主轴", "转速", "拉刀", "松刀", "刀库", "换刀", "刀臂", "刀盘", "刀号", "卡刀", "掉刀",
    "回零", "原点", "限位", "行程", "伺服", "驱动", "驱动器", "电机", "编码器", "变频器",
    "丝杆", "导轨", "精度", "尺寸", "跑偏", "震刀", "振刀", "圆度", "平面度", "加工",
    "气压", "液压", "油压", "润滑", "冷却", "水泵", "切削液", "漏油", "漏水", "异响",
    "电柜", "线路", "电源", "跳闸", "断电", "IO", "I/O", "PLC", "系统", "参数", "程序",
    "G代码", "M代码", "面板", "手轮", "急停", "排屑", "卡住", "调试", "安装", "处理",
)


async def _get_client():
    return await get_openai_client()


def get_classifying_group() -> int | None:
    """返回正在分析的 group_id，没有则 None。供 routers 检查串行状态。"""
    return _classifying_group


def _msg_seq(m: dict) -> int:
    return int(m.get("sort_seq") or m.get("seq") or m.get("Seq") or m.get("id") or 0)


def _msg_time(m: dict) -> str:
    return m.get("create_time") or m.get("time") or m.get("CreateTime") or ""


def _msg_sender(m: dict) -> str:
    return (
        m.get("sender_name")
        or m.get("senderName")
        or m.get("SenderName")
        or m.get("sender")
        or m.get("Sender")
        or ""
    )


def _raw_sender_id(m: dict) -> str:
    return str(m.get("sender") or m.get("Sender") or "").strip()


def _looks_like_raw_sender_id(value: str | None) -> bool:
    value = (value or "").strip()
    return (
        not value
        or value.startswith("wxid_")
        or value.startswith("gh_")
        or value.endswith("@chatroom")
        or value.startswith("chatroom_")
    )


def _sender_display_name(m: dict, sender: str = "") -> str:
    for key in (
        "sender_name",
        "senderName",
        "SenderName",
        "accountName",
        "groupNickname",
        "displayName",
        "nickName",
        "remark",
    ):
        value = str(m.get(key) or "").strip()
        if value and value != sender and not _looks_like_raw_sender_id(value):
            return value
    return ""


def _fill_message_sender_names(messages: list[dict]) -> None:
    names: dict[str, str] = {}
    for m in messages:
        sender = _raw_sender_id(m)
        name = _sender_display_name(m, sender)
        if sender and name:
            names[sender] = name
    if not names:
        return
    for m in messages:
        sender = _raw_sender_id(m)
        if not sender or sender not in names:
            continue
        current = str(m.get("sender_name") or m.get("senderName") or m.get("SenderName") or "").strip()
        if not current or current == sender or _looks_like_raw_sender_id(current):
            m["sender_name"] = names[sender]
            m["senderName"] = names[sender]


def _msg_type(m: dict) -> int:
    try:
        return int(m.get("type") or m.get("Type") or 1)
    except Exception:
        return 1


def _msg_sub_type(m: dict) -> int:
    try:
        return int(m.get("sub_type") or m.get("subType") or m.get("SubType") or 0)
    except Exception:
        return 0


def _contents(m: dict) -> dict:
    contents = m.get("contents") or m.get("Contents") or {}
    return contents if isinstance(contents, dict) else {}


def _media_key(m: dict) -> str:
    contents = _contents(m)
    key = (
        contents.get("rawmd5")
        or contents.get("md5")
        or contents.get("path")
        or m.get("media_key")
        or m.get("mediaKey")
        or m.get("image_path")
        or ""
    )
    return str(key).replace("\\", "/")


def _voice_key(m: dict) -> str:
    contents = _contents(m)
    if contents.get("voice"):
        return str(contents.get("voice"))
    return str(m.get("voice_key") or m.get("voiceKey") or "")


def _message_snapshot(m: dict) -> str:
    """保存分类时拿到的原始消息快照，供后续 chatlog batch 查不到时兜底显示。"""
    return json.dumps(m, ensure_ascii=False, separators=(",", ":"))


def _base_msg_content(m: dict) -> str:
    content = m.get("content") or m.get("Content") or ""
    contents = _contents(m)
    if isinstance(content, str) and content.lstrip().startswith("<") and extract_quote(m):
        content = ""

    link_title = contents.get("title") or m.get("link_title") or ""
    link_desc = contents.get("desc") or m.get("link_desc") or ""
    link_source = contents.get("sourceName") or contents.get("source_name") or m.get("link_source") or ""
    link_url = contents.get("url") or m.get("link_url") or ""
    file_name = contents.get("fileName") or contents.get("filename") or contents.get("title") or ""

    parts: list[str] = []
    if link_title:
        parts.append(f"[链接/文件] {link_title}")
    if file_name and file_name not in parts:
        parts.append(f"文件名：{file_name}")
    if link_desc:
        parts.append(f"描述：{link_desc}")
    if link_source:
        parts.append(f"来源：{link_source}")
    if link_url:
        parts.append(f"URL：{link_url}")
    if content and content not in parts:
        parts.append(content)
    return append_quote_text("；".join(parts) if parts else content, m)


def _is_image_message(m: dict) -> bool:
    return _msg_type(m) == 3


def _is_voice_message(m: dict) -> bool:
    return _msg_type(m) == 34


def _is_video_message(m: dict) -> bool:
    return _msg_type(m) == 43


def _is_file_or_link_message(m: dict) -> bool:
    return _msg_type(m) == 49


def _is_file_message(m: dict) -> bool:
    return _msg_type(m) == 49 and _msg_sub_type(m) == 6


def _media_kind(m: dict) -> str | None:
    if _is_image_message(m):
        return "image"
    if _is_voice_message(m):
        return "voice"
    if _is_video_message(m):
        return "video"
    return None


def _media_parse_key(m: dict) -> str:
    kind = _media_kind(m)
    if kind == "voice":
        return _voice_key(m)
    if kind in {"image", "video"}:
        return _media_key(m)
    return ""


def _message_analysis_text(m: dict) -> str:
    base = _base_msg_content(m).strip()
    parsed = str(m.get("_ai_media_text") or "").strip()
    parse_error = str(m.get("_ai_media_error") or "").strip()
    parts = []
    if base:
        parts.append(base)

    kind = _media_kind(m)
    if parsed:
        label = {"image": "图片描述", "voice": "语音转写", "video": "视频截图描述"}.get(kind or "", "媒体解析")
        parts.append(f"[{label}] {parsed}")
    elif parse_error:
        parts.append(f"[媒体解析失败] {parse_error}")
    elif kind:
        parts.append(f"[{kind}消息] key={_media_parse_key(m) or '无'}")

    if _is_file_or_link_message(m) and not parts:
        parts.append("[文件/链接消息]")

    neighbor_context = str(m.get("_neighbor_context") or "").strip()
    if neighbor_context and (kind or _is_file_or_link_message(m) or parse_error):
        parts.append(f"[附近上下文] {neighbor_context}")

    text = "；".join(parts).strip()
    if len(text) > CLASSIFY_CONTENT_LIMIT:
        text = text[:CLASSIFY_CONTENT_LIMIT] + "..."
    return text


def _prompt_item(m: dict, core: bool = True) -> dict:
    return {
        "seq": _msg_seq(m),
        "time": _msg_time(m),
        "sender": _msg_sender(m),
        "core": core,
        "type": _msg_type(m),
        "content": _message_analysis_text(m),
    }


def _extract_json_array(text: str) -> list:
    array_start = text.find("[")
    object_start = text.find("{")
    if array_start >= 0 and (object_start < 0 or array_start < object_start):
        end = text.rfind("]") + 1
        if end <= 0:
            raise ValueError(f"LLM 返回内容无法解析为 JSON 数组: {text[:200]}")
        data = json.loads(text[array_start:end])
    elif object_start >= 0:
        end = text.rfind("}") + 1
        if end <= 0:
            raise ValueError(f"LLM 返回内容无法解析为 JSON 对象: {text[:200]}")
        data = json.loads(text[object_start:end])
    else:
        raise ValueError(f"LLM 返回内容无法解析为 JSON: {text[:200]}")
    if isinstance(data, dict):
        data = data.get("topics") or data.get("items") or []
    if not isinstance(data, list):
        raise ValueError("LLM JSON 不是数组")
    return data


def _parse_msg_time(raw: str) -> float | None:
    """把 chatlog 的 ISO 时间字符串解析为 unix 秒。失败返回 None。"""
    if not raw:
        return None
    try:
        s = raw.replace("/", "-")
        if "T" not in s and " " in s:
            s = s.replace(" ", "T", 1)
        return datetime.fromisoformat(s).timestamp()
    except Exception:
        return None


def _looks_business_message(m: dict) -> bool:
    text = _message_analysis_text(m)
    if _is_file_or_link_message(m) or _is_voice_message(m):
        return True
    if (_is_image_message(m) or _is_video_message(m)) and str(m.get("_neighbor_context") or "").strip():
        return True
    if (_is_image_message(m) or _is_video_message(m)) and str(m.get("_ai_media_text") or "").strip():
        return True
    low = text.lower()
    if any(k.lower() in low for k in BUSINESS_KEYWORDS):
        return True
    if re.search(r"\b(a20\d+|ap\d+|sp\d+|\d+p|[a-z]{2,}\d{2,})\b", low):
        return True
    if re.search(r"\.(dwg|xlsx?|pdf|docx?|zip|rar)\b", low):
        return True
    return False


def _has_text_signal(m: dict) -> bool:
    text = _base_msg_content(m).strip()
    if text:
        return True
    contents = _contents(m)
    return bool(contents.get("desc") or contents.get("title") or contents.get("refer") or contents.get("recordInfo"))


def _attach_neighbor_context(messages: list[dict], radius: int = 4) -> None:
    """给图片/视频/语音/文件消息补充临近文本，供无法解析媒体时按上下文归类。"""
    for idx, m in enumerate(messages):
        if not (_media_kind(m) or _is_file_or_link_message(m)):
            continue
        lines: list[str] = []
        sender = _msg_sender(m)
        for j in range(max(0, idx - radius), min(len(messages), idx + radius + 1)):
            if j == idx:
                continue
            n = messages[j]
            if not _has_text_signal(n):
                continue
            # 同一发送者或时间邻近的文本最能解释媒体；不同发送者也保留少量上下文。
            prefix = "前文" if j < idx else "后文"
            relation = "同发送人" if sender and _msg_sender(n) == sender else "邻近"
            text = _base_msg_content(n).strip()
            if not text:
                text = _message_analysis_text(n).strip()
            if not text:
                continue
            if len(text) > 180:
                text = text[:180] + "..."
            lines.append(f"{prefix}/{relation} seq={_msg_seq(n)} {text}")
        if lines:
            m["_neighbor_context"] = " | ".join(lines[:6])


async def _fetch_window_messages(talker: str, start_ts: int, end_ts: int) -> list[dict]:
    start_dt = datetime.fromtimestamp(start_ts)
    end_dt = datetime.fromtimestamp(end_ts)
    time_str = f"{start_dt.strftime('%Y-%m-%d')},{end_dt.strftime('%Y-%m-%d')}"

    all_items: list[dict] = []
    offset = 0
    seen: set[int] = set()
    while True:
        data = await chatlog_client.get_messages(
            talker,
            time=time_str,
            limit=FETCH_PAGE_SIZE,
            offset=offset,
        )
        items = data.get("items", []) or []
        total = int(data.get("total") or 0)
        if not items:
            break
        for m in items:
            seq = _msg_seq(m)
            if seq and seq in seen:
                continue
            t = _msg_time(m)
            ts = _parse_msg_time(t)
            if ts is None or (start_ts <= ts <= end_ts):
                all_items.append(m)
                if seq:
                    seen.add(seq)
        offset += len(items)
        if total and offset >= total:
            break
        if len(items) < FETCH_PAGE_SIZE:
            break

    all_items.sort(key=lambda m: (_parse_msg_time(_msg_time(m)) or 0, _msg_seq(m)))
    return all_items


def _validate_media_settings(messages: list[dict], ai_settings: dict) -> str | None:
    has_visual = any(_media_kind(m) in {"image", "video"} for m in messages)
    has_voice = any(_media_kind(m) == "voice" for m in messages)
    if has_visual and not ai_settings.get("vision_model"):
        return "所选时间段包含图片/视频消息，请先在「设置」页面填入视觉模型"
    if has_voice and not ai_settings.get("voice_model"):
        return "所选时间段包含语音消息，请先在「设置」页面填入语音模型"
    return None


async def _parse_message_media(messages: list[dict], update_progress, base_processed: int, total: int) -> int:
    semaphore = asyncio.Semaphore(MEDIA_PARSE_CONCURRENCY)
    media_messages = [m for m in messages if _media_kind(m) and _media_parse_key(m)]
    parsed_count = 0
    cache: dict[tuple[str, str], str] = {}

    async def parse_one(m: dict):
        nonlocal parsed_count
        kind = _media_kind(m)
        key = _media_parse_key(m)
        if not kind or not key:
            return
        async with semaphore:
            try:
                cache_key = (kind, key)
                if cache_key not in cache:
                    parsed = await parse_media(kind, key)
                    cache[cache_key] = parsed.get("text") or ""
                m["_ai_media_text"] = cache[cache_key]
            except HTTPException as e:
                m["_ai_media_error"] = str(e.detail)
            except Exception as e:
                log.warning(f"[classify] media parse failed seq={_msg_seq(m)}: {e}", exc_info=True)
                m["_ai_media_error"] = str(e)
            finally:
                parsed_count += 1
                await update_progress("running", base_processed + parsed_count, total)

    await asyncio.gather(*(parse_one(m) for m in media_messages))
    return len(media_messages)


async def _manual_assigned_seqs(db: aiosqlite.Connection, group_id: int) -> set[int]:
    async with db.execute(
        """
        SELECT DISTINCT tm.msg_seq
        FROM topic_messages tm
        JOIN topics t ON t.id = tm.topic_id
        WHERE t.group_id = ? AND COALESCE(t.source, 'manual') = 'manual'
        """,
        (group_id,),
    ) as cur:
        return {int(row["msg_seq"]) for row in await cur.fetchall()}


async def _delete_ai_topics(db: aiosqlite.Connection, group_id: int) -> None:
    await db.execute(
        """
        DELETE FROM knowledge_fts WHERE doc_id IN (
            SELECT d.id FROM knowledge_docs d
            JOIN topics t ON t.id = d.topic_id
            WHERE t.group_id=? AND COALESCE(t.source, 'manual')='ai'
        )
        """,
        (group_id,),
    )
    await db.execute(
        """
        DELETE FROM knowledge_docs
        WHERE topic_id IN (
            SELECT id FROM topics WHERE group_id=? AND COALESCE(source, 'manual')='ai'
        )
        """,
        (group_id,),
    )
    await db.execute(
        """
        DELETE FROM topic_messages
        WHERE topic_id IN (
            SELECT id FROM topics WHERE group_id=? AND COALESCE(source, 'manual')='ai'
        )
        """,
        (group_id,),
    )
    await db.execute("DELETE FROM topics WHERE group_id=? AND COALESCE(source, 'manual')='ai'", (group_id,))
    await db.commit()


def _chunk_messages(messages: list[dict]) -> list[tuple[list[dict], list[dict]]]:
    chunks = []
    for start in range(0, len(messages), CLASSIFY_BATCH_SIZE):
        end = min(len(messages), start + CLASSIFY_BATCH_SIZE)
        ctx_start = max(0, start - CONTEXT_BEFORE)
        ctx_end = min(len(messages), end + CONTEXT_AFTER)
        chunks.append((messages[start:end], messages[ctx_start:ctx_end]))
    return chunks


def _batch_classify_prompt(core_messages: list[dict], context_messages: list[dict], guidance: str = "") -> str:
    core_seqs = {_msg_seq(m) for m in core_messages}
    payload = [_prompt_item(m, _msg_seq(m) in core_seqs) for m in context_messages]
    guidance_block = f"\n\n{guidance.strip()}\n\n" if guidance and guidance.strip() else ""
    prompt = (
        "你是广东大铁数控机械有限公司设备售后群话题分析助手。这个微信群里主要是大铁设备客户反馈问题，售后工程师对接处理。\n"
        "请只为 core=true 的消息按【设备问题/故障现象】聚类，而不是按客户公司、地域、日期、工程师或单次对话聚类。\n"
        "同类设备问题即使来自不同公司、不同地区，也要合并到同一话题；不同设备问题必须准确分开。\n"
        f"常见问题口径包括：{', '.join(DEVICE_ISSUE_EXAMPLES)}。\n\n"
        f"消息列表：\n{json.dumps(payload, ensure_ascii=False)}\n\n"
        "规则：\n"
        "1. 只输出 core=true 消息的归属，context 只用来理解上下文。\n"
        "2. 标题必须体现设备部件和问题现象，建议如「主轴报警/故障代码问题」「刀库换刀卡顿」「回零/限位异常」。不要把客户公司名作为标题核心。\n"
        "3. 不同公司出现相同设备问题时合并；同一公司出现不同问题时拆开。\n"
        "4. [引用消息] 是当前回复的强上下文，必须用于理解当前消息含义，但归类对象仍然只能是当前 core seq。\n"
        "5. 工程师回复、客户补充说明、图片、语音、视频、文件，必须跟随其对应的设备问题归类。\n"
        "6. 图片/视频无法识别时，不要编造内容，但必须根据附近上下文判断所属设备问题。\n"
        "7. 真正寒暄、入群通知、撤回、无设备售后含义短句才可归入「日常交流/待确认」；设备咨询和报修不得归入日常交流。\n"
        "8. 每条 core 消息最多归入一个话题，尽量覆盖所有 core seq。\n\n"
        "输出严格 JSON 数组，不要解释。格式：\n"
        '[{"topic_key":"稳定短键","title":"具体话题标题","seqs":[1,2],"reason":"分类依据"}]'
    )
    return prompt + guidance_block


async def _classify_batches(
    messages: list[dict],
    update_progress,
    base_processed: int,
    total: int,
    guidance: str = "",
) -> list[dict]:
    chunks = _chunk_messages(messages)
    results: list[dict] = []
    _client, _ai = await _get_client()

    for i, (core, context) in enumerate(chunks, start=1):
        prompt = _batch_classify_prompt(core, context, guidance)
        try:
            resp = await _client.chat.completions.create(
                model=_ai["ai_model"],
                messages=[
                    {
                        "role": "system",
                        "content": "你是广东大铁数控机械有限公司设备售后问题分类器。你只输出 JSON 数组，不输出解释、Markdown 或额外文字。",
                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.05,
            )
            batch_items = _extract_json_array(resp.choices[0].message.content.strip())
            for item in batch_items:
                if isinstance(item, dict):
                    item["_batch"] = i
                    results.append(item)
        except Exception as e:
            log.warning(f"[classify] batch {i} classify failed: {e}", exc_info=True)
        await update_progress("running", base_processed + i, total)
    return results


def _sanitize_topic_items(items: list[dict], seq_to_msg: dict[int, dict], allowed_seqs: set[int]) -> list[dict]:
    assigned: set[int] = set()
    clean: list[dict] = []
    for item in items:
        title = str(item.get("title") or item.get("new_topic") or item.get("topic") or "").strip()
        seqs = item.get("seqs") or item.get("message_seqs") or item.get("messages") or []
        topic_key = str(item.get("topic_key") or title).strip()
        reason = str(item.get("reason") or item.get("evidence") or "").strip()
        clean_seqs: list[int] = []
        for seq in seqs:
            try:
                n = int(seq)
            except Exception:
                continue
            if n in allowed_seqs and n in seq_to_msg and n not in assigned:
                clean_seqs.append(n)
        if not title or not clean_seqs:
            continue
        if title == DAILY_TOPIC:
            business = [n for n in clean_seqs if _looks_business_message(seq_to_msg[n])]
            non_business = [n for n in clean_seqs if n not in business]
            if business:
                clean.append({
                    "topic_key": UNCATEGORIZED_BUSINESS,
                    "title": UNCATEGORIZED_BUSINESS,
                    "seqs": business,
                    "reason": "模型归入日常交流但消息含业务信号，转入人工复核。",
                })
                assigned.update(business)
            clean_seqs = non_business
            if not clean_seqs:
                continue
        if title != DAILY_TOPIC and len(title) > 80:
            title = title[:80]
        clean.append({"topic_key": topic_key, "title": title, "seqs": clean_seqs, "reason": reason})
        assigned.update(clean_seqs)
    return clean


def _merge_prompt(candidates: list[dict]) -> str:
    compact = [
        {
            "id": i,
            "topic_key": c.get("topic_key", ""),
            "title": c.get("title", ""),
            "seqs": c.get("seqs", []),
            "reason": c.get("reason", ""),
        }
        for i, c in enumerate(candidates)
    ]
    return (
        "你是广东大铁数控机械有限公司设备售后话题合并审核员。请把分批识别出的候选话题做跨批次合并。\n"
        "合并粒度是【设备问题/故障现象】。不要按客户公司、地域、日期、工程师或单次对话合并。\n"
        "不同公司出现相同设备问题时必须合并，例如两个客户都反馈主轴报警，应归为同一类主轴报警问题。\n"
        "不同设备部件、不同故障现象、不同处理路径必须分开，例如主轴报警、刀库卡刀、回零限位异常、加工精度异常不能互相合并。\n\n"
        f"候选话题：\n{json.dumps(compact, ensure_ascii=False)}\n\n"
        "输出严格 JSON 数组，不要解释。格式：\n"
        '[{"title":"合并后的具体话题标题","candidate_ids":[0,1],"reason":"合并依据"}]'
    )


async def _merge_candidates(candidates: list[dict], update_progress, processed: int, total: int) -> list[dict]:
    if not candidates:
        return []
    if len(candidates) == 1:
        await update_progress("running", processed + 1, total)
        return candidates

    _client, _ai = await _get_client()
    resp = await _client.chat.completions.create(
        model=_ai["ai_model"],
        messages=[
            {
                "role": "system",
                "content": "你是广东大铁数控机械有限公司设备售后话题合并器。你只输出 JSON 数组，不输出解释、Markdown 或额外文字。",
            },
            {"role": "user", "content": _merge_prompt(candidates)},
        ],
        temperature=0.05,
    )
    merged_raw = _extract_json_array(resp.choices[0].message.content.strip())
    used: set[int] = set()
    merged: list[dict] = []
    for item in merged_raw:
        if not isinstance(item, dict):
            continue
        ids = item.get("candidate_ids") or item.get("ids") or []
        valid_ids = []
        for raw_id in ids:
            try:
                idx = int(raw_id)
            except Exception:
                continue
            if 0 <= idx < len(candidates) and idx not in used:
                valid_ids.append(idx)
        if not valid_ids:
            continue
        title = str(item.get("title") or candidates[valid_ids[0]].get("title") or "").strip()
        if not title:
            continue
        seqs: list[int] = []
        for idx in valid_ids:
            used.add(idx)
            seqs.extend(candidates[idx].get("seqs", []))
        merged.append({
            "title": title[:80] if title != DAILY_TOPIC else title,
            "seqs": seqs,
            "reason": str(item.get("reason") or "").strip(),
        })

    for idx, candidate in enumerate(candidates):
        if idx not in used:
            merged.append(candidate)

    await update_progress("running", processed + 1, total)
    return merged


def _supplement_prompt(unassigned: list[dict], topics: list[dict]) -> str:
    payload = [_prompt_item(m, True) for m in unassigned]
    topic_payload = [{"index": i, "title": t["title"], "seqs": t.get("seqs", [])[:8]} for i, t in enumerate(topics)]
    return (
        "你是广东大铁数控机械有限公司设备售后消息补充分配审核员。请把未归属消息分配到已有话题，或新建设备问题话题。\n"
        "分配粒度是设备问题/故障现象，不是客户公司、地域、日期、工程师或单次对话。\n"
        "不同公司出现相同设备问题时分配到同一话题；不同设备部件、故障现象或处理路径要新建不同话题。\n"
        "[引用消息] 是当前回复的强上下文，必须用于理解当前消息含义，但归类对象仍然只能是当前消息 seq。\n"
        "图片/视频内容无法识别时，必须优先使用消息里的[附近上下文]归入最相关已有设备问题；只有完全孤立且无上下文线索才新建「未归类设备问题（需人工复核）」。\n"
        "设备咨询、报修、售后处理消息不得归入「日常交流/待确认」。只有寒暄、通知、撤回等无售后含义消息才可归入日常交流。\n\n"
        f"已有话题：\n{json.dumps(topic_payload, ensure_ascii=False)}\n\n"
        f"未归属消息：\n{json.dumps(payload, ensure_ascii=False)}\n\n"
        "输出严格 JSON 数组，不要解释。格式：\n"
        '[{"seq":1,"topic_index":0,"new_topic":"","reason":"依据"}]\n'
        '若需要新建话题，topic_index 用 null，并填写 new_topic。'
    )


async def _supplement_assignments(
    topics: list[dict],
    messages: list[dict],
    seq_to_msg: dict[int, dict],
    allowed_seqs: set[int],
    update_progress,
    processed: int,
    total: int,
) -> list[dict]:
    assigned = {seq for t in topics for seq in t.get("seqs", [])}
    missing = [seq_to_msg[s] for s in allowed_seqs if s not in assigned]
    if not missing:
        await update_progress("running", processed + 1, total)
        return topics

    try:
        _client, _ai = await _get_client()
        resp = await _client.chat.completions.create(
            model=_ai["ai_model"],
            messages=[
                {
                    "role": "system",
                    "content": "你是广东大铁数控机械有限公司设备售后消息补充分配器。你只输出 JSON 数组，不输出解释、Markdown 或额外文字。",
                },
                {"role": "user", "content": _supplement_prompt(missing, topics)},
            ],
            temperature=0.05,
        )
        results = _extract_json_array(resp.choices[0].message.content.strip())
    except Exception as e:
        log.warning(f"[classify] supplement assignment failed: {e}", exc_info=True)
        results = []

    topic_by_title = {t["title"]: t for t in topics}
    for item in results:
        if not isinstance(item, dict):
            continue
        try:
            seq = int(item.get("seq"))
        except Exception:
            continue
        if seq not in allowed_seqs or seq in assigned:
            continue
        title = ""
        idx = item.get("topic_index")
        try:
            if idx is not None and 0 <= int(idx) < len(topics):
                title = topics[int(idx)]["title"]
        except Exception:
            title = ""
        if not title:
            title = str(item.get("new_topic") or "").strip()
        if not title or title == DAILY_TOPIC:
            title = UNCATEGORIZED_BUSINESS if _looks_business_message(seq_to_msg[seq]) else DAILY_TOPIC
        if title != DAILY_TOPIC and len(title) > 80:
            title = title[:80]
        if title not in topic_by_title:
            topic_by_title[title] = {"title": title, "seqs": [], "reason": str(item.get("reason") or "").strip()}
            topics.append(topic_by_title[title])
        topic_by_title[title]["seqs"].append(seq)
        assigned.add(seq)

    for m in missing:
        seq = _msg_seq(m)
        if seq in assigned:
            continue
        title = UNCATEGORIZED_BUSINESS if _looks_business_message(m) else DAILY_TOPIC
        if title not in topic_by_title:
            topic_by_title[title] = {"title": title, "seqs": [], "reason": "补充分配后仍无法归入具体话题"}
            topics.append(topic_by_title[title])
        topic_by_title[title]["seqs"].append(seq)
        assigned.add(seq)

    await update_progress("running", processed + 1, total)
    return topics


def _finalize_topics(topics: list[dict], seq_to_msg: dict[int, dict], allowed_seqs: set[int]) -> list[dict]:
    assigned: set[int] = set()
    grouped: dict[str, dict] = {}
    for topic in topics:
        title = str(topic.get("title") or "").strip()
        if not title:
            continue
        seqs: list[int] = []
        for seq in topic.get("seqs", []):
            try:
                n = int(seq)
            except Exception:
                continue
            if n not in allowed_seqs or n not in seq_to_msg or n in assigned:
                continue
            if title == DAILY_TOPIC and _looks_business_message(seq_to_msg[n]):
                title = UNCATEGORIZED_BUSINESS
            seqs.append(n)
            assigned.add(n)
        if not seqs:
            continue
        title = title[:80] if title != DAILY_TOPIC else title
        if title not in grouped:
            grouped[title] = {"title": title, "seqs": [], "reason": str(topic.get("reason") or "")}
        grouped[title]["seqs"].extend(seqs)

    for seq in allowed_seqs:
        if seq in assigned:
            continue
        title = UNCATEGORIZED_BUSINESS if _looks_business_message(seq_to_msg[seq]) else DAILY_TOPIC
        if title not in grouped:
            grouped[title] = {"title": title, "seqs": [], "reason": "最终兜底归属"}
        grouped[title]["seqs"].append(seq)

    result = list(grouped.values())
    result.sort(key=lambda t: min(t["seqs"]) if t["seqs"] else 0)
    return result


def _topic_text(topic: dict, seq_to_msg: dict[int, dict]) -> str:
    parts = [str(topic.get("title") or ""), str(topic.get("reason") or "")]
    for seq in topic.get("seqs", []):
        try:
            m = seq_to_msg[int(seq)]
        except Exception:
            continue
        parts.append(_message_analysis_text(m))
    return "\n".join(parts)


def _strip_customer_prefix(title: str) -> str:
    title = title.strip()
    patterns = (
        r"^[\u4e00-\u9fa5A-Za-z0-9（）()#＃_-]{2,30}(?:公司|工厂|厂|客户|现场|车间)[-—:：/]+",
        r"^(?:客户|厂家|现场|公司)[\u4e00-\u9fa5A-Za-z0-9（）()#＃_-]{0,20}[-—:：/]+",
    )
    for pattern in patterns:
        title = re.sub(pattern, "", title).strip()
    return title


def _infer_device_issue_title(topic: dict, seq_to_msg: dict[int, dict]) -> str:
    original = str(topic.get("title") or "").strip()
    if not original or original == DAILY_TOPIC:
        return original

    original = _strip_customer_prefix(original)
    text = _topic_text(topic, seq_to_msg)
    low = text.lower()

    broad_titles = (
        UNCATEGORIZED_BUSINESS,
        "现场故障/售后处理",
        "业务事项跟进",
        "沟通响应跟进",
    )
    is_broad = original in broad_titles or any(original.endswith(f"-{title}") for title in broad_titles)
    if not is_broad:
        return original[:80] if original != DAILY_TOPIC else original

    if any(k in text for k in ("主轴", "转速", "拉刀", "松刀")):
        if any(k in text for k in ("报警", "警报", "报错", "故障码", "代码")):
            return "主轴报警/故障代码问题"
        return "主轴异常问题"

    if any(k in text for k in ("刀库", "换刀", "刀臂", "刀盘", "刀号", "卡刀", "掉刀")):
        return "刀库/换刀异常问题"

    if any(k in text for k in ("回零", "原点", "限位", "行程")):
        return "回零/限位异常问题"

    if any(k in text for k in ("伺服", "驱动", "驱动器", "电机", "编码器", "变频器")):
        return "伺服/驱动异常问题"

    if any(k in text for k in ("精度", "尺寸", "跑偏", "震刀", "振刀", "圆度", "平面度")):
        return "加工精度异常问题"

    if any(k in text for k in ("气压", "液压", "油压", "漏油")):
        return "气压/液压异常问题"

    if any(k in text for k in ("润滑", "冷却", "水泵", "切削液", "漏水")):
        return "润滑/冷却异常问题"

    if any(k in text for k in ("电柜", "线路", "电源", "跳闸", "断电", "IO", "I/O", "PLC", "急停")):
        return "电气线路/IO异常问题"

    if any(k in text for k in ("系统", "参数", "程序", "G代码", "M代码", "面板", "手轮")):
        return "系统参数/程序操作问题"

    if "报警" in text or "警报" in text or "报错" in text or "故障码" in text or re.search(r"\b(error|alarm|err|e\d+)\b", low):
        return "设备报警/故障代码问题"

    if any(k in text for k in ("异响", "卡住", "排屑")):
        return "设备机械异常问题"

    return original


def _coalesce_device_issue_topics(topics: list[dict], seq_to_msg: dict[int, dict]) -> list[dict]:
    grouped: dict[str, dict] = {}
    for topic in topics:
        title = _infer_device_issue_title(topic, seq_to_msg)
        if not title:
            continue
        if title not in grouped:
            grouped[title] = {"title": title, "seqs": [], "reason": "按设备问题/故障现象自动合并"}
        grouped[title]["seqs"].extend(topic.get("seqs", []))

    result: list[dict] = []
    seen_global: set[int] = set()
    for title, topic in grouped.items():
        seqs: list[int] = []
        for seq in topic["seqs"]:
            try:
                n = int(seq)
            except Exception:
                continue
            if n in seen_global:
                continue
            seqs.append(n)
            seen_global.add(n)
        if seqs:
            result.append({"title": title[:80] if title != DAILY_TOPIC else title, "seqs": seqs, "reason": topic["reason"]})
    result.sort(key=lambda t: min(t["seqs"]) if t["seqs"] else 0)
    return result


async def _save_topics(db: aiosqlite.Connection, group_id: int, talker: str, topics: list[dict], seq_to_msg: dict[int, dict]) -> None:
    for topic in topics:
        title = topic["title"]
        seqs = topic.get("seqs", [])
        if not seqs:
            continue
        await db.execute(
            "INSERT INTO topics (group_id, title, source) VALUES (?, ?, 'ai')",
            (group_id, title),
        )
        await db.commit()
        async with db.execute("SELECT last_insert_rowid() AS id") as cur:
            row = await cur.fetchone()
        topic_id = row["id"]
        for seq in seqs:
            await db.execute(
                """
                INSERT OR IGNORE INTO topic_messages
                    (topic_id, msg_seq, talker, added_by, message_json)
                VALUES (?, ?, ?, 'ai', ?)
                """,
                (topic_id, seq, talker, _message_snapshot(seq_to_msg[seq])),
            )
    await db.commit()


async def run_classify_window(
    group_id: int,
    task_id: int,
    group: dict,
    start_ts: int,
    end_ts: int,
):
    """
    对指定时间区间内全部消息做 AI 话题分类。

    - 串行执行：同一时间只允许一个分类任务。
    - 重跑时删除旧 AI 话题，保留人工话题。
    """
    global _classifying_group

    path = get_active_db_path()

    async def _update_task(status: str, processed: int = 0, total: int = 0, error: str = ""):
        try:
            async with aiosqlite.connect(path) as _db:
                _db.row_factory = aiosqlite.Row
                if error:
                    await _db.execute(
                        "UPDATE ai_tasks SET status=?, progress=?, error=?, updated_at=CURRENT_TIMESTAMP WHERE id=?",
                        (status, json.dumps({"processed": processed, "total": total}), error, task_id),
                    )
                else:
                    await _db.execute(
                        "UPDATE ai_tasks SET status=?, progress=?, updated_at=CURRENT_TIMESTAMP WHERE id=?",
                        (status, json.dumps({"processed": processed, "total": total}), task_id),
                    )
                await _db.commit()
        except Exception as e:
            log.warning(f"[classify] 更新 task {task_id} 失败: {e}")

    _s = await get_ai_settings()
    if not _s.get("ai_api_key") or not _s.get("ai_model"):
        await _update_task("error", 0, 0, "AI 未配置，请在「设置」页面填入 API Key 和话题分析模型")
        log.warning(f"[classify] group={group_id} aborted: AI not configured")
        return

    async with _classify_lock:
        _classifying_group = group_id
        try:
            try:
                messages = await _fetch_window_messages(group["talker"], start_ts, end_ts)
            except Exception as e:
                log.error(f"[classify] group={group_id} fetch error: {e}", exc_info=True)
                await _update_task("error", 0, 0, f"拉取聊天记录失败：{e}")
                return

            total_messages = len(messages)
            log.info(f"[classify] group={group_id} got {total_messages} msgs in selected window")
            if total_messages == 0:
                await _update_task("done", 0, 0)
                return
            _fill_message_sender_names(messages)

            media_error = _validate_media_settings(messages, _s)
            if media_error:
                await _update_task("error", 0, total_messages, media_error)
                return

            _attach_neighbor_context(messages)

            media_count = sum(1 for m in messages if _media_kind(m) and _media_parse_key(m))
            chunks_count = len(_chunk_messages(messages))
            total_units = total_messages + media_count + chunks_count + 2
            await _update_task("running", total_messages, total_units)

            async with aiosqlite.connect(path) as db:
                db.row_factory = aiosqlite.Row
                manual_seqs = await _manual_assigned_seqs(db, group_id)
                await _delete_ai_topics(db, group_id)

                parsed_units = await _parse_message_media(messages, _update_task, total_messages, total_units)
                classifiable = [m for m in messages if _msg_seq(m) and _msg_seq(m) not in manual_seqs]
                seq_to_msg = {_msg_seq(m): m for m in classifiable if _msg_seq(m)}
                allowed_seqs = set(seq_to_msg)
                if not classifiable:
                    await _update_task("done", total_units, total_units)
                    return

                configured_prompt = (group.get("analysis_prompt") or _s.get("topic_analysis_prompt") or "").strip()
                sample_query = "\n".join(_message_analysis_text(m) for m in classifiable[:80])
                learning_context = await build_report_learning_context(
                    db,
                    group_id=group_id,
                    query=sample_query,
                    purpose="topic",
                )
                guidance_parts = []
                if configured_prompt:
                    guidance_parts.append(f"客户自定义话题分析提示词：\n{configured_prompt}")
                if learning_context:
                    guidance_parts.append(
                        "报告库学习参考：以下是人工修订过的历史报告，请学习其话题命名、分类颗粒度和关注点；不要复制历史事实。\n"
                        f"{learning_context}"
                    )
                analysis_guidance = "\n\n".join(guidance_parts)

                base = total_messages + parsed_units
                candidates_raw = await _classify_batches(
                    classifiable,
                    _update_task,
                    base,
                    total_units,
                    analysis_guidance,
                )
                candidates = _sanitize_topic_items(candidates_raw, seq_to_msg, allowed_seqs)
                processed_after_batches = base + chunks_count
                merged = await _merge_candidates(candidates, _update_task, processed_after_batches, total_units)
                merged = _finalize_topics(merged, seq_to_msg, allowed_seqs)
                supplemented = await _supplement_assignments(
                    merged,
                    classifiable,
                    seq_to_msg,
                    allowed_seqs,
                    _update_task,
                    processed_after_batches + 1,
                    total_units,
                )
                final_topics = _finalize_topics(supplemented, seq_to_msg, allowed_seqs)
                final_topics = _coalesce_device_issue_topics(final_topics, seq_to_msg)
                final_topics = _finalize_topics(final_topics, seq_to_msg, allowed_seqs)
                await _save_topics(db, group_id, group["talker"], final_topics, seq_to_msg)

            await _update_task("done", total_units, total_units)
            log.info(f"[classify] group={group_id} done, messages={total_messages}, topics={len(final_topics)}")

        except Exception as e:
            log.error(f"[classify] group={group_id} error: {e}", exc_info=True)
            await _update_task("error", 0, 0, str(e))
        finally:
            _classifying_group = None