get_wechat/chatlog_fastAPI/routers/files.py

import mimetypes
import os
import re
import shutil
import sqlite3
import tempfile
from pathlib import Path
from urllib.parse import quote

import httpx
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import FileResponse, StreamingResponse

from config import settings
from services.chatlog_client import chatlog_client

router = APIRouter(prefix="/api/files", tags=["files"])


OFFICE_MEDIA_TYPES = {
    ".xls": "application/vnd.ms-excel",
    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    ".ppt": "application/vnd.ms-powerpoint",
    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ".doc": "application/msword",
    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    ".pdf": "application/pdf",
    ".dwg": "application/acad",
}


def _connect_hardlink_db(hardlink_db: Path) -> sqlite3.Connection:
    """
    chatlog may keep hardlink.db open. Copying a tiny snapshot avoids transient
    "unable to open database file" errors on Windows while keeping reads safe.
    """
    tmp = Path(tempfile.gettempdir()) / f"chatlab_hardlink_{os.getpid()}_{hardlink_db.stat().st_mtime_ns}.db"
    if not tmp.exists() or tmp.stat().st_size != hardlink_db.stat().st_size:
        shutil.copy2(hardlink_db, tmp)
    con = sqlite3.connect(tmp)
    con.row_factory = sqlite3.Row
    return con


def _safe_download_name(name: str, fallback: str) -> str:
    name = (name or fallback).replace("\r", "").replace("\n", "").strip()
    return name or fallback


def _content_disposition(filename: str) -> str:
    quoted = quote(filename)
    ascii_fallback = re.sub(r"[^A-Za-z0-9._-]+", "_", filename) or "download"
    return f"attachment; filename=\"{ascii_fallback}\"; filename*=UTF-8''{quoted}"


def _guess_media_type(filename: str, fallback: str = "") -> str:
    ext = Path(filename or "").suffix.lower()
    return OFFICE_MEDIA_TYPES.get(ext) or mimetypes.guess_type(filename)[0] or fallback or "application/octet-stream"


async def _proxy_chatlog_file(md5: str, filename: str = ""):
    url = f"{settings.chatlog_base_url}/file/{quote(md5, safe='')}"
    try:
        async with httpx.AsyncClient(timeout=30, trust_env=False, follow_redirects=True) as client:
            resp = await client.get(url)
    except Exception:
        return None

    if resp.status_code != 200 or resp.content == b'"media not found"':
        return None

    headers = {
        "Content-Length": str(len(resp.content)),
        "X-ChatLab-File-Source": "chatlog",
    }
    if filename:
        headers["Content-Disposition"] = _content_disposition(filename)
    media_type = _guess_media_type(filename, resp.headers.get("content-type") or "")
    return StreamingResponse(iter([resp.content]), media_type=media_type, headers=headers)


def _xwechat_roots_from_hardlink_db(hardlink_db: Path) -> list[Path]:
    roots: list[Path] = []
    try:
        con = _connect_hardlink_db(hardlink_db)
        row = con.execute("SELECT ValueStdStr FROM db_info WHERE Key='uuid'").fetchone()
        raw = row["ValueStdStr"] if row else ""
    except Exception:
        raw = ""

    if raw:
        m = re.search(r"([A-Za-z]:\\[^|]+?xwechat_files)", raw)
        if m:
            roots.append(Path(m.group(1)))

    roots.extend([
        Path.home() / "xwechat_files",
        Path.home() / "Documents" / "WeChat Files",
    ])
    uniq: list[Path] = []
    seen = set()
    for root in roots:
        s = str(root).lower()
        if s not in seen:
            uniq.append(root)
            seen.add(s)
    return uniq


def _find_local_file(hardlink_db: Path, md5: str, requested_name: str = "") -> Path | None:
    try:
        con = _connect_hardlink_db(hardlink_db)
        row = con.execute(
            """
            SELECT md5, file_name, file_size, dir1, dir2
            FROM file_hardlink_info_v4
            WHERE md5=?
            ORDER BY _rowid_ DESC
            LIMIT 1
            """,
            (md5,),
        ).fetchone()
    except Exception:
        row = None
    if not row:
        return None

    names = [requested_name, row["file_name"]]
    names = [n for n in names if n]
    size = int(row["file_size"] or 0)
    roots = _xwechat_roots_from_hardlink_db(hardlink_db)

    for root in roots:
        if not root.exists():
            continue
        for name in names:
            for candidate in root.rglob(name):
                try:
                    if candidate.is_file() and (not size or candidate.stat().st_size == size):
                        return candidate
                except Exception:
                    continue
        if size:
            # Fallback by size in the common file store. This is intentionally limited
            # to msg/file to avoid scanning unrelated huge trees for every request.
            for file_root in root.glob("*/msg/file"):
                if not file_root.exists():
                    continue
                for candidate in file_root.rglob("*"):
                    try:
                        if candidate.is_file() and candidate.stat().st_size == size:
                            if not names or candidate.name in names:
                                return candidate
                    except Exception:
                        continue
    return None


@router.get("/{md5}")
async def get_file(md5: str, filename: str = Query("")):
    md5 = md5.strip()
    if not re.fullmatch(r"[0-9a-fA-F]{8,64}", md5):
        raise HTTPException(400, "文件 md5 不合法")

    filename = _safe_download_name(filename, md5)
    proxied = await _proxy_chatlog_file(md5, filename)
    if proxied:
        return proxied

    db_paths = await chatlog_client.get_db_paths()
    hardlink_paths = db_paths.get("media") or []
    for raw_path in hardlink_paths:
        hardlink_db = Path(raw_path)
        if not hardlink_db.exists():
            continue
        local_file = _find_local_file(hardlink_db, md5, filename)
        if local_file:
            media_type = _guess_media_type(filename or local_file.name)
            return FileResponse(
                path=str(local_file),
                filename=filename or local_file.name,
                media_type=media_type,
                headers={
                    "Content-Disposition": _content_disposition(filename or local_file.name),
                    "Content-Length": str(local_file.stat().st_size),
                    "X-ChatLab-File-Source": "local-hardlink",
                },
            )

    raise HTTPException(404, "原文件未找到，可能未解密或已清理")