import mimetypes import os import re import shutil import sqlite3 import tempfile from pathlib import Path from urllib.parse import quote import httpx from fastapi import APIRouter, HTTPException, Query from fastapi.responses import FileResponse, StreamingResponse from config import settings from services.chatlog_client import chatlog_client router = APIRouter(prefix="/api/files", tags=["files"]) OFFICE_MEDIA_TYPES = { ".xls": "application/vnd.ms-excel", ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ".ppt": "application/vnd.ms-powerpoint", ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", ".doc": "application/msword", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".pdf": "application/pdf", ".dwg": "application/acad", } def _connect_hardlink_db(hardlink_db: Path) -> sqlite3.Connection: """ chatlog may keep hardlink.db open. Copying a tiny snapshot avoids transient "unable to open database file" errors on Windows while keeping reads safe. """ tmp = Path(tempfile.gettempdir()) / f"chatlab_hardlink_{os.getpid()}_{hardlink_db.stat().st_mtime_ns}.db" if not tmp.exists() or tmp.stat().st_size != hardlink_db.stat().st_size: shutil.copy2(hardlink_db, tmp) con = sqlite3.connect(tmp) con.row_factory = sqlite3.Row return con def _safe_download_name(name: str, fallback: str) -> str: name = (name or fallback).replace("\r", "").replace("\n", "").strip() return name or fallback def _content_disposition(filename: str) -> str: quoted = quote(filename) ascii_fallback = re.sub(r"[^A-Za-z0-9._-]+", "_", filename) or "download" return f"attachment; filename=\"{ascii_fallback}\"; filename*=UTF-8''{quoted}" def _guess_media_type(filename: str, fallback: str = "") -> str: ext = Path(filename or "").suffix.lower() return OFFICE_MEDIA_TYPES.get(ext) or mimetypes.guess_type(filename)[0] or fallback or "application/octet-stream" async def _proxy_chatlog_file(md5: str, filename: str = ""): url = f"{settings.chatlog_base_url}/file/{quote(md5, safe='')}" try: async with httpx.AsyncClient(timeout=30, trust_env=False, follow_redirects=True) as client: resp = await client.get(url) except Exception: return None if resp.status_code != 200 or resp.content == b'"media not found"': return None headers = { "Content-Length": str(len(resp.content)), "X-ChatLab-File-Source": "chatlog", } if filename: headers["Content-Disposition"] = _content_disposition(filename) media_type = _guess_media_type(filename, resp.headers.get("content-type") or "") return StreamingResponse(iter([resp.content]), media_type=media_type, headers=headers) def _xwechat_roots_from_hardlink_db(hardlink_db: Path) -> list[Path]: roots: list[Path] = [] try: con = _connect_hardlink_db(hardlink_db) row = con.execute("SELECT ValueStdStr FROM db_info WHERE Key='uuid'").fetchone() raw = row["ValueStdStr"] if row else "" except Exception: raw = "" if raw: m = re.search(r"([A-Za-z]:\\[^|]+?xwechat_files)", raw) if m: roots.append(Path(m.group(1))) roots.extend([ Path.home() / "xwechat_files", Path.home() / "Documents" / "WeChat Files", ]) uniq: list[Path] = [] seen = set() for root in roots: s = str(root).lower() if s not in seen: uniq.append(root) seen.add(s) return uniq def _find_local_file(hardlink_db: Path, md5: str, requested_name: str = "") -> Path | None: try: con = _connect_hardlink_db(hardlink_db) row = con.execute( """ SELECT md5, file_name, file_size, dir1, dir2 FROM file_hardlink_info_v4 WHERE md5=? ORDER BY _rowid_ DESC LIMIT 1 """, (md5,), ).fetchone() except Exception: row = None if not row: return None names = [requested_name, row["file_name"]] names = [n for n in names if n] size = int(row["file_size"] or 0) roots = _xwechat_roots_from_hardlink_db(hardlink_db) for root in roots: if not root.exists(): continue for name in names: for candidate in root.rglob(name): try: if candidate.is_file() and (not size or candidate.stat().st_size == size): return candidate except Exception: continue if size: # Fallback by size in the common file store. This is intentionally limited # to msg/file to avoid scanning unrelated huge trees for every request. for file_root in root.glob("*/msg/file"): if not file_root.exists(): continue for candidate in file_root.rglob("*"): try: if candidate.is_file() and candidate.stat().st_size == size: if not names or candidate.name in names: return candidate except Exception: continue return None @router.get("/{md5}") async def get_file(md5: str, filename: str = Query("")): md5 = md5.strip() if not re.fullmatch(r"[0-9a-fA-F]{8,64}", md5): raise HTTPException(400, "文件 md5 不合法") filename = _safe_download_name(filename, md5) proxied = await _proxy_chatlog_file(md5, filename) if proxied: return proxied db_paths = await chatlog_client.get_db_paths() hardlink_paths = db_paths.get("media") or [] for raw_path in hardlink_paths: hardlink_db = Path(raw_path) if not hardlink_db.exists(): continue local_file = _find_local_file(hardlink_db, md5, filename) if local_file: media_type = _guess_media_type(filename or local_file.name) return FileResponse( path=str(local_file), filename=filename or local_file.name, media_type=media_type, headers={ "Content-Disposition": _content_disposition(filename or local_file.name), "Content-Length": str(local_file.stat().st_size), "X-ChatLab-File-Source": "local-hardlink", }, ) raise HTTPException(404, "原文件未找到,可能未解密或已清理")