Files
get_wechat/chatlog_fastAPI/routers/files.py

191 lines
6.5 KiB
Python

import mimetypes
import os
import re
import shutil
import sqlite3
import tempfile
from pathlib import Path
from urllib.parse import quote
import httpx
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import FileResponse, StreamingResponse
from config import settings
from services.chatlog_client import chatlog_client
router = APIRouter(prefix="/api/files", tags=["files"])
OFFICE_MEDIA_TYPES = {
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".ppt": "application/vnd.ms-powerpoint",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
".doc": "application/msword",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".pdf": "application/pdf",
".dwg": "application/acad",
}
def _connect_hardlink_db(hardlink_db: Path) -> sqlite3.Connection:
"""
chatlog may keep hardlink.db open. Copying a tiny snapshot avoids transient
"unable to open database file" errors on Windows while keeping reads safe.
"""
tmp = Path(tempfile.gettempdir()) / f"chatlab_hardlink_{os.getpid()}_{hardlink_db.stat().st_mtime_ns}.db"
if not tmp.exists() or tmp.stat().st_size != hardlink_db.stat().st_size:
shutil.copy2(hardlink_db, tmp)
con = sqlite3.connect(tmp)
con.row_factory = sqlite3.Row
return con
def _safe_download_name(name: str, fallback: str) -> str:
name = (name or fallback).replace("\r", "").replace("\n", "").strip()
return name or fallback
def _content_disposition(filename: str) -> str:
quoted = quote(filename)
ascii_fallback = re.sub(r"[^A-Za-z0-9._-]+", "_", filename) or "download"
return f"attachment; filename=\"{ascii_fallback}\"; filename*=UTF-8''{quoted}"
def _guess_media_type(filename: str, fallback: str = "") -> str:
ext = Path(filename or "").suffix.lower()
return OFFICE_MEDIA_TYPES.get(ext) or mimetypes.guess_type(filename)[0] or fallback or "application/octet-stream"
async def _proxy_chatlog_file(md5: str, filename: str = ""):
url = f"{settings.chatlog_base_url}/file/{quote(md5, safe='')}"
try:
async with httpx.AsyncClient(timeout=30, trust_env=False, follow_redirects=True) as client:
resp = await client.get(url)
except Exception:
return None
if resp.status_code != 200 or resp.content == b'"media not found"':
return None
headers = {
"Content-Length": str(len(resp.content)),
"X-ChatLab-File-Source": "chatlog",
}
if filename:
headers["Content-Disposition"] = _content_disposition(filename)
media_type = _guess_media_type(filename, resp.headers.get("content-type") or "")
return StreamingResponse(iter([resp.content]), media_type=media_type, headers=headers)
def _xwechat_roots_from_hardlink_db(hardlink_db: Path) -> list[Path]:
roots: list[Path] = []
try:
con = _connect_hardlink_db(hardlink_db)
row = con.execute("SELECT ValueStdStr FROM db_info WHERE Key='uuid'").fetchone()
raw = row["ValueStdStr"] if row else ""
except Exception:
raw = ""
if raw:
m = re.search(r"([A-Za-z]:\\[^|]+?xwechat_files)", raw)
if m:
roots.append(Path(m.group(1)))
roots.extend([
Path.home() / "xwechat_files",
Path.home() / "Documents" / "WeChat Files",
])
uniq: list[Path] = []
seen = set()
for root in roots:
s = str(root).lower()
if s not in seen:
uniq.append(root)
seen.add(s)
return uniq
def _find_local_file(hardlink_db: Path, md5: str, requested_name: str = "") -> Path | None:
try:
con = _connect_hardlink_db(hardlink_db)
row = con.execute(
"""
SELECT md5, file_name, file_size, dir1, dir2
FROM file_hardlink_info_v4
WHERE md5=?
ORDER BY _rowid_ DESC
LIMIT 1
""",
(md5,),
).fetchone()
except Exception:
row = None
if not row:
return None
names = [requested_name, row["file_name"]]
names = [n for n in names if n]
size = int(row["file_size"] or 0)
roots = _xwechat_roots_from_hardlink_db(hardlink_db)
for root in roots:
if not root.exists():
continue
for name in names:
for candidate in root.rglob(name):
try:
if candidate.is_file() and (not size or candidate.stat().st_size == size):
return candidate
except Exception:
continue
if size:
# Fallback by size in the common file store. This is intentionally limited
# to msg/file to avoid scanning unrelated huge trees for every request.
for file_root in root.glob("*/msg/file"):
if not file_root.exists():
continue
for candidate in file_root.rglob("*"):
try:
if candidate.is_file() and candidate.stat().st_size == size:
if not names or candidate.name in names:
return candidate
except Exception:
continue
return None
@router.get("/{md5}")
async def get_file(md5: str, filename: str = Query("")):
md5 = md5.strip()
if not re.fullmatch(r"[0-9a-fA-F]{8,64}", md5):
raise HTTPException(400, "文件 md5 不合法")
filename = _safe_download_name(filename, md5)
proxied = await _proxy_chatlog_file(md5, filename)
if proxied:
return proxied
db_paths = await chatlog_client.get_db_paths()
hardlink_paths = db_paths.get("media") or []
for raw_path in hardlink_paths:
hardlink_db = Path(raw_path)
if not hardlink_db.exists():
continue
local_file = _find_local_file(hardlink_db, md5, filename)
if local_file:
media_type = _guess_media_type(filename or local_file.name)
return FileResponse(
path=str(local_file),
filename=filename or local_file.name,
media_type=media_type,
headers={
"Content-Disposition": _content_disposition(filename or local_file.name),
"Content-Length": str(local_file.stat().st_size),
"X-ChatLab-File-Source": "local-hardlink",
},
)
raise HTTPException(404, "原文件未找到,可能未解密或已清理")