Initial commit

2026-04-24 19:18:15 +08:00
commit fbcbe08696
555 changed files with 96692 additions and 0 deletions
--- a/backend/routes/health.py
+++ b/backend/routes/health.py
@@ -0,0 +1,248 @@
+"""Health and infrastructure endpoints."""
+
+import asyncio
+import os
+import signal
+from pathlib import Path
+
+import torch
+from fastapi import APIRouter, Depends
+from fastapi.responses import FileResponse
+from sqlalchemy.orm import Session
+
+from .. import config, models
+from ..services import tts
+from ..database import get_db
+from ..utils.platform_detect import get_backend_type
+
+router = APIRouter()
+
+# Frontend build directory — present in Docker, absent in dev/API-only mode
+_frontend_dir = Path(__file__).resolve().parent.parent.parent / "frontend"
+
+
+@router.get("/")
+async def root():
+    """Root endpoint — serves SPA index.html in Docker, JSON otherwise."""
+    from .. import __version__
+
+    index = _frontend_dir / "index.html"
+    if index.is_file():
+        return FileResponse(index, media_type="text/html")
+    return {"message": "voicebox API", "version": __version__}
+
+
+@router.post("/shutdown")
+async def shutdown():
+    """Gracefully shutdown the server."""
+
+    async def shutdown_async():
+        await asyncio.sleep(0.1)
+        os.kill(os.getpid(), signal.SIGTERM)
+
+    asyncio.create_task(shutdown_async())
+    return {"message": "Shutting down..."}
+
+
+@router.post("/watchdog/disable")
+async def watchdog_disable():
+    """Disable the parent process watchdog so the server keeps running."""
+    from backend.server import disable_watchdog
+
+    disable_watchdog()
+    return {"message": "Watchdog disabled"}
+
+
+@router.get("/health", response_model=models.HealthResponse)
+async def health():
+    """Health check endpoint."""
+    from huggingface_hub import constants as hf_constants
+    from pathlib import Path
+
+    tts_model = tts.get_tts_model()
+    backend_type = get_backend_type()
+
+    has_cuda = torch.cuda.is_available()
+    has_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+
+    has_xpu = False
+    xpu_name = None
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401 -- side-effect import enables XPU
+
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            has_xpu = True
+            try:
+                xpu_name = torch.xpu.get_device_name(0)
+            except Exception:
+                xpu_name = "Intel GPU"
+    except ImportError:
+        pass
+
+    has_directml = False
+    directml_name = None
+    try:
+        import torch_directml
+
+        if torch_directml.device_count() > 0:
+            has_directml = True
+            try:
+                directml_name = torch_directml.device_name(0)
+            except Exception:
+                directml_name = "DirectML GPU"
+    except ImportError:
+        pass
+
+    gpu_compat_warning = None
+    if has_cuda:
+        from ..backends.base import check_cuda_compatibility
+
+        _compatible, gpu_compat_warning = check_cuda_compatibility()
+
+    gpu_available = has_cuda or has_mps or has_xpu or has_directml or backend_type == "mlx"
+
+    gpu_type = None
+    if has_cuda:
+        gpu_type = f"CUDA ({torch.cuda.get_device_name(0)})"
+    elif has_mps:
+        gpu_type = "MPS (Apple Silicon)"
+    elif backend_type == "mlx":
+        gpu_type = "Metal (Apple Silicon via MLX)"
+    elif has_xpu:
+        gpu_type = f"XPU ({xpu_name})"
+    elif has_directml:
+        gpu_type = f"DirectML ({directml_name})"
+
+    vram_used = None
+    if has_cuda:
+        vram_used = torch.cuda.memory_allocated() / 1024 / 1024
+    elif has_xpu:
+        try:
+            vram_used = torch.xpu.memory_allocated() / 1024 / 1024
+        except Exception:
+            pass  # memory_allocated() may not be available on all IPEX versions
+
+    model_loaded = False
+    model_size = None
+    try:
+        if tts_model.is_loaded():
+            model_loaded = True
+            model_size = getattr(tts_model, "_current_model_size", None)
+            if not model_size:
+                model_size = getattr(tts_model, "model_size", None)
+    except Exception:
+        model_loaded = False
+        model_size = None
+
+    model_downloaded = None
+    try:
+        from ..backends import get_model_config
+
+        default_config = get_model_config("qwen-tts-1.7B")
+        default_model_id = default_config.hf_repo_id if default_config else "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+
+        try:
+            from huggingface_hub import scan_cache_dir
+
+            cache_info = scan_cache_dir()
+            for repo in cache_info.repos:
+                if repo.repo_id == default_model_id:
+                    model_downloaded = True
+                    break
+        except (ImportError, Exception):
+            cache_dir = hf_constants.HF_HUB_CACHE
+            repo_cache = Path(cache_dir) / ("models--" + default_model_id.replace("/", "--"))
+            if repo_cache.exists():
+                has_model_files = (
+                    any(repo_cache.rglob("*.bin"))
+                    or any(repo_cache.rglob("*.safetensors"))
+                    or any(repo_cache.rglob("*.pt"))
+                    or any(repo_cache.rglob("*.pth"))
+                    or any(repo_cache.rglob("*.npz"))
+                )
+                model_downloaded = has_model_files
+    except Exception:
+        pass
+
+    return models.HealthResponse(
+        status="healthy",
+        model_loaded=model_loaded,
+        model_downloaded=model_downloaded,
+        model_size=model_size,
+        gpu_available=gpu_available,
+        gpu_type=gpu_type,
+        vram_used_mb=vram_used,
+        backend_type=backend_type,
+        backend_variant=os.environ.get(
+            "VOICEBOX_BACKEND_VARIANT",
+            "cuda" if torch.cuda.is_available() else ("xpu" if has_xpu else "cpu"),
+        ),
+        gpu_compatibility_warning=gpu_compat_warning,
+    )
+
+
+@router.get("/health/filesystem", response_model=models.FilesystemHealthResponse)
+async def filesystem_health():
+    """Check filesystem health: directory existence, write permissions, and disk space."""
+    import shutil
+
+    dirs_to_check = {
+        "generations": config.get_generations_dir(),
+        "profiles": config.get_profiles_dir(),
+        "data": config.get_data_dir(),
+    }
+
+    checks: list[models.DirectoryCheck] = []
+    all_ok = True
+
+    for _label, dir_path in dirs_to_check.items():
+        exists = dir_path.exists()
+        writable = False
+        error = None
+        if exists:
+            probe = dir_path / ".voicebox_probe"
+            try:
+                probe.write_text("ok")
+                probe.unlink()
+                writable = True
+            except PermissionError:
+                error = "Permission denied"
+            except OSError as e:
+                error = str(e)
+            finally:
+                try:
+                    probe.unlink(missing_ok=True)
+                except Exception:
+                    pass
+        else:
+            error = "Directory does not exist"
+
+        if not exists or not writable:
+            all_ok = False
+
+        checks.append(
+            models.DirectoryCheck(
+                path=str(dir_path.resolve()),
+                exists=exists,
+                writable=writable,
+                error=error,
+            )
+        )
+
+    disk_free_mb = None
+    disk_total_mb = None
+    try:
+        usage = shutil.disk_usage(str(config.get_data_dir()))
+        disk_free_mb = round(usage.free / (1024 * 1024), 1)
+        disk_total_mb = round(usage.total / (1024 * 1024), 1)
+        if disk_free_mb < 500:
+            all_ok = False
+    except OSError:
+        all_ok = False
+
+    return models.FilesystemHealthResponse(
+        healthy=all_ok,
+        disk_free_mb=disk_free_mb,
+        disk_total_mb=disk_total_mb,
+        directories=checks,
+    )