Initial commit

2026-04-24 19:18:15 +08:00
commit fbcbe08696
555 changed files with 96692 additions and 0 deletions
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -0,0 +1,458 @@
+"""
+PyInstaller build script for creating standalone Python server binary.
+
+Usage:
+    python build_binary.py           # Build default (CPU) server binary
+    python build_binary.py --cuda    # Build CUDA-enabled server binary
+"""
+
+import PyInstaller.__main__
+import argparse
+import logging
+import os
+import platform
+import sys
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def is_apple_silicon():
+    """Check if running on Apple Silicon."""
+    return platform.system() == "Darwin" and platform.machine() == "arm64"
+
+
+def build_server(cuda=False):
+    """Build Python server as standalone binary.
+
+    Args:
+        cuda: If True, build with CUDA support and name the binary
+              voicebox-server-cuda instead of voicebox-server.
+    """
+    backend_dir = Path(__file__).parent
+
+    binary_name = "voicebox-server-cuda" if cuda else "voicebox-server"
+
+    # PyInstaller arguments
+    # CUDA builds use --onedir so we can split the output into two archives:
+    #   1. Server core (~200-400MB) — versioned with the app
+    #   2. CUDA libs (~2GB) — versioned independently (only redownloaded on
+    #      CUDA toolkit / torch major version changes)
+    # CPU builds remain --onefile for simplicity.
+    pack_mode = "--onedir" if cuda else "--onefile"
+    args = [
+        "server.py",  # Use server.py as entry point instead of main.py
+        pack_mode,
+        "--name",
+        binary_name,
+    ]
+
+    # Hide console window on Windows only. On macOS/Linux the sidecar needs
+    # stdout/stderr for Tauri to capture logs.
+    if platform.system() == "Windows":
+        args.append("--noconsole")
+
+    # numpy 2.x / torch ABI mismatch fix: install memmove fallback for
+    # torch.from_numpy() before the app starts. Runtime hooks run after
+    # FrozenImporter is registered so frozen torch/numpy are importable.
+    # Paths are passed relative to backend_dir because os.chdir(backend_dir)
+    # runs before PyInstaller. Absolute paths would get baked into the
+    # generated .spec, breaking reproducible builds on other machines / CI.
+    args.extend(
+        [
+            "--runtime-hook",
+            "pyi_rth_numpy_compat.py",
+            # Stub torch.compiler.disable before transformers imports
+            # flex_attention, which otherwise triggers torch._dynamo →
+            # torch._numpy._ufuncs and crashes at module load under
+            # PyInstaller. See pyi_rth_torch_compiler_disable.py.
+            "--runtime-hook",
+            "pyi_rth_torch_compiler_disable.py",
+            # Per-module collection overrides (e.g. forcing scipy.stats._distn_infrastructure
+            # to bundle .py source alongside .pyc so the runtime hook can source-patch it).
+            "--additional-hooks-dir",
+            "pyi_hooks",
+        ]
+    )
+
+    # Add local qwen_tts path if specified (for editable installs)
+    qwen_tts_path = os.getenv("QWEN_TTS_PATH")
+    if qwen_tts_path and Path(qwen_tts_path).exists():
+        args.extend(["--paths", str(qwen_tts_path)])
+        logger.info("Using local qwen_tts source from: %s", qwen_tts_path)
+
+    # Add common hidden imports
+    args.extend(
+        [
+            "--hidden-import",
+            "backend",
+            "--hidden-import",
+            "backend.main",
+            "--hidden-import",
+            "backend.config",
+            "--hidden-import",
+            "backend.database",
+            "--hidden-import",
+            "backend.models",
+            "--hidden-import",
+            "backend.services.profiles",
+            "--hidden-import",
+            "backend.services.history",
+            "--hidden-import",
+            "backend.services.tts",
+            "--hidden-import",
+            "backend.services.transcribe",
+            "--hidden-import",
+            "backend.utils.platform_detect",
+            "--hidden-import",
+            "backend.backends",
+            "--hidden-import",
+            "backend.backends.pytorch_backend",
+            "--hidden-import",
+            "backend.backends.qwen_custom_voice_backend",
+            "--hidden-import",
+            "backend.utils.audio",
+            "--hidden-import",
+            "backend.utils.cache",
+            "--hidden-import",
+            "backend.utils.progress",
+            "--hidden-import",
+            "backend.utils.hf_progress",
+            "--hidden-import",
+            "backend.services.cuda",
+            "--hidden-import",
+            "backend.services.effects",
+            "--hidden-import",
+            "backend.utils.effects",
+            "--hidden-import",
+            "backend.services.versions",
+            "--hidden-import",
+            "pedalboard",
+            "--hidden-import",
+            "chatterbox",
+            "--hidden-import",
+            "chatterbox.tts_turbo",
+            "--hidden-import",
+            "chatterbox.mtl_tts",
+            "--hidden-import",
+            "backend.backends.chatterbox_backend",
+            "--hidden-import",
+            "backend.backends.chatterbox_turbo_backend",
+            # chatterbox multilingual uses spacy_pkuseg for Chinese word
+            # segmentation, which ships pickled dict files (dicts/default.pkl)
+            # and native .so extensions that --hidden-import alone won't bundle.
+            "--collect-all",
+            "spacy_pkuseg",
+            "--hidden-import",
+            "backend.backends.luxtts_backend",
+            "--hidden-import",
+            "zipvoice",
+            "--hidden-import",
+            "zipvoice.luxvoice",
+            "--collect-all",
+            "zipvoice",
+            "--collect-all",
+            "linacodec",
+            "--hidden-import",
+            "torch",
+            "--hidden-import",
+            "transformers",
+            "--hidden-import",
+            "fastapi",
+            "--hidden-import",
+            "uvicorn",
+            "--hidden-import",
+            "sqlalchemy",
+            # librosa uses lazy_loader which generates .pyi stub files at
+            # install time and reads them at runtime to discover submodules.
+            # --hidden-import alone doesn't bundle the stubs, causing
+            # "Cannot load imports from non-existent stub" at runtime.
+            "--collect-all",
+            "lazy_loader",
+            "--collect-all",
+            "librosa",
+            "--hidden-import",
+            "soundfile",
+            "--hidden-import",
+            "qwen_tts",
+            "--hidden-import",
+            "qwen_tts.inference",
+            "--hidden-import",
+            "qwen_tts.inference.qwen3_tts_model",
+            "--hidden-import",
+            "qwen_tts.inference.qwen3_tts_tokenizer",
+            "--hidden-import",
+            "qwen_tts.core",
+            "--hidden-import",
+            "qwen_tts.cli",
+            "--copy-metadata",
+            "qwen-tts",
+            "--copy-metadata",
+            "requests",
+            "--copy-metadata",
+            "transformers",
+            "--copy-metadata",
+            "huggingface-hub",
+            "--copy-metadata",
+            "tokenizers",
+            "--copy-metadata",
+            "safetensors",
+            "--copy-metadata",
+            "tqdm",
+            "--hidden-import",
+            "requests",
+            # qwen_tts uses inspect.getsource() at runtime to locate
+            # modeling_qwen3_tts.py — needs physical .py source files bundled
+            "--collect-all",
+            "qwen_tts",
+            # Fix for pkg_resources and jaraco namespace packages
+            "--hidden-import",
+            "pkg_resources.extern",
+            "--collect-submodules",
+            "jaraco",
+            # inflect uses typeguard @typechecked which calls inspect.getsource()
+            # at import time — needs .py source files, not just .pyc bytecode
+            "--collect-all",
+            "inflect",
+            # perth ships pretrained watermark model files (hparams.yaml, .pth.tar)
+            # in perth/perth_net/pretrained/ — needed by chatterbox at runtime
+            "--collect-all",
+            "perth",
+            # piper_phonemize ships espeak-ng-data/ (phoneme tables, language dicts)
+            # needed by LuxTTS for text-to-phoneme conversion
+            "--collect-all",
+            "piper_phonemize",
+            # HumeAI TADA — speech-language model using Llama + flow matching
+            "--hidden-import",
+            "backend.backends.hume_backend",
+            "--hidden-import",
+            "tada",
+            "--hidden-import",
+            "tada.modules",
+            "--hidden-import",
+            "tada.modules.tada",
+            "--hidden-import",
+            "tada.modules.encoder",
+            "--hidden-import",
+            "tada.modules.decoder",
+            "--hidden-import",
+            "tada.modules.aligner",
+            "--hidden-import",
+            "tada.modules.acoustic_spkr_verf",
+            "--hidden-import",
+            "tada.nn",
+            "--hidden-import",
+            "tada.nn.vibevoice",
+            "--hidden-import",
+            "tada.utils",
+            "--hidden-import",
+            "tada.utils.gray_code",
+            "--hidden-import",
+            "tada.utils.text",
+            # DAC shim — provides dac.nn.layers.Snake1d without the real
+            # descript-audio-codec package (which pulls onnx/tensorboard via
+            # descript-audiotools). The shim is in backend/utils/dac_shim.py.
+            "--hidden-import",
+            "backend.utils.dac_shim",
+            "--hidden-import",
+            "torchaudio",
+            "--collect-submodules",
+            "tada",
+            # Kokoro 82M — lightweight TTS engine using misaki G2P
+            # collect-all is required because transformers introspects .py source
+            # files at runtime (e.g. _can_set_attn_implementation opens the class
+            # file); hidden-import alone only bundles bytecode.
+            "--hidden-import",
+            "backend.backends.kokoro_backend",
+            "--collect-all",
+            "kokoro",
+            # misaki ships G2P data files (dictionaries, phoneme tables)
+            # that must be bundled for espeak/en/ja/zh G2P to work
+            "--collect-all",
+            "misaki",
+            # language_tags ships JSON data files (index.json etc.) loaded at
+            # runtime via: misaki → phonemizer → segments → csvw → language_tags
+            "--collect-all",
+            "language_tags",
+            # espeakng_loader ships the entire espeak-ng-data directory (369 files)
+            # loaded at import time by misaki.espeak via get_data_path()
+            "--collect-all",
+            "espeakng_loader",
+            # spacy en_core_web_sm model — misaki.en tries to spacy.cli.download()
+            # at runtime if not found, which calls pip as a subprocess and crashes
+            # the frozen binary. Bundle the model so spacy.util.is_package() passes.
+            "--collect-all",
+            "en_core_web_sm",
+            "--copy-metadata",
+            "en_core_web_sm",
+            "--hidden-import",
+            "en_core_web_sm",
+            # unidic-lite ships the MeCab dictionary used by fugashi (pulled in
+            # by misaki[ja]). The dict lives in unidic_lite/dicdir/ and is
+            # discovered via the package's DICDIR constant, so the data files
+            # must be collected or Japanese Kokoro voices crash at runtime.
+            "--collect-all",
+            "unidic_lite",
+            "--hidden-import",
+            "loguru",
+        ]
+    )
+
+    # Add CUDA-specific hidden imports
+    if cuda:
+        logger.info("Building with CUDA support")
+        args.extend(
+            [
+                "--hidden-import",
+                "torch.cuda",
+                "--hidden-import",
+                "torch.backends.cudnn",
+            ]
+        )
+    else:
+        # Exclude NVIDIA CUDA packages from CPU-only builds to keep binary small.
+        # When building from a venv with CUDA torch installed, PyInstaller would
+        # bundle ~3GB of NVIDIA shared libraries. We exclude both the Python
+        # modules and the binary DLLs.
+        nvidia_packages = [
+            "nvidia",
+            "nvidia.cublas",
+            "nvidia.cuda_cupti",
+            "nvidia.cuda_nvrtc",
+            "nvidia.cuda_runtime",
+            "nvidia.cudnn",
+            "nvidia.cufft",
+            "nvidia.curand",
+            "nvidia.cusolver",
+            "nvidia.cusparse",
+            "nvidia.nccl",
+            "nvidia.nvjitlink",
+            "nvidia.nvtx",
+        ]
+        for pkg in nvidia_packages:
+            args.extend(["--exclude-module", pkg])
+
+    # Add MLX-specific imports if building on Apple Silicon (never for CUDA builds)
+    if is_apple_silicon() and not cuda:
+        logger.info("Building for Apple Silicon - including MLX dependencies")
+        args.extend(
+            [
+                "--hidden-import",
+                "backend.backends.mlx_backend",
+                "--hidden-import",
+                "mlx",
+                "--hidden-import",
+                "mlx.core",
+                "--hidden-import",
+                "mlx.nn",
+                "--hidden-import",
+                "mlx_audio",
+                "--hidden-import",
+                "mlx_audio.tts",
+                "--hidden-import",
+                "mlx_audio.stt",
+                "--collect-submodules",
+                "mlx",
+                "--collect-submodules",
+                "mlx_audio",
+                # Use --collect-all so PyInstaller bundles both data files AND
+                # native shared libraries (.dylib, .metallib) for MLX.
+                # Previously only --collect-data was used, which caused MLX to
+                # raise OSError at runtime inside the bundled binary because
+                # the Metal shader libraries were missing.
+                "--collect-all",
+                "mlx",
+                "--collect-all",
+                "mlx_audio",
+            ]
+        )
+    elif not cuda:
+        logger.info("Building for non-Apple Silicon platform - PyTorch only")
+
+    dist_dir = str(backend_dir / "dist")
+    build_dir = str(backend_dir / "build")
+
+    args.extend(
+        [
+            "--distpath",
+            dist_dir,
+            "--workpath",
+            build_dir,
+            "--noconfirm",
+            "--clean",
+        ]
+    )
+
+    # Change to backend directory
+    os.chdir(backend_dir)
+
+    # For CPU builds on Windows, ensure we're using CPU-only torch.
+    # If CUDA torch is installed (local dev), swap to CPU torch before building,
+    # then restore CUDA torch after. This prevents PyInstaller from bundling
+    # ~3GB of CUDA DLLs into the CPU binary.
+    restore_cuda = False
+    if not cuda and platform.system() == "Windows":
+        import subprocess
+
+        result = subprocess.run(
+            [sys.executable, "-c", "import torch; print(torch.version.cuda or '')"], capture_output=True, text=True
+        )
+        has_cuda_torch = bool(result.stdout.strip())
+        if has_cuda_torch:
+            logger.info("CUDA torch detected — installing CPU torch for CPU build...")
+            subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "pip",
+                    "install",
+                    "torch",
+                    "torchvision",
+                    "torchaudio",
+                    "--index-url",
+                    "https://download.pytorch.org/whl/cpu",
+                    "--force-reinstall",
+                    "-q",
+                ],
+                check=True,
+            )
+            restore_cuda = True
+
+    # Run PyInstaller
+    try:
+        PyInstaller.__main__.run(args)
+    finally:
+        # Restore CUDA torch if we swapped it out (even on build failure)
+        if restore_cuda:
+            logger.info("Restoring CUDA torch...")
+            import subprocess
+
+            subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "pip",
+                    "install",
+                    "torch",
+                    "torchvision",
+                    "torchaudio",
+                    "--index-url",
+                    "https://download.pytorch.org/whl/cu128",
+                    "--force-reinstall",
+                    "-q",
+                ],
+                check=True,
+            )
+
+    logger.info("Binary built in %s", backend_dir / "dist" / binary_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build voicebox-server binary")
+    parser.add_argument(
+        "--cuda",
+        action="store_true",
+        help="Build CUDA-enabled binary (voicebox-server-cuda)",
+    )
+    cli_args = parser.parse_args()
+    build_server(cuda=cli_args.cuda)