commit fbcbe08696e8458ef652592f33c04d6ccbda1741 Author: BorderArea01 <3109362852@qq.com> Date: Fri Apr 24 19:18:15 2026 +0800 Initial commit diff --git a/.agents/skills/add-tts-engine/SKILL.md b/.agents/skills/add-tts-engine/SKILL.md new file mode 100644 index 0000000..b28a73f --- /dev/null +++ b/.agents/skills/add-tts-engine/SKILL.md @@ -0,0 +1,120 @@ +--- +name: add-tts-engine +description: Use this skill to add a new TTS engine to Voicebox. It walks through dependency research, backend implementation, frontend wiring, PyInstaller bundling, and frozen-build testing. Always start with Phase 0 (dependency audit) before writing any code. +--- + +# Add TTS Engine + +## Goal + +Integrate a new text-to-speech engine into Voicebox end-to-end: dependency research, backend protocol implementation, frontend UI wiring, PyInstaller bundling, and frozen-build verification. The user should only need to test the final build locally. + +## Reference Doc + +The full phased guide lives at `docs/content/docs/developer/tts-engines.mdx`. **Read this file in its entirety before starting.** It contains: + +- Phase 0: Dependency research (mandatory before writing code) +- Phase 1: Backend implementation (`TTSBackend` protocol) +- Phase 2: Route and service integration (usually zero changes) +- Phase 3: Frontend integration (5 files) +- Phase 4: Dependencies (`requirements.txt`, justfile, CI, Docker) +- Phase 5: PyInstaller bundling (`build_binary.py` + `server.py`) +- Phase 6: Common upstream workarounds +- Implementation checklist (gate between phases) + +## Workflow + +### 1. Read the guide + +```bash +# Read the full TTS engines doc +cat docs/content/docs/developer/tts-engines.mdx +``` + +Internalize all phases, especially Phase 0 and Phase 5. The v0.2.3 release was three patch releases because Phase 0 was skipped. + +### 2. Dependency research (Phase 0) + +Clone the model library into a temporary directory and audit it. Do NOT skip this. + +```bash +mkdir /tmp/engine-research && cd /tmp/engine-research +git clone +``` + +Run the grep searches from Phase 0.2 in the guide against the cloned source and its transitive dependencies. Produce a written dependency audit covering: + +1. PyPI vs non-PyPI packages +2. PyInstaller directives needed (`--collect-all`, `--copy-metadata`, `--hidden-import`) +3. Runtime data files that must be bundled +4. Native library paths that need env var overrides in frozen builds +5. Monkey-patches needed (`torch.load`, float64, MPS, HF token) +6. Sample rate +7. Model download method (`from_pretrained` vs `snapshot_download` + `from_local`) + +Test model loading and generation on CPU in the throwaway venv before proceeding. + +### 3. Implement (Phases 1–4) + +Follow the guide's phases in order. Key files to modify: + +**Backend (Phase 1):** +- Create `backend/backends/_backend.py` +- Register in `backend/backends/__init__.py` (ModelConfig + TTS_ENGINES + factory) +- Update regex in `backend/models.py` + +**Frontend (Phase 3):** +- `app/src/lib/api/types.ts` — engine union type +- `app/src/lib/constants/languages.ts` — ENGINE_LANGUAGES +- `app/src/components/Generation/EngineModelSelector.tsx` — ENGINE_OPTIONS, ENGINE_DESCRIPTIONS +- `app/src/lib/hooks/useGenerationForm.ts` — Zod schema, model-name mapping +- `app/src/components/ServerSettings/ModelManagement.tsx` — MODEL_DESCRIPTIONS + +**Dependencies (Phase 4):** +- `backend/requirements.txt` +- `justfile` (setup-python, setup-python-release targets) +- `.github/workflows/release.yml` +- `Dockerfile` (if applicable) + +### 4. PyInstaller bundling (Phase 5) + +Register the engine in `backend/build_binary.py`: +- `--hidden-import` for the backend module and model package +- `--collect-all` for packages using `inspect.getsource`, shipping data files, or native libraries +- `--copy-metadata` for packages using `importlib.metadata` + +If the engine has native data paths, add `os.environ.setdefault()` in `backend/server.py` inside the `if getattr(sys, 'frozen', False):` block. + +### 5. Verify in dev mode + +```bash +just dev +``` + +Test the full chain: model download → load → generate → voice cloning. + +### 6. Use the checklist + +Walk through the Implementation Checklist at the bottom of `tts-engines.mdx`. Every item must be checked before handing the build to the user. + +## Key Lessons (from v0.2.3) + +These are the most common failure modes. Phase 0 research catches all of them: + +| Pattern | Symptom in Frozen Build | Fix | +|---------|------------------------|-----| +| `@typechecked` / `inspect.getsource()` | "could not get source code" | `--collect-all ` | +| Package ships pretrained model files | `FileNotFoundError` for `.pth.tar`, `.yaml` | `--collect-all ` | +| C library with hardcoded system paths | `FileNotFoundError` for `/usr/share/...` | `--collect-all` + env var in `server.py` | +| `importlib.metadata.version()` | "No package metadata found" | `--copy-metadata ` | +| `torch.load` without `map_location` | CUDA device not available on CPU build | Monkey-patch `torch.load` | +| `torch.from_numpy` on float64 data | dtype mismatch RuntimeError | Cast to `.float()` | +| `token=True` in HF download calls | Auth failure without stored HF token | Use `snapshot_download(token=None)` + `from_local()` | + +## Notes + +- The route and service layers have zero per-engine dispatch points. `main.py` requires zero changes. +- The model config registry in `backends/__init__.py` handles all dispatch automatically. +- Use `get_torch_device()` and `model_load_progress()` from `backends/base.py` — don't reimplement device detection or progress tracking. +- Always test with a **clean HuggingFace cache** (no pre-downloaded models from dev). +- Do NOT push or create a release. Hand the build to the user for local testing. diff --git a/.agents/skills/draft-release-notes/SKILL.md b/.agents/skills/draft-release-notes/SKILL.md new file mode 100644 index 0000000..b844c70 --- /dev/null +++ b/.agents/skills/draft-release-notes/SKILL.md @@ -0,0 +1,94 @@ +--- +name: draft-release-notes +description: Use this skill to draft or update the [Unreleased] section of CHANGELOG.md from the actual changes since the last tag. Run this at any point during development to keep a working copy of the release narrative. Does NOT bump versions or create tags. +--- + +# Draft Release Notes + +## Goal + +Update the `[Unreleased]` section at the top of `CHANGELOG.md` with a narrative release story based on the real changes since the last tag. This is a **non-destructive working copy** — run it as many times as you want during development. + +## Workflow + +1. **Identify the last release tag and gather changes.** + + ```bash + LAST_TAG=$(git tag --list "v*" --sort=-v:refname | head -n 1) + echo "Last tag: $LAST_TAG" + ``` + + Then collect raw material from three sources: + + a. **Commit log since last tag:** + ```bash + git log --oneline "$LAST_TAG"..HEAD + ``` + + b. **GitHub-generated release notes preview** (PR titles, new contributors): + ```bash + gh api repos/:owner/:repo/releases/generate-notes \ + -f tag_name="vNEXT" \ + -f target_commitish="$(git rev-parse HEAD)" \ + -f previous_tag_name="$LAST_TAG" \ + --jq '.body' + ``` + + c. **Diff stat for theme analysis:** + ```bash + git diff --stat "$LAST_TAG"..HEAD + ``` + +2. **Draft the release narrative.** + + Write markdown for the `[Unreleased]` section following the format below. Do not include the `## [Unreleased]` heading itself — just the body content. + +3. **Update CHANGELOG.md.** + + Replace everything between `## [Unreleased]` and the next `## [` heading with the new draft. Preserve the HTML comment header and all existing release sections below. + + The `[Unreleased]` section must always exist and always be the first section after the header comments. + +4. **Do NOT commit, tag, or bump versions.** Just leave the file modified in the working tree. + +## Release Story Format + +Structure the `[Unreleased]` section like this: + +```markdown +## [Unreleased] + + + + + +### +- Bullet points with specifics +- Reference PRs where available: ([#123](https://github.com/jamiepine/voicebox/pull/123)) + +### +- ... + +### Bug Fixes +- ... +``` + +### Style Guidelines + +- **Factual and specific.** Every claim should trace to a real commit or PR. +- **Narrative over list.** Lead with paragraphs that tell the story, then support with bullets. +- **Group by theme, not by commit.** Cluster related changes under descriptive headings. +- **Reference PRs** where they exist, but don't fabricate them. +- **Skip trivial chores** (typo fixes, CI tweaks) unless they're the bulk of the release. +- **Match the voice of existing releases** — look at the v0.2.1 and v0.2.3 entries in CHANGELOG.md for tone reference. + +## When There Are No Changes + +If `git log "$LAST_TAG"..HEAD` is empty, leave the `[Unreleased]` section empty (just the heading) and tell the user there's nothing to draft. + +## Notes + +- This skill only touches the `[Unreleased]` section. It never modifies stamped release sections. +- The agent can be asked to run this skill at any point — mid-feature, before a PR, or right before cutting a release. +- The `release-bump` skill depends on this draft being up to date before it finalizes. diff --git a/.agents/skills/release-bump/SKILL.md b/.agents/skills/release-bump/SKILL.md new file mode 100644 index 0000000..8b52f87 --- /dev/null +++ b/.agents/skills/release-bump/SKILL.md @@ -0,0 +1,124 @@ +--- +name: release-bump +description: Use this skill to finalize a release. It stamps the [Unreleased] changelog section with a version and date, runs bumpversion to update all version files, and creates the release commit and tag. Only run this when you're ready to ship. +--- + +# Release Bump + +## Goal + +Finalize the changelog draft, bump the version across all tracked files, and create a tagged release commit. After this skill runs, the repo has a clean release commit and tag ready to push. + +## Prerequisites + +- `gh` CLI installed and authenticated (`gh auth status`). +- `bumpversion` installed (`pip install bumpversion` or available in the project venv). +- The `[Unreleased]` section of `CHANGELOG.md` should already contain the release narrative. If it's empty or stale, run the `draft-release-notes` skill first. + +## Workflow + +1. **Verify the working tree is clean** (except `CHANGELOG.md` which may have the draft). + + ```bash + git status --porcelain + ``` + + Only `CHANGELOG.md` (and optionally `.agents/` files) should be modified. If there are other uncommitted changes, stop and ask the user to commit or stash them first. + +2. **Determine the bump level.** + + Ask the user if not specified: `patch`, `minor`, or `major`. Check the current version: + + ```bash + grep '^current_version' .bumpversion.cfg + ``` + +3. **Stamp the changelog.** + + Read the current `[Unreleased]` content from `CHANGELOG.md`. Compute the new version (based on bump level and current version). Then: + + a. Replace the `## [Unreleased]` section body with an empty placeholder. + b. Insert a new stamped section immediately after `## [Unreleased]`: + + ```markdown + ## [Unreleased] + + ## [X.Y.Z] - YYYY-MM-DD + + + ``` + + c. Update the reference links at the bottom of the file: + - Change the `[Unreleased]` link to compare against the new tag + - Add a new link for the new version + + ```markdown + [Unreleased]: https://github.com/jamiepine/voicebox/compare/vX.Y.Z...HEAD + [X.Y.Z]: https://github.com/jamiepine/voicebox/compare/vPREVIOUS...vX.Y.Z + ``` + +4. **Stage the changelog.** + + ```bash + git add CHANGELOG.md + ``` + +5. **Run bumpversion.** + + ```bash + bumpversion --allow-dirty + ``` + + The `--allow-dirty` flag is needed because `CHANGELOG.md` is already staged. bumpversion will: + - Update version strings in all tracked files (see `.bumpversion.cfg`) + - Create a commit with message `Bump version: X.Y.Z -> A.B.C` + - Create a tag `vA.B.C` + + The staged `CHANGELOG.md` will be included in this commit automatically. + +6. **Verify results.** + + ```bash + git show --name-only --stat HEAD + git tag --list "v*" --sort=-v:refname | head -n 5 + ``` + + Confirm the commit contains: + - `CHANGELOG.md` + - `.bumpversion.cfg` + - `tauri/src-tauri/tauri.conf.json` + - `tauri/src-tauri/Cargo.toml` + - `package.json` + - `app/package.json` + - `tauri/package.json` + - `landing/package.json` + - `web/package.json` + - `backend/__init__.py` + + Confirm the new tag exists. + +7. **Do NOT push** unless the user explicitly asks. Report the tag name and suggest: + + ``` + Ready to push. When you're ready: + git push origin main --follow-tags + ``` + +## Version Calculation Reference + +Given current version `X.Y.Z`: +- `patch` -> `X.Y.(Z+1)` +- `minor` -> `X.(Y+1).0` +- `major` -> `(X+1).0.0` + +## Error Recovery + +- If bumpversion fails, the tag won't exist. Fix the issue and re-run — bumpversion is idempotent as long as the tag doesn't already exist. +- If you need to undo a release commit (before pushing): `git tag -d vX.Y.Z && git reset --soft HEAD~1` +- Never amend a release commit that has been pushed. + +## Notes + +- When the tag is pushed, the release CI (`.github/workflows/release.yml`) automatically extracts the matching version section from `CHANGELOG.md` and uses it as the GitHub Release body. No manual copy-paste needed. +- The release commit message is controlled by `.bumpversion.cfg` (`Bump version: X.Y.Z -> A.B.C`). Do not override it. +- If you need to manually update the GitHub Release body after the fact: `gh release edit vX.Y.Z --notes-file <(sed -n '/## \[X.Y.Z\]/,/## \[/p' CHANGELOG.md | head -n -1)` diff --git a/.agents/skills/triage-prs/SKILL.md b/.agents/skills/triage-prs/SKILL.md new file mode 100644 index 0000000..2b83b94 --- /dev/null +++ b/.agents/skills/triage-prs/SKILL.md @@ -0,0 +1,299 @@ +--- +name: triage-prs +description: Use this skill to triage the open PR queue before a release. Classifies every open PR into must-merge, candidate, superseded, or deferred; writes a working triage doc; and runs the merge loop end-to-end. Designed for the pre-release "PR speedrun" pass where a solo maintainer wants to clear the inbound backlog in a single session. +--- + +# Triage PRs + +## Goal + +Turn a backlog of open PRs into a shipped set of merges in a single focused session. Produce a tracked, resumable plan (`_PR_TRIAGE.md`), then work it — rebasing where needed, merging in isolation-safe batches, applying post-merge follow-ups, and closing superseded or partially-applicable PRs with credit to their authors. + +This skill pairs with `draft-release-notes` and `release-bump`: triage first, then draft notes against the new main, then cut the release. + +## When to use + +- Before a minor or major release when 10+ open PRs have accumulated +- When you want to unblock merging without losing the narrative of what's landing +- When you know you can't personally review every PR deeply, but need to land the critical subset fast + +## Prerequisites + +- `gh` CLI authenticated against the repo +- A dedicated worktree for PR review (avoid contaminating `main` with checkouts of contributor branches) +- Clarity on the target version — the triage doc is named after it (e.g. `0.4.0_PR_TRIAGE.md`) + +## Workflow + +### 1. Set up an isolated PR-review worktree + +```bash +git worktree list # check for stale ones first +git worktree prune +git worktree add ../voicebox-pr-review -b pr-review- main +``` + +Keep the main worktree for release-prep work (changelog drafts, direct-to-main follow-ups). Keep the review worktree for `gh pr checkout` — each checkout moves HEAD to a contributor branch, which you don't want to do in the main worktree. + +### 2. Gather metadata for every open PR + +```bash +gh pr list --state open --limit 50 --json \ + number,title,author,isDraft,mergeable,mergeStateStatus,files,additions,deletions,reviewDecision,statusCheckRollup,maintainerCanModify \ + --jq '.[] | {num: .number, title, author: .author.login, mergeable, state: .mergeStateStatus, canModify: .maintainerCanModify, changes: "+\(.additions)/-\(.deletions)", files: [.files[].path]}' +``` + +You want, for each PR: +- Size (`+additions/-deletions`) +- Mergeable state (`CLEAN`, `UNSTABLE`, `DIRTY` = conflicts, `UNKNOWN` = GitHub still computing) +- Whether maintainer edits are allowed on the branch (needed later if you rebase for the author) +- File paths touched (helps spot overlaps between PRs) + +`UNKNOWN` is common right after a push to main — just try the merge and see. + +### 3. Classify into tiers + +Sort each PR into exactly one bucket: + +**Tier 1 — Merge:** small, mergeable, fixes a real bug, clean CI, low review cost. One-liners, dependency relaxations, targeted safety hardening. These are the easy wins. + +**Tier 2 — Candidate, review:** medium size (50-200 lines), touches more surface area, looks sound but needs a closer read. New user-facing features that fit the product direction. + +**Supersede:** the fix or feature is already covered by something merged. Close with a comment pointing to the superseding PR. Check carefully — "similar title" isn't proof; compare the actual diffs. + +**Defer to next release:** big features, dirty conflicts, draft PRs, anything touching the release pipeline in ways that would introduce risk. Don't merge these in a speedrun — they need dedicated focus. + +### 4. Write the triage doc + +Create `_PR_TRIAGE.md` in the PR-review worktree root. Structure: + +```markdown +# — PR Triage + +Working doc for tracking which open PRs land in . Delete after release cut. + +Last updated: + +## Progress + +**Tier 1: 0 / N merged** +**Tier 2: 0 / M handled** +**Supersede triage: pending** + +--- + +## Merge for — critical bug fixes + +| PR | Status | Size | What it fixes | Why must-have | +|---|---|---|---|---| +| [#123](url) | [ ] | +5/-0 | ... | ... | + +## Strong candidate — needs a quick review + +| PR | Status | Size | Summary | +|---|---|---|---| + +## Close as superseded + +| PR | Status | Reason | +|---|---|---| + +## Defer to + +- [#xxx](url) ... — reason + +--- + +## Order of attack + +1. Close superseded PRs (one-liner comments) +2. Merge tier-1 in dependency-free batches — check file paths don't overlap +3. Review tier-2 individually +4. Rerun `draft-release-notes` to pick up everything +5. Run `release-bump` +``` + +The **Progress** header is the most important part — it's your scoreboard and lets you resume cleanly if the session gets interrupted. + +### 5. Work the loop — per PR + +For each PR in the tier-1 / tier-2 list: + +**a. Checkout in the review worktree:** +```bash +cd ../voicebox-pr-review +git checkout pr-review- # reset to neutral base +gh pr checkout +``` + +**b. Read the *actual* commit, not `main..HEAD`:** + +```bash +git show HEAD # the PR's actual changes +git show --stat HEAD # files touched + line counts +``` + +**Do NOT review via `git diff main..HEAD`** if the PR branch is older than main. That diff includes *every commit that landed on main after the PR was forked* as `-` (deletion) lines. A 3-line PR can look like a 700-line revert. This is the single easiest way to misjudge a PR. + +**c. Evaluate concerns:** correctness, scope, interaction with already-merged work, version compatibility (e.g. can't use an API that requires a dependency version we don't yet pin). + +**d. Rebase if the branch is behind main:** +```bash +git fetch origin main +git rebase origin/main +``` + +This is **essential** before squash-merging. GitHub's squash computes `diff(PR-head, merge-base)` — on a stale branch, that diff includes reverting every in-between commit. Rebasing moves the merge-base forward so the squash is clean. + +**e. If maintainer edits are allowed, push the rebase back to the contributor's fork:** +```bash +git remote add https://github.com//.git +git fetch # get their ref first +git push HEAD: --force-with-lease +``` + +This keeps GitHub's PR UI in sync with the rebased state and makes the merge clean from the GitHub side. + +**f. Merge:** +```bash +gh pr merge --squash +``` + +**g. Update the triage doc** — flip the checkbox to `✅ merged ` (use the short SHA from `gh pr view --json mergeCommit --jq '.mergeCommit.oid[0:7]'`). Update the Progress header. + +### 6. Batch tiny fixes + +PRs with ≤5 line changes, clean CI, non-overlapping file paths, and obviously-correct intent (e.g. one-line dependency relax, env var add, import path fix) can be merged in a single loop without the review-per-PR ceremony: + +```bash +for pr in 425 384 416 429; do + echo "=== Merging PR $pr ===" + gh pr merge $pr --squash +done +``` + +Verify afterward that each landed cleanly: +```bash +for pr in 425 384 416 429; do + gh pr view $pr --json state,mergeCommit --jq "{pr: $pr, state, sha: .mergeCommit.oid[0:7]}" +done +``` + +### 7. Post-merge follow-ups + +Sometimes a PR is worth merging despite a known minor issue (e.g. incomplete dtype map, stale sentinel cleanup). Don't block the merge; apply the follow-up as a normal branch + PR right after: + +```bash +cd +git pull --ff-only origin main +git checkout -b fix/ +# edit... +git commit -m "fix(): " +git push -u origin fix/ +gh pr create --title "..." --body "Follow-up to #. ..." +``` + +Record both SHAs in the triage doc (`✅ merged + follow-up `). + +**Direct-to-main exception:** only under an explicit, scoped policy (e.g. "release speedrun"). Don't default to it. + +### 8. Supersede: close with a credit-pointing comment + +```bash +gh pr close --comment "Closing — superseded by merged # which landed . Thanks!" +``` + +Check the diffs first — "similar title" is not enough. If the PR is *partially* superseded (the diagnosis is right but only half the changes are still needed), do a partial-apply instead. + +### 9. Partial-apply pattern + +When a PR has both valuable and questionable changes bundled: + +```bash +cd +git pull --ff-only origin main + +# Cherry-pick specific files from the PR branch +git checkout -- + +# Review the staged changes, adjust as needed +git diff --cached + +# Apply any surgical edits to files you don't want to bulk-replace +# (e.g. the PR's file predates a recent main commit you need to preserve) + +# Commit with a trailer crediting the original author +git commit -m "$(cat <<'EOF' + + + + +Co-Authored-By: +EOF +)" +git push ... # branch + PR, unless under the direct-to-main exception +``` + +Then close the PR with a comment explaining what was applied and what was dropped, referencing the commit SHA. + +### 10. Keep the doc current + +Every merge, every close, every follow-up → update `_PR_TRIAGE.md`. The doc is your session log. If you're interrupted and resume tomorrow, the doc is the only source of truth for "where am I." + +### 11. When triage is done + +- Every PR in the doc has a terminal status (✅ merged / ✅ closed / deferred) +- Progress header shows N/N for each tier +- Next skill to run is `draft-release-notes` (to regenerate `[Unreleased]` against the new main), then `release-bump` + +You can delete the triage doc after the release ships, or keep it in version history as a record. + +## Gotchas + +- **`main..HEAD` on a stale branch lies.** It shows everything main gained since the branch split as deletions. Always review via `git show HEAD` for the PR's actual commit. +- **Squash-merging an unrebased branch reverts in-between work.** The squash computes `diff(PR-head, merge-base)`. Rebase moves the merge-base forward. +- **`mergeable=UNKNOWN`** is transient — GitHub is recomputing after a push. Just try the merge. +- **Route ordering matters (FastAPI and similar):** `DELETE /history/failed` must be registered *before* `DELETE /history/{id}`, or the parameterized path will consume `"failed"` as an ID. +- **Apple's `-weak_framework` overrides `-framework`** for the same framework, regardless of order — use it via `cargo:rustc-link-arg=-Wl,-weak_framework,Name` when a dependency hard-links something optional. +- **Dependency version floors constrain what you can apply.** Before accepting a kwarg rename like `torch_dtype=` → `dtype=`, check the min-version pin supports it. Sometimes the right move is to cherry-pick half the PR. +- **`cpal::Stream` and similar `!Send` audio types** can't cross `await` points or `spawn_blocking`. Sometimes a "not-ideal but correct" sync wait is the best available fix; flag but don't block. +- **PyTorch nightly builds are not shippable for releases** — non-deterministic, can regress between runs. If a PR suggests switching to nightly to fix a GPU issue, prefer `TORCH_CUDA_ARCH_LIST=...+PTX` or wait for stable support instead. + +## Canonical commands reference + +```bash +# Bulk PR metadata +gh pr list --state open --limit 50 --json number,title,author,mergeable,mergeStateStatus,additions,deletions,maintainerCanModify,files + +# Detailed single-PR view +gh pr view --json body,author,headRefName,baseRefName,mergeable,maintainerCanModify,files,statusCheckRollup + +# The actual commit, not the branch-vs-main diff +git show HEAD +git show --stat HEAD +gh pr diff + +# Rebase contributor branch onto current main +git fetch origin main && git rebase origin/main + +# Push rebase back to contributor fork (maintainerCanModify=true required) +git remote add https://github.com//.git +git fetch +git push HEAD: --force-with-lease + +# Merge +gh pr merge --squash + +# Confirm merge SHA for triage doc +gh pr view --json state,mergeCommit --jq '{state, sha: .mergeCommit.oid[0:7]}' + +# Close superseded +gh pr close --comment "Closing — superseded by merged #. Thanks!" +``` + +## Notes + +- **Never review a stale branch via `main..HEAD`.** This is the single most important line in this skill. +- **The triage doc is the session state.** Lose the doc, lose the session. Update it after every action. +- **Credit contributors even on partial-applies.** Use `Co-Authored-By:` trailers and close comments that link to the applied commit. +- **Don't let perfect be the enemy of shipped.** A fix that goes from "broken" to "works with a minor known issue" is a strict improvement. Flag the issue, file a follow-up, merge the fix. diff --git a/.biomeignore b/.biomeignore new file mode 100644 index 0000000..5e32ca6 --- /dev/null +++ b/.biomeignore @@ -0,0 +1,18 @@ +# Dependencies +node_modules +bun.lockb + +# Build outputs +dist +target +.tauri + +# Generated files +app/src/lib/api + +# Config files (don't lint/format) +*.config.js +*.config.ts + +# Tailwind CSS files (contains @tailwind directives) +**/index.css diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..241a703 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,39 @@ +[bumpversion] +current_version = 0.4.5 +commit = True +tag = True +tag_name = v{new_version} +tag_message = Release v{new_version} +message = Bump version: {current_version} → {new_version} + +[bumpversion:file:tauri/src-tauri/tauri.conf.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:tauri/src-tauri/Cargo.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:package.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:app/package.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:tauri/package.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:landing/package.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:web/package.json] +search = "version": "{current_version}" +replace = "version": "{new_version}" + +[bumpversion:file:backend/__init__.py] +search = __version__ = "{current_version}" +replace = __version__ = "{new_version}" diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..fed7d0a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,45 @@ +# Version control +.git +.github +.gitignore + +# Desktop-only (not needed in web container) +tauri/ +landing/ +docs/ +mlx-test/ +scripts/ + +# Dependencies & build artifacts (rebuilt in Docker) +node_modules/ +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +dist/ +build/ +*.spec + +# Data (will be bind-mounted) +data/ +backend/data/ + +# IDE & OS +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store +Thumbs.db + +# Config files not needed in container +biome.json +.biomeignore +.bumpversion.cfg +.npmrc +Makefile +CONTRIBUTING.md +SECURITY.md +LICENSE +README.md +backend/README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bcc1927 --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Dependencies +node_modules/ +bun.lockb +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +env/ +ENV/ +*.prompt +# Build outputs +dist/ +build/ +*.egg-info/ +*.egg +target/ +*.app +*.dmg +*.exe +*.msi +*.deb +*.AppImage + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Data (user-generated) +data/ +!data/.gitkeep + +# Logs +*.log +logs/ + +# Environment +.env +.env.local + +# Generated files +app/openapi.json +tauri/src-tauri/binaries/* +tauri/src-tauri/gen/Assets.car +tauri/src-tauri/gen/voicebox.icns +tauri/src-tauri/gen/partial.plist + +# PyInstaller +*.spec + +# Windows artifacts +nul + +# Temporary +tmp/ +temp/ +*.tmp + +# E2E test artifacts +backend/tests/results/ +backend/tests/fixtures/reference_voice.wav +backend/tests/fixtures/reference_voice.txt diff --git a/.npmrc b/.npmrc new file mode 100644 index 0000000..f3ecbbc --- /dev/null +++ b/.npmrc @@ -0,0 +1,2 @@ +# Force bun usage +engine-strict=true diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1559d69 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,684 @@ + + + + + +# Changelog + +## [Unreleased] + +## [0.4.5] - 2026-04-22 + +Second hotfix for the "offline mode is enabled" crash on model load. 0.4.4 reverted the inference-path offline guards but kept the same trap on the load path, so users who updated to 0.4.4 kept hitting the exact error the release was supposed to fix ([#526](https://github.com/jamiepine/voicebox/issues/526)). This release removes the load-path guards and patches the transformers tokenizer load to be robust to HuggingFace metadata failures at the source, so the class of bug can't recur. + +### Reliability + +- **Load no longer fails with "offline mode is enabled"** ([#530](https://github.com/jamiepine/voicebox/pull/530), fixes [#526](https://github.com/jamiepine/voicebox/issues/526)). transformers 4.57.x added an unconditional `huggingface_hub.model_info()` call inside `AutoTokenizer.from_pretrained` (via `_patch_mistral_regex`) that runs for every non-local repo load, regardless of cache state or whether the target model is actually a Mistral variant. The load-time `HF_HUB_OFFLINE` guard from 0.4.2 turned that into a hard crash for cached online users the moment 0.4.4 removed the inference-path guard that had been masking the problem. Fix wraps `_patch_mistral_regex` so any exception from the HF metadata check is caught and the tokenizer is returned unchanged — matching the success-path behavior for non-Mistral repos. The wrapper installs at `backend.backends` import time so it covers Qwen Base, Qwen CustomVoice, TADA, and every other transformers-backed engine on Windows, Linux, and CUDA alike. The load-time `force_offline_if_cached` guards were removed — with the wrapper in place they provide zero value and only risk re-introducing the same failure mode. +- **No more 30s pause when generating without a network.** The HuggingFace metadata timeout called out as a known caveat in 0.4.4 is covered by the same patch; offline users no longer wait for the check to time out before load completes. + +## [0.4.4] - 2026-04-21 + +Hotfix for a regression in 0.4.3 where generation and transcription could fail outright with "offline mode is enabled" even when the user was online. + +### Reliability + +- **Inference no longer fails with "offline mode is enabled" while online** ([#524](https://github.com/jamiepine/voicebox/pull/524), reverts the inference-path guards from [#503](https://github.com/jamiepine/voicebox/pull/503)). 0.4.3 wrapped every inference body (`generate`, `transcribe`, `create_voice_clone_prompt`) with a process-wide `HF_HUB_OFFLINE` flip to stop lazy HuggingFace lookups from hanging when the network drops mid-inference ([#462](https://github.com/jamiepine/voicebox/issues/462)). That flag also blocks legitimate metadata calls (e.g. `HfApi().model_info` for revision resolution) so online users started seeing generation fail outright. Inference now runs with the process's default HF state. Load-time offline guards — which weren't the source of the regression — stay in place. + +**Known caveat**: users generating without an internet connection may see brief pauses during inference while HuggingFace metadata lookups time out (typically ~30s, after which the library recovers). A proper offline-mode toggle is planned for 0.4.5. + +## [0.4.3] - 2026-04-20 + +A patch focused on two user-impacting reliability fixes: macOS DMG notarization (unblocks `brew install voicebox` on macOS 15 Sequoia and fixes spurious "app isn't signed" Gatekeeper dialogs on older Intel Macs) and Kokoro Japanese voice initialization on fresh installs. + +### macOS + +- **DMGs are now notarized and stapled** ([#523](https://github.com/jamiepine/voicebox/pull/523)). Tauri's bundler notarizes the `.app` inside the DMG but ships the DMG wrapper itself unnotarized. Gatekeeper rejects that on macOS 15 Sequoia (confirmed by Homebrew Cask CI failing on both arm and intel Sequoia runners) and causes the "the app is not signed" dialog on older Intel Macs when Apple's notarization servers are slow or unreachable ([#509](https://github.com/jamiepine/voicebox/issues/509)). The release workflow now submits each DMG to `notarytool`, staples the ticket, verifies with `spctl`, and overwrites the draft-release asset `tauri-action` uploaded. Adds ~5-10 min per macOS job. + +### Backend + +- **Kokoro Japanese voices no longer crash on fresh installs** ([#521](https://github.com/jamiepine/voicebox/pull/521), fixes [#514](https://github.com/jamiepine/voicebox/issues/514)). `misaki[ja]` pulls in `fugashi`, which needs a MeCab dictionary on disk. The `unidic` package that was being installed ships no data and expects a ~526MB runtime download that `just setup` doesn't run (and which wouldn't survive PyInstaller anyway). Swapped to `unidic-lite`, which bundles a MeCab-compatible dict inside the wheel (~50MB). Collected in `build_binary.py` so frozen builds pick up `unidic_lite/dicdir/`. + +## [0.4.2] - 2026-04-20 + +This release localizes the entire app. English, Simplified Chinese (zh-CN), Traditional Chinese (zh-TW), and Japanese (ja) are wired up end-to-end across every tab, modal, dialog, and toast — 559 translation keys per locale, parity verified. Plus a batch of reliability fixes: offline-mode now actually stays offline, Chatterbox accepts reference samples it used to reject, MLX Qwen 0.6B points at the right repo, and macOS system audio survives backgrounding. + +### Internationalization ([#508](https://github.com/jamiepine/voicebox/pull/508)) +- **i18next foundation** with an in-app language switcher that re-renders the tree on change — lazy-loaded components were holding stale strings without an explicit key-bump on the React root. +- **Four locales** at full coverage: English, Simplified Chinese, Traditional Chinese, Japanese. No partial/English-fallback surfaces. +- **Every user-visible surface translated**: Stories (list, content editor, dialogs, toasts), Effects (list, detail, chain editor, built-in preset names), Voices (table, search, inspector, Create/Edit modal, audio sample panels), Audio Channels (list, dialogs, device picker), history + story dropdown menus, ProfileCard / ProfileList / HistoryTable, and the unsupported-model note. +- **Relative dates** localize via `date-fns` locale objects (`3 days ago` → `3 天前` / `3 日前`) — `Intl.RelativeTimeFormat` doesn't produce the phrasing we use in the history table. +- **Dev-build version suffix** (`v0.4.2 (dev)` / `(开发版)` / `(開發版)` / `(開発版)`) is now locale-aware. +- **559 translation keys** across all four locales. + +### Reliability +- **`HF_HUB_OFFLINE` now guards every inference path** ([#503](https://github.com/jamiepine/voicebox/pull/503)) — some engines were still attempting a HuggingFace metadata roundtrip on first load when offline mode was enabled, causing hangs on airgapped or flaky networks. +- **Chatterbox reference samples are preprocessed instead of rejected** ([#502](https://github.com/jamiepine/voicebox/pull/502)) — samples outside the expected sample rate or channel layout are resampled to match, rather than failing with an opaque error. +- **MLX Qwen 0.6B repo path fixed** ([#501](https://github.com/jamiepine/voicebox/pull/501)) — now points at the published `mlx-community` repo so the model actually downloads on Apple Silicon. +- **macOS system audio survives backgrounding** ([#486](https://github.com/jamiepine/voicebox/pull/486), closes [#41](https://github.com/jamiepine/voicebox/issues/41)) — WKWebView was tearing down the audio session when the app lost focus, silently killing system-audio capture. +- **MLX backend `miniaudio` dependency pinned** ([#506](https://github.com/jamiepine/voicebox/pull/506)) — `mlx_audio.stt` needs it at runtime and nothing else transitively pulled it in, so `--no-deps` installs were breaking on first use. + +### Landing / Docs +- **New `/download` page** ([#487](https://github.com/jamiepine/voicebox/pull/487)) — no more dumping first-time visitors onto the GitHub releases list. The API example snippet on the landing page also got an accuracy pass. +- **Download redirects work behind reverse proxies** ([#498](https://github.com/jamiepine/voicebox/pull/498)) — uses the public origin instead of `localhost` when resolving platform-specific installer URLs. +- **MDX docs audited against the multi-engine backend** ([#484](https://github.com/jamiepine/voicebox/pull/484)) — stale single-engine assumptions removed. +- **Three more tutorials + mobile navbar / hero CTA fixes** ([#483](https://github.com/jamiepine/voicebox/pull/483)). + +### Linux +- **Still not shipping.** The re-enable attempt ([#488](https://github.com/jamiepine/voicebox/pull/488)) landed on `main` but CI still hangs in the `tauri-action` bundler step on `ubuntu-22.04` — no output for 25+ minutes after `rpm` bundling, even with `createUpdaterArtifacts: false` and `--bundles deb,rpm`. The matrix entry is disabled again for 0.4.2; the ubuntu-specific setup steps stay in the workflow so re-enabling is a one-line change once we identify the hang. Next release will take another pass. + +### New Contributors +- [@shekharyv](https://github.com/shekharyv) — download redirects behind reverse proxies ([#498](https://github.com/jamiepine/voicebox/pull/498)) + +## [0.4.1] - 2026-04-18 + +A fast follow-up to 0.4.0 focused on making the new engines actually load in the production binary — plus generation cancellation, Linux system-audio capture, and the repo's first PR-time type check. Five first-time contributors shipped in this release. + +0.4.0 introduced three new TTS engines, but the frozen PyInstaller binary tripped over several Python-ecosystem quirks that don't show up in the dev venv: `transformers` opening `.py` sources at runtime, `scipy.stats._distn_infrastructure` hitting a frozen-importer `NameError`, and `chatterbox-multilingual` failing to find its Chinese segmenter dictionary. This release patches all of those in one sweep. + +### Frozen-Binary Reliability ([#438](https://github.com/jamiepine/voicebox/pull/438)) +- **Kokoro** now bundles `.py` sources alongside `.pyc` via `--collect-all kokoro` so `transformers`' `_can_set_attn_implementation` regex scan can read them — previously `FileNotFoundError: kokoro/modules.py` killed Kokoro loading in production builds +- **Chatterbox Multilingual** now bundles `spacy_pkuseg/dicts/default.pkl` and the package's native `.so` extensions via `--collect-all spacy_pkuseg` — previously the Chinese word segmenter crashed with `FileNotFoundError` on first load +- **scipy.stats._distn_infrastructure** — new runtime hook source-patches the trailing `del obj` (which raises `NameError` under PyInstaller's frozen importer because the preceding list comprehension evaluates empty) to `globals().pop('obj', None)`, unblocking `librosa` → `scipy.signal` → `scipy.stats` for every TTS engine that depends on librosa +- **transformers.masking_utils** — same runtime hook forces `_is_torch_greater_or_equal_than_2_6 = False` so the older `sdpa_mask_older_torch` path is selected; the 2.6+ path uses `TransformGetItemToIndex()`, a real `torch._dynamo` graph transform our permissive stub can't reproduce +- **torch._dynamo** — no-op stub replaces the real module before `transformers` imports it, preventing the `torch._numpy._ufuncs` import crash (`NameError: name 'name' is not defined`) that blocked Kokoro and every engine pulling in `flex_attention` +- `.spec` paths are now repo-relative instead of absolute, so the generated spec is portable across machines and CI + +### Generation +- **Cancel queued or running generations** ([#444](https://github.com/jamiepine/voicebox/pull/444)) — new `/generate/{id}/cancel` endpoint and a Stop button on the history row while generating. The serial queue now tracks per-ID state (queued / running / cancelled) so queued jobs are skipped before the worker picks them up and running jobs are `.cancel()`-ed mid-flight; `run_generation` catches `CancelledError` and marks the row `failed` with a "cancelled" error. +- **Legacy `data/` path prefix resolution** ([#440](https://github.com/jamiepine/voicebox/pull/440)) — generations stored with the old `data/` prefix under pre-0.4 installs now resolve correctly after the storage root moved, fixing 404s for historical audio. + +### Model Migration +- Migration dialog no longer hangs when the cache is empty ([#439](https://github.com/jamiepine/voicebox/pull/439)) — the backend now emits a completion SSE event even when zero models are moved. +- Storage-change flow surfaces a toast when there's nothing to migrate ([#433](https://github.com/jamiepine/voicebox/pull/433)) instead of proceeding with a no-op move and restarting the server. +- Deleting all generations from a voice profile now deletes the associated version files and DB rows too ([#447](https://github.com/jamiepine/voicebox/pull/447)) — previously orphaned versions accumulated in storage. + +### Platform +- **Linux system audio capture** ([#457](https://github.com/jamiepine/voicebox/pull/457)) — `cpal`'s ALSA backend doesn't expose PulseAudio/PipeWire monitor sources by name, so the previous device-name search never matched and silently fell back to the microphone. Detection now uses `pactl get-default-sink` + `pactl list short sources` and routes via `PULSE_SOURCE`, with the name-based search retained as a fallback when `pactl` is absent. + +### Frontend CI +- First PR-time quality gate ([#418](https://github.com/jamiepine/voicebox/pull/418)) — new `.github/workflows/ci.yml` runs `bun run typecheck` + `bun run build:web` on every PR. Fixed pre-existing type issues that were being suppressed with `@ts-expect-error`, cleaned up a dep-array typo (`[platform.metadata.isTauricheckOnMountcheckForUpdates]`) in `useAutoUpdater`, and removed 100+ lines of dead `ModelItem` code from `ModelManagement.tsx`. +- Follow-up: widened `apiClient.migrateModels()` return type to include `moved` and `errors` so the storage-change handler typechecks against the real backend response ([#470](https://github.com/jamiepine/voicebox/pull/470)). + +### Docs +- Clarified in the Quick Start + README that paralinguistic tags (`[laugh]`, `[sigh]`) only work with Chatterbox Turbo; other engines read them as literal text ([#450](https://github.com/jamiepine/voicebox/pull/450)). + +### New Contributors +- [@Bortlesboat](https://github.com/Bortlesboat) — generation cancellation (#444) +- [@gaojulong](https://github.com/gaojulong) — migration dialog hang fix (#439) +- [@fuleinist](https://github.com/fuleinist) — migration no-op toast (#433) +- [@erionjuniordeandrade-a11y](https://github.com/erionjuniordeandrade-a11y) — frontend CI + type hardening (#418) +- [@estefrac](https://github.com/estefrac) — Linux pactl system-audio capture (#457) + +## [0.4.0] - 2026-04-16 + +The biggest Voicebox release yet. Three new TTS engines bring the lineup to **seven** — HumeAI TADA, Kokoro 82M, and Qwen CustomVoice join Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and Chatterbox Turbo. GPU support broadens to Intel Arc (XPU) and NVIDIA Blackwell (RTX 50-series), with runtime diagnostics that warn when your PyTorch build doesn't match your GPU. The CUDA backend is now split into independently versioned server and library archives, so upgrading no longer redownloads 4 GB of PyTorch/CUDA DLLs. + +This release also marks a big community moment: **13 new contributors** shipped fixes and features in 0.4.0. Thirty-plus bug fixes target the most-reported issues in the tracker — numpy 2.x TTS crashes, Windows background-server reliability, macOS 11 launch failures, audio playback silence, Stories clip-splitting races, history status staleness, and more. + +### New TTS Engines + +#### HumeAI TADA — Expressive English & Multilingual ([#296](https://github.com/jamiepine/voicebox/pull/296)) +- Added `tada-1b` (English) and `tada-3b-ml` (multilingual) backends +- Replaced `descript-audio-codec` with a lightweight DAC shim to cut dependencies +- Switched audio decoding to `soundfile` to sidestep `torchcodec` bundling issues +- Redirected gated Llama tokenizer lookups to an ungated mirror so model loading works out of the box +- Fixed tokenizer patch that was corrupting `AutoTokenizer` for other engines +- Fixed TorchScript error in frozen builds + +#### Kokoro 82M — Fast Lightweight TTS ([#325](https://github.com/jamiepine/voicebox/pull/325)) +- Added Kokoro 82M engine with a new voice profile type system that distinguishes preset voices from cloned profiles +- Profile grid now handles engine compatibility directly — removed redundant dropdown filtering +- Tightened Kokoro profile handling so preset voices can't be edited like cloned profiles + +#### Qwen CustomVoice ([#328](https://github.com/jamiepine/voicebox/pull/328)) +- Added `qwen-custom-voice` preset engine backed by Qwen3-TTS +- Enforced preset/profile engine compatibility across the generation flow +- Floating generator now shows all engines instead of silently filtering + +### Voice Profile UX + +Until 0.4, every engine in Voicebox was a cloning model, so every voice profile was usable with every engine and the profile grid just showed them all. Introducing Kokoro and Qwen CustomVoice — which work from preset voices rather than cloned samples — broke that assumption for the first time. An early cut on `main` filtered the grid by the selected engine, which left users running pre-release builds thinking their cloned voices had vanished whenever they switched to a preset-only engine. + +This release ships the resolution before it ever reaches a tagged version: + +- **Grey-out instead of filter** — all profiles are always visible; unsupported ones render dimmed with a compatibility hint at the bottom of the grid +- **Auto-switch on selection** — clicking a greyed-out profile selects it AND switches the engine to a compatible one, instead of silently doing nothing +- **Instruct toggle restored for Qwen CustomVoice** — the floating generate box now reveals a delivery-instructions input (tone, emotion, pace) when CustomVoice is selected. Hidden across the board while the new multi-engine lineup was stabilizing because most engines don't honor the kwarg; now conditionally exposed only for the one engine that was actually trained for instruction-based style control +- Supported profiles sort first; the grid scrolls the selected profile into view after engine/sort changes +- Fixed engine desync on tab navigation — the form now initializes its engine from the store +- Fixed the disabled-and-selected card click edge case by bouncing selection to re-trigger the auto-switch +- Cleaned up scroll effect timers (requestAnimationFrame + setTimeout) to prevent stale DOM writes on unmount or rapid selection changes + +### GPU & Platform + +#### Intel Arc (XPU) Support ([#320](https://github.com/jamiepine/voicebox/pull/320)) +- First-class Intel Arc support across all PyTorch-based backends +- Device-aware seeding, XPU detection in the GPU status panel, and setup flow detection +- Reports correct device name and VRAM in settings + +#### Blackwell / RTX 50-series Support ([#316](https://github.com/jamiepine/voicebox/pull/316), [#401](https://github.com/jamiepine/voicebox/pull/401)) +- Upgraded the CUDA backend from cu126 → cu128 for RTX 50-series support +- Added `sm_120+PTX` to the CUDA build via `TORCH_CUDA_ARCH_LIST` for forward-compatibility with Blackwell architectures (closes 5 open reports: #386, #395, #396, #399, #400) +- GPU settings UI fixes around install/uninstall state + +#### GPU Compatibility Diagnostics ([#367](https://github.com/jamiepine/voicebox/pull/367), adapted) +- New `check_cuda_compatibility()` compares the current device's compute capability against the bundled PyTorch's architecture list +- Health endpoint exposes a `gpu_compatibility_warning` field so the UI can surface mismatches +- Startup logs a `WARN` when the installed PyTorch build doesn't support the detected GPU +- GPU status label shows `[UNSUPPORTED - see logs]` — no more silent "no kernel image" failures + +#### Split CUDA Backend ([#298](https://github.com/jamiepine/voicebox/pull/298)) +- CUDA backend now ships as two independently versioned archives: a small server binary and a large libs archive (the ~4 GB of PyTorch/CUDA DLLs) +- Upgrading Voicebox no longer redownloads the libs archive when only the server binary changed +- Added `asyncio.Lock` around `download_cuda_binary()` so auto-update and manual download can't race on the same temp file ([#428](https://github.com/jamiepine/voicebox/pull/428)) +- Updated `package_cuda.py` for PyInstaller 6.18 onedir layout +- Temp archives are always cleaned up on failure, even when the install aborts mid-extract + +### Bug Fixes + +#### Critical: TTS Generation +- **numpy 2.x `torch.from_numpy` crash** ([#361](https://github.com/jamiepine/voicebox/pull/361)) — torch compiled against numpy 1.x ABI fails silently when paired with numpy 2.x, causing `RuntimeError: Numpy is not available` / `Unable to create tensor` on every TTS request in bundled macOS Intel / Rosetta builds. Pinned `numpy<2.0` in requirements and added a PyInstaller runtime hook with a `ctypes.memmove` fallback as belt-and-suspenders. Hardened afterward to raise on unknown dtypes instead of silently reinterpreting bytes as float32. + +#### Platform Reliability +- **Windows background server** ([#402](https://github.com/jamiepine/voicebox/pull/402)) — "keep server running after close" now actually keeps the server running. The HTTP `/watchdog/disable` request could lose the race against process exit on Windows; added a `.keep-running` sentinel file as a synchronous fallback, with stale-sentinel cleanup on startup to avoid orphan server processes +- **macOS 11 launch crash** ([#424](https://github.com/jamiepine/voicebox/pull/424)) — weak-linked ScreenCaptureKit so the app can launch on macOS < 12.3 instead of crashing at dyld resolution. Gated system audio capture behind a real `sw_vers` version check so unsupported systems cleanly advertise "not available" rather than crashing at runtime +- **macOS Intel (x86_64) setup** ([#416](https://github.com/jamiepine/voicebox/pull/416)) — relaxed `torch>=2.7.0` → `torch>=2.2.0`. PyTorch dropped pre-built x86_64 wheels after 2.2.2, so Intel Mac devs could no longer `pip install`. Now resolves to the latest compatible torch per platform +- **Offline model loading** ([#318](https://github.com/jamiepine/voicebox/pull/318)) — Qwen TTS and Whisper force offline mode when loading cached models, so startup works without network access +- **GUI startup with external server** ([#319](https://github.com/jamiepine/voicebox/pull/319)) — fixed GUI launch when pointed at a remote/external server, and added data refresh on server switch; hardened health validation and error handling +- **Qwen3-TTS cache split on Windows** (adapted from [#218](https://github.com/jamiepine/voicebox/pull/218)) — route `Qwen3TTSModel.from_pretrained` through `hf_constants.HF_HUB_CACHE` so the speech tokenizer and `preprocessor_config.json` resolve from a single cache root +- **Qwen3-TTS bundling** ([#305](https://github.com/jamiepine/voicebox/pull/305)) — bundle `qwen_tts` source files in the PyInstaller build to fix `inspect.getsource` errors in frozen builds +- **Backend import paths** ([#345](https://github.com/jamiepine/voicebox/pull/345)) — moved lazy imports to top-level with absolute paths to resolve the "Failed to Save" preset error caused by `ModuleNotFoundError` in production builds +- **Effects service import** ([#384](https://github.com/jamiepine/voicebox/pull/384)) — fixed `ModuleNotFoundError` on preset create/update by switching to relative imports (#349) + +#### Audio & Playback +- **cpal stream silent playback** ([#405](https://github.com/jamiepine/voicebox/pull/405)) — `cpal::Stream` was dropped on function return immediately after `play()`, causing every playback to fall silent. Now holds the stream until either the buffer drains or the stop flag fires (#404) + +#### Stories & History +- **Clip-splitting race** ([#403](https://github.com/jamiepine/voicebox/pull/403)) — rapid double-clicks on split could race through `split_story_item` with inconsistent state. Added `with_for_update()` row locking on the backend and an `isPending` guard on the frontend (#366) +- **History `status` staleness** ([#394](https://github.com/jamiepine/voicebox/pull/394)) — `GET /history/{id}` was hardcoding `status="completed"` regardless of the DB row, breaking any client polling for job completion. Now returns `status`, `error`, `engine`, `model_size`, and `is_favorited` from the actual row +- **"Clear failed" bulk button** ([#412](https://github.com/jamiepine/voicebox/pull/412)) — new `DELETE /history/failed` endpoint and a header strip showing `"N failed generations"` with a Clear button, complementing the per-row trash icon added in #321 (#410) +- **Delete failed generations** ([#321](https://github.com/jamiepine/voicebox/pull/321)) — added a trash icon next to the retry button so failed entries can be cleaned up without having to retry first + +#### Security & Safety +- **Voice prompt cache hardening** ([#429](https://github.com/jamiepine/voicebox/pull/429)) — `torch.load(weights_only=True)` on cached voice prompts per PyTorch 2.6 recommendation; replaced string-based SPA path guard with `Path.is_relative_to()` for more robust path-traversal protection + +#### Infrastructure & Docker +- **Docker web build** ([#344](https://github.com/jamiepine/voicebox/pull/344)) — include `CHANGELOG.md` in the Docker web build so the in-app changelog page works in Docker deployments +- **Docker numba cache** ([#425](https://github.com/jamiepine/voicebox/pull/425)) — set `NUMBA_CACHE_DIR` in docker-compose so numba can write its JIT cache in container runtime (#308) +- **Relative media paths** ([#332](https://github.com/jamiepine/voicebox/pull/332)) — media paths now stored relative to the configured data dir rather than resolved against CWD, so the data directory is portable between installs + +### Developer Tooling + +- New `triage-prs` agent skill — encodes the end-to-end PR-speedrun workflow (classification → triage doc → rebase → squash-merge → follow-ups) so future release cycles can reproduce it +- Rewrote the TTS engine guide with the patterns learned from adding TADA and Kokoro +- Added the API refactor plan and CUDA libs addon design doc +- Fixed broken links in the Get Started section ([#332](https://github.com/jamiepine/voicebox/pull/332)) + +### New Contributors + +Huge thank you to everyone who contributed their first PR to Voicebox in this release: + +[@liorshahverdi](https://github.com/liorshahverdi), [@nicoschtein](https://github.com/nicoschtein), [@ArfianID](https://github.com/ArfianID), [@aimaaaimaa](https://github.com/aimaaaimaa), [@maxmcoding](https://github.com/maxmcoding), [@Khalodddd](https://github.com/Khalodddd), [@LuisSambrano](https://github.com/LuisSambrano), [@shaun0927](https://github.com/shaun0927), [@malletfils](https://github.com/malletfils), [@mvanhorn](https://github.com/mvanhorn), [@kuishou68](https://github.com/kuishou68), [@txhno](https://github.com/txhno), [@MukundaKatta](https://github.com/MukundaKatta) + +## [0.3.0] - 2026-03-17 + +This release rewrites the backend into a modular architecture, overhauls the settings UI into routed sub-pages, fixes audio player freezing, migrates documentation to Fumadocs, and ships a batch of bug fixes targeting the most-reported issues from the tracker. + +The backend's 3,000-line monolith `main.py` has been decomposed into domain routers, a services layer, and a proper database package. A style guide and ruff configuration now enforce consistency. On the frontend, settings have been split into dedicated routed pages with server logs, a changelog viewer, and an about page. The audio player no longer freezes mid-playback, and model loading status is now visible in the UI. Seven user-reported bugs have been fixed, including server crashes during sample uploads, generation list staleness, cryptic error messages, and CUDA support for RTX 50-series GPUs. + +### Settings Overhaul ([#294](https://github.com/jamiepine/voicebox/pull/294)) +- Split settings into routed sub-tabs: General, Generation, GPU, Logs, Changelog, About +- Added live server log viewer with auto-scroll +- Added in-app changelog page that parses `CHANGELOG.md` at build time +- Added About page with version info, license, and generation folder quick-open +- Extracted reusable `SettingRow` component for consistent setting layouts + +### Audio Player Fix ([#293](https://github.com/jamiepine/voicebox/pull/293)) +- Fixed audio player freezing during playback +- Improved playback UX with better state management and listener cleanup +- Fixed restart race condition during regeneration +- Added stable keys for audio element re-rendering +- Improved accessibility across player controls + +### Backend Refactor ([#285](https://github.com/jamiepine/voicebox/pull/285)) +- Extracted all routes from `main.py` into 13 domain routers under `backend/routes/` — `main.py` dropped from ~3,100 lines to ~10 +- Moved CRUD and service modules into `backend/services/`, platform detection into `backend/utils/` +- Split monolithic `database.py` into a `database/` package with separate `models`, `session`, `migrations`, and `seed` modules +- Added `backend/STYLE_GUIDE.md` and `pyproject.toml` with ruff linting config +- Removed dead code: unused `_get_cuda_dll_excludes`, stale `studio.py`, `example_usage.py`, old `Makefile` +- Deduplicated shared logic across TTS backends into `backends/base.py` +- Improved startup logging with version, platform, data directory, and database stats +- Fixed startup database session leak — sessions now rollback and close in `finally` block +- Isolated shutdown unload calls so one backend failure doesn't block the others +- Handled null duration in `story_items` migration +- Reject model migration when target is a subdirectory of source cache + +### Documentation Rewrite ([#288](https://github.com/jamiepine/voicebox/pull/288)) +- Migrated docs site from Mintlify to Fumadocs (Next.js-based) +- Rewrote introduction and root page with content from README +- Added "Edit on GitHub" links and last-updated timestamps on all pages +- Generated OpenAPI spec and auto-generated API reference pages +- Removed stale planning docs (`CUDA_BACKEND_SWAP`, `EXTERNAL_PROVIDERS`, `MLX_AUDIO`, `TTS_PROVIDER_ARCHITECTURE`, etc.) +- Sidebar groups now expand by default; root redirects to `/docs` +- Added OG image metadata and `/og` preview page + +### UI & Frontend +- Added model loading status indicator and effects preset dropdown ([3187344](https://github.com/jamiepine/voicebox/commit/3187344)) +- Fixed take-label race condition during regeneration +- Added accessible focus styling to select component +- Softened select focus indicator opacity +- Addressed 4 critical and 12 major issues from CodeRabbit review + +### Bug Fixes ([#295](https://github.com/jamiepine/voicebox/pull/295)) +- Fixed sample uploads crashing the server — audio decoding now runs in a thread pool instead of blocking the async event loop ([#278](https://github.com/jamiepine/voicebox/issues/278)) +- Fixed generation list not updating when a generation completes — switched to `refetchQueries` for reliable cache busting, added SSE error fallback, and page reset on completion ([#231](https://github.com/jamiepine/voicebox/issues/231)) +- Fixed error toasts showing `[object Object]` instead of the actual error message ([#290](https://github.com/jamiepine/voicebox/issues/290)) +- Added Whisper model selection (`base`, `small`, `medium`, `large`, `turbo`) and expanded language support to the `/transcribe` endpoint ([#233](https://github.com/jamiepine/voicebox/issues/233)) +- Upgraded CUDA backend build from cu121 to cu126 for RTX 50-series (Blackwell) GPU support ([#289](https://github.com/jamiepine/voicebox/issues/289)) +- Handled client disconnects in SSE and streaming endpoints to suppress `[Errno 32] Broken Pipe` errors ([#248](https://github.com/jamiepine/voicebox/issues/248)) +- Fixed Docker build failure from pip hash mismatch on Qwen3-TTS dependencies ([#286](https://github.com/jamiepine/voicebox/issues/286)) +- Added 50 MB upload size limit with chunked reads to prevent unbounded memory allocation on sample uploads +- Eliminated redundant double audio decode in sample processing pipeline + +### Platform Fixes +- Replaced `netstat` with `TcpStream` + PowerShell for Windows port detection ([#277](https://github.com/jamiepine/voicebox/pull/277)) +- Fixed Docker frontend build and cleaned up Docker docs +- Fixed macOS download links to use `.dmg` instead of `.app.tar.gz` +- Added dynamic download redirect routes to landing site + +### Release Tooling +- Added `draft-release-notes` and `release-bump` agent skills +- Wired CI release workflow to extract notes from `CHANGELOG.md` for GitHub Releases +- Backfilled changelog with all historical releases + +## [0.2.3] - 2026-03-15 + +The "it works in dev but not in prod" release. This version fixes a series of PyInstaller bundling issues that prevented model downloading, loading, generation, and progress tracking from working in production builds. + +### Model Downloads Now Actually Work + +The v0.2.1/v0.2.2 builds could not download or load models that weren't already cached from a dev install. This release fixes the entire chain: + +- **Chatterbox, Chatterbox Turbo, and LuxTTS** all download, load, and generate correctly in bundled builds +- **Real-time download progress** — byte-level progress bars now work in production. The root cause: `huggingface_hub` silently disables tqdm progress bars based on logger level, which prevented our progress tracker from receiving byte updates. We now force-enable the internal counter regardless. +- **Fixed Python 3.12.0 `code.replace()` bug** — the macOS build was on Python 3.12.0, which has a [known CPython bug](https://github.com/pyinstaller/pyinstaller/issues/7992) that corrupts bytecode when PyInstaller rewrites code objects. This caused `NameError: name 'obj' is not defined` crashes during scipy/torch imports. Upgraded to Python 3.12.13. + +### PyInstaller Fixes + +- Collect all `inflect` files — `typeguard`'s `@typechecked` decorator calls `inspect.getsource()` at import time, which needs `.py` source files, not just bytecode. Fixes LuxTTS "could not get source code" error. +- Collect all `perth` files — bundles the pretrained watermark model (`hparams.yaml`, `.pth.tar`) needed by Chatterbox at runtime +- Collect all `piper_phonemize` files — bundles `espeak-ng-data/` (phoneme tables, language dicts) needed by LuxTTS for text-to-phoneme conversion +- Set `ESPEAK_DATA_PATH` in frozen builds so the espeak-ng C library finds the bundled data instead of looking at `/usr/share/espeak-ng-data/` +- Collect all `linacodec` files — fixes `inspect.getsource` error in Vocos codec +- Collect all `zipvoice` files — fixes source code lookup in LuxTTS voice cloning +- Copy metadata for `requests`, `transformers`, `huggingface-hub`, `tokenizers`, `safetensors`, `tqdm` — fixes `importlib.metadata` lookups in frozen binary +- Add hidden imports for `chatterbox`, `chatterbox_turbo`, `luxtts`, `zipvoice` backends +- Add `multiprocessing.freeze_support()` to fix resource_tracker subprocess crash in frozen binary +- `--noconsole` now only applied on Windows — macOS/Linux need stdout/stderr for Tauri sidecar log capture +- Hardened `sys.stdout`/`sys.stderr` devnull redirect to test writability, not just `None` check + +### Updater + +- Fixed updater artifact generation with `v1Compatible` for `tauri-action` signature files +- Updated `tauri-action` to v0.6 to fix updater JSON and `.sig` generation + +### Other Fixes + +- Full traceback logging on all backend model loading errors (was just `str(e)` before) + +## [0.2.2] - 2026-03-15 + +- Fix Chatterbox model support in bundled builds +- Fix LuxTTS/ZipVoice support in bundled builds +- Auto-update CUDA binary when app version changes +- CUDA download progress bar +- Fix server process staying alive on macOS (SIGHUP handling, watchdog grace period) +- Hide console window when running CUDA binary on Windows + +## [0.2.1] - 2026-03-15 + +Voicebox v0.1.x was a single-engine voice cloning app built around Qwen3-TTS. v0.2.0 is a ground-up rethink: four TTS engines, 23 languages, paralinguistic emotion controls, a post-processing effects pipeline, unlimited generation length, an async generation queue, and support for every major GPU vendor. Plus Docker. + +### New TTS Engines + +#### Multi-Engine Architecture + +Voicebox now runs **four independent TTS engines** behind a thread-safe per-engine backend registry. Switch engines per-generation from a single dropdown — no restart required. + +| Engine | Languages | Size | Key Strengths | +| --------------------------- | --------- | ------- | --------------------------------------------- | +| **Qwen3-TTS 1.7B** | 10 | ~3.5 GB | Highest quality, delivery instructions | +| **Qwen3-TTS 0.6B** | 10 | ~1.2 GB | Lighter, faster variant | +| **LuxTTS** | English | ~300 MB | CPU-friendly, 48 kHz output, 150x realtime | +| **Chatterbox Multilingual** | 23 | ~3.2 GB | Broadest language coverage, zero-shot cloning | +| **Chatterbox Turbo** | English | ~1.5 GB | 350M params, low latency, paralinguistic tags | + +#### Chatterbox Multilingual — 23 Languages ([#257](https://github.com/jamiepine/voicebox/pull/257)) + +Zero-shot voice cloning in Arabic, Chinese, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Italian, Japanese, Korean, Malay, Norwegian, Polish, Portuguese, Russian, Spanish, Swahili, Swedish, and Turkish. + +#### LuxTTS — Lightweight English TTS ([#254](https://github.com/jamiepine/voicebox/pull/254)) + +A fast, CPU-friendly English engine. ~300 MB download, 48 kHz output, runs at 150x realtime on CPU. + +#### Chatterbox Turbo — Expressive English ([#258](https://github.com/jamiepine/voicebox/pull/258)) + +A fast 350M-parameter English model with inline paralinguistic tags. + +#### Paralinguistic Tags Autocomplete ([#265](https://github.com/jamiepine/voicebox/pull/265)) + +Type `/` in the text input with Chatterbox Turbo selected to open an autocomplete for **9 expressive tags**: `[laugh]` `[chuckle]` `[gasp]` `[cough]` `[sigh]` `[groan]` `[sniff]` `[shush]` `[clear throat]` + +### Generation + +#### Unlimited Generation Length — Auto-Chunking ([#266](https://github.com/jamiepine/voicebox/pull/266)) + +Long text is now automatically split at sentence boundaries, generated per-chunk, and crossfaded back together. Engine-agnostic. + +- Auto-chunking limit slider — 100–5,000 chars (default 800) +- Crossfade slider — 0–200ms (default 50ms) +- Max text length raised to 50,000 characters +- Smart splitting respects abbreviations, CJK punctuation, and `[tags]` + +#### Asynchronous Generation Queue ([#269](https://github.com/jamiepine/voicebox/pull/269)) + +Generation is now fully non-blocking. Serial execution queue prevents GPU contention. Real-time SSE status streaming. + +#### Generation Versions + +Every generation now supports multiple versions with provenance tracking — original, effects versions, takes, source tracking, version pinning in stories, and favorites. + +### Post-Processing Effects ([#271](https://github.com/jamiepine/voicebox/pull/271)) + +A full audio effects system powered by Spotify's `pedalboard` library: Pitch Shift, Reverb, Delay, Chorus/Flanger, Compressor, Gain, High-Pass Filter, Low-Pass Filter. 4 built-in presets, custom presets, per-profile default effects, and live preview. + +### Platform Support + +- **Windows Support** ([#272](https://github.com/jamiepine/voicebox/pull/272)) — Full Windows support with CUDA GPU detection +- **Linux** ([#262](https://github.com/jamiepine/voicebox/pull/262)) — AMD ROCm, NVIDIA GBM fix, WebKitGTK mic access (build from source) +- **NVIDIA CUDA Backend Swap** ([#252](https://github.com/jamiepine/voicebox/pull/252)) — Download and swap in CUDA backend from within the app +- **Intel Arc (XPU) and DirectML** — PyTorch backend supports Intel Arc and DirectML +- **Docker + Web Deployment** ([#161](https://github.com/jamiepine/voicebox/pull/161)) — 3-stage build, non-root runtime, health checks +- **Whisper Turbo** — Added `openai/whisper-large-v3-turbo` as a transcription model option + +### Model Management ([#268](https://github.com/jamiepine/voicebox/pull/268)) + +Per-model unload, custom models directory, model folder migration, download cancel/clear UI ([#238](https://github.com/jamiepine/voicebox/pull/238)), restructured settings UI. + +### Security & Reliability + +- CORS hardening ([#88](https://github.com/jamiepine/voicebox/pull/88)) +- Network access toggle ([#133](https://github.com/jamiepine/voicebox/pull/133)) +- Offline crash fix ([#152](https://github.com/jamiepine/voicebox/pull/152)) +- Atomic audio saves ([#263](https://github.com/jamiepine/voicebox/pull/263)) +- Filesystem health endpoint +- Chatterbox float64 dtype fix ([#264](https://github.com/jamiepine/voicebox/pull/264)) + +### Accessibility ([#243](https://github.com/jamiepine/voicebox/pull/243)) + +Screen reader support, keyboard navigation, state-aware `aria-label` attributes on all interactive controls. + +### UI Polish + +- Redesigned landing page ([#274](https://github.com/jamiepine/voicebox/pull/274)) +- Voices tab overhaul with inline inspector +- Responsive layout improvements +- Duplicate profile name validation ([#175](https://github.com/jamiepine/voicebox/pull/175)) + +### Community Contributors + +[@haosenwang1018](https://github.com/haosenwang1018), [@Balneario-de-Cofrentes](https://github.com/Balneario-de-Cofrentes), [@ageofalgo](https://github.com/ageofalgo), [@mikeswann](https://github.com/mikeswann), [@rayl15](https://github.com/rayl15), [@mpecanha](https://github.com/mpecanha), [@ways2read](https://github.com/ways2read), [@ieguiguren](https://github.com/ieguiguren), [@Vaibhavee89](https://github.com/Vaibhavee89), [@pandego](https://github.com/pandego), [@luminest-llc](https://github.com/luminest-llc) + +## [0.1.13] - 2026-02-23 + +### Stability and reliability + +- [#95](https://github.com/jamiepine/voicebox/pull/95) Fix: selecting 0.6B model still downloads and uses 1.7B +- [#93](https://github.com/jamiepine/voicebox/pull/93) fix(mlx): bundle native libs and broaden error handling for Apple Silicon +- [#79](https://github.com/jamiepine/voicebox/pull/79) fix: handle non-ASCII filenames in Content-Disposition headers +- [#78](https://github.com/jamiepine/voicebox/pull/78) fix: guard getUserMedia call against undefined mediaDevices in non-secure contexts +- [#77](https://github.com/jamiepine/voicebox/pull/77) fix: await for confirmation before deleting voices and channels +- [#128](https://github.com/jamiepine/voicebox/pull/128) fix: resolve multiple issues (#96, #119, #111, #108, #121, #125, #127) +- [#40](https://github.com/jamiepine/voicebox/pull/40) Fix: audio export path resolution + +### Build and packaging + +- [#122](https://github.com/jamiepine/voicebox/pull/122) fix(web): add @tailwindcss/vite plugin to web config +- [#126](https://github.com/jamiepine/voicebox/pull/126) Create requirements.txt + +### UX and docs + +- [#44](https://github.com/jamiepine/voicebox/pull/44) Enhances floating generate box UX +- [#57](https://github.com/jamiepine/voicebox/pull/57) chore: updates repo URL in README +- [#146](https://github.com/jamiepine/voicebox/pull/146) Add Spacebot banner to landing page +- [#1](https://github.com/jamiepine/voicebox/pull/1) Improvements + +## [0.1.12] - 2026-01-31 + +### Model Download UX Overhaul + +- Real-time download progress tracking with accurate percentage and speed info +- No more downloading notifications during generation even when its not downloading +- Better error handling and status reporting throughout the download process + +### Other Improvements + +- Enhanced health check endpoint with GPU type information +- Improved model caching verification +- More reliable SSE progress updates +- Actual update notifications — no need to manually check in settings anymore + +## [0.1.11] - 2026-01-30 + +- Fixed transcriptions on MLX +- Fixed model download progress (finally) + +## [0.1.10] - 2026-01-30 + +### Faster generation on Apple Silicon + +Massive speed gains, from around 20s per generation to 2-3s. Added native MLX backend support for Apple Silicon, providing significantly faster TTS and STT generation on M-series macOS machines. + +- **MLX Backend** — New backend implementation optimized for Apple Silicon using MLX framework +- **Dynamic Backend Selection** — Automatically detects platform and selects between MLX (macOS) and PyTorch (other platforms) +- Refactored TTS and STT logic into modular backend implementations +- Updated build process to include MLX-specific dependencies for macOS builds + +## [0.1.9] - 2026-01-30 + +### Improved voice profile creation flow + +- Voice create drafts: No longer lose work if you close the modal +- Fixed whisper only transcribing English or Chinese, now has support for all languages + +### Improved Stories editor + +- Added spacebar for play/pause +- Timeline now auto-scrolls to follow playhead during playback +- Fixed misalignment of the items with mouse when picking up +- Fixed hitbox for selecting an item +- Fixed playhead jumping forward when pressing play + +### Generation box improvements + +- Instruct mode no longer wipes prompt text +- Improved UI cleanliness + +### Misc + +- Fixed "Model downloading" toast during generation when model is already downloaded + +## [0.1.8] - 2026-01-29 + +### Model Download Timeout Issues + +Fixed critical issue where model downloads would fail with "Failed to fetch" errors on Windows. Refactored download endpoints to return immediately and continue downloads in background. + +### Cross-Platform Cache Path Issues + +Fixed hardcoded `~/.cache/huggingface/hub` paths that don't work on Windows. All cache paths now use `hf_constants.HF_HUB_CACHE` for proper cross-platform support. + +### Windows Process Management + +- Added `/shutdown` endpoint for graceful server shutdown on Windows +- Added `gpu_type` field to health check response + +## [0.1.7] - 2026-01-29 + +- Trim and split audio clips in Story Editor +- Auto-activation of stories in Story Editor with visible playhead +- Conditional auto-play support in AudioPlayer for better user control +- Refactored audio loading across HistoryTable, SampleList, and generation forms +- Audio now only auto-plays when explicitly intended, preventing unexpected playback + +## [0.1.6] - 2026-01-29 + +### Introducing Stories + +A full voice editor for composing podcasts and generated conversations. + +- **Stories Editor** — Create multi-voice narratives, podcasts, or conversations with a timeline-based editor +- Compose tracks with different voices +- Edit and arrange audio segments inline +- Build generated conversations with multiple participants +- **Improved Voice Generation UI** — Auto-resizing input, default voice selection, better layout +- **Track Editor Integration** — Inline track editing within story items + +## [0.1.5] - 2026-01-28 + +Fixed recording length limit at 0:29 to auto stop instead of passing the limit and getting an error, which would cause users to lose their recording. + +## [0.1.4] - 2026-01-28 + +- Audio channel management system +- Native audio playback handling in AudioPlayer component +- Refactored ConnectionForm and Checkbox components +- Improved layout consistency and responsiveness +- Added safe area constants for better responsive design + +## [0.1.3] - 2026-01-27 + +- Improved the generate textbox +- Maybe fixed Windows autoupdate restarting entire computer + +## [0.1.2] - 2026-01-27 + +### Audio Capture & Format Conversion + +- Added audio format conversion util +- Enhanced system audio capture on macOS and Windows +- Improved audio recording hooks +- Added audio input entitlement for macOS +- Added audio capture tests + +### Update System + +- Enhanced auto-updater functionality and update status display + +## [0.1.1] - 2026-01-27 + +### Platform Support + +- **macOS Audio Capture** — Native audio capture support for sample creation +- **Windows Audio Capture** — WASAPI implementation with improved thread safety +- **Linux Support** — Temporarily removed builds due to runner disk space constraints + +### Audio Features + +- Play/pause for audio samples across all components +- Three new sample components: Recording, System capture, Upload with drag-and-drop +- Audio validation, error handling, and consistent cleanup + +### Voice Profile Management + +- Profile import with file size validation (100MB limit) +- Enhanced profile form with new audio sample components +- Drag-and-drop support for audio file uploads + +### Server Management + +- Changed default URL from `localhost:8000` to `127.0.0.1:17493` +- Server reuse logic, "keep server running" preference, orphaned process handling + +### Build & Release + +- Added `.bumpversion.cfg` for automated version management +- Enhanced icon generation script for multi-size Windows icons + +### Bug Fixes + +- Fixed date formatting for timezone-less date strings +- Fixed getLatestRelease file filtering +- Improved audio duration metadata on Windows + +## [0.1.0] - 2026-01-27 + +The first public release of Voicebox — an open-source voice synthesis studio powered by Qwen3-TTS. + +### Voice Cloning with Qwen3-TTS + +- Automatic model download from HuggingFace +- Multiple model sizes (1.7B and 0.6B) +- Voice prompt caching for instant regeneration +- English and Chinese support + +### Voice Profile Management + +- Create profiles from audio files or record directly in the app +- Multiple samples per profile for higher quality cloning +- Import/Export profiles +- Automatic transcription via Whisper + +### Speech Generation + +- Simple text-to-speech with profile selection +- Seed control for reproducible generations +- Long-form support up to 5,000 characters + +### Generation History + +- Full history with metadata +- Search by text content +- Inline playback and download + +### Flexible Deployment + +- Local mode with bundled backend +- Remote mode for GPU servers on your network +- One-click server setup + +### Desktop Experience + +- Built with Tauri v2 (Rust) — native performance, not Electron +- Cross-platform: macOS and Windows +- No Python installation required + +### Tech Stack + +Tauri v2, React, TypeScript, Tailwind CSS, FastAPI, Qwen3-TTS, Whisper, SQLite + +[Unreleased]: https://github.com/jamiepine/voicebox/compare/v0.4.5...HEAD +[0.4.5]: https://github.com/jamiepine/voicebox/compare/v0.4.4...v0.4.5 +[0.4.4]: https://github.com/jamiepine/voicebox/compare/v0.4.3...v0.4.4 +[0.4.3]: https://github.com/jamiepine/voicebox/compare/v0.4.2...v0.4.3 +[0.4.2]: https://github.com/jamiepine/voicebox/compare/v0.4.1...v0.4.2 +[0.4.1]: https://github.com/jamiepine/voicebox/compare/v0.4.0...v0.4.1 +[0.4.0]: https://github.com/jamiepine/voicebox/compare/v0.3.0...v0.4.0 +[0.3.0]: https://github.com/jamiepine/voicebox/compare/v0.2.3...v0.3.0 +[0.2.3]: https://github.com/jamiepine/voicebox/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/jamiepine/voicebox/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/jamiepine/voicebox/compare/v0.1.13...v0.2.1 +[0.1.13]: https://github.com/jamiepine/voicebox/compare/v0.1.12...v0.1.13 +[0.1.12]: https://github.com/jamiepine/voicebox/compare/v0.1.11...v0.1.12 +[0.1.11]: https://github.com/jamiepine/voicebox/compare/v0.1.10...v0.1.11 +[0.1.10]: https://github.com/jamiepine/voicebox/compare/v0.1.9...v0.1.10 +[0.1.9]: https://github.com/jamiepine/voicebox/compare/v0.1.8...v0.1.9 +[0.1.8]: https://github.com/jamiepine/voicebox/compare/v0.1.7...v0.1.8 +[0.1.7]: https://github.com/jamiepine/voicebox/compare/v0.1.6...v0.1.7 +[0.1.6]: https://github.com/jamiepine/voicebox/compare/v0.1.5...v0.1.6 +[0.1.5]: https://github.com/jamiepine/voicebox/compare/v0.1.4...v0.1.5 +[0.1.4]: https://github.com/jamiepine/voicebox/compare/v0.1.3...v0.1.4 +[0.1.3]: https://github.com/jamiepine/voicebox/compare/v0.1.2...v0.1.3 +[0.1.2]: https://github.com/jamiepine/voicebox/compare/v0.1.1...v0.1.2 +[0.1.1]: https://github.com/jamiepine/voicebox/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/jamiepine/voicebox/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b4a3b0e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,392 @@ +# Contributing to Voicebox + +Thank you for your interest in contributing to Voicebox! This document provides guidelines and instructions for contributing. + +## Code of Conduct + +- Be respectful and inclusive +- Welcome newcomers and help them learn +- Focus on constructive feedback +- Respect different viewpoints and experiences + +## Getting Started + +### Prerequisites + +- **[Bun](https://bun.sh)** - Fast JavaScript runtime and package manager + ```bash + curl -fsSL https://bun.sh/install | bash + ``` + +- **[Python 3.11+](https://python.org)** - For backend development + ```bash + python --version # Should be 3.11 or higher + ``` + +- **[Rust](https://rustup.rs)** - For Tauri desktop app (installed automatically by Tauri CLI) + ```bash + rustc --version # Check if installed + ``` +- **[Tauri Prerequisites](https://v2.tauri.app/start/prerequisites)** - Tauri-specific system dependencies (varies by OS). + +- **Git** - Version control + +### Development Setup + +Install [just](https://github.com/casey/just) (`brew install just`, `cargo install just`, or `winget install Casey.Just`), then: + +```bash +git clone https://github.com/YOUR_USERNAME/voicebox.git +cd voicebox + +just setup # creates venv, installs Python + JS deps +just dev # starts backend + desktop app +``` + +`just setup` handles everything automatically, including: +- Creating a Python virtual environment +- Installing Python dependencies (with CUDA PyTorch on Windows if an NVIDIA GPU is detected) +- Installing MLX dependencies on Apple Silicon +- Installing JavaScript dependencies + +`just dev` starts the backend and desktop app together. If a backend is already running (e.g. from `just dev-backend` in another terminal), it detects it and only starts the frontend. + +Other useful commands: + +```bash +just dev-web # backend + web app (no Tauri/Rust build) +just dev-backend # backend only +just dev-frontend # Tauri app only (backend must be running) +just kill # stop all dev processes +just clean-all # nuke everything and start fresh +just --list # see all available commands +``` + +> **Note:** In dev mode, the app connects to a manually-started Python server. +> The bundled server binary is only used in production builds. + +#### Windows Notes + +The justfile works natively on Windows via PowerShell. No WSL or Git Bash required. On Windows with an NVIDIA GPU, `just setup` automatically installs CUDA-enabled PyTorch for GPU acceleration. + +### Model Downloads + +Models are automatically downloaded from HuggingFace Hub on first use: +- **Whisper** (transcription): Auto-downloads on first transcription +- **Qwen3-TTS** (voice cloning): Auto-downloads on first generation (~2-4GB) + +First-time usage will be slower due to model downloads, but subsequent runs will use cached models. + +### Building + +**Build production app:** + +```bash +just build # Build CPU server binary + Tauri installer +``` + +On Windows, to build with CUDA support for local testing: + +```bash +just build-local # Build CPU + CUDA server binaries + Tauri installer +``` + +This builds the CPU sidecar (bundled with the app), the CUDA binary (placed in `%APPDATA%/com.voicebox.app/backends/` for runtime GPU switching), and the installable Tauri app. + +Creates platform-specific installers (`.dmg`, `.msi`, `.AppImage`) in `tauri/src-tauri/target/release/bundle/`. + +**Individual build targets:** + +```bash +just build-server # CPU server binary only +just build-server-cuda # CUDA server binary only (Windows) +just build-tauri # Tauri desktop app only +just build-web # Web app only +``` + +**Building with local Qwen3-TTS development version:** + +If you're actively developing or modifying the Qwen3-TTS library, set the `QWEN_TTS_PATH` environment variable to point to your local clone: + +```bash +export QWEN_TTS_PATH=~/path/to/your/Qwen3-TTS +just build-server +``` + +This makes PyInstaller use your local qwen-tts version instead of the pip-installed package. + +### Generate OpenAPI Client + +After starting the backend server: +```bash +./scripts/generate-api.sh +``` +This downloads the OpenAPI schema and generates the TypeScript client in `app/src/lib/api/` + +### Convert Assets to Web Formats + +To optimize images and videos for the web, run: +```bash +bun run convert:assets +``` + +This script: +- Converts PNG → WebP (better compression, same quality) +- Converts MOV → WebM (VP9 codec, smaller file size) +- Processes files in `landing/public/` and `docs/public/` +- **Deletes original files** after successful conversion + +**Requirements:** Install `webp` and `ffmpeg`: +```bash +brew install webp ffmpeg +``` + +> **Note:** Run this before committing new images or videos to keep the repository size small. + +## Development Workflow + +### 1. Create a Branch + +```bash +git checkout -b feature/your-feature-name +# or +git checkout -b fix/your-bug-fix +``` + +### 2. Make Your Changes + +- Write clean, readable code +- Follow existing code style +- Add comments for complex logic +- Update documentation as needed + +### 3. Test Your Changes + +- Test manually in the app +- Ensure backend API endpoints work +- Check for TypeScript/Python errors +- Verify UI components render correctly + +### 4. Commit Your Changes + +Write clear, descriptive commit messages: + +```bash +git commit -m "Add feature: voice profile export" +git commit -m "Fix: audio playback stops after 30 seconds" +``` + +### 5. Push and Create Pull Request + +```bash +git push origin feature/your-feature-name +``` + +Then create a pull request on GitHub with: +- Clear description of changes +- Screenshots (for UI changes) +- Reference to related issues + +## Code Style + +### TypeScript/React + +- Use TypeScript strict mode +- Follow React best practices +- Use functional components with hooks +- Prefer named exports +- Format with Biome (runs automatically) + +```typescript +// Good +export function ProfileCard({ profile }: { profile: Profile }) { + return
{profile.name}
; +} + +// Avoid +export const ProfileCard = (props) => { ... } +``` + +### Python + +- Follow PEP 8 style guide +- Use type hints +- Use async/await for I/O operations +- Format with Black (if configured) + +```python +# Good +async def create_profile(name: str, language: str) -> Profile: + """Create a new voice profile.""" + ... + +# Avoid +def create_profile(name, language): + ... +``` + +### Rust + +- Follow Rust conventions +- Use meaningful variable names +- Handle errors explicitly +- Format with `rustfmt` + +## Project Structure + +``` +voicebox/ +├── app/ # Shared React frontend +│ └── src/ +│ ├── components/ # UI components +│ ├── lib/ # Utilities and API client +│ └── hooks/ # React hooks +├── backend/ # Python FastAPI server +│ ├── main.py # API routes +│ ├── tts.py # Voice synthesis +│ └── ... +├── tauri/ # Desktop app wrapper +│ └── src-tauri/ # Rust backend +└── scripts/ # Build scripts +``` + +## Areas for Contribution + +### 🐛 Bug Fixes + +- Check existing issues for bugs to fix +- Test your fix thoroughly +- Add tests if possible + +### ✨ New Features + +- Check the roadmap in README.md and the engineering status in [`docs/PROJECT_STATUS.md`](docs/PROJECT_STATUS.md) before proposing work — it lists prioritized tasks (Tier 1 → 3), known architectural bottlenecks, and candidate TTS engines already under evaluation (including why some have been backlogged) +- Discuss major features in an issue first +- Keep features focused and well-scoped + +### 📚 Documentation + +- Improve README clarity +- Add code comments +- Write API documentation +- Create tutorials or guides + +### 🎨 UI/UX Improvements + +- Improve accessibility +- Enhance visual design +- Optimize performance +- Add animations/transitions + +### 🔧 Infrastructure + +- Improve build process +- Add CI/CD improvements +- Optimize bundle size +- Add testing infrastructure + +## API Development + +When adding new API endpoints: + +1. **Add route in `backend/main.py`** +2. **Create Pydantic models in `backend/models.py`** +3. **Implement business logic in appropriate module** +4. **Update OpenAPI schema** (automatic with FastAPI) +5. **Regenerate TypeScript client:** + ```bash + bun run generate:api + ``` +6. **Update `backend/README.md`** with endpoint documentation + +## Testing + +Currently, testing is primarily manual. When adding tests: + +- **Backend**: Use pytest for Python tests +- **Frontend**: Use Vitest for React component tests +- **E2E**: Use Playwright for end-to-end tests (future) + +## Pull Request Process + +1. **Update documentation** if needed +2. **Ensure code follows style guidelines** +3. **Test your changes thoroughly** +4. **Update CHANGELOG.md** with your changes +5. **Request review** from maintainers + +### PR Checklist + +- [ ] Code follows style guidelines +- [ ] Documentation updated +- [ ] Changes tested +- [ ] No breaking changes (or documented) +- [ ] CHANGELOG.md updated + +## Release Process + +Releases are managed by maintainers: + +1. **Bump version using bumpversion:** + ```bash + # Install bumpversion (if not already installed) + pip install bumpversion + + # Bump patch version (0.1.0 -> 0.1.1) + bumpversion patch + + # Or bump minor version (0.1.0 -> 0.2.0) + bumpversion minor + + # Or bump major version (0.1.0 -> 1.0.0) + bumpversion major + ``` + + This automatically: + - Updates version numbers in all files (`tauri.conf.json`, `Cargo.toml`, all `package.json` files, `backend/main.py`) + - Creates a git commit with the version bump + - Creates a git tag (e.g., `v0.1.1`, `v0.2.0`) + +2. **Update CHANGELOG.md** with release notes + +3. **Push commits and tags:** + ```bash + git push + git push --tags + ``` + +4. **GitHub Actions builds and releases** automatically when tags are pushed + +## Troubleshooting + +See [docs/content/docs/overview/troubleshooting.mdx](docs/content/docs/overview/troubleshooting.mdx) for common issues and solutions. + +**Quick fixes:** + +- **Backend won't start:** Check Python version (3.11+), ensure venv is activated, install dependencies +- **Tauri build fails:** Ensure Rust is installed, clean build with `cd tauri/src-tauri && cargo clean` +- **OpenAPI client generation fails:** Ensure backend is running, check `curl http://localhost:17493/openapi.json` + +## Questions? + +- Open an issue for bugs or feature requests +- Check existing issues and discussions +- Review the codebase to understand patterns +- See [docs/content/docs/overview/troubleshooting.mdx](docs/content/docs/overview/troubleshooting.mdx) for common issues + +## Additional Resources + +- [README.md](README.md) - Project overview +- [backend/README.md](backend/README.md) - API documentation +- [docs/PROJECT_STATUS.md](docs/PROJECT_STATUS.md) - Living engineering roadmap: architecture, shipped vs in-flight work, prioritized open issues, candidate TTS engines under evaluation, architectural bottlenecks. Keep this updated when you ship significant features, close or backlog a model integration, or identify new bottlenecks. +- [docs/AUTOUPDATER_QUICKSTART.md](docs/AUTOUPDATER_QUICKSTART.md) - Auto-updater setup +- [SECURITY.md](SECURITY.md) - Security policy +- [CHANGELOG.md](CHANGELOG.md) - Version history + +## License + +By contributing, you agree that your contributions will be licensed under the MIT License. + +--- + +Thank you for contributing to Voicebox! 🎉 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ae143e2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,87 @@ +# ============================================================ +# Voicebox — Local TTS Server with Web UI (CPU) +# 3-stage build: Frontend → Python deps → Runtime +# ============================================================ + +# === Stage 1: Build frontend === +FROM oven/bun:1 AS frontend + +WORKDIR /build + +# Copy workspace config and frontend source +COPY package.json bun.lock ./ +COPY CHANGELOG.md ./CHANGELOG.md +COPY app/ ./app/ +COPY web/ ./web/ + +# Strip workspaces not needed for web build, and fix trailing comma +RUN sed -i '/"tauri"/d; /"landing"/d' package.json && \ + sed -i -z 's/,\n ]/\n ]/' package.json +RUN bun install --no-save +# Build frontend (skip tsc — upstream has pre-existing type errors) +RUN cd web && bunx --bun vite build + + +# === Stage 2: Build Python dependencies === +FROM python:3.11-slim AS backend-builder + +WORKDIR /build + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir --upgrade pip + +COPY backend/requirements.txt . +RUN pip install --no-cache-dir --prefix=/install -r requirements.txt +RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts +RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada +RUN pip install --no-cache-dir --prefix=/install \ + git+https://github.com/QwenLM/Qwen3-TTS.git + +ENV MODELS_DIR=/app/models +ENV TTS_MODE=local +ENV WHISPER_MODE=remote + +# === Stage 3: Runtime === +FROM python:3.11-slim + +# Create non-root user for security +RUN groupadd -r voicebox && \ + useradd -r -g voicebox -m -s /bin/bash voicebox + +WORKDIR /app + +# Install only runtime system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy installed Python packages from builder stage +COPY --from=backend-builder /install /usr/local + +# Copy backend application code +COPY --chown=voicebox:voicebox backend/ /app/backend/ + +# Copy built frontend from frontend stage +COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/ + +# Create data directories owned by non-root user +RUN mkdir -p /app/data/generations /app/data/profiles /app/data/cache \ + && chown -R voicebox:voicebox /app/data + +# Switch to non-root user +USER voicebox + +# Expose the API port +EXPOSE 17493 + +# Health check — auto-restart if the server hangs +HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=60s \ + CMD curl -f http://localhost:17493/health || exit 1 + +# Start the FastAPI server +CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "17493"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5eea3a0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Voicebox Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d22020 --- /dev/null +++ b/README.md @@ -0,0 +1,339 @@ +

+ Voicebox +

+ +

Voicebox

+ +

+ The open-source voice synthesis studio.
+ Clone voices. Generate speech. Apply effects. Build voice-powered apps.
+ All running locally on your machine. +

+ +

+ + Downloads + + + Release + + + Stars + + + License + + + Ask DeepWiki + +

+ +

+ voicebox.sh • + Docs • + Download • + Features • + API • + Troubleshooting +

+ +
+ +

+ + Voicebox App Screenshot + +

+ +

+ Click the image above to watch the demo video on voicebox.sh +

+ +
+ +

+ Voicebox Screenshot 2 +

+ +

+ Voicebox Screenshot 3 +

+ +
+ +## What is Voicebox? + +Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio or pick from 50+ preset voices, generate speech in 23 languages across 7 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. + +- **Complete privacy** — models and voice data stay on your machine +- **7 TTS engines** — Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, HumeAI TADA, and Kokoro +- **Cloning and preset voices** — zero-shot cloning from a reference sample, or curated preset voices via Kokoro (50 voices) and Qwen CustomVoice (9 voices) +- **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more +- **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters +- **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo; natural-language delivery control via Qwen CustomVoice +- **Unlimited length** — auto-chunking with crossfade for scripts, articles, and chapters +- **Stories editor** — multi-track timeline for conversations, podcasts, and narratives +- **API-first** — REST API for integrating voice synthesis into your own projects +- **Native performance** — built with Tauri (Rust), not Electron +- **Runs everywhere** — macOS (MLX/Metal), Windows (CUDA), Linux, AMD ROCm, Intel Arc, Docker + +--- + +## Download + +| Platform | Download | +| --------------------- | ------------------------------------------------------ | +| macOS (Apple Silicon) | [Download DMG](https://voicebox.sh/download/mac-arm) | +| macOS (Intel) | [Download DMG](https://voicebox.sh/download/mac-intel) | +| Windows | [Download MSI](https://voicebox.sh/download/windows) | +| Docker | `docker compose up` | + +> **[View all binaries →](https://github.com/jamiepine/voicebox/releases/latest)** + +> **Linux** — Pre-built binaries are not yet available. See [voicebox.sh/linux-install](https://voicebox.sh/linux-install) for build-from-source instructions. + +> **Having trouble?** See the [Troubleshooting Guide](docs/content/docs/overview/troubleshooting.mdx) for common install, generation, model-download, and GPU issues. + +--- + +## Features + +### Multi-Engine Voice Cloning + +Seven TTS engines with different strengths, switchable per-generation: + +| Engine | Languages | Strengths | +| --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| **Qwen3-TTS** (0.6B / 1.7B) | 10 | High-quality multilingual cloning, delivery instructions ("speak slowly", "whisper") | +| **Qwen CustomVoice** | 10 | 9 curated preset voices with natural-language delivery control — no reference audio required | +| **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | +| **Chatterbox Multilingual** | 23 | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more | +| **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | +| **TADA** (1B / 3B) | 10 | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment | +| **Kokoro** | 8 | 50 curated preset voices, tiny 82M model, fast CPU inference | + +### Emotions & Paralinguistic Tags + +Only **Chatterbox Turbo** interprets paralinguistic tags like `[laugh]` and +`[sigh]`. Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and HumeAI TADA read them +literally as text. + +With **Chatterbox Turbo** selected, type `/` in the text input to open the tag +inserter and add expressive tags inline with speech: + +`[laugh]` `[chuckle]` `[gasp]` `[cough]` `[sigh]` `[groan]` `[sniff]` `[shush]` `[clear throat]` + +### Post-Processing Effects + +8 audio effects powered by Spotify's `pedalboard` library. Apply after generation, preview in real time, build reusable presets. + +| Effect | Description | +| ---------------- | --------------------------------------------- | +| Pitch Shift | Up or down by up to 12 semitones | +| Reverb | Configurable room size, damping, wet/dry mix | +| Delay | Echo with adjustable time, feedback, and mix | +| Chorus / Flanger | Modulated delay for metallic or lush textures | +| Compressor | Dynamic range compression | +| Gain | Volume adjustment (-40 to +40 dB) | +| High-Pass Filter | Remove low frequencies | +| Low-Pass Filter | Remove high frequencies | + +Ships with 4 built-in presets (Robotic, Radio, Echo Chamber, Deep Voice) and supports custom presets. Effects can be assigned per-profile as defaults. + +### Unlimited Generation Length + +Text is automatically split at sentence boundaries and each chunk is generated independently, then crossfaded together. Works with all engines. + +- Configurable auto-chunking limit (100–5,000 chars) +- Crossfade slider (0–200ms) for smooth transitions +- Max text length: 50,000 characters +- Smart splitting respects abbreviations, CJK punctuation, and `[tags]` + +### Generation Versions + +Every generation supports multiple versions with provenance tracking: + +- **Original** — clean TTS output, always preserved +- **Effects versions** — apply different effects chains from any source version +- **Takes** — regenerate with a new seed for variation +- **Source tracking** — each version records its lineage +- **Favorites** — star generations for quick access + +### Async Generation Queue + +Generation is non-blocking. Submit and immediately start typing the next one. + +- Serial execution queue prevents GPU contention +- Real-time SSE status streaming +- Failed generations can be retried +- Stale generations from crashes auto-recover on startup + +### Voice Profile Management + +- Create profiles from audio files or record directly in-app +- Import/export profiles to share or back up +- Multi-sample support for higher quality cloning +- Per-profile default effects chains +- Organize with descriptions and language tags + +### Stories Editor + +Multi-voice timeline editor for conversations, podcasts, and narratives. + +- Multi-track composition with drag-and-drop +- Inline audio trimming and splitting +- Auto-playback with synchronized playhead +- Version pinning per track clip + +### Recording & Transcription + +- In-app recording with waveform visualization +- System audio capture (macOS and Windows) +- Automatic transcription powered by Whisper (including Whisper Turbo) +- Export recordings in multiple formats + +### Model Management + +- Per-model unload to free GPU memory without deleting downloads +- Custom models directory via `VOICEBOX_MODELS_DIR` +- Model folder migration with progress tracking +- Download cancel/clear UI + +### GPU Support + +| Platform | Backend | Notes | +| ------------------------ | -------------- | ---------------------------------------------- | +| macOS (Apple Silicon) | MLX (Metal) | 4-5x faster via Neural Engine | +| Windows / Linux (NVIDIA) | PyTorch (CUDA) | Auto-downloads CUDA binary from within the app | +| Linux (AMD) | PyTorch (ROCm) | Auto-configures HSA_OVERRIDE_GFX_VERSION | +| Windows (any GPU) | DirectML | Universal Windows GPU support | +| Intel Arc | IPEX/XPU | Intel discrete GPU acceleration | +| Any | CPU | Works everywhere, just slower | + +--- + +## API + +Voicebox exposes a full REST API for integrating voice synthesis into your own apps. + +```bash +# Generate speech +curl -X POST http://localhost:17493/generate \ + -H "Content-Type: application/json" \ + -d '{"text": "Hello world", "profile_id": "abc123", "language": "en"}' + +# List voice profiles +curl http://localhost:17493/profiles + +# Create a profile +curl -X POST http://localhost:17493/profiles \ + -H "Content-Type: application/json" \ + -d '{"name": "My Voice", "language": "en"}' +``` + +**Use cases:** game dialogue, podcast production, accessibility tools, voice assistants, content automation. + +Full API documentation available at `http://localhost:17493/docs`. + +--- + +## Tech Stack + +| Layer | Technology | +| ------------- | ------------------------------------------------- | +| Desktop App | Tauri (Rust) | +| Frontend | React, TypeScript, Tailwind CSS | +| State | Zustand, React Query | +| Backend | FastAPI (Python) | +| TTS Engines | Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Kokoro | +| Effects | Pedalboard (Spotify) | +| Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | +| Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | +| Database | SQLite | +| Audio | WaveSurfer.js, librosa | + +--- + +## Roadmap + +| Feature | Description | +| ----------------------- | ---------------------------------------------- | +| **Real-time Streaming** | Stream audio as it generates, word by word | +| **Voice Design** | Create new voices from text descriptions | +| **More Models** | XTTS, Bark, and other open-source voice models | +| **Plugin Architecture** | Extend with custom models and effects | +| **Mobile Companion** | Control Voicebox from your phone | + +For the **full engineering status, open-issue triage, and prioritized work queue**, see [`docs/PROJECT_STATUS.md`](docs/PROJECT_STATUS.md) — a living document that tracks what's shipped, what's in-flight, candidate TTS engines under evaluation, and why we've accepted or backlogged specific integrations. + +--- + +## Development + +See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed setup and contribution guidelines. + +### Quick Start + +```bash +git clone https://github.com/jamiepine/voicebox.git +cd voicebox + +just setup # creates Python venv, installs all deps +just dev # starts backend + desktop app +``` + +Install [just](https://github.com/casey/just): `brew install just` or `cargo install just`. Run `just --list` to see all commands. + +**Prerequisites:** [Bun](https://bun.sh), [Rust](https://rustup.rs), [Python 3.11+](https://python.org), [Tauri Prerequisites](https://v2.tauri.app/start/prerequisites/), and [Xcode](https://developer.apple.com/xcode/) on macOS. + +### Building Locally + +```bash +just build # Build CPU server binary + Tauri app +just build-local # (Windows) Build CPU + CUDA server binaries + Tauri app +``` + +### Adding New Voice Models + +The multi-engine architecture makes adding new TTS engines straightforward. A [step-by-step guide](docs/content/docs/developer/tts-engines.mdx) covers the full process: dependency research, backend protocol implementation, frontend wiring, and PyInstaller bundling. + +The guide is optimized for AI coding agents. An [agent skill](.agents/skills/add-tts-engine/SKILL.md) can pick up a model name and handle the entire integration autonomously — you just test the build locally. + +### Project Structure + +``` +voicebox/ +├── app/ # Shared React frontend +├── tauri/ # Desktop app (Tauri + Rust) +├── web/ # Web deployment +├── backend/ # Python FastAPI server +├── landing/ # Marketing website +└── scripts/ # Build & release scripts +``` + +--- + +## Contributing + +Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. + +1. Fork the repo +2. Create a feature branch +3. Make your changes +4. Submit a PR + +## Security + +Found a security vulnerability? Please report it responsibly. See [SECURITY.md](SECURITY.md) for details. + +--- + +## License + +MIT License — see [LICENSE](LICENSE) for details. + +--- + +

+ voicebox.sh +

diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..049dd77 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,92 @@ +# Security Policy + +## Supported Versions + +We release patches for security vulnerabilities. Which versions are eligible for receiving such patches depends on the CVSS v3.0 Rating: + +| Version | Supported | +| ------- | ------------------ | +| 0.3.x | :white_check_mark: | +| < 0.3 | :x: | + +## Reporting a Vulnerability + +If you discover a security vulnerability, please report it responsibly: + +1. **Do not** open a public GitHub issue +2. Email security details to: [security@voicebox.sh](mailto:security@voicebox.sh) +3. Include: + - Description of the vulnerability + - Steps to reproduce + - Potential impact + - Suggested fix (if any) + +We will: +- Acknowledge receipt within 48 hours +- Provide a timeline for addressing the issue +- Keep you informed of progress +- Credit you in the security advisory (if desired) + +## Security Best Practices + +### For Users + +- **Keep Voicebox updated** - Updates include security patches +- **Verify downloads** - Only download from official releases +- **Local processing** - Voice data stays on your machine +- **Network security** - Use HTTPS when connecting to remote servers + +### For Developers + +- **Dependencies** - Keep all dependencies up to date +- **Code review** - All PRs require review before merging +- **Secrets** - Never commit API keys or signing keys +- **Signing** - All releases are cryptographically signed + +## Known Security Considerations + +### Local Processing + +Voicebox processes all audio locally by default. Your voice data never leaves your machine unless you explicitly enable remote server mode. + +### Remote Server Mode + +When connecting to a remote server: +- Ensure the server is on a trusted network +- Use HTTPS for remote connections +- Verify server identity before connecting + +### Auto-Updates + +- Updates are cryptographically signed +- Signature verification happens before installation +- Only HTTPS endpoints are allowed + +### Python Server + +The embedded Python server: +- Runs locally by default (localhost only) +- Can be configured for remote access +- Uses standard FastAPI security practices + +## Disclosure Timeline + +- **Day 0**: Vulnerability reported +- **Day 1-2**: Initial assessment and acknowledgment +- **Day 3-7**: Investigation and fix development +- **Day 8-14**: Testing and release preparation +- **Day 15+**: Public disclosure (if applicable) + +Timeline may vary based on severity and complexity. + +## Security Updates + +Security updates will be: +- Released as patch versions (e.g., 0.3.2) +- Documented in CHANGELOG.md +- Announced via GitHub releases +- Automatically delivered via auto-updater + +--- + +Thank you for helping keep Voicebox secure! 🔒 diff --git a/app/components.json b/app/components.json new file mode 100644 index 0000000..ba9305d --- /dev/null +++ b/app/components.json @@ -0,0 +1,20 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "new-york", + "rsc": false, + "tsx": true, + "tailwind": { + "config": "tailwind.config.js", + "css": "src/index.css", + "baseColor": "slate", + "cssVariables": true, + "prefix": "" + }, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/lib/hooks" + } +} diff --git a/app/index.html b/app/index.html new file mode 100644 index 0000000..c7a4be9 --- /dev/null +++ b/app/index.html @@ -0,0 +1,13 @@ + + + + + + + voicebox + + +
+ + + diff --git a/app/package.json b/app/package.json new file mode 100644 index 0000000..a94b30b --- /dev/null +++ b/app/package.json @@ -0,0 +1,70 @@ +{ + "name": "@voicebox/app", + "version": "0.4.5", + "private": true, + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "typecheck": "tsc -p tsconfig.json --noEmit", + "preview": "vite preview", + "lint": "biome lint src", + "lint:fix": "biome lint --write src", + "format": "biome format --write src", + "check": "biome check --write src" + }, + "dependencies": { + "@dnd-kit/core": "^6.3.1", + "@dnd-kit/sortable": "^10.0.0", + "@dnd-kit/utilities": "^3.2.2", + "@hookform/resolvers": "^3.9.0", + "@radix-ui/react-alert-dialog": "^1.1.1", + "@radix-ui/react-avatar": "^1.1.0", + "@radix-ui/react-dialog": "^1.1.1", + "@radix-ui/react-dropdown-menu": "^2.1.1", + "@radix-ui/react-label": "^2.1.0", + "@radix-ui/react-popover": "^1.1.1", + "@radix-ui/react-progress": "^1.1.0", + "@radix-ui/react-scroll-area": "^1.1.0", + "@radix-ui/react-select": "^2.1.1", + "@radix-ui/react-separator": "^1.1.0", + "@radix-ui/react-slider": "^1.3.6", + "@radix-ui/react-slot": "^1.1.0", + "@radix-ui/react-tabs": "^1.1.0", + "@radix-ui/react-toast": "^1.2.1", + "@tanstack/react-query": "^5.0.0", + "@tanstack/react-query-devtools": "^5.0.0", + "@tanstack/react-router": "^1.157.16", + "@tauri-apps/api": "^2.0.0", + "@tauri-apps/plugin-dialog": "^2.0.0", + "@tauri-apps/plugin-fs": "^2.0.0", + "@tauri-apps/plugin-process": "^2.3.1", + "@tauri-apps/plugin-updater": "^2.9.0", + "class-variance-authority": "^0.7.0", + "clsx": "^2.1.1", + "date-fns": "^3.6.0", + "framer-motion": "^12.29.0", + "i18next": "^26.0.6", + "i18next-browser-languagedetector": "^8.2.1", + "lucide-react": "^0.454.0", + "motion": "^12.29.0", + "react": "^18.3.0", + "react-dom": "^18.3.0", + "react-hook-form": "^7.53.0", + "react-i18next": "^17.0.4", + "react-sound-visualizer": "^1.4.0", + "tailwind-merge": "^2.5.4", + "wavesurfer.js": "^7.0.0", + "zod": "^3.23.8", + "zustand": "^4.5.0" + }, + "devDependencies": { + "@tailwindcss/vite": "^4.1.18", + "@types/react": "^18.3.0", + "@types/react-dom": "^18.3.0", + "@vitejs/plugin-react": "^4.3.0", + "tailwindcss": "^4.1.0", + "typescript": "^5.6.0", + "vite": "^5.4.0" + } +} diff --git a/app/plugins/changelog.ts b/app/plugins/changelog.ts new file mode 100644 index 0000000..d131fe5 --- /dev/null +++ b/app/plugins/changelog.ts @@ -0,0 +1,23 @@ +import { readFileSync } from 'node:fs'; +import path from 'node:path'; +import type { Plugin } from 'vite'; + +/** Vite plugin that exposes CHANGELOG.md as `virtual:changelog`. */ +export function changelogPlugin(repoRoot: string): Plugin { + const virtualId = 'virtual:changelog'; + const resolvedId = '\0' + virtualId; + const changelogPath = path.resolve(repoRoot, 'CHANGELOG.md'); + + return { + name: 'changelog', + resolveId(id) { + if (id === virtualId) return resolvedId; + }, + load(id) { + if (id === resolvedId) { + const raw = readFileSync(changelogPath, 'utf-8'); + return `export default ${JSON.stringify(raw)};`; + } + }, + }; +} diff --git a/app/src/App.tsx b/app/src/App.tsx new file mode 100644 index 0000000..d277c80 --- /dev/null +++ b/app/src/App.tsx @@ -0,0 +1,274 @@ +import { RouterProvider } from '@tanstack/react-router'; +import { useEffect, useRef, useState } from 'react'; +import voiceboxLogo from '@/assets/voicebox-logo.png'; +import ShinyText from '@/components/ShinyText'; +import { TitleBarDragRegion } from '@/components/TitleBarDragRegion'; +import { useAutoUpdater } from '@/hooks/useAutoUpdater'; +import { apiClient } from '@/lib/api/client'; +import type { HealthResponse } from '@/lib/api/types'; +import { TOP_SAFE_AREA_PADDING } from '@/lib/constants/ui'; +import { cn } from '@/lib/utils/cn'; +import { usePlatform } from '@/platform/PlatformContext'; +import { router } from '@/router'; +import { useLogStore } from '@/stores/logStore'; +import { useServerStore } from '@/stores/serverStore'; + +/** + * Validate that a health response has the expected Voicebox-specific shape. + * Prevents misidentifying an unrelated service on the same port. + */ +function isVoiceboxHealthResponse(health: HealthResponse): boolean { + return ( + health?.status === 'healthy' && + typeof health.model_loaded === 'boolean' && + typeof health.gpu_available === 'boolean' + ); +} + +/** + * Check whether a startup error indicates the port is occupied by an external + * server (which we should try to reuse via health-check polling) vs. a real + * failure (missing sidecar, signing issue, etc.) that should surface immediately. + */ +function isPortInUseError(error: unknown): boolean { + const msg = error instanceof Error ? error.message : String(error); + return ( + msg.includes('already in use') || + msg.includes('port') || + msg.includes('EADDRINUSE') || + msg.includes('address already in use') + ); +} + +const LOADING_MESSAGES = [ + 'Warming up tensors...', + 'Calibrating synthesizer engine...', + 'Initializing voice models...', + 'Loading neural networks...', + 'Preparing audio pipelines...', + 'Optimizing waveform generators...', + 'Tuning frequency analyzers...', + 'Building voice embeddings...', + 'Configuring text-to-speech cores...', + 'Syncing audio buffers...', + 'Establishing model connections...', + 'Preprocessing training data...', + 'Validating voice samples...', + 'Compiling inference engines...', + 'Mapping phoneme sequences...', + 'Aligning prosody parameters...', + 'Activating speech synthesis...', + 'Fine-tuning acoustic models...', + 'Preparing voice cloning matrices...', + 'Initializing Qwen TTS framework...', +]; + +function App() { + const platform = usePlatform(); + const [serverReady, setServerReady] = useState(false); + const [startupError, setStartupError] = useState(null); + const [loadingMessageIndex, setLoadingMessageIndex] = useState(0); + const serverStartingRef = useRef(false); + + // Automatically check for app updates on startup and show toast notifications + useAutoUpdater({ checkOnMount: true, showToast: true }); + + // Sync stored setting to Rust on startup + useEffect(() => { + if (platform.metadata.isTauri) { + const keepRunning = useServerStore.getState().keepServerRunningOnClose; + platform.lifecycle.setKeepServerRunning(keepRunning).catch((error) => { + console.error('Failed to sync initial setting to Rust:', error); + }); + } + // Empty dependency array - platform is stable from context, only run once + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [platform.metadata.isTauri, platform.lifecycle]); + + // Setup lifecycle callbacks + useEffect(() => { + platform.lifecycle.onServerReady = () => { + setServerReady(true); + }; + // Empty dependency array - platform is stable from context, only run once + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [platform.lifecycle]); + + // Subscribe to server logs + useEffect(() => { + const unsubscribe = platform.lifecycle.subscribeToServerLogs((entry) => { + useLogStore.getState().addEntry(entry); + }); + return unsubscribe; + }, [platform.lifecycle]); + + // Setup window close handler and auto-start server when running in Tauri (production only) + useEffect(() => { + if (!platform.metadata.isTauri) { + setServerReady(true); // Web assumes server is running + return; + } + + // Setup window close handler to check setting and stop server if needed + // This works in both dev and prod, but will only stop server if it was started by the app + platform.lifecycle.setupWindowCloseHandler().catch((error) => { + console.error('Failed to setup window close handler:', error); + }); + + // Only auto-start server in production mode + // In dev mode, user runs server separately + if (!import.meta.env?.PROD) { + console.log('Dev mode: Skipping auto-start of server (run it separately)'); + setServerReady(true); // Mark as ready so UI doesn't show loading screen + // Mark that server was not started by app (so we don't try to stop it on close) + window.__voiceboxServerStartedByApp = false; + return; + } + + // Auto-start server in production + if (serverStartingRef.current) { + return; + } + + serverStartingRef.current = true; + const isRemote = useServerStore.getState().mode === 'remote'; + const customModelsDir = useServerStore.getState().customModelsDir; + console.log(`Production mode: Starting bundled server... (remote: ${isRemote})`); + + platform.lifecycle + .startServer(isRemote, customModelsDir) + .then((serverUrl) => { + console.log('Server is ready at:', serverUrl); + // Update the server URL in the store with the dynamically assigned port + useServerStore.getState().setServerUrl(serverUrl); + setServerReady(true); + // Mark that we started the server (so we know to stop it on close) + window.__voiceboxServerStartedByApp = true; + }) + .catch((error) => { + console.error('Failed to auto-start server:', error); + serverStartingRef.current = false; + window.__voiceboxServerStartedByApp = false; + + // Only fall back to health-check polling when the error indicates the + // port is occupied (likely an external server). For real failures + // (missing sidecar, signing issues, etc.) surface the error immediately. + if (!isPortInUseError(error)) { + const msg = error instanceof Error ? error.message : String(error); + console.error('Real startup failure — not polling:', msg); + setStartupError(msg); + return; + } + + // Fall back to polling: the server may already be running externally + // (e.g. started via python/uvicorn/Docker). Poll the health endpoint + // until it responds with a valid Voicebox payload, then transition to + // the main UI. + console.log('Falling back to health-check polling...'); + const pollInterval = setInterval(async () => { + try { + const health = await apiClient.getHealth(); + if (!isVoiceboxHealthResponse(health)) { + console.log('Health response is not from a Voicebox server, keep polling...'); + return; + } + console.log('External Voicebox server detected via health check'); + clearInterval(pollInterval); + setServerReady(true); + } catch { + // Server not ready yet, keep polling + } + }, 2000); + + // Stop polling after 2 minutes and surface the failure + setTimeout(() => { + clearInterval(pollInterval); + serverStartingRef.current = false; + setStartupError( + 'Could not connect to a Voicebox server within 2 minutes. ' + + 'Please check that the server is running and try again.', + ); + }, 120_000); + }); + + // Cleanup: stop server on actual unmount (not StrictMode remount) + // Note: Window close is handled separately in Tauri Rust code + return () => { + // Window close event handles server shutdown based on setting + serverStartingRef.current = false; + }; + // Empty dependency array - platform is stable from context, only run once + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [platform.metadata.isTauri, platform.lifecycle]); + + // Cycle through loading messages every 3 seconds + useEffect(() => { + if (!platform.metadata.isTauri || serverReady) { + return; + } + + const interval = setInterval(() => { + setLoadingMessageIndex((prev) => (prev + 1) % LOADING_MESSAGES.length); + }, 3000); + + return () => clearInterval(interval); + }, [serverReady, platform.metadata.isTauri]); + + // Show loading screen while server is starting in Tauri + if (platform.metadata.isTauri && !serverReady) { + return ( +
+ +
+
+
+
+
+ Voicebox +
+ {startupError ? ( +
+

Server startup failed

+

{startupError}

+ +
+ ) : ( +
+ +
+ )} +
+
+ ); + } + + return ; +} + +export default App; diff --git a/app/src/assets/voicebox-logo.png b/app/src/assets/voicebox-logo.png new file mode 100644 index 0000000..eebe543 Binary files /dev/null and b/app/src/assets/voicebox-logo.png differ diff --git a/app/src/components/AppFrame/AppFrame.tsx b/app/src/components/AppFrame/AppFrame.tsx new file mode 100644 index 0000000..8e6c42a --- /dev/null +++ b/app/src/components/AppFrame/AppFrame.tsx @@ -0,0 +1,39 @@ +import { useRouterState } from '@tanstack/react-router'; +import { TitleBarDragRegion } from '@/components/TitleBarDragRegion'; +import { AudioKeepAlive } from '@/components/AudioPlayer/AudioKeepAlive'; +import { AudioPlayer } from '@/components/AudioPlayer/AudioPlayer'; +import { StoryTrackEditor } from '@/components/StoriesTab/StoryTrackEditor'; +import { TOP_SAFE_AREA_PADDING } from '@/lib/constants/ui'; +import { cn } from '@/lib/utils/cn'; +import { useStoryStore } from '@/stores/storyStore'; +import { useStory } from '@/lib/hooks/useStories'; + +interface AppFrameProps { + children: React.ReactNode; +} + +export function AppFrame({ children }: AppFrameProps) { + const routerState = useRouterState(); + const isStoriesRoute = routerState.location.pathname === '/stories'; + + const selectedStoryId = useStoryStore((state) => state.selectedStoryId); + const { data: story } = useStory(selectedStoryId); + + // Show track editor when on stories route with a selected story that has items + const showTrackEditor = isStoriesRoute && selectedStoryId && story && story.items.length > 0; + + return ( +
+ + + {children} + {showTrackEditor ? ( + + ) : ( + + )} +
+ ); +} diff --git a/app/src/components/AudioPlayer/AudioKeepAlive.tsx b/app/src/components/AudioPlayer/AudioKeepAlive.tsx new file mode 100644 index 0000000..f86cc4a --- /dev/null +++ b/app/src/components/AudioPlayer/AudioKeepAlive.tsx @@ -0,0 +1,85 @@ +import { useEffect, useRef } from 'react'; +import { debug } from '@/lib/utils/debug'; + +// WKWebView tears down the app's CoreAudio output when idle for long enough, +// and a JS-level reload (cmd+R) does NOT restore it — only relaunching the +// Tauri app does. Keeping a silent
+ ); +} + +function ChannelVoicesList({ channelId }: { channelId: string }) { + const { t } = useTranslation(); + const { data: voices } = useQuery({ + queryKey: ['channel-voices', channelId], + queryFn: () => apiClient.getChannelVoices(channelId), + }); + + const { data: profiles } = useQuery({ + queryKey: ['profiles'], + queryFn: () => apiClient.listProfiles(), + }); + + const voiceNames = + voices?.profile_ids.map((id) => profiles?.find((p) => p.id === id)?.name).filter(Boolean) || []; + + return ( +
+ {voiceNames.length > 0 ? ( + voiceNames.map((name) => ( + + {name} + + )) + ) : ( + {t('audioChannels.noVoicesAssigned')} + )} +
+ ); +} + +interface CreateChannelDialogProps { + open: boolean; + onOpenChange: (open: boolean) => void; + devices: AudioDevice[]; + onCreate: (name: string, deviceIds: string[]) => void; +} + +function CreateChannelDialog({ open, onOpenChange, devices, onCreate }: CreateChannelDialogProps) { + const { t } = useTranslation(); + const [name, setName] = useState(''); + const [selectedDevices, setSelectedDevices] = useState([]); + + const handleSubmit = () => { + if (name.trim()) { + onCreate(name.trim(), selectedDevices); + setName(''); + setSelectedDevices([]); + } + }; + + return ( + + + + {t('audioChannels.createDialog.title')} + {t('audioChannels.createDialog.description')} + +
+
+ + setName(e.target.value)} + placeholder={t('audioChannels.fields.namePlaceholder')} + /> +
+
+ + + {selectedDevices.length > 0 && ( +
+ {selectedDevices.map((deviceId) => { + const device = devices.find((d) => d.id === deviceId); + return ( +
+ {device?.name || deviceId} + +
+ ); + })} +
+ )} +
+
+ + + + +
+
+ ); +} + +interface EditChannelDialogProps { + open: boolean; + onOpenChange: (open: boolean) => void; + channel: { + id: string; + name: string; + device_ids: string[]; + }; + devices: AudioDevice[]; + profiles: Array<{ id: string; name: string }>; + channelVoices: string[]; + onUpdate: (name: string, deviceIds: string[]) => void; + onSetVoices: (profileIds: string[]) => void; +} + +function EditChannelDialog({ + open, + onOpenChange, + channel, + devices, + profiles, + channelVoices, + onUpdate, + onSetVoices, +}: EditChannelDialogProps) { + const { t } = useTranslation(); + const [name, setName] = useState(channel.name); + const [selectedDevices, setSelectedDevices] = useState(channel.device_ids); + const [selectedVoices, setSelectedVoices] = useState(channelVoices); + + const handleSubmit = () => { + if (name.trim()) { + onUpdate(name.trim(), selectedDevices); + onSetVoices(selectedVoices); + } + }; + + return ( + + + + {t('audioChannels.editDialog.title')} + {t('audioChannels.editDialog.description')} + +
+
+ + setName(e.target.value)} /> +
+
+ + + {selectedDevices.length > 0 && ( +
+ {selectedDevices.map((deviceId) => { + const device = devices.find((d) => d.id === deviceId); + return ( +
+ {device?.name || deviceId} + +
+ ); + })} +
+ )} +
+
+ + + {selectedVoices.length > 0 && ( +
+ {selectedVoices.map((profileId) => { + const profile = profiles.find((p) => p.id === profileId); + return ( +
+ {profile?.name || profileId} + +
+ ); + })} +
+ )} +
+
+ + + + +
+
+ ); +} diff --git a/app/src/components/Effects/EffectsChainEditor.tsx b/app/src/components/Effects/EffectsChainEditor.tsx new file mode 100644 index 0000000..0472807 --- /dev/null +++ b/app/src/components/Effects/EffectsChainEditor.tsx @@ -0,0 +1,394 @@ +import { + closestCenter, + DndContext, + type DragEndEvent, + KeyboardSensor, + PointerSensor, + useSensor, + useSensors, +} from '@dnd-kit/core'; +import { + arrayMove, + SortableContext, + sortableKeyboardCoordinates, + useSortable, + verticalListSortingStrategy, +} from '@dnd-kit/sortable'; +import { CSS } from '@dnd-kit/utilities'; +import { useQuery } from '@tanstack/react-query'; +import { ChevronDown, ChevronRight, GripVertical, Plus, Power, Trash2 } from 'lucide-react'; +import { useCallback, useMemo, useRef, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Button } from '@/components/ui/button'; +import { Label } from '@/components/ui/label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { Slider } from '@/components/ui/slider'; +import { apiClient } from '@/lib/api/client'; +import type { AvailableEffect, EffectConfig, EffectPresetResponse } from '@/lib/api/types'; +import { cn } from '@/lib/utils/cn'; + +// Each effect in the chain gets a stable ID for dnd-kit +interface EffectWithId extends EffectConfig { + _id: string; +} + +let nextId = 0; +function makeId() { + return `fx-${++nextId}`; +} + +interface EffectsChainEditorProps { + value: EffectConfig[]; + onChange: (chain: EffectConfig[]) => void; + compact?: boolean; + showPresets?: boolean; +} + +export function EffectsChainEditor({ + value, + onChange, + compact = false, + showPresets = true, +}: EffectsChainEditorProps) { + const { t } = useTranslation(); + const [expandedId, setExpandedId] = useState(null); + + // Maintain stable IDs for each effect across renders. + // We use a ref to map value items to IDs, rebuilding when length changes. + const idsRef = useRef([]); + const items: EffectWithId[] = useMemo(() => { + // Grow ID array if effects were added + while (idsRef.current.length < value.length) { + idsRef.current.push(makeId()); + } + // Shrink if effects were removed + if (idsRef.current.length > value.length) { + idsRef.current = idsRef.current.slice(0, value.length); + } + return value.map((e, i) => ({ ...e, _id: idsRef.current[i] })); + }, [value]); + + const sensors = useSensors( + useSensor(PointerSensor, { activationConstraint: { distance: 5 } }), + useSensor(KeyboardSensor, { coordinateGetter: sortableKeyboardCoordinates }), + ); + + const { data: availableEffects } = useQuery({ + queryKey: ['available-effects'], + queryFn: () => apiClient.getAvailableEffects(), + staleTime: Infinity, + }); + + const { data: presets } = useQuery({ + queryKey: ['effect-presets'], + queryFn: () => apiClient.listEffectPresets(), + staleTime: 30_000, + }); + + const effectsMap = useMemo(() => { + const m = new Map(); + if (availableEffects) { + for (const e of availableEffects.effects) { + m.set(e.type, e); + } + } + return m; + }, [availableEffects]); + + function addEffect(type: string) { + const def = effectsMap.get(type); + if (!def) return; + const params: Record = {}; + for (const [key, p] of Object.entries(def.params)) { + params[key] = p.default; + } + const newEffect: EffectConfig = { type, enabled: true, params }; + const newId = makeId(); + idsRef.current = [...idsRef.current, newId]; + onChange([...value, newEffect]); + setExpandedId(newId); + } + + const removeEffect = useCallback( + (index: number) => { + const removedId = idsRef.current[index]; + idsRef.current = idsRef.current.filter((_, i) => i !== index); + onChange(value.filter((_, i) => i !== index)); + if (expandedId === removedId) setExpandedId(null); + }, + [value, onChange, expandedId], + ); + + const toggleEnabled = useCallback( + (index: number) => { + onChange(value.map((e, i) => (i === index ? { ...e, enabled: !e.enabled } : e))); + }, + [value, onChange], + ); + + const updateParam = useCallback( + (index: number, paramName: string, paramValue: number) => { + onChange( + value.map((e, i) => + i === index ? { ...e, params: { ...e.params, [paramName]: paramValue } } : e, + ), + ); + }, + [value, onChange], + ); + + function loadPreset(preset: EffectPresetResponse) { + idsRef.current = preset.effects_chain.map(() => makeId()); + onChange(preset.effects_chain); + setExpandedId(null); + } + + function clearAll() { + idsRef.current = []; + onChange([]); + setExpandedId(null); + } + + function handleDragEnd(event: DragEndEvent) { + const { active, over } = event; + if (!over || active.id === over.id) return; + + const oldIndex = idsRef.current.indexOf(active.id as string); + const newIndex = idsRef.current.indexOf(over.id as string); + if (oldIndex === -1 || newIndex === -1) return; + + idsRef.current = arrayMove(idsRef.current, oldIndex, newIndex); + onChange(arrayMove([...value], oldIndex, newIndex)); + } + + return ( +
+ {/* Preset selector row */} + {showPresets && ( +
+ + + {value.length > 0 && ( + + )} +
+ )} + + {/* Sortable effects chain */} + + i._id)} strategy={verticalListSortingStrategy}> + {items.map((effect, index) => ( + setExpandedId(expandedId === effect._id ? null : effect._id)} + onRemove={() => removeEffect(index)} + onToggleEnabled={() => toggleEnabled(index)} + onUpdateParam={(paramName, paramValue) => updateParam(index, paramName, paramValue)} + /> + ))} + + + + {/* Add effect */} + {availableEffects && ( + + )} +
+ ); +} + +// --------------------------------------------------------------------------- +// Sortable effect item +// --------------------------------------------------------------------------- + +interface SortableEffectItemProps { + id: string; + effect: EffectConfig; + index: number; + effectDef?: AvailableEffect; + isExpanded: boolean; + onToggleExpand: () => void; + onRemove: () => void; + onToggleEnabled: () => void; + onUpdateParam: (paramName: string, paramValue: number) => void; +} + +function SortableEffectItem({ + id, + effect, + effectDef, + isExpanded, + onToggleExpand, + onRemove, + onToggleEnabled, + onUpdateParam, +}: SortableEffectItemProps) { + const { t } = useTranslation(); + const { attributes, listeners, setNodeRef, transform, transition, isDragging } = useSortable({ + id, + }); + + const style = { + transform: CSS.Transform.toString(transform), + transition, + zIndex: isDragging ? 10 : undefined, + }; + + const label = t(`effects.types.${effect.type}.label`, { + defaultValue: effectDef?.label ?? effect.type, + }); + + return ( +
+ {/* Header */} +
+ + + + + + {label} + + + + + +
+ + {/* Params */} + {isExpanded && effectDef && ( +
+ {Object.entries(effectDef.params).map(([paramName, paramDef]) => { + const currentValue = effect.params[paramName] ?? paramDef.default; + return ( +
+
+ + + {currentValue.toFixed( + paramDef.step < 1 ? Math.max(1, -Math.floor(Math.log10(paramDef.step))) : 0, + )} + +
+ onUpdateParam(paramName, v)} + /> +
+ ); + })} +
+ )} +
+ ); +} diff --git a/app/src/components/Effects/GenerationPicker.tsx b/app/src/components/Effects/GenerationPicker.tsx new file mode 100644 index 0000000..d8ab3d2 --- /dev/null +++ b/app/src/components/Effects/GenerationPicker.tsx @@ -0,0 +1,103 @@ +import { ChevronDown, Search } from 'lucide-react'; +import { useMemo, useState } from 'react'; +import { Button } from '@/components/ui/button'; +import { Input } from '@/components/ui/input'; +import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; +import type { HistoryResponse } from '@/lib/api/types'; +import { useHistory } from '@/lib/hooks/useHistory'; +import { cn } from '@/lib/utils/cn'; + +interface GenerationPickerProps { + selectedId: string | null; + onSelect: (generation: HistoryResponse) => void; + className?: string; +} + +export function GenerationPicker({ selectedId, onSelect, className }: GenerationPickerProps) { + const [open, setOpen] = useState(false); + const [searchQuery, setSearchQuery] = useState(''); + + const { data: historyData } = useHistory({ limit: 50 }); + + const completedGenerations = useMemo(() => { + if (!historyData?.items) return []; + return historyData.items.filter((gen) => gen.status === 'completed'); + }, [historyData]); + + const filtered = useMemo(() => { + if (!searchQuery) return completedGenerations; + const q = searchQuery.toLowerCase(); + return completedGenerations.filter( + (gen) => gen.text.toLowerCase().includes(q) || gen.profile_name.toLowerCase().includes(q), + ); + }, [completedGenerations, searchQuery]); + + const selectedGeneration = completedGenerations.find((g) => g.id === selectedId); + + return ( + + + + + +
+
+ + setSearchQuery(e.target.value)} + className="h-8 pl-7 text-xs" + /> +
+
+
+ {filtered.length === 0 ? ( +
+ No generations found +
+ ) : ( + filtered.map((gen) => ( + + )) + )} +
+
+
+ ); +} diff --git a/app/src/components/EffectsTab/EffectsDetail.tsx b/app/src/components/EffectsTab/EffectsDetail.tsx new file mode 100644 index 0000000..32f78f2 --- /dev/null +++ b/app/src/components/EffectsTab/EffectsDetail.tsx @@ -0,0 +1,435 @@ +import { useQuery, useQueryClient } from '@tanstack/react-query'; +import { Loader2, Play, Save, Trash2, Wand2 } from 'lucide-react'; +import { useEffect, useRef, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { EffectsChainEditor } from '@/components/Effects/EffectsChainEditor'; +import { GenerationPicker } from '@/components/Effects/GenerationPicker'; +import { Button } from '@/components/ui/button'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { Separator } from '@/components/ui/separator'; +import { Textarea } from '@/components/ui/textarea'; +import { useToast } from '@/components/ui/use-toast'; +import { apiClient } from '@/lib/api/client'; +import type { HistoryResponse } from '@/lib/api/types'; +import { useHistory } from '@/lib/hooks/useHistory'; +import { useEffectsStore } from '@/stores/effectsStore'; +import { usePlayerStore } from '@/stores/playerStore'; + +export function EffectsDetail() { + const { t } = useTranslation(); + const selectedPresetId = useEffectsStore((s) => s.selectedPresetId); + const isCreatingNew = useEffectsStore((s) => s.isCreatingNew); + const workingChain = useEffectsStore((s) => s.workingChain); + const setWorkingChain = useEffectsStore((s) => s.setWorkingChain); + const setSelectedPresetId = useEffectsStore((s) => s.setSelectedPresetId); + const setIsCreatingNew = useEffectsStore((s) => s.setIsCreatingNew); + + const [name, setName] = useState(''); + const [description, setDescription] = useState(''); + const [saving, setSaving] = useState(false); + const [deleting, setDeleting] = useState(false); + + // "Save as Custom" dialog state + const [saveAsDialogOpen, setSaveAsDialogOpen] = useState(false); + const [saveAsName, setSaveAsName] = useState(''); + const [saveAsDescription, setSaveAsDescription] = useState(''); + + // Preview state + const [previewGenId, setPreviewGenId] = useState(null); + const [previewLoading, setPreviewLoading] = useState(false); + const blobUrlRef = useRef(null); + const setAudioWithAutoPlay = usePlayerStore((s) => s.setAudioWithAutoPlay); + + const { toast } = useToast(); + const queryClient = useQueryClient(); + + // Auto-select the most recent generation as preview source + const { data: historyData } = useHistory({ limit: 1 }); + useEffect(() => { + if (!previewGenId && historyData?.items?.length) { + const first = historyData.items.find((g) => g.status === 'completed'); + if (first) setPreviewGenId(first.id); + } + }, [historyData, previewGenId]); + + const { data: preset } = useQuery({ + queryKey: ['effect-preset', selectedPresetId], + queryFn: () => + selectedPresetId + ? apiClient + .listEffectPresets() + .then((all) => all.find((p) => p.id === selectedPresetId) ?? null) + : null, + enabled: !!selectedPresetId, + staleTime: 30_000, + }); + + // Sync name/description when selecting a preset + useEffect(() => { + if (preset) { + setName(preset.name); + setDescription(preset.description ?? ''); + } else if (isCreatingNew) { + setName(''); + setDescription(''); + } + }, [preset, isCreatingNew]); + + // Cleanup blob URL on unmount + useEffect(() => { + return () => { + if (blobUrlRef.current) { + URL.revokeObjectURL(blobUrlRef.current); + blobUrlRef.current = null; + } + }; + }, []); + + const isEditing = !!selectedPresetId || isCreatingNew; + const isBuiltIn = preset?.is_builtin ?? false; + const presetName = preset + ? preset.is_builtin + ? t(`effects.builtinPresets.${preset.name}.name`, { defaultValue: preset.name }) + : preset.name + : ''; + const presetDescription = preset + ? preset.is_builtin + ? t(`effects.builtinPresets.${preset.name}.description`, { + defaultValue: preset.description ?? '', + }) + : preset.description + : ''; + + async function handlePreview() { + if (!previewGenId || workingChain.length === 0) return; + + setPreviewLoading(true); + try { + const blob = await apiClient.previewEffects(previewGenId, workingChain); + + // Revoke old blob URL + if (blobUrlRef.current) { + URL.revokeObjectURL(blobUrlRef.current); + } + + const url = URL.createObjectURL(blob); + blobUrlRef.current = url; + + // Play through the main audio player + setAudioWithAutoPlay(url, `preview-${Date.now()}`, null, 'Effects Preview'); + } catch (error) { + toast({ + title: t('effects.toast.previewFailed'), + description: error instanceof Error ? error.message : t('common.unknownError'), + variant: 'destructive', + }); + } finally { + setPreviewLoading(false); + } + } + + function handleSelectGeneration(gen: HistoryResponse) { + setPreviewGenId(gen.id); + } + + async function handleSaveNew() { + if (!name.trim()) { + toast({ title: t('effects.toast.nameRequired'), variant: 'destructive' }); + return; + } + setSaving(true); + try { + const created = await apiClient.createEffectPreset({ + name: name.trim(), + description: description.trim() || undefined, + effects_chain: workingChain, + }); + queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); + setIsCreatingNew(false); + setSelectedPresetId(created.id); + toast({ + title: t('effects.toast.saved'), + description: t('effects.toast.createdDescription', { name: created.name }), + }); + } catch (error) { + toast({ + title: t('effects.toast.saveFailed'), + description: error instanceof Error ? error.message : t('common.unknownError'), + variant: 'destructive', + }); + } finally { + setSaving(false); + } + } + + async function handleSaveExisting() { + if (!selectedPresetId || !name.trim()) return; + setSaving(true); + try { + await apiClient.updateEffectPreset(selectedPresetId, { + name: name.trim(), + description: description.trim() || undefined, + effects_chain: workingChain, + }); + queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); + queryClient.invalidateQueries({ queryKey: ['effect-preset', selectedPresetId] }); + toast({ title: t('effects.toast.updated') }); + } catch (error) { + toast({ + title: t('effects.toast.saveFailed'), + description: error instanceof Error ? error.message : t('common.unknownError'), + variant: 'destructive', + }); + } finally { + setSaving(false); + } + } + + function handleSaveAsNew() { + const sourceName = isBuiltIn ? presetName : name; + setSaveAsName(t('effects.saveAs.suggestedName', { name: sourceName })); + setSaveAsDescription(description); + setSaveAsDialogOpen(true); + } + + async function handleSaveAsConfirm() { + if (!saveAsName.trim()) { + toast({ title: t('effects.toast.nameRequired'), variant: 'destructive' }); + return; + } + setSaving(true); + try { + const created = await apiClient.createEffectPreset({ + name: saveAsName.trim(), + description: saveAsDescription.trim() || undefined, + effects_chain: workingChain, + }); + queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); + setSaveAsDialogOpen(false); + setSelectedPresetId(created.id); + toast({ + title: t('effects.toast.saved'), + description: t('effects.toast.createdDescription', { name: created.name }), + }); + } catch (error) { + toast({ + title: t('effects.toast.saveFailed'), + description: error instanceof Error ? error.message : t('common.unknownError'), + variant: 'destructive', + }); + } finally { + setSaving(false); + } + } + + async function handleDelete() { + if (!selectedPresetId) return; + setDeleting(true); + try { + await apiClient.deleteEffectPreset(selectedPresetId); + queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); + setSelectedPresetId(null); + setWorkingChain([]); + toast({ title: t('effects.toast.deleted') }); + } catch (error) { + toast({ + title: t('effects.toast.deleteFailed'), + description: error instanceof Error ? error.message : t('common.unknownError'), + variant: 'destructive', + }); + } finally { + setDeleting(false); + } + } + + if (!isEditing) { + return ( +
+
+ +

{t('effects.placeholder')}

+
+
+ ); + } + + return ( +
+
+

+ {isCreatingNew + ? t('effects.detail.newTitle') + : isBuiltIn + ? presetName + : t('effects.detail.editTitle')} +

+
+ {!isBuiltIn && !isCreatingNew && ( + <> + + + + )} + {isCreatingNew && ( + + )} + {isBuiltIn && ( + + )} +
+
+ +
+ {(isCreatingNew || !isBuiltIn) && ( +
+
+ + setName(e.target.value)} + placeholder={t('effects.fields.namePlaceholder')} + className="h-9" + /> +
+
+ +