Initial commit

This commit is contained in:
2026-04-24 19:18:15 +08:00
commit fbcbe08696
555 changed files with 96692 additions and 0 deletions

6085
tauri/src-tauri/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,49 @@
[package]
name = "voicebox"
version = "0.4.5"
description = "A production-quality desktop app for Qwen3-TTS voice cloning and generation"
authors = ["you"]
license = ""
repository = ""
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[build-dependencies]
tauri-build = { version = "2.0", features = [] }
[dependencies]
tauri = { version = "2.0", features = [] }
tauri-plugin-dialog = "2.0"
tauri-plugin-fs = "2.0"
tauri-plugin-shell = "2.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.12", features = ["blocking", "json"] }
hound = "3.5"
base64 = "0.22"
cpal = "0.15"
symphonia = { version = "0.5", features = ["all"] }
scopeguard = "1.2.0"
[target.'cfg(target_os = "macos")'.dependencies]
screencapturekit = { version = "1", features = ["async"] }
coreaudio-sys = "0.2"
objc = "0.2"
core-foundation-sys = "0.8"
[target.'cfg(target_os = "windows")'.dependencies]
wasapi = "0.22"
windows = { version = "0.62", features = ["Win32_Foundation", "Win32_UI_WindowsAndMessaging", "Win32_System_Com"] }
[target.'cfg(target_os = "linux")'.dependencies]
webkit2gtk = "2.0"
[target.'cfg(not(any(target_os = "android", target_os = "ios")))'.dependencies]
tauri-plugin-updater = "2.0"
tauri-plugin-process = "2.0"
[features]
# This feature is used for production builds or when `devPath` points to the filesystem
custom-protocol = ["tauri/custom-protocol"]

View File

@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.security.cs.allow-jit</key>
<true/>
<key>com.apple.security.cs.allow-unsigned-executable-memory</key>
<true/>
<key>com.apple.security.cs.disable-library-validation</key>
<true/>
<key>com.apple.security.device.audio-input</key>
<true/>
<key>com.apple.security.files.user-selected.read-write</key>
<true/>
</dict>
</plist>

View File

@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleIconFile</key>
<string>voicebox</string>
<key>CFBundleIconName</key>
<string>voicebox</string>
<key>NSMicrophoneUsageDescription</key>
<string>voicebox needs microphone access to record voice samples for voice cloning.</string>
<key>NSScreenCaptureUsageDescription</key>
<string>Voicebox needs screen capture access to record system audio for voice samples.</string>
</dict>
</plist>

170
tauri/src-tauri/build.rs Normal file
View File

@@ -0,0 +1,170 @@
#[cfg(target_os = "macos")]
use std::process::Command;
fn main() {
// Link Swift runtime libraries for screencapturekit crate
#[cfg(target_os = "macos")]
{
// ScreenCaptureKit does not exist on macOS 11, so weak-link it to
// allow the app to launch and gate usage at runtime instead.
println!("cargo:rustc-link-arg=-Wl,-weak_framework,ScreenCaptureKit");
// Add Swift runtime library paths to RPATH
println!("cargo:rustc-link-arg=-Wl,-rpath,/usr/lib/swift");
println!("cargo:rustc-link-arg=-L/usr/lib/swift");
// Also try Xcode's Swift libraries
if let Ok(output) = Command::new("xcode-select").arg("-p").output() {
if output.status.success() {
let xcode_path = String::from_utf8_lossy(&output.stdout).trim().to_string();
let swift_lib_path = format!(
"{}/Toolchains/XcodeDefault.xctoolchain/usr/lib/swift/macosx",
xcode_path
);
println!("cargo:rustc-link-arg=-Wl,-rpath,{}", swift_lib_path);
println!("cargo:rustc-link-arg=-L{}", swift_lib_path);
}
}
}
let project_root = env!("CARGO_MANIFEST_DIR");
let gen_dir = format!("{}/gen", project_root);
std::fs::create_dir_all(&gen_dir).expect("Failed to create gen directory");
// Compile macOS Liquid Glass icon
#[cfg(target_os = "macos")]
{
// voicebox.icon is in tauri/assets/voicebox.icon (one level up from src-tauri)
let icon_source = format!("{}/../assets/voicebox.icon", project_root);
if std::path::Path::new(&icon_source).exists() {
println!("cargo:rerun-if-changed={}", icon_source);
println!("cargo:rerun-if-changed={}/icon.json", icon_source);
println!("cargo:rerun-if-changed={}/Assets", icon_source);
let partial_plist = format!("{}/partial.plist", gen_dir);
let output = Command::new("xcrun")
.args([
"actool",
"--compile",
&gen_dir,
"--output-format",
"human-readable-text",
"--output-partial-info-plist",
&partial_plist,
"--app-icon",
"voicebox",
"--include-all-app-icons",
"--target-device",
"mac",
"--minimum-deployment-target",
"11.0",
"--platform",
"macosx",
&icon_source,
])
.output();
match output {
Ok(output) => {
if !output.status.success() {
eprintln!("actool stderr: {}", String::from_utf8_lossy(&output.stderr));
eprintln!("actool stdout: {}", String::from_utf8_lossy(&output.stdout));
panic!("actool failed to compile icon");
}
println!("Successfully compiled icon to {}", gen_dir);
}
Err(e) => {
eprintln!("Failed to execute xcrun actool: {}", e);
eprintln!("Make sure you have Xcode Command Line Tools installed");
panic!("Icon compilation failed");
}
}
// Generate voicebox.icns from the source PNG via sips + iconutil
let icns_path = format!("{}/voicebox.icns", gen_dir);
if !std::path::Path::new(&icns_path).exists() {
let source_png = format!("{}/Assets/Voicebox.png", icon_source);
if std::path::Path::new(&source_png).exists() {
let iconset_dir = format!("{}/voicebox.iconset", gen_dir);
std::fs::create_dir_all(&iconset_dir).ok();
let sizes: &[(u32, &str)] = &[
(16, "icon_16x16.png"),
(32, "icon_16x16@2x.png"),
(32, "icon_32x32.png"),
(64, "icon_32x32@2x.png"),
(128, "icon_128x128.png"),
(256, "icon_128x128@2x.png"),
(256, "icon_256x256.png"),
(512, "icon_256x256@2x.png"),
(512, "icon_512x512.png"),
(1024, "icon_512x512@2x.png"),
];
for (size, name) in sizes {
let dest = format!("{}/{}", iconset_dir, name);
let status = Command::new("sips")
.args([
"-z",
&size.to_string(),
&size.to_string(),
&source_png,
"--out",
&dest,
])
.output();
if let Ok(out) = status {
if !out.status.success() {
eprintln!(
"sips failed for {}: {}",
name,
String::from_utf8_lossy(&out.stderr)
);
}
}
}
let iconutil_output = Command::new("iconutil")
.args(["-c", "icns", "-o", &icns_path, &iconset_dir])
.output();
match iconutil_output {
Ok(out) if out.status.success() => {
println!("Generated voicebox.icns");
}
Ok(out) => {
eprintln!("iconutil failed: {}", String::from_utf8_lossy(&out.stderr));
}
Err(e) => {
eprintln!("Failed to run iconutil: {}", e);
}
}
// Clean up iconset directory
std::fs::remove_dir_all(&iconset_dir).ok();
}
}
} else {
println!(
"cargo:warning=Icon source not found at {}, skipping icon compilation",
icon_source
);
}
}
// Ensure all resource files exist so Tauri's bundler doesn't fail.
// On non-macOS these are always stubs. On macOS, actool may not produce
// Assets.car if the Xcode version doesn't support the .icon format.
{
let required = ["Assets.car", "voicebox.icns", "partial.plist"];
for name in required {
let path = format!("{}/{}", gen_dir, name);
if !std::path::Path::new(&path).exists() {
std::fs::write(&path, b"").ok();
}
}
}
tauri_build::build()
}

View File

@@ -0,0 +1,28 @@
{
"$schema": "https://schema.tauri.app/config/2",
"identifier": "default",
"description": "Default permissions for voicebox",
"platforms": ["linux", "macOS", "windows"],
"windows": ["main"],
"remote": {
"urls": ["http://localhost:*"]
},
"permissions": [
"core:default",
"core:window:default",
"core:window:allow-start-dragging",
"core:webview:default",
"core:webview:allow-internal-toggle-devtools",
"shell:allow-open",
"shell:allow-execute",
"shell:allow-spawn",
"updater:default",
"process:default",
"dialog:default",
"dialog:allow-save",
"dialog:allow-open",
"fs:default",
"fs:read-all",
"fs:write-all"
]
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1 @@
{"default":{"identifier":"default","description":"Default permissions for voicebox","remote":{"urls":["http://localhost:*"]},"local":true,"windows":["main"],"permissions":["core:default","core:window:default","core:window:allow-start-dragging","core:webview:default","core:webview:allow-internal-toggle-devtools","shell:allow-open","shell:allow-execute","shell:allow-spawn","updater:default","process:default","dialog:default","dialog:allow-save","dialog:allow-open","fs:default","fs:read-all","fs:write-all"],"platforms":["linux","macOS","windows"]}}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.1 KiB

View File

@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
<foreground android:drawable="@mipmap/ic_launcher_foreground"/>
<background android:drawable="@color/ic_launcher_background"/>
</adaptive-icon>

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="ic_launcher_background">#fff</color>
</resources>

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 296 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 831 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 388 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 793 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 793 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

View File

@@ -0,0 +1,406 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::{SampleFormat, StreamConfig};
use hound::{WavSpec, WavWriter};
use std::io::Cursor;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
/// Try to find a PulseAudio/PipeWire monitor source using `pactl`.
/// Returns the source name (e.g. "alsa_output.pci-0000_0d_00.6.analog-stereo.monitor") if found.
fn find_monitor_source_via_pactl() -> Option<String> {
let output = std::process::Command::new("pactl")
.args(["list", "short", "sources"])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
// First, try to find the monitor of the default sink
let default_sink = std::process::Command::new("pactl")
.args(["get-default-sink"])
.output()
.ok()
.and_then(|o| {
if o.status.success() {
Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
} else {
None
}
});
// If we know the default sink, look for its .monitor specifically
if let Some(sink_name) = &default_sink {
let monitor_name = format!("{}.monitor", sink_name);
for line in stdout.lines() {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 && parts[1] == monitor_name {
eprintln!(
"Linux audio capture: Found default sink monitor via pactl: {}",
monitor_name
);
return Some(monitor_name);
}
}
}
// Fallback: find any .monitor source
for line in stdout.lines() {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 && parts[1].ends_with(".monitor") {
let name = parts[1].to_string();
eprintln!(
"Linux audio capture: Found monitor source via pactl: {}",
name
);
return Some(name);
}
}
None
}
/// Start capturing system audio on Linux using PulseAudio monitor sources.
///
/// On modern Linux with PulseAudio or PipeWire, we first try to detect the
/// monitor source via `pactl` and set the `PULSE_SOURCE` environment variable.
/// This tells PulseAudio's ALSA plugin to use the monitor as the default input
/// source for this process. If `pactl` is unavailable, we fall back to searching
/// cpal device names for "monitor".
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
// Reset previous samples
state.reset();
let samples = state.samples.clone();
let sample_rate_arc = state.sample_rate.clone();
let channels_arc = state.channels.clone();
let stop_tx = state.stop_tx.clone();
let error_arc = state.error.clone();
// Use AtomicBool for stop signal (works across threads)
let stop_flag = Arc::new(AtomicBool::new(false));
let stop_flag_clone = stop_flag.clone();
// Create tokio channel and spawn a task to bridge it to the AtomicBool
let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(1);
*stop_tx.lock().unwrap() = Some(tx);
tokio::spawn(async move {
rx.recv().await;
stop_flag_clone.store(true, Ordering::Relaxed);
});
// Spawn capture on a dedicated thread
thread::spawn(move || {
// Try to set PULSE_SOURCE to a monitor before initializing cpal.
// This tells PulseAudio/PipeWire's ALSA plugin to use the monitor
// as the default input source for this process.
let monitor_source = find_monitor_source_via_pactl();
if let Some(ref source_name) = monitor_source {
eprintln!(
"Linux audio capture: Setting PULSE_SOURCE={}",
source_name
);
std::env::set_var("PULSE_SOURCE", source_name);
}
let host = cpal::default_host();
// Select the capture device.
// If PULSE_SOURCE was set, the default input device IS the monitor.
// Otherwise, fall back to searching device names for "monitor".
let device = if monitor_source.is_some() {
// PULSE_SOURCE was set — default input IS the monitor now
match host.default_input_device() {
Some(d) => {
let name = d.name().unwrap_or_default();
eprintln!(
"Linux audio capture: Using PULSE_SOURCE monitor device: {}",
name
);
d
}
None => {
let error_msg = "No audio input device available".to_string();
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
}
} else {
// pactl not available — try to find monitor by name (original approach)
let mut monitor_device = None;
if let Ok(devices) = host.input_devices() {
for d in devices {
if let Ok(name) = d.name() {
let name_lower = name.to_lowercase();
if name_lower.contains("monitor") {
eprintln!(
"Linux audio capture: Found monitor device by name: {}",
name
);
monitor_device = Some(d);
break;
}
}
}
}
match monitor_device {
Some(d) => d,
None => {
eprintln!("Linux audio capture: No monitor device found, falling back to default input");
match host.default_input_device() {
Some(d) => d,
None => {
let error_msg = "No audio input device available".to_string();
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
}
}
}
};
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("Linux audio capture: Using device: {}", device_name);
// Get supported config
let config = match device.default_input_config() {
Ok(c) => c,
Err(e) => {
let error_msg = format!("Failed to get default input config: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let sample_rate = config.sample_rate().0;
let channels = config.channels();
let sample_format = config.sample_format();
eprintln!(
"Linux audio capture: Config - {}Hz, {} channels, format: {:?}",
sample_rate, channels, sample_format
);
*sample_rate_arc.lock().unwrap() = sample_rate;
*channels_arc.lock().unwrap() = channels;
let stream_config = StreamConfig {
channels,
sample_rate: cpal::SampleRate(sample_rate),
buffer_size: cpal::BufferSize::Default,
};
let samples_clone = samples.clone();
let error_arc_clone = error_arc.clone();
let stop_flag_for_stream = stop_flag.clone();
let err_fn = {
let error_arc = error_arc.clone();
move |err: cpal::StreamError| {
let error_msg = format!("Stream error: {}", err);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
}
};
let stream = match sample_format {
SampleFormat::F32 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[f32], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
guard.extend_from_slice(data);
},
err_fn,
None,
)
}
SampleFormat::I16 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[i16], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
for &s in data {
guard.push(s as f32 / 32768.0);
}
},
err_fn,
None,
)
}
SampleFormat::U16 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[u16], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
for &s in data {
guard.push((s as f32 / 32768.0) - 1.0);
}
},
err_fn,
None,
)
}
_ => {
let error_msg = format!("Unsupported sample format: {:?}", sample_format);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
};
let stream = match stream {
Ok(s) => s,
Err(e) => {
let error_msg = format!("Failed to build input stream: {}", e);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
};
if let Err(e) = stream.play() {
let error_msg = format!("Failed to start stream: {}", e);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
eprintln!("Linux audio capture: Stream started successfully");
// Keep thread alive until stop signal
loop {
if stop_flag.load(Ordering::Relaxed) {
break;
}
std::thread::sleep(std::time::Duration::from_millis(100));
}
// Stream will be dropped here, stopping capture
eprintln!("Linux audio capture: Stream stopped");
});
// Spawn timeout task
let stop_tx_clone = state.stop_tx.clone();
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)).await;
let tx = stop_tx_clone.lock().unwrap().take();
if let Some(tx) = tx {
let _ = tx.send(()).await;
}
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Check if there was an error during capture
if let Some(error) = state.error.lock().unwrap().as_ref() {
return Err(error.clone());
}
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err(
"No audio samples captured. Make sure audio is playing on your system during recording."
.to_string(),
);
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
// Check via pactl first (most reliable on modern Linux)
if find_monitor_source_via_pactl().is_some() {
return true;
}
// Fallback: check cpal devices
let host = cpal::default_host();
if let Ok(devices) = host.input_devices() {
for d in devices {
if let Ok(name) = d.name() {
if name.to_lowercase().contains("monitor") {
return true;
}
}
}
}
host.default_input_device().is_some()
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer =
WavWriter::new(cursor, spec).map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer
.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer
.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,265 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use hound::{WavSpec, WavWriter};
use screencapturekit::{
cm::CMSampleBuffer,
shareable_content::SCShareableContent,
stream::{
configuration::SCStreamConfiguration,
content_filter::SCContentFilter,
output_trait::SCStreamOutputTrait,
output_type::SCStreamOutputType,
sc_stream::SCStream,
},
};
use std::io::Cursor;
use std::process::Command;
use std::sync::{Arc, Mutex};
use tokio::sync::mpsc;
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
if !is_supported() {
return Err("System audio capture requires macOS 12.3 or newer.".to_string());
}
// Reset previous samples
state.reset();
// Get shareable content
let content = SCShareableContent::get()
.map_err(|e| format!("Failed to get shareable content: {}", e))?;
// Get first display
let displays = content.displays();
if displays.is_empty() {
return Err("No displays available".to_string());
}
let display = &displays[0];
// Create content filter for desktop audio
let filter = SCContentFilter::create()
.with_display(display)
.with_excluding_windows(&[])
.build();
// Create stream configuration - audio only
let mut config = SCStreamConfiguration::default();
config.set_captures_audio(true);
config.set_excludes_current_process_audio(false);
config.set_sample_rate(48000); // Use i32 directly
config.set_channel_count(2); // Use i32 directly
// Create stream using builder
let (tx, mut rx) = mpsc::channel::<()>(1);
*state.stop_tx.lock().unwrap() = Some(tx);
let samples = state.samples.clone();
let sample_rate = state.sample_rate.clone();
let channels = state.channels.clone();
// Set sample rate and channels
*sample_rate.lock().unwrap() = 48000;
*channels.lock().unwrap() = 2;
// Create output handler struct
struct AudioHandler {
samples: Arc<Mutex<Vec<f32>>>,
}
impl SCStreamOutputTrait for AudioHandler {
fn did_output_sample_buffer(
&self,
sample: CMSampleBuffer,
_type: SCStreamOutputType,
) {
if _type == SCStreamOutputType::Audio {
if let Ok(audio_samples) = extract_audio_samples(sample) {
let mut samples_guard = self.samples.lock().unwrap();
samples_guard.extend_from_slice(&audio_samples);
}
}
}
}
let handler = AudioHandler {
samples: samples.clone(),
};
// Create stream
let mut stream = SCStream::new(&filter, &config);
// Add output handler for audio (order: handler, then output_type)
stream.add_output_handler(handler, SCStreamOutputType::Audio);
// Store stream reference
*state.stream.lock().unwrap() = Some(stream.clone());
stream.start_capture().map_err(|e| format!("Failed to start capture: {}", e))?;
// Spawn task to stop after max duration
let stream_clone = stream.clone();
tokio::spawn(async move {
tokio::select! {
_ = tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)) => {
// Timeout reached
}
_ = rx.recv() => {
// Manual stop
}
}
let _ = stream_clone.stop_capture();
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Stop stream if still active
if let Some(stream) = state.stream.lock().unwrap().take() {
let _ = stream.stop_capture();
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err("No audio samples captured".to_string());
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
macos_version_at_least(12, 3)
}
fn macos_version_at_least(required_major: u64, required_minor: u64) -> bool {
let output = match Command::new("sw_vers").arg("-productVersion").output() {
Ok(output) if output.status.success() => output,
_ => return false,
};
let version = String::from_utf8_lossy(&output.stdout);
let mut parts = version.trim().split('.');
let major = parts.next().and_then(|part| part.parse::<u64>().ok()).unwrap_or(0);
let minor = parts.next().and_then(|part| part.parse::<u64>().ok()).unwrap_or(0);
major > required_major || (major == required_major && minor >= required_minor)
}
fn extract_audio_samples(sample_buffer: CMSampleBuffer) -> Result<Vec<f32>, String> {
// Use the crate's built-in method to get audio buffer list
let audio_buffer_list = sample_buffer
.audio_buffer_list()
.ok_or_else(|| "Failed to get audio buffer list".to_string())?;
let buffers: Vec<_> = audio_buffer_list.iter().collect();
let num_buffers = buffers.len();
if num_buffers == 0 {
return Ok(Vec::new());
}
// ScreenCaptureKit on macOS provides audio in Float32 format
// The audio can be either:
// - Interleaved (1 buffer with L,R,L,R,... samples)
// - Planar (2 buffers, one for L channel, one for R channel)
if num_buffers == 1 {
// Interleaved stereo or mono in a single buffer
let buffer = &buffers[0];
let data_bytes = buffer.data();
let num_samples = data_bytes.len() / std::mem::size_of::<f32>();
if num_samples > 0 {
unsafe {
let data_ptr = data_bytes.as_ptr() as *const f32;
let data = std::slice::from_raw_parts(data_ptr, num_samples);
return Ok(data.to_vec());
}
}
} else {
// Planar format - separate buffer for each channel
// We need to interleave them: L0, R0, L1, R1, ...
let mut channel_data: Vec<Vec<f32>> = Vec::new();
let mut max_samples = 0;
for buffer in &buffers {
let data_bytes = buffer.data();
let num_samples = data_bytes.len() / std::mem::size_of::<f32>();
if num_samples > 0 {
unsafe {
let data_ptr = data_bytes.as_ptr() as *const f32;
let data = std::slice::from_raw_parts(data_ptr, num_samples);
channel_data.push(data.to_vec());
max_samples = max_samples.max(num_samples);
}
}
}
// Interleave the channels
let mut interleaved = Vec::with_capacity(max_samples * num_buffers);
for i in 0..max_samples {
for channel in &channel_data {
if i < channel.len() {
interleaved.push(channel[i]);
} else {
interleaved.push(0.0); // Pad with silence if needed
}
}
}
return Ok(interleaved);
}
Ok(Vec::new())
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = WavWriter::new(cursor, spec)
.map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,47 @@
#[cfg(target_os = "macos")]
mod macos;
#[cfg(target_os = "windows")]
mod windows;
#[cfg(target_os = "linux")]
mod linux;
#[cfg(target_os = "macos")]
pub use macos::*;
#[cfg(target_os = "windows")]
pub use windows::*;
#[cfg(target_os = "linux")]
pub use linux::*;
use std::sync::{Arc, Mutex};
#[cfg(target_os = "macos")]
use screencapturekit::stream::sc_stream::SCStream;
pub struct AudioCaptureState {
pub samples: Arc<Mutex<Vec<f32>>>,
pub sample_rate: Arc<Mutex<u32>>,
pub channels: Arc<Mutex<u16>>,
pub stop_tx: Arc<Mutex<Option<tokio::sync::mpsc::Sender<()>>>>,
pub error: Arc<Mutex<Option<String>>>,
#[cfg(target_os = "macos")]
pub stream: Arc<Mutex<Option<SCStream>>>,
}
impl AudioCaptureState {
pub fn new() -> Self {
Self {
samples: Arc::new(Mutex::new(Vec::new())),
sample_rate: Arc::new(Mutex::new(44100)),
channels: Arc::new(Mutex::new(2)),
stop_tx: Arc::new(Mutex::new(None)),
error: Arc::new(Mutex::new(None)),
#[cfg(target_os = "macos")]
stream: Arc::new(Mutex::new(None)),
}
}
pub fn reset(&self) {
*self.samples.lock().unwrap() = Vec::new();
*self.error.lock().unwrap() = None;
}
}

View File

@@ -0,0 +1,288 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use hound::{WavSpec, WavWriter};
use std::io::Cursor;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::thread;
use wasapi::*;
use windows::Win32::System::Com::{CoInitializeEx, CoUninitialize, COINIT_MULTITHREADED};
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
// Reset previous samples
state.reset();
let samples = state.samples.clone();
let sample_rate_arc = state.sample_rate.clone();
let channels_arc = state.channels.clone();
let stop_tx = state.stop_tx.clone();
let error_arc = state.error.clone();
// Use AtomicBool for stop signal (works with non-Send types)
let stop_flag = Arc::new(AtomicBool::new(false));
let stop_flag_clone = stop_flag.clone();
// Create tokio channel and spawn a task to bridge it to the AtomicBool
let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(1);
*stop_tx.lock().unwrap() = Some(tx);
tokio::spawn(async move {
rx.recv().await;
stop_flag_clone.store(true, Ordering::Relaxed);
});
// Spawn capture task on a dedicated thread (WASAPI COM objects are not Send)
// All WASAPI objects must be created and used on the same thread
thread::spawn(move || {
// Initialize COM for this thread
unsafe {
let hr = CoInitializeEx(None, COINIT_MULTITHREADED);
if hr.is_err() {
eprintln!("Failed to initialize COM: {:?}", hr);
return;
}
}
// Ensure COM is uninitialized when thread exits
let _com_guard = scopeguard::guard((), |_| unsafe {
CoUninitialize();
});
// Initialize WASAPI on this thread
let device = match DeviceEnumerator::new()
.and_then(|enumerator| enumerator.get_default_device(&Direction::Render))
{
Ok(d) => d,
Err(e) => {
let error_msg = format!("Failed to get audio device: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let mut audio_client = match device.get_iaudioclient() {
Ok(client) => client,
Err(e) => {
let error_msg = format!("Failed to get audio client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let mix_format = match audio_client.get_mixformat() {
Ok(format) => format,
Err(e) => {
let error_msg = format!("Failed to get mix format: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
// Set sample rate and channels
let channels = mix_format.get_nchannels() as usize;
let bytes_per_sample = (mix_format.get_bitspersample() / 8) as usize;
*sample_rate_arc.lock().unwrap() = mix_format.get_samplespersec();
*channels_arc.lock().unwrap() = mix_format.get_nchannels();
// Get device period
let (_def_period, min_period) = match audio_client.get_device_period() {
Ok(periods) => periods,
Err(e) => {
eprintln!("Failed to get device period: {}", e);
return;
}
};
// Initialize audio client for loopback with StreamMode
// For loopback mode: get Render device, initialize with Capture direction
// This triggers AUDCLNT_STREAMFLAGS_LOOPBACK in the wasapi crate
let stream_mode = StreamMode::EventsShared {
autoconvert: true, // Enable automatic format conversion
buffer_duration_hns: min_period, // Use minimum period
};
if let Err(e) = audio_client.initialize_client(&mix_format, &Direction::Capture, &stream_mode) {
let error_msg = format!("Failed to initialize audio client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
// Set up event handle for EventsShared mode
let h_event = match audio_client.set_get_eventhandle() {
Ok(event) => event,
Err(e) => {
eprintln!("Failed to set event handle: {}", e);
return;
}
};
let capture_client = match audio_client.get_audiocaptureclient() {
Ok(client) => client,
Err(e) => {
let error_msg = format!("Failed to get capture client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
if let Err(e) = audio_client.start_stream() {
let error_msg = format!("Failed to start stream: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
loop {
// Check if stop signal was received
if stop_flag.load(Ordering::Relaxed) {
break;
}
// Try to get available data
match capture_client.get_next_packet_size() {
Ok(Some(frames_available)) => {
if frames_available > 0 {
// Calculate buffer size needed (frames * channels * bytes_per_sample)
let buffer_size = frames_available as usize * channels * bytes_per_sample;
let mut buffer = vec![0u8; buffer_size];
match capture_client.read_from_device(&mut buffer) {
Ok((frames_read, _buffer_info)) => {
if frames_read > 0 {
// Convert bytes to f32 samples
let samples_read = (frames_read as usize * channels) as usize;
let mut samples_guard = samples.lock().unwrap();
// Assuming 32-bit float format
if bytes_per_sample == 4 {
for i in 0..samples_read {
let byte_offset = i * 4;
if byte_offset + 4 <= buffer.len() {
let sample = f32::from_le_bytes([
buffer[byte_offset],
buffer[byte_offset + 1],
buffer[byte_offset + 2],
buffer[byte_offset + 3],
]);
samples_guard.push(sample);
}
}
}
}
}
Err(e) => {
eprintln!("Error reading from device: {}", e);
}
}
}
}
Ok(None) => {
// Exclusive mode - handle differently if needed
}
Err(e) => {
eprintln!("Error getting next packet size: {}", e);
}
}
// Wait for event signal (with timeout to allow checking stop flag)
if h_event.wait_for_event(100).is_err() {
// Timeout is expected - just continue to check stop flag
}
}
// Stop the stream when done
audio_client.stop_stream().ok();
});
// Spawn timeout task
let stop_tx_clone = state.stop_tx.clone();
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)).await;
// Take the sender out of the mutex before awaiting
let tx = stop_tx_clone.lock().unwrap().take();
if let Some(tx) = tx {
let _ = tx.send(()).await;
}
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Check if there was an error during capture
if let Some(error) = state.error.lock().unwrap().as_ref() {
return Err(error.clone());
}
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err("No audio samples captured. Make sure audio is playing on your system during recording.".to_string());
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
#[cfg(target_os = "windows")]
{
true
}
#[cfg(not(target_os = "windows"))]
{
false
}
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = WavWriter::new(cursor, spec)
.map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,481 @@
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::{Device, Host, SampleFormat, StreamConfig};
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
#[derive(Debug, Clone, serde::Serialize)]
pub struct AudioOutputDevice {
pub id: String,
pub name: String,
pub is_default: bool,
}
pub struct AudioOutputState {
host: Host,
stop_flag: Arc<AtomicBool>,
}
impl AudioOutputState {
pub fn new() -> Self {
Self {
host: cpal::default_host(),
stop_flag: Arc::new(AtomicBool::new(false)),
}
}
pub fn stop_all_playback(&self) -> Result<(), String> {
eprintln!("stop_all_playback: Setting stop flag");
self.stop_flag.store(true, Ordering::Relaxed);
eprintln!("stop_all_playback: Stop flag set - active streams will output silence");
Ok(())
}
pub fn list_output_devices(&self) -> Result<Vec<AudioOutputDevice>, String> {
let devices = self
.host
.output_devices()
.map_err(|e| format!("Failed to enumerate output devices: {}", e))?;
let default_device = self.host.default_output_device();
let mut result = Vec::new();
for device in devices {
let name = device
.name()
.map_err(|e| format!("Failed to get device name: {}", e))?;
// Generate a stable ID from the device name (cpal doesn't provide stable IDs)
let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
let is_default = default_device
.as_ref()
.map(|d| d.name().unwrap_or_default() == name)
.unwrap_or(false);
result.push(AudioOutputDevice {
id,
name,
is_default,
});
}
Ok(result)
}
pub async fn play_audio_to_devices(
&self,
audio_data: Vec<u8>,
device_ids: Vec<String>,
) -> Result<(), String> {
eprintln!("play_audio_to_devices called with {} bytes, {} device IDs", audio_data.len(), device_ids.len());
eprintln!("Requested device IDs: {:?}", device_ids);
// Decode audio file (assuming WAV format)
eprintln!("Decoding audio data...");
let (samples, sample_rate, channels) = self.decode_wav(&audio_data)?;
eprintln!("Audio decoded: {} samples, {}Hz, {} channels", samples.len(), sample_rate, channels);
// Find devices by ID
eprintln!("Enumerating output devices...");
let devices: Vec<Device> = self
.host
.output_devices()
.map_err(|e| format!("Failed to enumerate devices: {}", e))?
.filter_map(|device| {
let name = device.name().ok()?;
let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
eprintln!("Found device: {} (id: {})", name, id);
if device_ids.contains(&id) {
eprintln!(" -> Matched! Will play to this device");
Some(device)
} else {
None
}
})
.collect();
if devices.is_empty() {
eprintln!("ERROR: No matching devices found");
return Err("No matching devices found".to_string());
}
eprintln!("Playing to {} device(s)", devices.len());
// Stop any existing playback first
self.stop_all_playback().ok();
// Reset stop flag for new playback
self.stop_flag.store(false, Ordering::Relaxed);
// Play to each device
for (i, device) in devices.iter().enumerate() {
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("Playing to device {}/{}: {}", i + 1, devices.len(), device_name);
self.play_to_device(device, samples.clone(), sample_rate, channels, self.stop_flag.clone())
.map_err(|e| format!("Failed to play to device {}: {}", device_name, e))?;
eprintln!("Successfully started playback on device: {}", device_name);
}
eprintln!("play_audio_to_devices completed successfully");
Ok(())
}
fn decode_wav(&self, data: &[u8]) -> Result<(Vec<f32>, u32, u16), String> {
use symphonia::core::formats::FormatOptions;
use symphonia::core::io::MediaSourceStream;
use symphonia::core::meta::MetadataOptions;
eprintln!("decode_wav: Creating MediaSourceStream from {} bytes", data.len());
let mss = MediaSourceStream::new(
Box::new(std::io::Cursor::new(data.to_vec())),
Default::default(),
);
eprintln!("decode_wav: Probing audio format...");
let mut format = symphonia::default::get_probe()
.format(
&Default::default(),
mss,
&FormatOptions::default(),
&MetadataOptions::default(),
)
.map_err(|e| {
eprintln!("decode_wav: Failed to probe audio: {}", e);
format!("Failed to probe audio: {}", e)
})?
.format;
eprintln!("decode_wav: Audio format probed successfully");
eprintln!("decode_wav: Finding audio track...");
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
.ok_or_else(|| {
eprintln!("decode_wav: No audio track found");
"No audio track found".to_string()
})?;
let sample_rate = track
.codec_params
.sample_rate
.ok_or_else(|| {
eprintln!("decode_wav: No sample rate found in track");
"No sample rate found".to_string()
})?;
let channels = track
.codec_params
.channels
.ok_or_else(|| {
eprintln!("decode_wav: No channels found in track");
"No channels found".to_string()
})?
.count() as u16;
eprintln!("decode_wav: Track info - sample_rate: {}, channels: {}", sample_rate, channels);
eprintln!("decode_wav: Creating decoder...");
let mut decoder = symphonia::default::get_codecs()
.make(&track.codec_params, &Default::default())
.map_err(|e| {
eprintln!("decode_wav: Failed to create decoder: {}", e);
format!("Failed to create decoder: {}", e)
})?;
eprintln!("decode_wav: Decoder created successfully");
let mut samples = Vec::new();
let mut packet_count = 0;
eprintln!("decode_wav: Starting packet decoding loop...");
loop {
let packet = match format.next_packet() {
Ok(packet) => packet,
Err(e) => {
eprintln!("decode_wav: End of stream or error: {:?}", e);
break;
}
};
packet_count += 1;
let decoded = decoder
.decode(&packet)
.map_err(|e| {
eprintln!("decode_wav: Decode error on packet {}: {}", packet_count, e);
format!("Decode error: {}", e)
})?;
// Convert to f32 samples by matching on the buffer type
use symphonia::core::audio::{AudioBufferRef, Signal};
use symphonia::core::conv::FromSample;
let spec = *decoded.spec();
let num_channels = spec.channels.count();
let num_frames = decoded.frames();
eprintln!("decode_wav: Packet {} - {} frames, {} channels", packet_count, num_frames, num_channels);
// Interleave samples from all channels
for frame_idx in 0..num_frames {
for ch in 0..num_channels {
let sample_f32 = match &decoded {
AudioBufferRef::U8(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U16(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U24(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U32(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S8(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S16(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S24(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S32(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::F32(buf) => buf.chan(ch)[frame_idx],
AudioBufferRef::F64(buf) => buf.chan(ch)[frame_idx] as f32,
};
samples.push(sample_f32);
}
}
}
eprintln!("decode_wav: Decoded {} packets, total {} samples", packet_count, samples.len());
eprintln!("decode_wav: Returning sample_rate={}, channels={}", sample_rate, channels);
Ok((samples, sample_rate, channels))
}
fn play_to_device(
&self,
device: &Device,
samples: Vec<f32>,
sample_rate: u32,
channels: u16,
stop_flag: Arc<AtomicBool>,
) -> Result<(), String> {
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("play_to_device: Starting playback to device: {}", device_name);
eprintln!("play_to_device: Input - {} samples, {}Hz, {} channels", samples.len(), sample_rate, channels);
let config = device
.default_output_config()
.map_err(|e| format!("Failed to get default config: {}", e))?;
// Prepare samples for the device's format
let device_sample_rate = config.sample_rate().0;
let device_channels = config.channels();
let device_sample_format = config.sample_format();
eprintln!("play_to_device: Device config - {}Hz, {} channels, format: {:?}",
device_sample_rate, device_channels, device_sample_format);
// Resample if needed (simple linear interpolation for now)
let resampled = if device_sample_rate != sample_rate {
eprintln!("play_to_device: Resampling from {}Hz to {}Hz", sample_rate, device_sample_rate);
let result = self.resample(&samples, sample_rate, device_sample_rate);
eprintln!("play_to_device: Resampled {} samples to {} samples", samples.len(), result.len());
result
} else {
eprintln!("play_to_device: No resampling needed");
samples
};
// Interleave/convert channels if needed
eprintln!("play_to_device: Interleaving channels from {} to {} channels", channels, device_channels);
let interleaved = self.interleave_channels(&resampled, channels, device_channels);
eprintln!("play_to_device: Interleaved to {} samples", interleaved.len());
// Create shared buffer for playback
let buffer: Arc<Mutex<Vec<f32>>> = Arc::new(Mutex::new(interleaved));
let position = Arc::new(AtomicUsize::new(0));
let buffer_clone = buffer.clone();
let position_clone = position.clone();
let err_fn = |err| eprintln!("Playback error: {}", err);
let stream_config = StreamConfig {
channels: device_channels,
sample_rate: cpal::SampleRate(device_sample_rate),
buffer_size: cpal::BufferSize::Default,
};
let stop_flag_clone = stop_flag.clone();
let stream = match config.sample_format() {
SampleFormat::F32 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [f32], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 0.0;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = buf[idx];
idx += 1;
} else {
*sample = 0.0;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
SampleFormat::I16 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [i16], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 0;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = (buf[idx] * 32767.0) as i16;
idx += 1;
} else {
*sample = 0;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
SampleFormat::U16 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [u16], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 32768;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = ((buf[idx] + 1.0) * 32767.5) as u16;
idx += 1;
} else {
*sample = 32768;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
_ => return Err("Unsupported sample format".to_string()),
};
eprintln!("play_to_device: Starting stream playback...");
stream.play().map_err(|e| {
eprintln!("play_to_device: Failed to play stream: {}", e);
format!("Failed to play stream: {}", e)
})?;
eprintln!("play_to_device: Stream started successfully");
// Keep the stream alive until playback finishes.
// Previously the stream was dropped immediately on function return,
// causing silent playback (cpal stops output when its Stream is dropped).
let total_samples = {
buffer.lock().unwrap().len()
};
loop {
let pos = position.load(std::sync::atomic::Ordering::Relaxed);
if pos >= total_samples || stop_flag.load(std::sync::atomic::Ordering::Relaxed) {
break;
}
std::thread::sleep(std::time::Duration::from_millis(10));
}
// stream is dropped here, after audio has finished playing
drop(stream);
eprintln!("play_to_device: Function completed successfully");
Ok(())
}
fn resample(&self, samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
if from_rate == to_rate {
return samples.to_vec();
}
let ratio = to_rate as f64 / from_rate as f64;
let new_len = (samples.len() as f64 * ratio) as usize;
let mut resampled = Vec::with_capacity(new_len);
for i in 0..new_len {
let src_idx = (i as f64 / ratio) as usize;
if src_idx < samples.len() {
resampled.push(samples[src_idx]);
} else {
resampled.push(0.0);
}
}
resampled
}
fn interleave_channels(
&self,
samples: &[f32],
src_channels: u16,
dst_channels: u16,
) -> Vec<f32> {
if src_channels == dst_channels {
return samples.to_vec();
}
let mut interleaved = Vec::new();
let samples_per_channel = samples.len() / src_channels as usize;
for i in 0..samples_per_channel {
for ch in 0..dst_channels {
let src_ch = if ch < src_channels { ch } else { src_channels - 1 };
let idx = (i * src_channels as usize) + src_ch as usize;
if idx < samples.len() {
interleaved.push(samples[idx]);
} else {
interleaved.push(0.0);
}
}
}
interleaved
}
}
impl Default for AudioOutputState {
fn default() -> Self {
Self::new()
}
}

View File

@@ -0,0 +1 @@
pub mod audio_capture;

926
tauri/src-tauri/src/main.rs Normal file
View File

@@ -0,0 +1,926 @@
// Prevents additional console window on Windows in release, DO NOT REMOVE!!
#![cfg_attr(not(debug_assertions), windows_subsystem = "windows")]
mod audio_capture;
mod audio_output;
use std::sync::Mutex;
use tauri::{command, State, Manager, WindowEvent, Emitter, Listener, RunEvent};
use tauri_plugin_shell::ShellExt;
use tokio::sync::mpsc;
const LEGACY_PORT: u16 = 8000;
const SERVER_PORT: u16 = 17493;
/// Find a voicebox-server process listening on a given port (Windows only).
///
/// Uses PowerShell `Get-NetTCPConnection` to look up the PID owning the port,
/// then verifies via `tasklist` that it's a voicebox process. The caller is
/// responsible for checking port occupancy first (e.g. `TcpStream::connect_timeout`).
/// Replaces the previous `netstat -ano` approach which failed on systems with
/// corrupted system DLLs (see #277).
#[cfg(windows)]
fn find_voicebox_pid_on_port(port: u16) -> Option<u32> {
use std::process::Command;
// Use PowerShell's Get-NetTCPConnection to find the PID listening on the port.
// This is a built-in cmdlet that doesn't depend on netstat.exe.
let ps_script = format!(
"Get-NetTCPConnection -LocalPort {} -State Listen -ErrorAction SilentlyContinue | Select-Object -ExpandProperty OwningProcess",
port
);
if let Ok(output) = Command::new("powershell")
.args(["-NoProfile", "-Command", &ps_script])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines() {
if let Ok(pid) = line.trim().parse::<u32>() {
// Verify this PID is a voicebox process
if let Ok(tasklist_output) = Command::new("tasklist")
.args(["/FI", &format!("PID eq {}", pid), "/FO", "CSV", "/NH"])
.output()
{
let tasklist_str = String::from_utf8_lossy(&tasklist_output.stdout);
if tasklist_str.to_lowercase().contains("voicebox") {
return Some(pid);
}
}
}
}
}
None
}
/// Check if a Voicebox server is responding on the given port.
///
/// Sends an HTTP GET to `/health` and returns `true` only if the response
/// is valid JSON matching the Voicebox `HealthResponse` schema — specifically
/// `status` must be `"healthy"`, and both `model_loaded` and `gpu_available`
/// must be present as booleans. This prevents misidentifying an unrelated
/// service that happens to expose a `/health` endpoint.
#[allow(dead_code)] // Used in platform-specific cfg blocks
fn check_health(port: u16) -> bool {
let url = format!("http://127.0.0.1:{}/health", port);
match reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(3))
.build()
{
Ok(client) => match client.get(&url).send() {
Ok(resp) => {
if !resp.status().is_success() {
return false;
}
// Parse as JSON and validate Voicebox-specific fields
match resp.json::<serde_json::Value>() {
Ok(body) => {
body.get("status").and_then(|v| v.as_str()) == Some("healthy")
&& body.get("model_loaded").map(|v| v.is_boolean()).unwrap_or(false)
&& body.get("gpu_available").map(|v| v.is_boolean()).unwrap_or(false)
}
Err(_) => false,
}
}
Err(_) => false,
},
Err(_) => false,
}
}
struct ServerState {
child: Mutex<Option<tauri_plugin_shell::process::CommandChild>>,
server_pid: Mutex<Option<u32>>,
keep_running_on_close: Mutex<bool>,
models_dir: Mutex<Option<String>>,
}
#[command]
async fn start_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
remote: Option<bool>,
models_dir: Option<String>,
) -> Result<String, String> {
// Store models_dir for use on restart (empty string means reset to default)
if let Some(ref dir) = models_dir {
if dir.is_empty() {
*state.models_dir.lock().unwrap() = None;
} else {
*state.models_dir.lock().unwrap() = Some(dir.clone());
}
}
// Check if server is already running (managed by this app instance)
if state.child.lock().unwrap().is_some() {
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
// Check if a voicebox server is already running on our port (from previous session with keep_running=true,
// or an externally started server e.g. via `python`, `uvicorn`, Docker, etc.)
#[cfg(unix)]
{
use std::process::Command;
if let Ok(output) = Command::new("lsof")
.args(["-i", &format!(":{}", SERVER_PORT), "-sTCP:LISTEN"])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines().skip(1) {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let command = parts[0];
let pid_str = parts[1];
if command.contains("voicebox") {
if let Ok(pid) = pid_str.parse::<u32>() {
println!("Found existing voicebox-server on port {} (PID: {}), reusing it", SERVER_PORT, pid);
// Store the PID so we can kill it on exit if needed
*state.server_pid.lock().unwrap() = Some(pid);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
} else {
// Process name doesn't contain "voicebox" — could be an external
// Python/uvicorn/Docker server. Verify via HTTP health check.
println!("Port {} in use by '{}' (PID: {}), checking if it's a Voicebox server...", SERVER_PORT, command, pid_str);
if check_health(SERVER_PORT) {
println!("Health check passed — reusing external server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
println!("Health check failed — port is occupied by a non-Voicebox process");
return Err(format!(
"Port {} is already in use by another application ({}). \
Close it or change the Voicebox server port.",
SERVER_PORT, command
));
}
}
}
}
}
#[cfg(windows)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Port is in use — check if it's a voicebox process by name first
if let Some(pid) = find_voicebox_pid_on_port(SERVER_PORT) {
println!("Found existing voicebox-server on port {} (PID: {}), reusing it", SERVER_PORT, pid);
*state.server_pid.lock().unwrap() = Some(pid);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
// Process name doesn't match — could be an external Python/Docker server.
// Verify via HTTP health check before giving up.
println!("Port {} in use by unknown process, checking if it's a Voicebox server...", SERVER_PORT);
if check_health(SERVER_PORT) {
println!("Health check passed — reusing external server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
return Err(format!(
"Port {} is already in use by another application. \
Close the other application or change the Voicebox port.",
SERVER_PORT
));
}
}
// Kill any orphaned voicebox-server from previous session on legacy port 8000
// This handles upgrades from older versions that used a fixed port
#[cfg(unix)]
{
use std::process::Command;
if let Ok(output) = Command::new("lsof")
.args(["-i", &format!(":{}", LEGACY_PORT), "-sTCP:LISTEN"])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines().skip(1) {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let command = parts[0];
let pid_str = parts[1];
if command.contains("voicebox") {
if let Ok(pid) = pid_str.parse::<i32>() {
println!("Found orphaned voicebox-server on legacy port {} (PID: {}, CMD: {}), killing it...", LEGACY_PORT, pid, command);
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
}
} else {
println!("Legacy port {} is in use by non-voicebox process: {} (PID: {}), not killing", LEGACY_PORT, command, pid_str);
}
}
}
}
}
#[cfg(windows)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", LEGACY_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
if let Some(pid) = find_voicebox_pid_on_port(LEGACY_PORT) {
println!("Found orphaned voicebox-server on legacy port {} (PID: {}), killing it...", LEGACY_PORT, pid);
let _ = std::process::Command::new("taskkill")
.args(["/PID", &pid.to_string(), "/T", "/F"])
.output();
}
}
}
// Brief wait for port to be released
std::thread::sleep(std::time::Duration::from_millis(200));
// Get app data directory
let data_dir = app
.path()
.app_data_dir()
.map_err(|e| format!("Failed to get app data dir: {}", e))?;
// Ensure data directory exists
std::fs::create_dir_all(&data_dir)
.map_err(|e| format!("Failed to create data dir: {}", e))?;
println!("=================================================================");
println!("Starting voicebox-server sidecar");
println!("Data directory: {:?}", data_dir);
println!("Remote mode: {}", remote.unwrap_or(false));
// Check for CUDA backend in data directory (onedir layout: backends/cuda/)
let cuda_binary = {
let cuda_dir = data_dir.join("backends").join("cuda");
let cuda_name = if cfg!(windows) {
"voicebox-server-cuda.exe"
} else {
"voicebox-server-cuda"
};
let exe_path = cuda_dir.join(cuda_name);
if exe_path.exists() {
println!("Found CUDA backend at {:?}", cuda_dir);
// Version check: run --version from the onedir directory so
// PyInstaller can find its support files for the fast --version path
let app_version = app.config().version.clone().unwrap_or_default();
let version_ok = match std::process::Command::new(&exe_path)
.arg("--version")
.current_dir(&cuda_dir)
.output()
{
Ok(output) => {
// Output format: "voicebox-server X.Y.Z\n"
let version_str = String::from_utf8_lossy(&output.stdout);
let binary_version = version_str.trim().split_whitespace().last().unwrap_or("");
if binary_version == app_version {
println!("CUDA binary version {} matches app version", binary_version);
true
} else {
println!(
"CUDA binary version mismatch: binary={}, app={}. Falling back to CPU.",
binary_version, app_version
);
false
}
}
Err(e) => {
println!("Failed to check CUDA binary version: {}. Falling back to CPU.", e);
false
}
};
if version_ok {
Some(exe_path)
} else {
None
}
} else {
println!("No CUDA backend found, using bundled CPU binary");
None
}
};
let sidecar_result = app.shell().sidecar("voicebox-server");
let mut sidecar = match sidecar_result {
Ok(s) => s,
Err(e) => {
eprintln!("Failed to get sidecar: {}", e);
// In dev mode, check if the server is already running (started manually)
#[cfg(debug_assertions)]
{
eprintln!("Dev mode: Checking if server is already running on port {}...", SERVER_PORT);
// Try to connect to the server port
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
println!("Found server already running on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: No server found on port {}", SERVER_PORT);
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
}
return Err(format!("Failed to start server. In dev mode, run 'bun run dev:server' in a separate terminal."));
}
};
println!("Sidecar command created successfully");
// Build common args
let data_dir_str = data_dir
.to_str()
.ok_or_else(|| "Invalid data dir path".to_string())?
.to_string();
let port_str = SERVER_PORT.to_string();
let parent_pid_str = std::process::id().to_string();
let is_remote = remote.unwrap_or(false);
// Resolve the custom models directory from the parameter or stored state
let effective_models_dir = models_dir.or_else(|| state.models_dir.lock().unwrap().clone());
if let Some(ref dir) = effective_models_dir {
println!("Custom models directory: {}", dir);
}
// If CUDA binary exists, launch it from the onedir directory.
// .current_dir() is critical: PyInstaller onedir expects all DLLs and
// support files (nvidia/, _internal/, etc.) relative to the exe.
let spawn_result = if let Some(ref cuda_path) = cuda_binary {
let cuda_dir = cuda_path.parent().unwrap();
println!("Launching CUDA backend: {:?} (cwd: {:?})", cuda_path, cuda_dir);
let mut cmd = app.shell().command(cuda_path.to_str().unwrap());
cmd = cmd.current_dir(cuda_dir);
cmd = cmd.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
if is_remote {
cmd = cmd.args(["--host", "0.0.0.0"]);
}
if let Some(ref dir) = effective_models_dir {
cmd = cmd.env("VOICEBOX_MODELS_DIR", dir);
}
cmd.spawn()
} else {
// Use the bundled CPU sidecar
sidecar = sidecar.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
if is_remote {
sidecar = sidecar.args(["--host", "0.0.0.0"]);
}
if let Some(ref dir) = effective_models_dir {
sidecar = sidecar.env("VOICEBOX_MODELS_DIR", dir);
}
println!("Spawning server process...");
sidecar.spawn()
};
let (mut rx, child) = match spawn_result {
Ok(result) => result,
Err(e) => {
eprintln!("Failed to spawn server process: {}", e);
// In dev mode, check if a manually-started server is available
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: Server binary failed to start");
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
return Err("Dev mode: Start server manually with 'bun run dev:server'".to_string());
}
#[cfg(not(debug_assertions))]
{
eprintln!("This could be due to:");
eprintln!(" - Missing or corrupted binary");
eprintln!(" - Missing execute permissions");
eprintln!(" - Code signing issues on macOS");
eprintln!(" - Missing dependencies");
return Err(format!("Failed to spawn: {}", e));
}
}
};
println!("Server process spawned, waiting for ready signal...");
println!("=================================================================");
// Store child process and PID
let process_pid = child.pid();
*state.server_pid.lock().unwrap() = Some(process_pid);
*state.child.lock().unwrap() = Some(child);
// Wait for server to be ready by listening for startup log
// PyInstaller bundles can be slow on first import, especially torch/transformers
let timeout = tokio::time::Duration::from_secs(120);
let start_time = tokio::time::Instant::now();
let mut error_output = Vec::new();
loop {
if start_time.elapsed() > timeout {
eprintln!("Server startup timeout after 120 seconds");
if !error_output.is_empty() {
eprintln!("Collected error output:");
for line in &error_output {
eprintln!(" {}", line);
}
}
// In dev mode, check if a manual server came up during the wait
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Kill the placeholder process
let _ = state.child.lock().unwrap().take();
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
}
return Err("Server startup timeout - check Console.app for detailed logs".to_string());
}
match tokio::time::timeout(tokio::time::Duration::from_millis(100), rx.recv()).await {
Ok(Some(event)) => {
match event {
tauri_plugin_shell::process::CommandEvent::Stdout(line) => {
let line_str = String::from_utf8_lossy(&line);
println!("Server output: {}", line_str);
let _ = app.emit("server-log", serde_json::json!({
"stream": "stdout",
"line": line_str.trim_end(),
}));
if line_str.contains("Uvicorn running") || line_str.contains("Application startup complete") {
println!("Server is ready!");
break;
}
}
tauri_plugin_shell::process::CommandEvent::Stderr(line) => {
let line_str = String::from_utf8_lossy(&line).to_string();
eprintln!("Server: {}", line_str);
let _ = app.emit("server-log", serde_json::json!({
"stream": "stderr",
"line": line_str.trim_end(),
}));
// Collect error lines for debugging
if line_str.contains("ERROR") || line_str.contains("Error") || line_str.contains("Failed") {
error_output.push(line_str.clone());
}
// Uvicorn logs to stderr, so check there too
if line_str.contains("Uvicorn running") || line_str.contains("Application startup complete") {
println!("Server is ready!");
break;
}
}
_ => {}
}
}
Ok(None) => {
// In dev mode, this is expected when using the placeholder binary
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
eprintln!("Server process ended (dev mode placeholder detected)");
// Check if a manually-started server is available
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Clean up state
let _ = state.child.lock().unwrap().take();
let _ = state.server_pid.lock().unwrap().take();
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: No bundled server binary available");
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
return Err("Dev mode: Start server manually with 'bun run dev:server'".to_string());
}
#[cfg(not(debug_assertions))]
{
eprintln!("Server process ended unexpectedly during startup!");
eprintln!("The server binary may have crashed or exited with an error.");
eprintln!("Check Console.app logs for more details (search for 'voicebox')");
return Err("Server process ended unexpectedly".to_string());
}
}
Err(_) => {
// Timeout on this recv, continue loop
continue;
}
}
}
// Spawn task to continue reading output and emit to frontend
let app_handle = app.clone();
tokio::spawn(async move {
while let Some(event) = rx.recv().await {
match event {
tauri_plugin_shell::process::CommandEvent::Stdout(line) => {
let line_str = String::from_utf8_lossy(&line);
println!("Server: {}", line_str);
let _ = app_handle.emit("server-log", serde_json::json!({
"stream": "stdout",
"line": line_str.trim_end(),
}));
}
tauri_plugin_shell::process::CommandEvent::Stderr(line) => {
let line_str = String::from_utf8_lossy(&line);
eprintln!("Server error: {}", line_str);
let _ = app_handle.emit("server-log", serde_json::json!({
"stream": "stderr",
"line": line_str.trim_end(),
}));
}
_ => {}
}
}
});
Ok(format!("http://127.0.0.1:{}", SERVER_PORT))
}
#[command]
async fn stop_server(state: State<'_, ServerState>) -> Result<(), String> {
let pid = state.server_pid.lock().unwrap().take();
let _child = state.child.lock().unwrap().take();
if let Some(pid) = pid {
println!("stop_server: Stopping server with PID: {}", pid);
#[cfg(unix)]
{
use std::process::Command;
// Kill process group with SIGTERM first
let _ = Command::new("kill")
.args(["-TERM", "--", &format!("-{}", pid)])
.output();
// Brief wait then force kill
std::thread::sleep(std::time::Duration::from_millis(100));
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
println!("stop_server: Process group kill completed");
}
#[cfg(windows)]
{
// Send graceful shutdown via HTTP — the server's parent-pid watchdog
// will also handle cleanup if this app process exits.
println!("Sending graceful shutdown via HTTP...");
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(2))
.build()
.unwrap();
let _ = client
.post(&format!("http://127.0.0.1:{}/shutdown", SERVER_PORT))
.send();
println!("Shutdown request sent (server watchdog will handle cleanup)");
}
}
Ok(())
}
#[command]
async fn restart_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
models_dir: Option<String>,
) -> Result<String, String> {
println!("restart_server: stopping current server...");
// Update stored models_dir: empty string means reset to default, non-empty means set
if let Some(ref dir) = models_dir {
if dir.is_empty() {
*state.models_dir.lock().unwrap() = None;
} else {
*state.models_dir.lock().unwrap() = Some(dir.clone());
}
}
// Stop the current server
stop_server(state.clone()).await?;
// Wait for port to be released
println!("restart_server: waiting for port release...");
tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
// Start server again (will auto-detect CUDA binary and use stored models_dir)
println!("restart_server: starting server...");
start_server(app, state, None, None).await
}
#[command]
fn set_keep_server_running(state: State<'_, ServerState>, keep_running: bool) {
println!("set_keep_server_running called with: {}", keep_running);
*state.keep_running_on_close.lock().unwrap() = keep_running;
}
#[command]
async fn start_system_audio_capture(
state: State<'_, audio_capture::AudioCaptureState>,
max_duration_secs: u32,
) -> Result<(), String> {
audio_capture::start_capture(&state, max_duration_secs).await
}
#[command]
async fn stop_system_audio_capture(
state: State<'_, audio_capture::AudioCaptureState>,
) -> Result<String, String> {
audio_capture::stop_capture(&state).await
}
#[command]
fn is_system_audio_supported() -> bool {
audio_capture::is_supported()
}
#[command]
fn list_audio_output_devices(
state: State<'_, audio_output::AudioOutputState>,
) -> Result<Vec<audio_output::AudioOutputDevice>, String> {
state.list_output_devices()
}
#[command]
async fn play_audio_to_devices(
state: State<'_, audio_output::AudioOutputState>,
audio_data: Vec<u8>,
device_ids: Vec<String>,
) -> Result<(), String> {
state.play_audio_to_devices(audio_data, device_ids).await
}
#[command]
fn stop_audio_playback(
state: State<'_, audio_output::AudioOutputState>,
) -> Result<(), String> {
state.stop_all_playback()
}
#[cfg_attr(mobile, tauri::mobile_entry_point)]
pub fn run() {
tauri::Builder::default()
.plugin(tauri_plugin_dialog::init())
.plugin(tauri_plugin_fs::init())
.plugin(tauri_plugin_shell::init())
.manage(ServerState {
child: Mutex::new(None),
server_pid: Mutex::new(None),
keep_running_on_close: Mutex::new(false),
models_dir: Mutex::new(None),
})
.manage(audio_capture::AudioCaptureState::new())
.manage(audio_output::AudioOutputState::new())
.setup(|app| {
#[cfg(desktop)]
{
app.handle().plugin(tauri_plugin_updater::Builder::new().build())?;
app.handle().plugin(tauri_plugin_process::init())?;
}
// Hide title bar icon on Windows
#[cfg(windows)]
{
use windows::Win32::Foundation::HWND;
use windows::Win32::UI::WindowsAndMessaging::{SetClassLongPtrW, GCLP_HICON, GCLP_HICONSM};
if let Some((_, window)) = app.webview_windows().iter().next() {
if let Ok(hwnd) = window.hwnd() {
let hwnd = HWND(hwnd.0);
unsafe {
// Set both small and regular icons to NULL to hide the title bar icon
SetClassLongPtrW(hwnd, GCLP_HICON, 0);
SetClassLongPtrW(hwnd, GCLP_HICONSM, 0);
}
}
}
}
// Enable microphone access on Linux (WebKitGTK denies getUserMedia by default)
#[cfg(target_os = "linux")]
{
use tauri::Manager;
if let Some(window) = app.get_webview_window("main") {
let _ = window.with_webview(|webview| {
use webkit2gtk::{WebViewExt, SettingsExt, PermissionRequestExt};
use webkit2gtk::glib::ObjectExt;
let wk_webview = webview.inner();
// Enable media stream support in WebKitGTK settings
if let Some(settings) = WebViewExt::settings(&wk_webview) {
settings.set_enable_media_stream(true);
}
// Auto-grant UserMediaPermissionRequest (microphone access)
// Only for trusted local origins (Tauri dev server or custom protocol)
wk_webview.connect_permission_request(move |webview, request: &webkit2gtk::PermissionRequest| {
if request.is::<webkit2gtk::UserMediaPermissionRequest>() {
let uri = WebViewExt::uri(webview).unwrap_or_default();
let is_trusted = uri.starts_with("tauri://")
|| uri.starts_with("https://tauri.localhost")
|| uri.starts_with("http://localhost")
|| uri.starts_with("http://127.0.0.1");
if is_trusted {
request.allow();
return true;
}
request.deny();
return true;
}
false
});
});
}
}
Ok(())
})
.invoke_handler(tauri::generate_handler![
start_server,
stop_server,
restart_server,
set_keep_server_running,
start_system_audio_capture,
stop_system_audio_capture,
is_system_audio_supported,
list_audio_output_devices,
play_audio_to_devices,
stop_audio_playback
])
.on_window_event({
let closing = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
move |window, event| {
if let WindowEvent::CloseRequested { api, .. } = event {
// If we're already in the close flow, let it proceed
if closing.load(std::sync::atomic::Ordering::SeqCst) {
return;
}
closing.store(true, std::sync::atomic::Ordering::SeqCst);
// Prevent automatic close so frontend can clean up
api.prevent_close();
// Emit event to frontend to check setting and stop server if needed
let app_handle = window.app_handle();
if let Err(e) = app_handle.emit("window-close-requested", ()) {
eprintln!("Failed to emit window-close-requested event: {}", e);
window.close().ok();
return;
}
// Set up listener for frontend response
let window_for_close = window.clone();
let closing_for_timeout = closing.clone();
let (tx, mut rx) = mpsc::unbounded_channel::<()>();
let listener_id = window.listen("window-close-allowed", move |_| {
let _ = tx.send(());
});
tauri::async_runtime::spawn(async move {
tokio::select! {
_ = rx.recv() => {
window_for_close.close().ok();
}
_ = tokio::time::sleep(tokio::time::Duration::from_secs(5)) => {
eprintln!("Window close timeout, closing anyway");
window_for_close.close().ok();
}
}
window_for_close.unlisten(listener_id);
closing_for_timeout.store(false, std::sync::atomic::Ordering::SeqCst);
});
}
}})
.build(tauri::generate_context!())
.expect("error while building tauri application")
.run(|app, event| {
let _ = &app; // used on unix
match &event {
RunEvent::Exit => {
let state = app.state::<ServerState>();
let keep_running = *state.keep_running_on_close.lock().unwrap();
let has_pid = state.server_pid.lock().unwrap().is_some();
println!("RunEvent::Exit — keep_running={}, has_pid={}", keep_running, has_pid);
if keep_running {
// Tell the server to disable its watchdog so it survives
// after this process exits.
println!("Keep server running: disabling watchdog...");
// Write a sentinel file as a reliable fallback. On Windows
// the HTTP request below can race with process exit, leaving
// the watchdog unaware it should stay alive. The sentinel
// file is checked during the watchdog grace period.
let data_dir = app
.path()
.app_data_dir()
.unwrap_or_default();
let sentinel = data_dir.join(".keep-running");
if let Err(e) = std::fs::write(&sentinel, b"1") {
eprintln!("Failed to write keep-running sentinel: {}", e);
} else {
println!("Wrote keep-running sentinel to {:?}", sentinel);
}
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(2))
.build()
.unwrap();
match client
.post(&format!("http://127.0.0.1:{}/watchdog/disable", SERVER_PORT))
.send()
{
Ok(resp) => println!("Watchdog disable response: {}", resp.status()),
Err(e) => eprintln!("Failed to disable watchdog: {}", e),
}
} else {
// Server will self-terminate via parent-pid watchdog when
// this process exits. On Unix, also send SIGTERM for
// immediate cleanup.
println!("RunEvent::Exit - server will self-terminate via watchdog");
#[cfg(unix)]
{
if let Some(pid) = state.server_pid.lock().unwrap().take() {
use std::process::Command;
let _ = Command::new("kill")
.args(["-TERM", "--", &format!("-{}", pid)])
.output();
std::thread::sleep(std::time::Duration::from_millis(100));
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
}
}
}
}
RunEvent::ExitRequested { api, .. } => {
println!("RunEvent::ExitRequested received");
// Don't prevent exit, just log it
let _ = api;
}
_ => {}
}
});
}
fn main() {
run();
}

View File

@@ -0,0 +1,66 @@
{
"$schema": "https://schema.tauri.app/config/2",
"productName": "Voicebox",
"version": "0.4.5",
"identifier": "sh.voicebox.app",
"build": {
"beforeDevCommand": "bun run dev",
"beforeBuildCommand": "bun run build",
"frontendDist": "../dist",
"devUrl": "http://localhost:5173"
},
"bundle": {
"active": true,
"targets": "all",
"createUpdaterArtifacts": "v1Compatible",
"externalBin": ["binaries/voicebox-server"],
"icon": [
"icons/32x32.png",
"icons/128x128.png",
"icons/128x128@2x.png",
"icons/icon.icns",
"icons/icon.ico"
],
"macOS": {
"frameworks": [],
"minimumSystemVersion": "11.0",
"infoPlist": "Info.plist",
"entitlements": "Entitlements.plist"
},
"resources": {
"gen/Assets.car": "./",
"gen/voicebox.icns": "./",
"gen/partial.plist": "./"
}
},
"app": {
"security": {
"csp": null,
"capabilities": ["default"]
},
"windows": [
{
"title": "",
"width": 1200,
"height": 800,
"minWidth": 800,
"minHeight": 600,
"resizable": true,
"fullscreen": false,
"devtools": true,
"userAgent": null,
"titleBarStyle": "Overlay"
}
],
"withGlobalTauri": true
},
"plugins": {
"shell": {
"open": ".*"
},
"updater": {
"pubkey": "dW50cnVzdGVkIGNvbW1lbnQ6IG1pbmlzaWduIHB1YmxpYyBrZXk6IEUxRENBQkRBQjdBNTM1OTIKUldTU05hVzMycXZjNGJGcUxmcVVocll2QjdSaTJNdlFxR2M3VDJsMnVvbDdyZGRPMmRlOW9aWTcK",
"endpoints": ["https://github.com/jamiepine/voicebox/releases/latest/download/latest.json"]
}
}
}

View File

@@ -0,0 +1,59 @@
// NOTE: This test requires system audio to be playing during execution.
// To run this test successfully:
// 1. Start playing audio (music, video, etc.)
// 2. Run: cargo test --test audio_capture_test -- --nocapture
// 3. The test will capture audio for 5 seconds and verify the output
use voicebox::audio_capture::{AudioCaptureState, start_capture, stop_capture};
use base64::Engine;
#[tokio::test]
async fn test_system_audio_capture() {
// Create AudioCaptureState
let state = AudioCaptureState::new();
println!("Starting system audio capture with 5 second max duration...");
// Start capture with 5 second max duration
let result = start_capture(&state, 5).await;
if let Err(e) = result {
panic!("Failed to start capture: {}", e);
}
println!("Capture started, waiting 5 seconds...");
// Wait 5 seconds for capture to complete
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
println!("Stopping capture...");
// Stop capture and get the result
let audio_data = stop_capture(&state).await;
match audio_data {
Ok(base64_wav) => {
println!("Capture stopped successfully");
// Validate the returned base64 WAV data
println!("Validating base64 WAV data...");
// Decode base64 to bytes
let decoded_bytes = base64::engine::general_purpose::STANDARD
.decode(&base64_wav)
.expect("Failed to decode base64 data");
// Verify bytes array is not empty
assert!(!decoded_bytes.is_empty(), "Decoded bytes array is empty");
// Confirm data has content (length > 0)
println!("WAV data length: {} bytes", decoded_bytes.len());
assert!(decoded_bytes.len() > 0, "WAV data has no content");
println!("✓ Test passed: Audio capture produced valid WAV data");
}
Err(e) => {
panic!("Failed to stop capture or get audio data: {}", e);
}
}
}