Initial commit

This commit is contained in:
2026-04-24 19:18:15 +08:00
commit fbcbe08696
555 changed files with 96692 additions and 0 deletions

View File

@@ -0,0 +1,406 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::{SampleFormat, StreamConfig};
use hound::{WavSpec, WavWriter};
use std::io::Cursor;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
/// Try to find a PulseAudio/PipeWire monitor source using `pactl`.
/// Returns the source name (e.g. "alsa_output.pci-0000_0d_00.6.analog-stereo.monitor") if found.
fn find_monitor_source_via_pactl() -> Option<String> {
let output = std::process::Command::new("pactl")
.args(["list", "short", "sources"])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
// First, try to find the monitor of the default sink
let default_sink = std::process::Command::new("pactl")
.args(["get-default-sink"])
.output()
.ok()
.and_then(|o| {
if o.status.success() {
Some(String::from_utf8_lossy(&o.stdout).trim().to_string())
} else {
None
}
});
// If we know the default sink, look for its .monitor specifically
if let Some(sink_name) = &default_sink {
let monitor_name = format!("{}.monitor", sink_name);
for line in stdout.lines() {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 && parts[1] == monitor_name {
eprintln!(
"Linux audio capture: Found default sink monitor via pactl: {}",
monitor_name
);
return Some(monitor_name);
}
}
}
// Fallback: find any .monitor source
for line in stdout.lines() {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() >= 2 && parts[1].ends_with(".monitor") {
let name = parts[1].to_string();
eprintln!(
"Linux audio capture: Found monitor source via pactl: {}",
name
);
return Some(name);
}
}
None
}
/// Start capturing system audio on Linux using PulseAudio monitor sources.
///
/// On modern Linux with PulseAudio or PipeWire, we first try to detect the
/// monitor source via `pactl` and set the `PULSE_SOURCE` environment variable.
/// This tells PulseAudio's ALSA plugin to use the monitor as the default input
/// source for this process. If `pactl` is unavailable, we fall back to searching
/// cpal device names for "monitor".
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
// Reset previous samples
state.reset();
let samples = state.samples.clone();
let sample_rate_arc = state.sample_rate.clone();
let channels_arc = state.channels.clone();
let stop_tx = state.stop_tx.clone();
let error_arc = state.error.clone();
// Use AtomicBool for stop signal (works across threads)
let stop_flag = Arc::new(AtomicBool::new(false));
let stop_flag_clone = stop_flag.clone();
// Create tokio channel and spawn a task to bridge it to the AtomicBool
let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(1);
*stop_tx.lock().unwrap() = Some(tx);
tokio::spawn(async move {
rx.recv().await;
stop_flag_clone.store(true, Ordering::Relaxed);
});
// Spawn capture on a dedicated thread
thread::spawn(move || {
// Try to set PULSE_SOURCE to a monitor before initializing cpal.
// This tells PulseAudio/PipeWire's ALSA plugin to use the monitor
// as the default input source for this process.
let monitor_source = find_monitor_source_via_pactl();
if let Some(ref source_name) = monitor_source {
eprintln!(
"Linux audio capture: Setting PULSE_SOURCE={}",
source_name
);
std::env::set_var("PULSE_SOURCE", source_name);
}
let host = cpal::default_host();
// Select the capture device.
// If PULSE_SOURCE was set, the default input device IS the monitor.
// Otherwise, fall back to searching device names for "monitor".
let device = if monitor_source.is_some() {
// PULSE_SOURCE was set — default input IS the monitor now
match host.default_input_device() {
Some(d) => {
let name = d.name().unwrap_or_default();
eprintln!(
"Linux audio capture: Using PULSE_SOURCE monitor device: {}",
name
);
d
}
None => {
let error_msg = "No audio input device available".to_string();
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
}
} else {
// pactl not available — try to find monitor by name (original approach)
let mut monitor_device = None;
if let Ok(devices) = host.input_devices() {
for d in devices {
if let Ok(name) = d.name() {
let name_lower = name.to_lowercase();
if name_lower.contains("monitor") {
eprintln!(
"Linux audio capture: Found monitor device by name: {}",
name
);
monitor_device = Some(d);
break;
}
}
}
}
match monitor_device {
Some(d) => d,
None => {
eprintln!("Linux audio capture: No monitor device found, falling back to default input");
match host.default_input_device() {
Some(d) => d,
None => {
let error_msg = "No audio input device available".to_string();
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
}
}
}
};
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("Linux audio capture: Using device: {}", device_name);
// Get supported config
let config = match device.default_input_config() {
Ok(c) => c,
Err(e) => {
let error_msg = format!("Failed to get default input config: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let sample_rate = config.sample_rate().0;
let channels = config.channels();
let sample_format = config.sample_format();
eprintln!(
"Linux audio capture: Config - {}Hz, {} channels, format: {:?}",
sample_rate, channels, sample_format
);
*sample_rate_arc.lock().unwrap() = sample_rate;
*channels_arc.lock().unwrap() = channels;
let stream_config = StreamConfig {
channels,
sample_rate: cpal::SampleRate(sample_rate),
buffer_size: cpal::BufferSize::Default,
};
let samples_clone = samples.clone();
let error_arc_clone = error_arc.clone();
let stop_flag_for_stream = stop_flag.clone();
let err_fn = {
let error_arc = error_arc.clone();
move |err: cpal::StreamError| {
let error_msg = format!("Stream error: {}", err);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
}
};
let stream = match sample_format {
SampleFormat::F32 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[f32], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
guard.extend_from_slice(data);
},
err_fn,
None,
)
}
SampleFormat::I16 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[i16], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
for &s in data {
guard.push(s as f32 / 32768.0);
}
},
err_fn,
None,
)
}
SampleFormat::U16 => {
let samples = samples_clone.clone();
let stop = stop_flag_for_stream.clone();
device.build_input_stream(
&stream_config,
move |data: &[u16], _: &cpal::InputCallbackInfo| {
if stop.load(Ordering::Relaxed) {
return;
}
let mut guard = samples.lock().unwrap();
for &s in data {
guard.push((s as f32 / 32768.0) - 1.0);
}
},
err_fn,
None,
)
}
_ => {
let error_msg = format!("Unsupported sample format: {:?}", sample_format);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
};
let stream = match stream {
Ok(s) => s,
Err(e) => {
let error_msg = format!("Failed to build input stream: {}", e);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
};
if let Err(e) = stream.play() {
let error_msg = format!("Failed to start stream: {}", e);
eprintln!("{}", error_msg);
*error_arc_clone.lock().unwrap() = Some(error_msg);
return;
}
eprintln!("Linux audio capture: Stream started successfully");
// Keep thread alive until stop signal
loop {
if stop_flag.load(Ordering::Relaxed) {
break;
}
std::thread::sleep(std::time::Duration::from_millis(100));
}
// Stream will be dropped here, stopping capture
eprintln!("Linux audio capture: Stream stopped");
});
// Spawn timeout task
let stop_tx_clone = state.stop_tx.clone();
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)).await;
let tx = stop_tx_clone.lock().unwrap().take();
if let Some(tx) = tx {
let _ = tx.send(()).await;
}
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Check if there was an error during capture
if let Some(error) = state.error.lock().unwrap().as_ref() {
return Err(error.clone());
}
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err(
"No audio samples captured. Make sure audio is playing on your system during recording."
.to_string(),
);
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
// Check via pactl first (most reliable on modern Linux)
if find_monitor_source_via_pactl().is_some() {
return true;
}
// Fallback: check cpal devices
let host = cpal::default_host();
if let Ok(devices) = host.input_devices() {
for d in devices {
if let Ok(name) = d.name() {
if name.to_lowercase().contains("monitor") {
return true;
}
}
}
}
host.default_input_device().is_some()
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer =
WavWriter::new(cursor, spec).map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer
.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer
.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,265 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use hound::{WavSpec, WavWriter};
use screencapturekit::{
cm::CMSampleBuffer,
shareable_content::SCShareableContent,
stream::{
configuration::SCStreamConfiguration,
content_filter::SCContentFilter,
output_trait::SCStreamOutputTrait,
output_type::SCStreamOutputType,
sc_stream::SCStream,
},
};
use std::io::Cursor;
use std::process::Command;
use std::sync::{Arc, Mutex};
use tokio::sync::mpsc;
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
if !is_supported() {
return Err("System audio capture requires macOS 12.3 or newer.".to_string());
}
// Reset previous samples
state.reset();
// Get shareable content
let content = SCShareableContent::get()
.map_err(|e| format!("Failed to get shareable content: {}", e))?;
// Get first display
let displays = content.displays();
if displays.is_empty() {
return Err("No displays available".to_string());
}
let display = &displays[0];
// Create content filter for desktop audio
let filter = SCContentFilter::create()
.with_display(display)
.with_excluding_windows(&[])
.build();
// Create stream configuration - audio only
let mut config = SCStreamConfiguration::default();
config.set_captures_audio(true);
config.set_excludes_current_process_audio(false);
config.set_sample_rate(48000); // Use i32 directly
config.set_channel_count(2); // Use i32 directly
// Create stream using builder
let (tx, mut rx) = mpsc::channel::<()>(1);
*state.stop_tx.lock().unwrap() = Some(tx);
let samples = state.samples.clone();
let sample_rate = state.sample_rate.clone();
let channels = state.channels.clone();
// Set sample rate and channels
*sample_rate.lock().unwrap() = 48000;
*channels.lock().unwrap() = 2;
// Create output handler struct
struct AudioHandler {
samples: Arc<Mutex<Vec<f32>>>,
}
impl SCStreamOutputTrait for AudioHandler {
fn did_output_sample_buffer(
&self,
sample: CMSampleBuffer,
_type: SCStreamOutputType,
) {
if _type == SCStreamOutputType::Audio {
if let Ok(audio_samples) = extract_audio_samples(sample) {
let mut samples_guard = self.samples.lock().unwrap();
samples_guard.extend_from_slice(&audio_samples);
}
}
}
}
let handler = AudioHandler {
samples: samples.clone(),
};
// Create stream
let mut stream = SCStream::new(&filter, &config);
// Add output handler for audio (order: handler, then output_type)
stream.add_output_handler(handler, SCStreamOutputType::Audio);
// Store stream reference
*state.stream.lock().unwrap() = Some(stream.clone());
stream.start_capture().map_err(|e| format!("Failed to start capture: {}", e))?;
// Spawn task to stop after max duration
let stream_clone = stream.clone();
tokio::spawn(async move {
tokio::select! {
_ = tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)) => {
// Timeout reached
}
_ = rx.recv() => {
// Manual stop
}
}
let _ = stream_clone.stop_capture();
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Stop stream if still active
if let Some(stream) = state.stream.lock().unwrap().take() {
let _ = stream.stop_capture();
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err("No audio samples captured".to_string());
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
macos_version_at_least(12, 3)
}
fn macos_version_at_least(required_major: u64, required_minor: u64) -> bool {
let output = match Command::new("sw_vers").arg("-productVersion").output() {
Ok(output) if output.status.success() => output,
_ => return false,
};
let version = String::from_utf8_lossy(&output.stdout);
let mut parts = version.trim().split('.');
let major = parts.next().and_then(|part| part.parse::<u64>().ok()).unwrap_or(0);
let minor = parts.next().and_then(|part| part.parse::<u64>().ok()).unwrap_or(0);
major > required_major || (major == required_major && minor >= required_minor)
}
fn extract_audio_samples(sample_buffer: CMSampleBuffer) -> Result<Vec<f32>, String> {
// Use the crate's built-in method to get audio buffer list
let audio_buffer_list = sample_buffer
.audio_buffer_list()
.ok_or_else(|| "Failed to get audio buffer list".to_string())?;
let buffers: Vec<_> = audio_buffer_list.iter().collect();
let num_buffers = buffers.len();
if num_buffers == 0 {
return Ok(Vec::new());
}
// ScreenCaptureKit on macOS provides audio in Float32 format
// The audio can be either:
// - Interleaved (1 buffer with L,R,L,R,... samples)
// - Planar (2 buffers, one for L channel, one for R channel)
if num_buffers == 1 {
// Interleaved stereo or mono in a single buffer
let buffer = &buffers[0];
let data_bytes = buffer.data();
let num_samples = data_bytes.len() / std::mem::size_of::<f32>();
if num_samples > 0 {
unsafe {
let data_ptr = data_bytes.as_ptr() as *const f32;
let data = std::slice::from_raw_parts(data_ptr, num_samples);
return Ok(data.to_vec());
}
}
} else {
// Planar format - separate buffer for each channel
// We need to interleave them: L0, R0, L1, R1, ...
let mut channel_data: Vec<Vec<f32>> = Vec::new();
let mut max_samples = 0;
for buffer in &buffers {
let data_bytes = buffer.data();
let num_samples = data_bytes.len() / std::mem::size_of::<f32>();
if num_samples > 0 {
unsafe {
let data_ptr = data_bytes.as_ptr() as *const f32;
let data = std::slice::from_raw_parts(data_ptr, num_samples);
channel_data.push(data.to_vec());
max_samples = max_samples.max(num_samples);
}
}
}
// Interleave the channels
let mut interleaved = Vec::with_capacity(max_samples * num_buffers);
for i in 0..max_samples {
for channel in &channel_data {
if i < channel.len() {
interleaved.push(channel[i]);
} else {
interleaved.push(0.0); // Pad with silence if needed
}
}
}
return Ok(interleaved);
}
Ok(Vec::new())
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = WavWriter::new(cursor, spec)
.map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,47 @@
#[cfg(target_os = "macos")]
mod macos;
#[cfg(target_os = "windows")]
mod windows;
#[cfg(target_os = "linux")]
mod linux;
#[cfg(target_os = "macos")]
pub use macos::*;
#[cfg(target_os = "windows")]
pub use windows::*;
#[cfg(target_os = "linux")]
pub use linux::*;
use std::sync::{Arc, Mutex};
#[cfg(target_os = "macos")]
use screencapturekit::stream::sc_stream::SCStream;
pub struct AudioCaptureState {
pub samples: Arc<Mutex<Vec<f32>>>,
pub sample_rate: Arc<Mutex<u32>>,
pub channels: Arc<Mutex<u16>>,
pub stop_tx: Arc<Mutex<Option<tokio::sync::mpsc::Sender<()>>>>,
pub error: Arc<Mutex<Option<String>>>,
#[cfg(target_os = "macos")]
pub stream: Arc<Mutex<Option<SCStream>>>,
}
impl AudioCaptureState {
pub fn new() -> Self {
Self {
samples: Arc::new(Mutex::new(Vec::new())),
sample_rate: Arc::new(Mutex::new(44100)),
channels: Arc::new(Mutex::new(2)),
stop_tx: Arc::new(Mutex::new(None)),
error: Arc::new(Mutex::new(None)),
#[cfg(target_os = "macos")]
stream: Arc::new(Mutex::new(None)),
}
}
pub fn reset(&self) {
*self.samples.lock().unwrap() = Vec::new();
*self.error.lock().unwrap() = None;
}
}

View File

@@ -0,0 +1,288 @@
use crate::audio_capture::AudioCaptureState;
use base64::{engine::general_purpose, Engine as _};
use hound::{WavSpec, WavWriter};
use std::io::Cursor;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::thread;
use wasapi::*;
use windows::Win32::System::Com::{CoInitializeEx, CoUninitialize, COINIT_MULTITHREADED};
pub async fn start_capture(
state: &AudioCaptureState,
max_duration_secs: u32,
) -> Result<(), String> {
// Reset previous samples
state.reset();
let samples = state.samples.clone();
let sample_rate_arc = state.sample_rate.clone();
let channels_arc = state.channels.clone();
let stop_tx = state.stop_tx.clone();
let error_arc = state.error.clone();
// Use AtomicBool for stop signal (works with non-Send types)
let stop_flag = Arc::new(AtomicBool::new(false));
let stop_flag_clone = stop_flag.clone();
// Create tokio channel and spawn a task to bridge it to the AtomicBool
let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(1);
*stop_tx.lock().unwrap() = Some(tx);
tokio::spawn(async move {
rx.recv().await;
stop_flag_clone.store(true, Ordering::Relaxed);
});
// Spawn capture task on a dedicated thread (WASAPI COM objects are not Send)
// All WASAPI objects must be created and used on the same thread
thread::spawn(move || {
// Initialize COM for this thread
unsafe {
let hr = CoInitializeEx(None, COINIT_MULTITHREADED);
if hr.is_err() {
eprintln!("Failed to initialize COM: {:?}", hr);
return;
}
}
// Ensure COM is uninitialized when thread exits
let _com_guard = scopeguard::guard((), |_| unsafe {
CoUninitialize();
});
// Initialize WASAPI on this thread
let device = match DeviceEnumerator::new()
.and_then(|enumerator| enumerator.get_default_device(&Direction::Render))
{
Ok(d) => d,
Err(e) => {
let error_msg = format!("Failed to get audio device: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let mut audio_client = match device.get_iaudioclient() {
Ok(client) => client,
Err(e) => {
let error_msg = format!("Failed to get audio client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
let mix_format = match audio_client.get_mixformat() {
Ok(format) => format,
Err(e) => {
let error_msg = format!("Failed to get mix format: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
// Set sample rate and channels
let channels = mix_format.get_nchannels() as usize;
let bytes_per_sample = (mix_format.get_bitspersample() / 8) as usize;
*sample_rate_arc.lock().unwrap() = mix_format.get_samplespersec();
*channels_arc.lock().unwrap() = mix_format.get_nchannels();
// Get device period
let (_def_period, min_period) = match audio_client.get_device_period() {
Ok(periods) => periods,
Err(e) => {
eprintln!("Failed to get device period: {}", e);
return;
}
};
// Initialize audio client for loopback with StreamMode
// For loopback mode: get Render device, initialize with Capture direction
// This triggers AUDCLNT_STREAMFLAGS_LOOPBACK in the wasapi crate
let stream_mode = StreamMode::EventsShared {
autoconvert: true, // Enable automatic format conversion
buffer_duration_hns: min_period, // Use minimum period
};
if let Err(e) = audio_client.initialize_client(&mix_format, &Direction::Capture, &stream_mode) {
let error_msg = format!("Failed to initialize audio client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
// Set up event handle for EventsShared mode
let h_event = match audio_client.set_get_eventhandle() {
Ok(event) => event,
Err(e) => {
eprintln!("Failed to set event handle: {}", e);
return;
}
};
let capture_client = match audio_client.get_audiocaptureclient() {
Ok(client) => client,
Err(e) => {
let error_msg = format!("Failed to get capture client: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
};
if let Err(e) = audio_client.start_stream() {
let error_msg = format!("Failed to start stream: {}", e);
eprintln!("{}", error_msg);
*error_arc.lock().unwrap() = Some(error_msg);
return;
}
loop {
// Check if stop signal was received
if stop_flag.load(Ordering::Relaxed) {
break;
}
// Try to get available data
match capture_client.get_next_packet_size() {
Ok(Some(frames_available)) => {
if frames_available > 0 {
// Calculate buffer size needed (frames * channels * bytes_per_sample)
let buffer_size = frames_available as usize * channels * bytes_per_sample;
let mut buffer = vec![0u8; buffer_size];
match capture_client.read_from_device(&mut buffer) {
Ok((frames_read, _buffer_info)) => {
if frames_read > 0 {
// Convert bytes to f32 samples
let samples_read = (frames_read as usize * channels) as usize;
let mut samples_guard = samples.lock().unwrap();
// Assuming 32-bit float format
if bytes_per_sample == 4 {
for i in 0..samples_read {
let byte_offset = i * 4;
if byte_offset + 4 <= buffer.len() {
let sample = f32::from_le_bytes([
buffer[byte_offset],
buffer[byte_offset + 1],
buffer[byte_offset + 2],
buffer[byte_offset + 3],
]);
samples_guard.push(sample);
}
}
}
}
}
Err(e) => {
eprintln!("Error reading from device: {}", e);
}
}
}
}
Ok(None) => {
// Exclusive mode - handle differently if needed
}
Err(e) => {
eprintln!("Error getting next packet size: {}", e);
}
}
// Wait for event signal (with timeout to allow checking stop flag)
if h_event.wait_for_event(100).is_err() {
// Timeout is expected - just continue to check stop flag
}
}
// Stop the stream when done
audio_client.stop_stream().ok();
});
// Spawn timeout task
let stop_tx_clone = state.stop_tx.clone();
tokio::spawn(async move {
tokio::time::sleep(tokio::time::Duration::from_secs(max_duration_secs as u64)).await;
// Take the sender out of the mutex before awaiting
let tx = stop_tx_clone.lock().unwrap().take();
if let Some(tx) = tx {
let _ = tx.send(()).await;
}
});
Ok(())
}
pub async fn stop_capture(state: &AudioCaptureState) -> Result<String, String> {
// Signal stop
if let Some(tx) = state.stop_tx.lock().unwrap().take() {
let _ = tx.send(());
}
// Wait a bit for capture to stop
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
// Check if there was an error during capture
if let Some(error) = state.error.lock().unwrap().as_ref() {
return Err(error.clone());
}
// Get samples
let samples = state.samples.lock().unwrap().clone();
let sample_rate = *state.sample_rate.lock().unwrap();
let channels = *state.channels.lock().unwrap();
if samples.is_empty() {
return Err("No audio samples captured. Make sure audio is playing on your system during recording.".to_string());
}
// Convert to WAV
let wav_data = samples_to_wav(&samples, sample_rate, channels)?;
// Encode to base64
let base64_data = general_purpose::STANDARD.encode(&wav_data);
Ok(base64_data)
}
pub fn is_supported() -> bool {
#[cfg(target_os = "windows")]
{
true
}
#[cfg(not(target_os = "windows"))]
{
false
}
}
fn samples_to_wav(samples: &[f32], sample_rate: u32, channels: u16) -> Result<Vec<u8>, String> {
let mut buffer = Vec::new();
let cursor = Cursor::new(&mut buffer);
let spec = WavSpec {
channels,
sample_rate,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = WavWriter::new(cursor, spec)
.map_err(|e| format!("Failed to create WAV writer: {}", e))?;
// Convert f32 samples to i16
for sample in samples {
let clamped = sample.clamp(-1.0, 1.0);
let i16_sample = (clamped * 32767.0) as i16;
writer.write_sample(i16_sample)
.map_err(|e| format!("Failed to write sample: {}", e))?;
}
writer.finalize()
.map_err(|e| format!("Failed to finalize WAV: {}", e))?;
Ok(buffer)
}

View File

@@ -0,0 +1,481 @@
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::{Device, Host, SampleFormat, StreamConfig};
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
#[derive(Debug, Clone, serde::Serialize)]
pub struct AudioOutputDevice {
pub id: String,
pub name: String,
pub is_default: bool,
}
pub struct AudioOutputState {
host: Host,
stop_flag: Arc<AtomicBool>,
}
impl AudioOutputState {
pub fn new() -> Self {
Self {
host: cpal::default_host(),
stop_flag: Arc::new(AtomicBool::new(false)),
}
}
pub fn stop_all_playback(&self) -> Result<(), String> {
eprintln!("stop_all_playback: Setting stop flag");
self.stop_flag.store(true, Ordering::Relaxed);
eprintln!("stop_all_playback: Stop flag set - active streams will output silence");
Ok(())
}
pub fn list_output_devices(&self) -> Result<Vec<AudioOutputDevice>, String> {
let devices = self
.host
.output_devices()
.map_err(|e| format!("Failed to enumerate output devices: {}", e))?;
let default_device = self.host.default_output_device();
let mut result = Vec::new();
for device in devices {
let name = device
.name()
.map_err(|e| format!("Failed to get device name: {}", e))?;
// Generate a stable ID from the device name (cpal doesn't provide stable IDs)
let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
let is_default = default_device
.as_ref()
.map(|d| d.name().unwrap_or_default() == name)
.unwrap_or(false);
result.push(AudioOutputDevice {
id,
name,
is_default,
});
}
Ok(result)
}
pub async fn play_audio_to_devices(
&self,
audio_data: Vec<u8>,
device_ids: Vec<String>,
) -> Result<(), String> {
eprintln!("play_audio_to_devices called with {} bytes, {} device IDs", audio_data.len(), device_ids.len());
eprintln!("Requested device IDs: {:?}", device_ids);
// Decode audio file (assuming WAV format)
eprintln!("Decoding audio data...");
let (samples, sample_rate, channels) = self.decode_wav(&audio_data)?;
eprintln!("Audio decoded: {} samples, {}Hz, {} channels", samples.len(), sample_rate, channels);
// Find devices by ID
eprintln!("Enumerating output devices...");
let devices: Vec<Device> = self
.host
.output_devices()
.map_err(|e| format!("Failed to enumerate devices: {}", e))?
.filter_map(|device| {
let name = device.name().ok()?;
let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
eprintln!("Found device: {} (id: {})", name, id);
if device_ids.contains(&id) {
eprintln!(" -> Matched! Will play to this device");
Some(device)
} else {
None
}
})
.collect();
if devices.is_empty() {
eprintln!("ERROR: No matching devices found");
return Err("No matching devices found".to_string());
}
eprintln!("Playing to {} device(s)", devices.len());
// Stop any existing playback first
self.stop_all_playback().ok();
// Reset stop flag for new playback
self.stop_flag.store(false, Ordering::Relaxed);
// Play to each device
for (i, device) in devices.iter().enumerate() {
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("Playing to device {}/{}: {}", i + 1, devices.len(), device_name);
self.play_to_device(device, samples.clone(), sample_rate, channels, self.stop_flag.clone())
.map_err(|e| format!("Failed to play to device {}: {}", device_name, e))?;
eprintln!("Successfully started playback on device: {}", device_name);
}
eprintln!("play_audio_to_devices completed successfully");
Ok(())
}
fn decode_wav(&self, data: &[u8]) -> Result<(Vec<f32>, u32, u16), String> {
use symphonia::core::formats::FormatOptions;
use symphonia::core::io::MediaSourceStream;
use symphonia::core::meta::MetadataOptions;
eprintln!("decode_wav: Creating MediaSourceStream from {} bytes", data.len());
let mss = MediaSourceStream::new(
Box::new(std::io::Cursor::new(data.to_vec())),
Default::default(),
);
eprintln!("decode_wav: Probing audio format...");
let mut format = symphonia::default::get_probe()
.format(
&Default::default(),
mss,
&FormatOptions::default(),
&MetadataOptions::default(),
)
.map_err(|e| {
eprintln!("decode_wav: Failed to probe audio: {}", e);
format!("Failed to probe audio: {}", e)
})?
.format;
eprintln!("decode_wav: Audio format probed successfully");
eprintln!("decode_wav: Finding audio track...");
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
.ok_or_else(|| {
eprintln!("decode_wav: No audio track found");
"No audio track found".to_string()
})?;
let sample_rate = track
.codec_params
.sample_rate
.ok_or_else(|| {
eprintln!("decode_wav: No sample rate found in track");
"No sample rate found".to_string()
})?;
let channels = track
.codec_params
.channels
.ok_or_else(|| {
eprintln!("decode_wav: No channels found in track");
"No channels found".to_string()
})?
.count() as u16;
eprintln!("decode_wav: Track info - sample_rate: {}, channels: {}", sample_rate, channels);
eprintln!("decode_wav: Creating decoder...");
let mut decoder = symphonia::default::get_codecs()
.make(&track.codec_params, &Default::default())
.map_err(|e| {
eprintln!("decode_wav: Failed to create decoder: {}", e);
format!("Failed to create decoder: {}", e)
})?;
eprintln!("decode_wav: Decoder created successfully");
let mut samples = Vec::new();
let mut packet_count = 0;
eprintln!("decode_wav: Starting packet decoding loop...");
loop {
let packet = match format.next_packet() {
Ok(packet) => packet,
Err(e) => {
eprintln!("decode_wav: End of stream or error: {:?}", e);
break;
}
};
packet_count += 1;
let decoded = decoder
.decode(&packet)
.map_err(|e| {
eprintln!("decode_wav: Decode error on packet {}: {}", packet_count, e);
format!("Decode error: {}", e)
})?;
// Convert to f32 samples by matching on the buffer type
use symphonia::core::audio::{AudioBufferRef, Signal};
use symphonia::core::conv::FromSample;
let spec = *decoded.spec();
let num_channels = spec.channels.count();
let num_frames = decoded.frames();
eprintln!("decode_wav: Packet {} - {} frames, {} channels", packet_count, num_frames, num_channels);
// Interleave samples from all channels
for frame_idx in 0..num_frames {
for ch in 0..num_channels {
let sample_f32 = match &decoded {
AudioBufferRef::U8(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U16(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U24(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::U32(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S8(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S16(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S24(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::S32(buf) => f32::from_sample(buf.chan(ch)[frame_idx]),
AudioBufferRef::F32(buf) => buf.chan(ch)[frame_idx],
AudioBufferRef::F64(buf) => buf.chan(ch)[frame_idx] as f32,
};
samples.push(sample_f32);
}
}
}
eprintln!("decode_wav: Decoded {} packets, total {} samples", packet_count, samples.len());
eprintln!("decode_wav: Returning sample_rate={}, channels={}", sample_rate, channels);
Ok((samples, sample_rate, channels))
}
fn play_to_device(
&self,
device: &Device,
samples: Vec<f32>,
sample_rate: u32,
channels: u16,
stop_flag: Arc<AtomicBool>,
) -> Result<(), String> {
let device_name = device.name().unwrap_or_else(|_| "unknown".to_string());
eprintln!("play_to_device: Starting playback to device: {}", device_name);
eprintln!("play_to_device: Input - {} samples, {}Hz, {} channels", samples.len(), sample_rate, channels);
let config = device
.default_output_config()
.map_err(|e| format!("Failed to get default config: {}", e))?;
// Prepare samples for the device's format
let device_sample_rate = config.sample_rate().0;
let device_channels = config.channels();
let device_sample_format = config.sample_format();
eprintln!("play_to_device: Device config - {}Hz, {} channels, format: {:?}",
device_sample_rate, device_channels, device_sample_format);
// Resample if needed (simple linear interpolation for now)
let resampled = if device_sample_rate != sample_rate {
eprintln!("play_to_device: Resampling from {}Hz to {}Hz", sample_rate, device_sample_rate);
let result = self.resample(&samples, sample_rate, device_sample_rate);
eprintln!("play_to_device: Resampled {} samples to {} samples", samples.len(), result.len());
result
} else {
eprintln!("play_to_device: No resampling needed");
samples
};
// Interleave/convert channels if needed
eprintln!("play_to_device: Interleaving channels from {} to {} channels", channels, device_channels);
let interleaved = self.interleave_channels(&resampled, channels, device_channels);
eprintln!("play_to_device: Interleaved to {} samples", interleaved.len());
// Create shared buffer for playback
let buffer: Arc<Mutex<Vec<f32>>> = Arc::new(Mutex::new(interleaved));
let position = Arc::new(AtomicUsize::new(0));
let buffer_clone = buffer.clone();
let position_clone = position.clone();
let err_fn = |err| eprintln!("Playback error: {}", err);
let stream_config = StreamConfig {
channels: device_channels,
sample_rate: cpal::SampleRate(device_sample_rate),
buffer_size: cpal::BufferSize::Default,
};
let stop_flag_clone = stop_flag.clone();
let stream = match config.sample_format() {
SampleFormat::F32 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [f32], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 0.0;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = buf[idx];
idx += 1;
} else {
*sample = 0.0;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
SampleFormat::I16 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [i16], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 0;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = (buf[idx] * 32767.0) as i16;
idx += 1;
} else {
*sample = 0;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
SampleFormat::U16 => {
let buffer = buffer_clone.clone();
let pos = position_clone.clone();
device
.build_output_stream(
&stream_config,
move |data: &mut [u16], _: &cpal::OutputCallbackInfo| {
// Check stop flag - if set, output silence
if stop_flag_clone.load(Ordering::Relaxed) {
for sample in data.iter_mut() {
*sample = 32768;
}
return;
}
let mut idx = pos.load(Ordering::Relaxed);
let buf = buffer.lock().unwrap();
for sample in data.iter_mut() {
if idx < buf.len() {
*sample = ((buf[idx] + 1.0) * 32767.5) as u16;
idx += 1;
} else {
*sample = 32768;
}
}
pos.store(idx, Ordering::Relaxed);
},
err_fn,
None,
)
.map_err(|e| format!("Failed to build stream: {}", e))?
}
_ => return Err("Unsupported sample format".to_string()),
};
eprintln!("play_to_device: Starting stream playback...");
stream.play().map_err(|e| {
eprintln!("play_to_device: Failed to play stream: {}", e);
format!("Failed to play stream: {}", e)
})?;
eprintln!("play_to_device: Stream started successfully");
// Keep the stream alive until playback finishes.
// Previously the stream was dropped immediately on function return,
// causing silent playback (cpal stops output when its Stream is dropped).
let total_samples = {
buffer.lock().unwrap().len()
};
loop {
let pos = position.load(std::sync::atomic::Ordering::Relaxed);
if pos >= total_samples || stop_flag.load(std::sync::atomic::Ordering::Relaxed) {
break;
}
std::thread::sleep(std::time::Duration::from_millis(10));
}
// stream is dropped here, after audio has finished playing
drop(stream);
eprintln!("play_to_device: Function completed successfully");
Ok(())
}
fn resample(&self, samples: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> {
if from_rate == to_rate {
return samples.to_vec();
}
let ratio = to_rate as f64 / from_rate as f64;
let new_len = (samples.len() as f64 * ratio) as usize;
let mut resampled = Vec::with_capacity(new_len);
for i in 0..new_len {
let src_idx = (i as f64 / ratio) as usize;
if src_idx < samples.len() {
resampled.push(samples[src_idx]);
} else {
resampled.push(0.0);
}
}
resampled
}
fn interleave_channels(
&self,
samples: &[f32],
src_channels: u16,
dst_channels: u16,
) -> Vec<f32> {
if src_channels == dst_channels {
return samples.to_vec();
}
let mut interleaved = Vec::new();
let samples_per_channel = samples.len() / src_channels as usize;
for i in 0..samples_per_channel {
for ch in 0..dst_channels {
let src_ch = if ch < src_channels { ch } else { src_channels - 1 };
let idx = (i * src_channels as usize) + src_ch as usize;
if idx < samples.len() {
interleaved.push(samples[idx]);
} else {
interleaved.push(0.0);
}
}
}
interleaved
}
}
impl Default for AudioOutputState {
fn default() -> Self {
Self::new()
}
}

View File

@@ -0,0 +1 @@
pub mod audio_capture;

926
tauri/src-tauri/src/main.rs Normal file
View File

@@ -0,0 +1,926 @@
// Prevents additional console window on Windows in release, DO NOT REMOVE!!
#![cfg_attr(not(debug_assertions), windows_subsystem = "windows")]
mod audio_capture;
mod audio_output;
use std::sync::Mutex;
use tauri::{command, State, Manager, WindowEvent, Emitter, Listener, RunEvent};
use tauri_plugin_shell::ShellExt;
use tokio::sync::mpsc;
const LEGACY_PORT: u16 = 8000;
const SERVER_PORT: u16 = 17493;
/// Find a voicebox-server process listening on a given port (Windows only).
///
/// Uses PowerShell `Get-NetTCPConnection` to look up the PID owning the port,
/// then verifies via `tasklist` that it's a voicebox process. The caller is
/// responsible for checking port occupancy first (e.g. `TcpStream::connect_timeout`).
/// Replaces the previous `netstat -ano` approach which failed on systems with
/// corrupted system DLLs (see #277).
#[cfg(windows)]
fn find_voicebox_pid_on_port(port: u16) -> Option<u32> {
use std::process::Command;
// Use PowerShell's Get-NetTCPConnection to find the PID listening on the port.
// This is a built-in cmdlet that doesn't depend on netstat.exe.
let ps_script = format!(
"Get-NetTCPConnection -LocalPort {} -State Listen -ErrorAction SilentlyContinue | Select-Object -ExpandProperty OwningProcess",
port
);
if let Ok(output) = Command::new("powershell")
.args(["-NoProfile", "-Command", &ps_script])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines() {
if let Ok(pid) = line.trim().parse::<u32>() {
// Verify this PID is a voicebox process
if let Ok(tasklist_output) = Command::new("tasklist")
.args(["/FI", &format!("PID eq {}", pid), "/FO", "CSV", "/NH"])
.output()
{
let tasklist_str = String::from_utf8_lossy(&tasklist_output.stdout);
if tasklist_str.to_lowercase().contains("voicebox") {
return Some(pid);
}
}
}
}
}
None
}
/// Check if a Voicebox server is responding on the given port.
///
/// Sends an HTTP GET to `/health` and returns `true` only if the response
/// is valid JSON matching the Voicebox `HealthResponse` schema — specifically
/// `status` must be `"healthy"`, and both `model_loaded` and `gpu_available`
/// must be present as booleans. This prevents misidentifying an unrelated
/// service that happens to expose a `/health` endpoint.
#[allow(dead_code)] // Used in platform-specific cfg blocks
fn check_health(port: u16) -> bool {
let url = format!("http://127.0.0.1:{}/health", port);
match reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(3))
.build()
{
Ok(client) => match client.get(&url).send() {
Ok(resp) => {
if !resp.status().is_success() {
return false;
}
// Parse as JSON and validate Voicebox-specific fields
match resp.json::<serde_json::Value>() {
Ok(body) => {
body.get("status").and_then(|v| v.as_str()) == Some("healthy")
&& body.get("model_loaded").map(|v| v.is_boolean()).unwrap_or(false)
&& body.get("gpu_available").map(|v| v.is_boolean()).unwrap_or(false)
}
Err(_) => false,
}
}
Err(_) => false,
},
Err(_) => false,
}
}
struct ServerState {
child: Mutex<Option<tauri_plugin_shell::process::CommandChild>>,
server_pid: Mutex<Option<u32>>,
keep_running_on_close: Mutex<bool>,
models_dir: Mutex<Option<String>>,
}
#[command]
async fn start_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
remote: Option<bool>,
models_dir: Option<String>,
) -> Result<String, String> {
// Store models_dir for use on restart (empty string means reset to default)
if let Some(ref dir) = models_dir {
if dir.is_empty() {
*state.models_dir.lock().unwrap() = None;
} else {
*state.models_dir.lock().unwrap() = Some(dir.clone());
}
}
// Check if server is already running (managed by this app instance)
if state.child.lock().unwrap().is_some() {
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
// Check if a voicebox server is already running on our port (from previous session with keep_running=true,
// or an externally started server e.g. via `python`, `uvicorn`, Docker, etc.)
#[cfg(unix)]
{
use std::process::Command;
if let Ok(output) = Command::new("lsof")
.args(["-i", &format!(":{}", SERVER_PORT), "-sTCP:LISTEN"])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines().skip(1) {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let command = parts[0];
let pid_str = parts[1];
if command.contains("voicebox") {
if let Ok(pid) = pid_str.parse::<u32>() {
println!("Found existing voicebox-server on port {} (PID: {}), reusing it", SERVER_PORT, pid);
// Store the PID so we can kill it on exit if needed
*state.server_pid.lock().unwrap() = Some(pid);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
} else {
// Process name doesn't contain "voicebox" — could be an external
// Python/uvicorn/Docker server. Verify via HTTP health check.
println!("Port {} in use by '{}' (PID: {}), checking if it's a Voicebox server...", SERVER_PORT, command, pid_str);
if check_health(SERVER_PORT) {
println!("Health check passed — reusing external server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
println!("Health check failed — port is occupied by a non-Voicebox process");
return Err(format!(
"Port {} is already in use by another application ({}). \
Close it or change the Voicebox server port.",
SERVER_PORT, command
));
}
}
}
}
}
#[cfg(windows)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Port is in use — check if it's a voicebox process by name first
if let Some(pid) = find_voicebox_pid_on_port(SERVER_PORT) {
println!("Found existing voicebox-server on port {} (PID: {}), reusing it", SERVER_PORT, pid);
*state.server_pid.lock().unwrap() = Some(pid);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
// Process name doesn't match — could be an external Python/Docker server.
// Verify via HTTP health check before giving up.
println!("Port {} in use by unknown process, checking if it's a Voicebox server...", SERVER_PORT);
if check_health(SERVER_PORT) {
println!("Health check passed — reusing external server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
return Err(format!(
"Port {} is already in use by another application. \
Close the other application or change the Voicebox port.",
SERVER_PORT
));
}
}
// Kill any orphaned voicebox-server from previous session on legacy port 8000
// This handles upgrades from older versions that used a fixed port
#[cfg(unix)]
{
use std::process::Command;
if let Ok(output) = Command::new("lsof")
.args(["-i", &format!(":{}", LEGACY_PORT), "-sTCP:LISTEN"])
.output()
{
let output_str = String::from_utf8_lossy(&output.stdout);
for line in output_str.lines().skip(1) {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let command = parts[0];
let pid_str = parts[1];
if command.contains("voicebox") {
if let Ok(pid) = pid_str.parse::<i32>() {
println!("Found orphaned voicebox-server on legacy port {} (PID: {}, CMD: {}), killing it...", LEGACY_PORT, pid, command);
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
}
} else {
println!("Legacy port {} is in use by non-voicebox process: {} (PID: {}), not killing", LEGACY_PORT, command, pid_str);
}
}
}
}
}
#[cfg(windows)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", LEGACY_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
if let Some(pid) = find_voicebox_pid_on_port(LEGACY_PORT) {
println!("Found orphaned voicebox-server on legacy port {} (PID: {}), killing it...", LEGACY_PORT, pid);
let _ = std::process::Command::new("taskkill")
.args(["/PID", &pid.to_string(), "/T", "/F"])
.output();
}
}
}
// Brief wait for port to be released
std::thread::sleep(std::time::Duration::from_millis(200));
// Get app data directory
let data_dir = app
.path()
.app_data_dir()
.map_err(|e| format!("Failed to get app data dir: {}", e))?;
// Ensure data directory exists
std::fs::create_dir_all(&data_dir)
.map_err(|e| format!("Failed to create data dir: {}", e))?;
println!("=================================================================");
println!("Starting voicebox-server sidecar");
println!("Data directory: {:?}", data_dir);
println!("Remote mode: {}", remote.unwrap_or(false));
// Check for CUDA backend in data directory (onedir layout: backends/cuda/)
let cuda_binary = {
let cuda_dir = data_dir.join("backends").join("cuda");
let cuda_name = if cfg!(windows) {
"voicebox-server-cuda.exe"
} else {
"voicebox-server-cuda"
};
let exe_path = cuda_dir.join(cuda_name);
if exe_path.exists() {
println!("Found CUDA backend at {:?}", cuda_dir);
// Version check: run --version from the onedir directory so
// PyInstaller can find its support files for the fast --version path
let app_version = app.config().version.clone().unwrap_or_default();
let version_ok = match std::process::Command::new(&exe_path)
.arg("--version")
.current_dir(&cuda_dir)
.output()
{
Ok(output) => {
// Output format: "voicebox-server X.Y.Z\n"
let version_str = String::from_utf8_lossy(&output.stdout);
let binary_version = version_str.trim().split_whitespace().last().unwrap_or("");
if binary_version == app_version {
println!("CUDA binary version {} matches app version", binary_version);
true
} else {
println!(
"CUDA binary version mismatch: binary={}, app={}. Falling back to CPU.",
binary_version, app_version
);
false
}
}
Err(e) => {
println!("Failed to check CUDA binary version: {}. Falling back to CPU.", e);
false
}
};
if version_ok {
Some(exe_path)
} else {
None
}
} else {
println!("No CUDA backend found, using bundled CPU binary");
None
}
};
let sidecar_result = app.shell().sidecar("voicebox-server");
let mut sidecar = match sidecar_result {
Ok(s) => s,
Err(e) => {
eprintln!("Failed to get sidecar: {}", e);
// In dev mode, check if the server is already running (started manually)
#[cfg(debug_assertions)]
{
eprintln!("Dev mode: Checking if server is already running on port {}...", SERVER_PORT);
// Try to connect to the server port
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
println!("Found server already running on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: No server found on port {}", SERVER_PORT);
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
}
return Err(format!("Failed to start server. In dev mode, run 'bun run dev:server' in a separate terminal."));
}
};
println!("Sidecar command created successfully");
// Build common args
let data_dir_str = data_dir
.to_str()
.ok_or_else(|| "Invalid data dir path".to_string())?
.to_string();
let port_str = SERVER_PORT.to_string();
let parent_pid_str = std::process::id().to_string();
let is_remote = remote.unwrap_or(false);
// Resolve the custom models directory from the parameter or stored state
let effective_models_dir = models_dir.or_else(|| state.models_dir.lock().unwrap().clone());
if let Some(ref dir) = effective_models_dir {
println!("Custom models directory: {}", dir);
}
// If CUDA binary exists, launch it from the onedir directory.
// .current_dir() is critical: PyInstaller onedir expects all DLLs and
// support files (nvidia/, _internal/, etc.) relative to the exe.
let spawn_result = if let Some(ref cuda_path) = cuda_binary {
let cuda_dir = cuda_path.parent().unwrap();
println!("Launching CUDA backend: {:?} (cwd: {:?})", cuda_path, cuda_dir);
let mut cmd = app.shell().command(cuda_path.to_str().unwrap());
cmd = cmd.current_dir(cuda_dir);
cmd = cmd.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
if is_remote {
cmd = cmd.args(["--host", "0.0.0.0"]);
}
if let Some(ref dir) = effective_models_dir {
cmd = cmd.env("VOICEBOX_MODELS_DIR", dir);
}
cmd.spawn()
} else {
// Use the bundled CPU sidecar
sidecar = sidecar.args(["--data-dir", &data_dir_str, "--port", &port_str, "--parent-pid", &parent_pid_str]);
if is_remote {
sidecar = sidecar.args(["--host", "0.0.0.0"]);
}
if let Some(ref dir) = effective_models_dir {
sidecar = sidecar.env("VOICEBOX_MODELS_DIR", dir);
}
println!("Spawning server process...");
sidecar.spawn()
};
let (mut rx, child) = match spawn_result {
Ok(result) => result,
Err(e) => {
eprintln!("Failed to spawn server process: {}", e);
// In dev mode, check if a manually-started server is available
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: Server binary failed to start");
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
return Err("Dev mode: Start server manually with 'bun run dev:server'".to_string());
}
#[cfg(not(debug_assertions))]
{
eprintln!("This could be due to:");
eprintln!(" - Missing or corrupted binary");
eprintln!(" - Missing execute permissions");
eprintln!(" - Code signing issues on macOS");
eprintln!(" - Missing dependencies");
return Err(format!("Failed to spawn: {}", e));
}
}
};
println!("Server process spawned, waiting for ready signal...");
println!("=================================================================");
// Store child process and PID
let process_pid = child.pid();
*state.server_pid.lock().unwrap() = Some(process_pid);
*state.child.lock().unwrap() = Some(child);
// Wait for server to be ready by listening for startup log
// PyInstaller bundles can be slow on first import, especially torch/transformers
let timeout = tokio::time::Duration::from_secs(120);
let start_time = tokio::time::Instant::now();
let mut error_output = Vec::new();
loop {
if start_time.elapsed() > timeout {
eprintln!("Server startup timeout after 120 seconds");
if !error_output.is_empty() {
eprintln!("Collected error output:");
for line in &error_output {
eprintln!(" {}", line);
}
}
// In dev mode, check if a manual server came up during the wait
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Kill the placeholder process
let _ = state.child.lock().unwrap().take();
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
}
return Err("Server startup timeout - check Console.app for detailed logs".to_string());
}
match tokio::time::timeout(tokio::time::Duration::from_millis(100), rx.recv()).await {
Ok(Some(event)) => {
match event {
tauri_plugin_shell::process::CommandEvent::Stdout(line) => {
let line_str = String::from_utf8_lossy(&line);
println!("Server output: {}", line_str);
let _ = app.emit("server-log", serde_json::json!({
"stream": "stdout",
"line": line_str.trim_end(),
}));
if line_str.contains("Uvicorn running") || line_str.contains("Application startup complete") {
println!("Server is ready!");
break;
}
}
tauri_plugin_shell::process::CommandEvent::Stderr(line) => {
let line_str = String::from_utf8_lossy(&line).to_string();
eprintln!("Server: {}", line_str);
let _ = app.emit("server-log", serde_json::json!({
"stream": "stderr",
"line": line_str.trim_end(),
}));
// Collect error lines for debugging
if line_str.contains("ERROR") || line_str.contains("Error") || line_str.contains("Failed") {
error_output.push(line_str.clone());
}
// Uvicorn logs to stderr, so check there too
if line_str.contains("Uvicorn running") || line_str.contains("Application startup complete") {
println!("Server is ready!");
break;
}
}
_ => {}
}
}
Ok(None) => {
// In dev mode, this is expected when using the placeholder binary
#[cfg(debug_assertions)]
{
use std::net::TcpStream;
eprintln!("Server process ended (dev mode placeholder detected)");
// Check if a manually-started server is available
if TcpStream::connect_timeout(
&format!("127.0.0.1:{}", SERVER_PORT).parse().unwrap(),
std::time::Duration::from_secs(1),
).is_ok() {
// Clean up state
let _ = state.child.lock().unwrap().take();
let _ = state.server_pid.lock().unwrap().take();
println!("Found manually-started server on port {}", SERVER_PORT);
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
}
eprintln!("");
eprintln!("=================================================================");
eprintln!("DEV MODE: No bundled server binary available");
eprintln!("");
eprintln!("Start the Python server in a separate terminal:");
eprintln!(" bun run dev:server");
eprintln!("=================================================================");
eprintln!("");
return Err("Dev mode: Start server manually with 'bun run dev:server'".to_string());
}
#[cfg(not(debug_assertions))]
{
eprintln!("Server process ended unexpectedly during startup!");
eprintln!("The server binary may have crashed or exited with an error.");
eprintln!("Check Console.app logs for more details (search for 'voicebox')");
return Err("Server process ended unexpectedly".to_string());
}
}
Err(_) => {
// Timeout on this recv, continue loop
continue;
}
}
}
// Spawn task to continue reading output and emit to frontend
let app_handle = app.clone();
tokio::spawn(async move {
while let Some(event) = rx.recv().await {
match event {
tauri_plugin_shell::process::CommandEvent::Stdout(line) => {
let line_str = String::from_utf8_lossy(&line);
println!("Server: {}", line_str);
let _ = app_handle.emit("server-log", serde_json::json!({
"stream": "stdout",
"line": line_str.trim_end(),
}));
}
tauri_plugin_shell::process::CommandEvent::Stderr(line) => {
let line_str = String::from_utf8_lossy(&line);
eprintln!("Server error: {}", line_str);
let _ = app_handle.emit("server-log", serde_json::json!({
"stream": "stderr",
"line": line_str.trim_end(),
}));
}
_ => {}
}
}
});
Ok(format!("http://127.0.0.1:{}", SERVER_PORT))
}
#[command]
async fn stop_server(state: State<'_, ServerState>) -> Result<(), String> {
let pid = state.server_pid.lock().unwrap().take();
let _child = state.child.lock().unwrap().take();
if let Some(pid) = pid {
println!("stop_server: Stopping server with PID: {}", pid);
#[cfg(unix)]
{
use std::process::Command;
// Kill process group with SIGTERM first
let _ = Command::new("kill")
.args(["-TERM", "--", &format!("-{}", pid)])
.output();
// Brief wait then force kill
std::thread::sleep(std::time::Duration::from_millis(100));
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
println!("stop_server: Process group kill completed");
}
#[cfg(windows)]
{
// Send graceful shutdown via HTTP — the server's parent-pid watchdog
// will also handle cleanup if this app process exits.
println!("Sending graceful shutdown via HTTP...");
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(2))
.build()
.unwrap();
let _ = client
.post(&format!("http://127.0.0.1:{}/shutdown", SERVER_PORT))
.send();
println!("Shutdown request sent (server watchdog will handle cleanup)");
}
}
Ok(())
}
#[command]
async fn restart_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
models_dir: Option<String>,
) -> Result<String, String> {
println!("restart_server: stopping current server...");
// Update stored models_dir: empty string means reset to default, non-empty means set
if let Some(ref dir) = models_dir {
if dir.is_empty() {
*state.models_dir.lock().unwrap() = None;
} else {
*state.models_dir.lock().unwrap() = Some(dir.clone());
}
}
// Stop the current server
stop_server(state.clone()).await?;
// Wait for port to be released
println!("restart_server: waiting for port release...");
tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
// Start server again (will auto-detect CUDA binary and use stored models_dir)
println!("restart_server: starting server...");
start_server(app, state, None, None).await
}
#[command]
fn set_keep_server_running(state: State<'_, ServerState>, keep_running: bool) {
println!("set_keep_server_running called with: {}", keep_running);
*state.keep_running_on_close.lock().unwrap() = keep_running;
}
#[command]
async fn start_system_audio_capture(
state: State<'_, audio_capture::AudioCaptureState>,
max_duration_secs: u32,
) -> Result<(), String> {
audio_capture::start_capture(&state, max_duration_secs).await
}
#[command]
async fn stop_system_audio_capture(
state: State<'_, audio_capture::AudioCaptureState>,
) -> Result<String, String> {
audio_capture::stop_capture(&state).await
}
#[command]
fn is_system_audio_supported() -> bool {
audio_capture::is_supported()
}
#[command]
fn list_audio_output_devices(
state: State<'_, audio_output::AudioOutputState>,
) -> Result<Vec<audio_output::AudioOutputDevice>, String> {
state.list_output_devices()
}
#[command]
async fn play_audio_to_devices(
state: State<'_, audio_output::AudioOutputState>,
audio_data: Vec<u8>,
device_ids: Vec<String>,
) -> Result<(), String> {
state.play_audio_to_devices(audio_data, device_ids).await
}
#[command]
fn stop_audio_playback(
state: State<'_, audio_output::AudioOutputState>,
) -> Result<(), String> {
state.stop_all_playback()
}
#[cfg_attr(mobile, tauri::mobile_entry_point)]
pub fn run() {
tauri::Builder::default()
.plugin(tauri_plugin_dialog::init())
.plugin(tauri_plugin_fs::init())
.plugin(tauri_plugin_shell::init())
.manage(ServerState {
child: Mutex::new(None),
server_pid: Mutex::new(None),
keep_running_on_close: Mutex::new(false),
models_dir: Mutex::new(None),
})
.manage(audio_capture::AudioCaptureState::new())
.manage(audio_output::AudioOutputState::new())
.setup(|app| {
#[cfg(desktop)]
{
app.handle().plugin(tauri_plugin_updater::Builder::new().build())?;
app.handle().plugin(tauri_plugin_process::init())?;
}
// Hide title bar icon on Windows
#[cfg(windows)]
{
use windows::Win32::Foundation::HWND;
use windows::Win32::UI::WindowsAndMessaging::{SetClassLongPtrW, GCLP_HICON, GCLP_HICONSM};
if let Some((_, window)) = app.webview_windows().iter().next() {
if let Ok(hwnd) = window.hwnd() {
let hwnd = HWND(hwnd.0);
unsafe {
// Set both small and regular icons to NULL to hide the title bar icon
SetClassLongPtrW(hwnd, GCLP_HICON, 0);
SetClassLongPtrW(hwnd, GCLP_HICONSM, 0);
}
}
}
}
// Enable microphone access on Linux (WebKitGTK denies getUserMedia by default)
#[cfg(target_os = "linux")]
{
use tauri::Manager;
if let Some(window) = app.get_webview_window("main") {
let _ = window.with_webview(|webview| {
use webkit2gtk::{WebViewExt, SettingsExt, PermissionRequestExt};
use webkit2gtk::glib::ObjectExt;
let wk_webview = webview.inner();
// Enable media stream support in WebKitGTK settings
if let Some(settings) = WebViewExt::settings(&wk_webview) {
settings.set_enable_media_stream(true);
}
// Auto-grant UserMediaPermissionRequest (microphone access)
// Only for trusted local origins (Tauri dev server or custom protocol)
wk_webview.connect_permission_request(move |webview, request: &webkit2gtk::PermissionRequest| {
if request.is::<webkit2gtk::UserMediaPermissionRequest>() {
let uri = WebViewExt::uri(webview).unwrap_or_default();
let is_trusted = uri.starts_with("tauri://")
|| uri.starts_with("https://tauri.localhost")
|| uri.starts_with("http://localhost")
|| uri.starts_with("http://127.0.0.1");
if is_trusted {
request.allow();
return true;
}
request.deny();
return true;
}
false
});
});
}
}
Ok(())
})
.invoke_handler(tauri::generate_handler![
start_server,
stop_server,
restart_server,
set_keep_server_running,
start_system_audio_capture,
stop_system_audio_capture,
is_system_audio_supported,
list_audio_output_devices,
play_audio_to_devices,
stop_audio_playback
])
.on_window_event({
let closing = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
move |window, event| {
if let WindowEvent::CloseRequested { api, .. } = event {
// If we're already in the close flow, let it proceed
if closing.load(std::sync::atomic::Ordering::SeqCst) {
return;
}
closing.store(true, std::sync::atomic::Ordering::SeqCst);
// Prevent automatic close so frontend can clean up
api.prevent_close();
// Emit event to frontend to check setting and stop server if needed
let app_handle = window.app_handle();
if let Err(e) = app_handle.emit("window-close-requested", ()) {
eprintln!("Failed to emit window-close-requested event: {}", e);
window.close().ok();
return;
}
// Set up listener for frontend response
let window_for_close = window.clone();
let closing_for_timeout = closing.clone();
let (tx, mut rx) = mpsc::unbounded_channel::<()>();
let listener_id = window.listen("window-close-allowed", move |_| {
let _ = tx.send(());
});
tauri::async_runtime::spawn(async move {
tokio::select! {
_ = rx.recv() => {
window_for_close.close().ok();
}
_ = tokio::time::sleep(tokio::time::Duration::from_secs(5)) => {
eprintln!("Window close timeout, closing anyway");
window_for_close.close().ok();
}
}
window_for_close.unlisten(listener_id);
closing_for_timeout.store(false, std::sync::atomic::Ordering::SeqCst);
});
}
}})
.build(tauri::generate_context!())
.expect("error while building tauri application")
.run(|app, event| {
let _ = &app; // used on unix
match &event {
RunEvent::Exit => {
let state = app.state::<ServerState>();
let keep_running = *state.keep_running_on_close.lock().unwrap();
let has_pid = state.server_pid.lock().unwrap().is_some();
println!("RunEvent::Exit — keep_running={}, has_pid={}", keep_running, has_pid);
if keep_running {
// Tell the server to disable its watchdog so it survives
// after this process exits.
println!("Keep server running: disabling watchdog...");
// Write a sentinel file as a reliable fallback. On Windows
// the HTTP request below can race with process exit, leaving
// the watchdog unaware it should stay alive. The sentinel
// file is checked during the watchdog grace period.
let data_dir = app
.path()
.app_data_dir()
.unwrap_or_default();
let sentinel = data_dir.join(".keep-running");
if let Err(e) = std::fs::write(&sentinel, b"1") {
eprintln!("Failed to write keep-running sentinel: {}", e);
} else {
println!("Wrote keep-running sentinel to {:?}", sentinel);
}
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(2))
.build()
.unwrap();
match client
.post(&format!("http://127.0.0.1:{}/watchdog/disable", SERVER_PORT))
.send()
{
Ok(resp) => println!("Watchdog disable response: {}", resp.status()),
Err(e) => eprintln!("Failed to disable watchdog: {}", e),
}
} else {
// Server will self-terminate via parent-pid watchdog when
// this process exits. On Unix, also send SIGTERM for
// immediate cleanup.
println!("RunEvent::Exit - server will self-terminate via watchdog");
#[cfg(unix)]
{
if let Some(pid) = state.server_pid.lock().unwrap().take() {
use std::process::Command;
let _ = Command::new("kill")
.args(["-TERM", "--", &format!("-{}", pid)])
.output();
std::thread::sleep(std::time::Duration::from_millis(100));
let _ = Command::new("kill")
.args(["-9", "--", &format!("-{}", pid)])
.output();
let _ = Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
}
}
}
}
RunEvent::ExitRequested { api, .. } => {
println!("RunEvent::ExitRequested received");
// Don't prevent exit, just log it
let _ = api;
}
_ => {}
}
});
}
fn main() {
run();
}